optimize: drop seed context & add text temps

- add param `manual_seed` - add missing params of refine_text
2noise · Jul 31, 2024 · e675a59 · e675a59
1 parent 63f4868
commit e675a59
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 30 deletions.
diff --git a/ChatTTS/core.py b/ChatTTS/core.py
@@ -58,9 +58,6 @@ def has_loaded(self, use_decoder=False):
                 self.logger.warning(f"{module} not initialized.")
                 not_finish = True
 
-        if not not_finish:
-            self.logger.info("all models has been initialized.")
-
         return not not_finish
 
     def download_models(
@@ -186,6 +183,7 @@ class RefineTextParams:
         min_new_token: int = 0
         show_tqdm: bool = True
         ensure_non_empty: bool = True
+        manual_seed: Optional[int] = None
 
     @dataclass(repr=False, eq=False)
     class InferCodeParams(RefineTextParams):
@@ -578,6 +576,7 @@ def _infer_code(
             show_tqdm=params.show_tqdm,
             ensure_non_empty=params.ensure_non_empty,
             stream_batch=params.stream_batch,
+            manual_seed=params.manual_seed,
             context=self.context,
         )
 
@@ -667,6 +666,7 @@ def _refine_text(
                 stream=False,
                 show_tqdm=params.show_tqdm,
                 ensure_non_empty=params.ensure_non_empty,
+                manual_seed=params.manual_seed,
                 context=self.context,
             )
         )

diff --git a/ChatTTS/model/gpt.py b/ChatTTS/model/gpt.py
@@ -36,6 +36,8 @@ def __init__(
         self.device = device
         self.device_gpt = device_gpt
 
+        self.generator = torch.Generator(device=device)
+
         self.config = gpt_config
         self.num_vq = int(gpt_config["num_vq"])
         self.num_audio_tokens = int(gpt_config["num_audio_tokens"])
@@ -416,6 +418,7 @@ def generate(
         show_tqdm=True,
         ensure_non_empty=True,
         stream_batch=24,
+        manual_seed: Optional[int] = None,
         context=Context(),
     ):
 
@@ -581,7 +584,13 @@ def generate(
 
             del logits
 
-            idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+            if manual_seed is None:
+                idx_next = torch.multinomial(scores, num_samples=1).to(finish.device)
+            else:
+                idx_next = torch.multinomial(
+                    scores, num_samples=1,
+                    generator=self.generator.manual_seed(manual_seed),
+                ).to(finish.device)
 
             del scores
 

diff --git a/examples/api/main.py b/examples/api/main.py
@@ -52,8 +52,6 @@ class ChatTTSParams(BaseModel):
     use_decoder: bool = True
     do_text_normalization: bool = True
     do_homophone_replacement: bool = False
-    audio_seed: int
-    text_seed: int
     params_refine_text: ChatTTS.Chat.RefineTextParams
     params_infer_code: ChatTTS.Chat.InferCodeParams
 
@@ -63,13 +61,12 @@ async def generate_voice(params: ChatTTSParams):
     logger.info("Text input: %s", str(params.text))
 
     # audio seed
-    if params.audio_seed:
-        torch.manual_seed(params.audio_seed)
+    if params.params_infer_code.manual_seed is not None:
+        torch.manual_seed(params.params_infer_code.manual_seed)
         params.params_infer_code.spk_emb = chat.sample_random_speaker()
 
     # text seed for text refining
     if params.params_refine_text:
-        torch.manual_seed(params.text_seed)
         text = chat.infer(
             text=params.text, skip_refine_text=False, refine_text_only=True
         )

diff --git a/examples/web/funcs.py b/examples/web/funcs.py
@@ -133,19 +133,27 @@ def refine_text(
     text,
     text_seed_input,
     refine_text_flag,
+    temperature,
+    top_P,
+    top_K,
 ):
     global chat
 
     if not refine_text_flag:
         sleep(1)  # to skip fast answer of loading mark
         return text
 
-    with TorchSeedContext(text_seed_input):
-        text = chat.infer(
-            text,
-            skip_refine_text=False,
-            refine_text_only=True,
-        )
+    text = chat.infer(
+        text,
+        skip_refine_text=False,
+        refine_text_only=True,
+        params_refine_text=ChatTTS.Chat.RefineTextParams(
+            temperature=temperature,
+            top_P=top_P,
+            top_K=top_K,
+            manual_seed=text_seed_input,
+        ),
+    )
 
     return text[0] if isinstance(text, list) else text
 
@@ -171,28 +179,28 @@ def generate_audio(
         temperature=temperature,
         top_P=top_P,
         top_K=top_K,
+        manual_seed=audio_seed_input,
     )
 
     if sample_text_input and sample_audio_code_input:
         params_infer_code.txt_smp = sample_text_input
         params_infer_code.spk_smp = sample_audio_code_input
         params_infer_code.spk_emb = None
 
-    with TorchSeedContext(audio_seed_input):
-        wav = chat.infer(
-            text,
-            skip_refine_text=True,
-            params_infer_code=params_infer_code,
-            stream=stream,
-        )
-        if stream:
-            for gen in wav:
-                audio = gen[0]
-                if audio is not None and len(audio) > 0:
-                    yield 24000, float_to_int16(audio).T
-                del audio
-        else:
-            yield 24000, float_to_int16(wav[0]).T
+    wav = chat.infer(
+        text,
+        skip_refine_text=True,
+        params_infer_code=params_infer_code,
+        stream=stream,
+    )
+    if stream:
+        for gen in wav:
+            audio = gen[0]
+            if audio is not None and len(audio) > 0:
+                yield 24000, float_to_int16(audio).T
+            del audio
+    else:
+        yield 24000, float_to_int16(wav[0]).T
 
 
 def interrupt_generate():

diff --git a/examples/web/webui.py b/examples/web/webui.py
@@ -205,6 +205,9 @@ def make_audio(autoplay, stream):
                     text_input,
                     text_seed_input,
                     refine_text_checkbox,
+                    temperature_slider,
+                    top_p_slider,
+                    top_k_slider,
                 ],
                 outputs=text_output,
             ).then(