Spaces:

IndexTeam
/

IndexTTS-2-Demo

Running on Zero

App Files Files Community

kemuriririn commited on 5 days ago

Commit

c4fe16f

1 Parent(s): 04be12f

sync from github

Browse files

Files changed (14) hide show

.gitattributes +1 -0
.gitignore +2 -1
examples/cases.jsonl +5 -5
indextts/cli.py +7 -4
indextts/infer.py +73 -60
indextts/infer_v2.py +144 -98
indextts/s2mel/modules/openvoice/api.py +4 -4
indextts/s2mel/modules/openvoice/openvoice_app.py +1 -1
indextts/s2mel/modules/openvoice/utils.py +35 -35
indextts/utils/front.py +53 -53
indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 +3 -0
tools/i18n/locale/en_US.json +24 -21
tools/i18n/locale/zh_CN.json +7 -3
webui.py +149 -76

.gitattributes CHANGED Viewed

@@ -47,3 +47,4 @@ examples/emo_hate.wav filter=lfs diff=lfs merge=lfs -text
 examples/voice_01.wav filter=lfs diff=lfs merge=lfs -text
 examples/voice_03.wav filter=lfs diff=lfs merge=lfs -text
 examples/voice_04.wav filter=lfs diff=lfs merge=lfs -text

 examples/voice_01.wav filter=lfs diff=lfs merge=lfs -text
 examples/voice_03.wav filter=lfs diff=lfs merge=lfs -text
 examples/voice_04.wav filter=lfs diff=lfs merge=lfs -text
+indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -13,4 +13,5 @@ build/
 *.py[cod]
 *.egg-info/
 .venv
-checkpoints/*

 *.py[cod]
 *.egg-info/
 .venv
+checkpoints/*
+__MACOSX

examples/cases.jsonl CHANGED Viewed

@@ -1,12 +1,12 @@
-{"prompt_audio":"voice_01.wav","text":"Translate for me，what is a surprise！","emo_mode":0}
 {"prompt_audio":"voice_02.wav","text":"The palace is strict, no false rumors, Lady Qi!","emo_mode":0}
 {"prompt_audio":"voice_03.wav","text":"这个呀，就是我们精心制作准备的纪念品，大家可以看到这个色泽和这个材质啊，哎呀多么的光彩照人。","emo_mode":0}
 {"prompt_audio":"voice_04.wav","text":"你就需要我这种专业人士的帮助，就像手无缚鸡之力的人进入雪山狩猎，一定需要最老练的猎人指导。","emo_mode":0}
 {"prompt_audio":"voice_05.wav","text":"在真正的日本剑道中，格斗过程极其短暂，常常短至半秒，最长也不超过两秒，利剑相击的转瞬间，已有一方倒在血泊中。但在这电光石火的对决之前，双方都要以一个石雕般凝固的姿势站定，长时间的逼视对方，这一过程可能长达十分钟！","emo_mode":0}
 {"prompt_audio":"voice_06.wav","text":"今天呢，咱们开一部新书，叫《赛博朋克二零七七》。这词儿我听着都新鲜。这赛博朋克啊，简单理解就是“高科技，低生活”。这一听，我就明白了，于老师就爱用那高科技的东西，手机都得拿脚纹开，大冬天为了解锁脱得一丝不挂，冻得跟王八蛋似的。","emo_mode":0}
-{"prompt_audio":"voice_07.wav","emo_audio":"emo_sad.wav","emo_weight": 0.9, "emo_mode":1,"text":"酒楼丧尽天良，开始借机竞拍房间，哎，一群蠢货。"}
-{"prompt_audio":"voice_08.wav","emo_audio":"emo_hate.wav","emo_weight": 0.8, "emo_mode":1,"text":"你看看你，对我还有没有一点父子之间的信任了。"}
-{"prompt_audio":"voice_09.wav","emo_vec_3":0.55,"emo_mode":2,"text":"对不起嘛！我的记性真的不太好，但是和你在一起的事情，我都会努力记住的~"}
-{"prompt_audio":"voice_10.wav","emo_vec_7":0.45,"emo_mode":2,"text":"哇塞！这个爆率也太高了！欧皇附体了！"}
 {"prompt_audio":"voice_11.wav","emo_mode":3,"emo_text":"极度悲伤","text":"这些年的时光终究是错付了... "}
 {"prompt_audio":"voice_12.wav","emo_mode":3,"emo_text":"You scared me to death! What are you, a ghost?","text":"快躲起来！是他要来了！他要来抓我们了！"}

+{"prompt_audio":"voice_01.wav","text":"Translate for me, what is a surprise!","emo_mode":0}
 {"prompt_audio":"voice_02.wav","text":"The palace is strict, no false rumors, Lady Qi!","emo_mode":0}
 {"prompt_audio":"voice_03.wav","text":"这个呀，就是我们精心制作准备的纪念品，大家可以看到这个色泽和这个材质啊，哎呀多么的光彩照人。","emo_mode":0}
 {"prompt_audio":"voice_04.wav","text":"你就需要我这种专业人士的帮助，就像手无缚鸡之力的人进入雪山狩猎，一定需要最老练的猎人指导。","emo_mode":0}
 {"prompt_audio":"voice_05.wav","text":"在真正的日本剑道中，格斗过程极其短暂，常常短至半秒，最长也不超过两秒，利剑相击的转瞬间，已有一方倒在血泊中。但在这电光石火的对决之前，双方都要以一个石雕般凝固的姿势站定，长时间的逼视对方，这一过程可能长达十分钟！","emo_mode":0}
 {"prompt_audio":"voice_06.wav","text":"今天呢，咱们开一部新书，叫《赛博朋克二零七七》。这词儿我听着都新鲜。这赛博朋克啊，简单理解就是“高科技，低生活”。这一听，我就明白了，于老师就爱用那高科技的东西，手机都得拿脚纹开，大冬天为了解锁脱得一丝不挂，冻得跟王八蛋似的。","emo_mode":0}
+{"prompt_audio":"voice_07.wav","emo_audio":"emo_sad.wav","emo_weight": 1.0, "emo_mode":1,"text":"酒楼丧尽天良，开始借机竞拍房间，哎，一群蠢货。"}
+{"prompt_audio":"voice_08.wav","emo_audio":"emo_hate.wav","emo_weight": 1.0, "emo_mode":1,"text":"你看看你，对我还有没有一点父子之间的信任了。"}
+{"prompt_audio":"voice_09.wav","emo_vec_3":0.8,"emo_mode":2,"text":"对不起嘛！我的记性真的不太好，但是和你在一起的事情，我都会努力记住的~"}
+{"prompt_audio":"voice_10.wav","emo_vec_7":1.0,"emo_mode":2,"text":"哇塞！这个爆率也太高了！欧皇附体了！"}
 {"prompt_audio":"voice_11.wav","emo_mode":3,"emo_text":"极度悲伤","text":"这些年的时光终究是错付了... "}
 {"prompt_audio":"voice_12.wav","emo_mode":3,"emo_text":"You scared me to death! What are you, a ghost?","text":"快躲起来！是他要来了！他要来抓我们了！"}

indextts/cli.py CHANGED Viewed

@@ -12,9 +12,9 @@ def main():
     parser.add_argument("-o", "--output_path", type=str, default="gen.wav", help="Path to the output wav file")
     parser.add_argument("-c", "--config", type=str, default="checkpoints/config.yaml", help="Path to the config file. Default is 'checkpoints/config.yaml'")
     parser.add_argument("--model_dir", type=str, default="checkpoints", help="Path to the model directory. Default is 'checkpoints'")
-    parser.add_argument("--fp16", action="store_true", default=True, help="Use FP16 for inference if available")
     parser.add_argument("-f", "--force", action="store_true", default=False, help="Force to overwrite the output file if it exists")
-    parser.add_argument("-d", "--device", type=str, default=None, help="Device to run the model on (cpu, cuda, mps)." )
     args = parser.parse_args()
     if len(args.text.strip()) == 0:
         print("ERROR: Text is empty.")
@@ -47,15 +47,18 @@ def main():
     if args.device is None:
         if torch.cuda.is_available():
             args.device = "cuda:0"
-        elif torch.mps.is_available():
             args.device = "mps"
         else:
             args.device = "cpu"
             args.fp16 = False # Disable FP16 on CPU
             print("WARNING: Running on CPU may be slow.")
     from indextts.infer import IndexTTS
-    tts = IndexTTS(cfg_path=args.config, model_dir=args.model_dir, is_fp16=args.fp16, device=args.device)
     tts.infer(audio_prompt=args.voice, text=args.text.strip(), output_path=output_path)
 if __name__ == "__main__":

     parser.add_argument("-o", "--output_path", type=str, default="gen.wav", help="Path to the output wav file")
     parser.add_argument("-c", "--config", type=str, default="checkpoints/config.yaml", help="Path to the config file. Default is 'checkpoints/config.yaml'")
     parser.add_argument("--model_dir", type=str, default="checkpoints", help="Path to the model directory. Default is 'checkpoints'")
+    parser.add_argument("--fp16", action="store_true", default=False, help="Use FP16 for inference if available")
     parser.add_argument("-f", "--force", action="store_true", default=False, help="Force to overwrite the output file if it exists")
+    parser.add_argument("-d", "--device", type=str, default=None, help="Device to run the model on (cpu, cuda, mps, xpu)." )
     args = parser.parse_args()
     if len(args.text.strip()) == 0:
         print("ERROR: Text is empty.")
     if args.device is None:
         if torch.cuda.is_available():
             args.device = "cuda:0"
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            args.device = "xpu"
+        elif hasattr(torch, "mps") and torch.mps.is_available():
             args.device = "mps"
         else:
             args.device = "cpu"
             args.fp16 = False # Disable FP16 on CPU
             print("WARNING: Running on CPU may be slow.")
+    # TODO: Add CLI support for IndexTTS2.
     from indextts.infer import IndexTTS
+    tts = IndexTTS(cfg_path=args.config, model_dir=args.model_dir, use_fp16=args.fp16, device=args.device)
     tts.infer(audio_prompt=args.voice, text=args.text.strip(), output_path=output_path)
 if __name__ == "__main__":

indextts/infer.py CHANGED Viewed

@@ -26,38 +26,42 @@ from indextts.utils.front import TextNormalizer, TextTokenizer
 class IndexTTS:
     def __init__(
-            self, cfg_path="checkpoints/config.yaml", model_dir="checkpoints", is_fp16=True, device=None,
             use_cuda_kernel=None,
     ):
         """
         Args:
             cfg_path (str): path to the config file.
             model_dir (str): path to the model directory.
-            is_fp16 (bool): whether to use fp16.
             device (str): device to use (e.g., 'cuda:0', 'cpu'). If None, it will be set automatically based on the availability of CUDA or MPS.
             use_cuda_kernel (None | bool): whether to use BigVGan custom fused activation CUDA kernel, only for CUDA device.
         """
         if device is not None:
             self.device = device
-            self.is_fp16 = False if device == "cpu" else is_fp16
             self.use_cuda_kernel = use_cuda_kernel is not None and use_cuda_kernel and device.startswith("cuda")
         elif torch.cuda.is_available():
             self.device = "cuda:0"
-            self.is_fp16 = is_fp16
             self.use_cuda_kernel = use_cuda_kernel is None or use_cuda_kernel
         elif hasattr(torch, "mps") and torch.backends.mps.is_available():
             self.device = "mps"
-            self.is_fp16 = False  # Use float16 on MPS is overhead than float32
             self.use_cuda_kernel = False
         else:
             self.device = "cpu"
-            self.is_fp16 = False
             self.use_cuda_kernel = False
             print(">> Be patient, it may take a while to run in CPU mode.")
         self.cfg = OmegaConf.load(cfg_path)
         self.model_dir = model_dir
-        self.dtype = torch.float16 if self.is_fp16 else None
         self.stop_mel_token = self.cfg.gpt.stop_mel_token
         # Comment-off to load the VQ-VAE model for debugging tokenizer
@@ -68,7 +72,7 @@ class IndexTTS:
         # self.dvae_path = os.path.join(self.model_dir, self.cfg.dvae_checkpoint)
         # load_checkpoint(self.dvae, self.dvae_path)
         # self.dvae = self.dvae.to(self.device)
-        # if self.is_fp16:
         #     self.dvae.eval().half()
         # else:
         #     self.dvae.eval()
@@ -77,12 +81,12 @@ class IndexTTS:
         self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
         load_checkpoint(self.gpt, self.gpt_path)
         self.gpt = self.gpt.to(self.device)
-        if self.is_fp16:
             self.gpt.eval().half()
         else:
             self.gpt.eval()
         print(">> GPT weights restored from:", self.gpt_path)
-        if self.is_fp16:
             try:
                 import deepspeed
@@ -184,17 +188,17 @@ class IndexTTS:
         code_lens = torch.tensor(code_lens, dtype=torch.long, device=device)
         return codes, code_lens
-    def bucket_sentences(self, sentences, bucket_max_size=4) -> List[List[Dict]]:
         """
-        Sentence data bucketing.
-        if ``bucket_max_size=1``, return all sentences in one bucket.
         """
         outputs: List[Dict] = []
-        for idx, sent in enumerate(sentences):
             outputs.append({"idx": idx, "sent": sent, "len": len(sent)})
         if len(outputs) > bucket_max_size:
-            # split sentences into buckets by sentence length
             buckets: List[List[Dict]] = []
             factor = 1.5
             last_bucket = None
@@ -203,7 +207,7 @@ class IndexTTS:
             for sent in sorted(outputs, key=lambda x: x["len"]):
                 current_sent_len = sent["len"]
                 if current_sent_len == 0:
-                    print(">> skip empty sentence")
                     continue
                 if last_bucket is None \
                         or current_sent_len >= int(last_bucket_sent_len_median * factor) \
@@ -213,7 +217,7 @@ class IndexTTS:
                     last_bucket = buckets[-1]
                     last_bucket_sent_len_median = current_sent_len
                 else:
-                    # current bucket can hold more sentences
                     last_bucket.append(sent)  # sorted
                     mid = len(last_bucket) // 2
                     last_bucket_sent_len_median = last_bucket[mid]["len"]
@@ -276,20 +280,20 @@ class IndexTTS:
             self.gr_progress(value, desc=desc)
     # 快速推理：对于“多句长文本”，可实现至少 2~10 倍以上的速度提升~ （First modified by sunnyboxs 2025-04-16）
-    def infer_fast(self, audio_prompt, text, output_path, verbose=False, max_text_tokens_per_sentence=100,
-                   sentences_bucket_max_size=4, **generation_kwargs):
         """
         Args:
-            ``max_text_tokens_per_sentence``: 分句的最大token数，默认``100``，可以根据GPU硬件情况调整
                 - 越小，batch 越多，推理速度越*快*，占用内存更多，可能影响质量
                 - 越大，batch 越少，推理速度越*慢*，占用内存和质量更接近于非快速推理
-            ``sentences_bucket_max_size``: 分句分桶的最大容量，默认``4``，可以根据GPU内存调整
                 - 越大，bucket数量越少，batch越多，推理速度越*快*，占用内存更多，可能影响质量
                 - 越小，bucket数量越多，batch越少，推理速度越*慢*，占用内存和质量更接近于非快速推理
         """
-        print(">> start fast inference...")
-        self._set_gr_progress(0, "start fast inference...")
         if verbose:
             print(f"origin text:{text}")
         start_time = time.perf_counter()
@@ -301,6 +305,15 @@ class IndexTTS:
             if audio.shape[0] > 1:
                 audio = audio[0].unsqueeze(0)
             audio = torchaudio.transforms.Resample(sr, 24000)(audio)
             cond_mel = MelSpectrogramFeatures()(audio).to(self.device)
             cond_mel_frame = cond_mel.shape[-1]
             if verbose:
@@ -319,13 +332,13 @@ class IndexTTS:
         # text_tokens
         text_tokens_list = self.tokenizer.tokenize(text)
-        sentences = self.tokenizer.split_sentences(text_tokens_list,
-                                                   max_tokens_per_sentence=max_text_tokens_per_sentence)
         if verbose:
             print(">> text token count:", len(text_tokens_list))
-            print("   splited sentences count:", len(sentences))
-            print("   max_text_tokens_per_sentence:", max_text_tokens_per_sentence)
-            print(*sentences, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
@@ -346,17 +359,17 @@ class IndexTTS:
         # text processing
         all_text_tokens: List[List[torch.Tensor]] = []
         self._set_gr_progress(0.1, "text processing...")
-        bucket_max_size = sentences_bucket_max_size if self.device != "cpu" else 1
-        all_sentences = self.bucket_sentences(sentences, bucket_max_size=bucket_max_size)
-        bucket_count = len(all_sentences)
         if verbose:
-            print(">> sentences bucket_count:", bucket_count,
-                  "bucket sizes:", [(len(s), [t["idx"] for t in s]) for s in all_sentences],
                   "bucket_max_size:", bucket_max_size)
-        for sentences in all_sentences:
             temp_tokens: List[torch.Tensor] = []
             all_text_tokens.append(temp_tokens)
-            for item in sentences:
                 sent = item["sent"]
                 text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
                 text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
@@ -365,11 +378,11 @@ class IndexTTS:
                     print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                     # debug tokenizer
                     text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
-                    print("text_token_syms is same as sentence tokens", text_token_syms == sent)
                 temp_tokens.append(text_tokens)
         # Sequential processing of bucketing data
-        all_batch_num = sum(len(s) for s in all_sentences)
         all_batch_codes = []
         processed_num = 0
         for item_tokens in all_text_tokens:
@@ -381,7 +394,7 @@ class IndexTTS:
             processed_num += batch_num
             # gpt speech
             self._set_gr_progress(0.2 + 0.3 * processed_num / all_batch_num,
-                                  f"gpt inference speech... {processed_num}/{all_batch_num}")
             m_start_time = time.perf_counter()
             with torch.no_grad():
                 with torch.amp.autocast(batch_text_tokens.device.type, enabled=self.dtype is not None,
@@ -403,17 +416,17 @@ class IndexTTS:
             gpt_gen_time += time.perf_counter() - m_start_time
         # gpt latent
-        self._set_gr_progress(0.5, "gpt inference latents...")
         all_idxs = []
         all_latents = []
         has_warned = False
-        for batch_codes, batch_tokens, batch_sentences in zip(all_batch_codes, all_text_tokens, all_sentences):
             for i in range(batch_codes.shape[0]):
                 codes = batch_codes[i]  # [x]
                 if not has_warned and codes[-1] != self.stop_mel_token:
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
-                        f"Consider reducing `max_text_tokens_per_sentence`({max_text_tokens_per_sentence}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
@@ -427,7 +440,7 @@ class IndexTTS:
                     print(codes)
                     print("code_lens:", code_lens)
                 text_tokens = batch_tokens[i]
-                all_idxs.append(batch_sentences[i]["idx"])
                 m_start_time = time.perf_counter()
                 with torch.no_grad():
                     with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
@@ -440,7 +453,7 @@ class IndexTTS:
                                      return_latent=True, clip_inputs=False)
                         gpt_forward_time += time.perf_counter() - m_start_time
                         all_latents.append(latent)
-        del all_batch_codes, all_text_tokens, all_sentences
         # bigvgan chunk
         chunk_size = 2
         all_latents = [all_latents[all_idxs.index(i)] for i in range(len(all_latents))]
@@ -452,7 +465,7 @@ class IndexTTS:
         latent_length = len(all_latents)
         # bigvgan chunk decode
-        self._set_gr_progress(0.7, "bigvgan decode...")
         tqdm_progress = tqdm(total=latent_length, desc="bigvgan")
         for items in chunk_latents:
             tqdm_progress.update(len(items))
@@ -474,7 +487,7 @@ class IndexTTS:
         self.torch_empty_cache()
         # wav audio output
-        self._set_gr_progress(0.9, "save audio...")
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
         print(f">> Reference audio length: {cond_mel_frame * 256 / sampling_rate:.2f} seconds")
@@ -503,10 +516,10 @@ class IndexTTS:
             return (sampling_rate, wav_data)
     # 原始推理模式
-    def infer(self, audio_prompt, text, output_path, verbose=False, max_text_tokens_per_sentence=120,
               **generation_kwargs):
-        print(">> start inference...")
-        self._set_gr_progress(0, "start inference...")
         if verbose:
             print(f"origin text:{text}")
         start_time = time.perf_counter()
@@ -533,12 +546,12 @@ class IndexTTS:
         self._set_gr_progress(0.1, "text processing...")
         auto_conditioning = cond_mel
         text_tokens_list = self.tokenizer.tokenize(text)
-        sentences = self.tokenizer.split_sentences(text_tokens_list, max_text_tokens_per_sentence)
         if verbose:
             print("text token count:", len(text_tokens_list))
-            print("sentences count:", len(sentences))
-            print("max_text_tokens_per_sentence:", max_text_tokens_per_sentence)
-            print(*sentences, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
@@ -557,7 +570,7 @@ class IndexTTS:
         bigvgan_time = 0
         progress = 0
         has_warned = False
-        for sent in sentences:
             text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
             text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
             # text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
@@ -568,13 +581,13 @@ class IndexTTS:
                 print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                 # debug tokenizer
                 text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
-                print("text_token_syms is same as sentence tokens", text_token_syms == sent)
             # text_len = torch.IntTensor([text_tokens.size(1)], device=text_tokens.device)
             # print(text_len)
             progress += 1
-            self._set_gr_progress(0.2 + 0.4 * (progress - 1) / len(sentences),
-                                  f"gpt inference latent... {progress}/{len(sentences)}")
             m_start_time = time.perf_counter()
             with torch.no_grad():
                 with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
@@ -597,7 +610,7 @@ class IndexTTS:
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
                         f"Input text tokens: {text_tokens.shape[1]}. "
-                        f"Consider reducing `max_text_tokens_per_sentence`({max_text_tokens_per_sentence}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
@@ -615,8 +628,8 @@ class IndexTTS:
                     print(codes, type(codes))
                     print(f"fix codes shape: {codes.shape}, codes type: {codes.dtype}")
                     print(f"code len: {code_lens}")
-                self._set_gr_progress(0.2 + 0.4 * progress / len(sentences),
-                                      f"gpt inference speech... {progress}/{len(sentences)}")
                 m_start_time = time.perf_counter()
                 # latent, text_lens_out, code_lens_out = \
                 with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
@@ -640,7 +653,7 @@ class IndexTTS:
                 # wavs.append(wav[:, :-512])
                 wavs.append(wav.cpu())  # to cpu before saving
         end_time = time.perf_counter()
-        self._set_gr_progress(0.9, "save audio...")
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
         print(f">> Reference audio length: {cond_mel_frame * 256 / sampling_rate:.2f} seconds")

 class IndexTTS:
     def __init__(
+            self, cfg_path="checkpoints/config.yaml", model_dir="checkpoints", use_fp16=True, device=None,
             use_cuda_kernel=None,
     ):
         """
         Args:
             cfg_path (str): path to the config file.
             model_dir (str): path to the model directory.
+            use_fp16 (bool): whether to use fp16.
             device (str): device to use (e.g., 'cuda:0', 'cpu'). If None, it will be set automatically based on the availability of CUDA or MPS.
             use_cuda_kernel (None | bool): whether to use BigVGan custom fused activation CUDA kernel, only for CUDA device.
         """
         if device is not None:
             self.device = device
+            self.use_fp16 = False if device == "cpu" else use_fp16
             self.use_cuda_kernel = use_cuda_kernel is not None and use_cuda_kernel and device.startswith("cuda")
         elif torch.cuda.is_available():
             self.device = "cuda:0"
+            self.use_fp16 = use_fp16
             self.use_cuda_kernel = use_cuda_kernel is None or use_cuda_kernel
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            self.device = "xpu"
+            self.use_fp16 = use_fp16
+            self.use_cuda_kernel = False
         elif hasattr(torch, "mps") and torch.backends.mps.is_available():
             self.device = "mps"
+            self.use_fp16 = False  # Use float16 on MPS is overhead than float32
             self.use_cuda_kernel = False
         else:
             self.device = "cpu"
+            self.use_fp16 = False
             self.use_cuda_kernel = False
             print(">> Be patient, it may take a while to run in CPU mode.")
         self.cfg = OmegaConf.load(cfg_path)
         self.model_dir = model_dir
+        self.dtype = torch.float16 if self.use_fp16 else None
         self.stop_mel_token = self.cfg.gpt.stop_mel_token
         # Comment-off to load the VQ-VAE model for debugging tokenizer
         # self.dvae_path = os.path.join(self.model_dir, self.cfg.dvae_checkpoint)
         # load_checkpoint(self.dvae, self.dvae_path)
         # self.dvae = self.dvae.to(self.device)
+        # if self.use_fp16:
         #     self.dvae.eval().half()
         # else:
         #     self.dvae.eval()
         self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
         load_checkpoint(self.gpt, self.gpt_path)
         self.gpt = self.gpt.to(self.device)
+        if self.use_fp16:
             self.gpt.eval().half()
         else:
             self.gpt.eval()
         print(">> GPT weights restored from:", self.gpt_path)
+        if self.use_fp16:
             try:
                 import deepspeed
         code_lens = torch.tensor(code_lens, dtype=torch.long, device=device)
         return codes, code_lens
+    def bucket_segments(self, segments, bucket_max_size=4) -> List[List[Dict]]:
         """
+        Segment data bucketing.
+        if ``bucket_max_size=1``, return all segments in one bucket.
         """
         outputs: List[Dict] = []
+        for idx, sent in enumerate(segments):
             outputs.append({"idx": idx, "sent": sent, "len": len(sent)})
         if len(outputs) > bucket_max_size:
+            # split segments into buckets by segment length
             buckets: List[List[Dict]] = []
             factor = 1.5
             last_bucket = None
             for sent in sorted(outputs, key=lambda x: x["len"]):
                 current_sent_len = sent["len"]
                 if current_sent_len == 0:
+                    print(">> skip empty segment")
                     continue
                 if last_bucket is None \
                         or current_sent_len >= int(last_bucket_sent_len_median * factor) \
                     last_bucket = buckets[-1]
                     last_bucket_sent_len_median = current_sent_len
                 else:
+                    # current bucket can hold more segments
                     last_bucket.append(sent)  # sorted
                     mid = len(last_bucket) // 2
                     last_bucket_sent_len_median = last_bucket[mid]["len"]
             self.gr_progress(value, desc=desc)
     # 快速推理：对于“多句长文本”，可实现至少 2~10 倍以上的速度提升~ （First modified by sunnyboxs 2025-04-16）
+    def infer_fast(self, audio_prompt, text, output_path, verbose=False, max_text_tokens_per_segment=100,
+                   segments_bucket_max_size=4, **generation_kwargs):
         """
         Args:
+            ``max_text_tokens_per_segment``: 分句的最大token数，默认``100``，可以根据GPU硬件情况调整
                 - 越小，batch 越多，推理速度越*快*，占用内存更多，可能影响质量
                 - 越大，batch 越少，推理速度越*慢*，占用内存和质量更接近于非快速推理
+            ``segments_bucket_max_size``: 分句分桶的最大容量，默认``4``，可以根据GPU内存调整
                 - 越大，bucket数量越少，batch越多，推理速度越*快*，占用内存更多，可能影响质量
                 - 越小，bucket数量越多，batch越少，推理速度越*慢*，占用内存和质量更接近于非快速推理
         """
+        print(">> starting fast inference...")
+        self._set_gr_progress(0, "starting fast inference...")
         if verbose:
             print(f"origin text:{text}")
         start_time = time.perf_counter()
             if audio.shape[0] > 1:
                 audio = audio[0].unsqueeze(0)
             audio = torchaudio.transforms.Resample(sr, 24000)(audio)
+            max_audio_length_seconds = 50
+            max_audio_samples = int(max_audio_length_seconds * 24000)
+            if audio.shape[1] > max_audio_samples:
+                if verbose:
+                    print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples")
+                audio = audio[:, :max_audio_samples]
             cond_mel = MelSpectrogramFeatures()(audio).to(self.device)
             cond_mel_frame = cond_mel.shape[-1]
             if verbose:
         # text_tokens
         text_tokens_list = self.tokenizer.tokenize(text)
+        segments = self.tokenizer.split_segments(text_tokens_list,
+                                                   max_text_tokens_per_segment=max_text_tokens_per_segment)
         if verbose:
             print(">> text token count:", len(text_tokens_list))
+            print("   segments count:", len(segments))
+            print("   max_text_tokens_per_segment:", max_text_tokens_per_segment)
+            print(*segments, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
         # text processing
         all_text_tokens: List[List[torch.Tensor]] = []
         self._set_gr_progress(0.1, "text processing...")
+        bucket_max_size = segments_bucket_max_size if self.device != "cpu" else 1
+        all_segments = self.bucket_segments(segments, bucket_max_size=bucket_max_size)
+        bucket_count = len(all_segments)
         if verbose:
+            print(">> segments bucket_count:", bucket_count,
+                  "bucket sizes:", [(len(s), [t["idx"] for t in s]) for s in all_segments],
                   "bucket_max_size:", bucket_max_size)
+        for segments in all_segments:
             temp_tokens: List[torch.Tensor] = []
             all_text_tokens.append(temp_tokens)
+            for item in segments:
                 sent = item["sent"]
                 text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
                 text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
                     print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                     # debug tokenizer
                     text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
+                    print("text_token_syms is same as segment tokens", text_token_syms == sent)
                 temp_tokens.append(text_tokens)
         # Sequential processing of bucketing data
+        all_batch_num = sum(len(s) for s in all_segments)
         all_batch_codes = []
         processed_num = 0
         for item_tokens in all_text_tokens:
             processed_num += batch_num
             # gpt speech
             self._set_gr_progress(0.2 + 0.3 * processed_num / all_batch_num,
+                                  f"gpt speech inference {processed_num}/{all_batch_num}...")
             m_start_time = time.perf_counter()
             with torch.no_grad():
                 with torch.amp.autocast(batch_text_tokens.device.type, enabled=self.dtype is not None,
             gpt_gen_time += time.perf_counter() - m_start_time
         # gpt latent
+        self._set_gr_progress(0.5, "gpt latents inference...")
         all_idxs = []
         all_latents = []
         has_warned = False
+        for batch_codes, batch_tokens, batch_segments in zip(all_batch_codes, all_text_tokens, all_segments):
             for i in range(batch_codes.shape[0]):
                 codes = batch_codes[i]  # [x]
                 if not has_warned and codes[-1] != self.stop_mel_token:
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
+                        f"Consider reducing `max_text_tokens_per_segment`({max_text_tokens_per_segment}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
                     print(codes)
                     print("code_lens:", code_lens)
                 text_tokens = batch_tokens[i]
+                all_idxs.append(batch_segments[i]["idx"])
                 m_start_time = time.perf_counter()
                 with torch.no_grad():
                     with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
                                      return_latent=True, clip_inputs=False)
                         gpt_forward_time += time.perf_counter() - m_start_time
                         all_latents.append(latent)
+        del all_batch_codes, all_text_tokens, all_segments
         # bigvgan chunk
         chunk_size = 2
         all_latents = [all_latents[all_idxs.index(i)] for i in range(len(all_latents))]
         latent_length = len(all_latents)
         # bigvgan chunk decode
+        self._set_gr_progress(0.7, "bigvgan decoding...")
         tqdm_progress = tqdm(total=latent_length, desc="bigvgan")
         for items in chunk_latents:
             tqdm_progress.update(len(items))
         self.torch_empty_cache()
         # wav audio output
+        self._set_gr_progress(0.9, "saving audio...")
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
         print(f">> Reference audio length: {cond_mel_frame * 256 / sampling_rate:.2f} seconds")
             return (sampling_rate, wav_data)
     # 原始推理模式
+    def infer(self, audio_prompt, text, output_path, verbose=False, max_text_tokens_per_segment=120,
               **generation_kwargs):
+        print(">> starting inference...")
+        self._set_gr_progress(0, "starting inference...")
         if verbose:
             print(f"origin text:{text}")
         start_time = time.perf_counter()
         self._set_gr_progress(0.1, "text processing...")
         auto_conditioning = cond_mel
         text_tokens_list = self.tokenizer.tokenize(text)
+        segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment)
         if verbose:
             print("text token count:", len(text_tokens_list))
+            print("segments count:", len(segments))
+            print("max_text_tokens_per_segment:", max_text_tokens_per_segment)
+            print(*segments, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
         bigvgan_time = 0
         progress = 0
         has_warned = False
+        for sent in segments:
             text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
             text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
             # text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
                 print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                 # debug tokenizer
                 text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
+                print("text_token_syms is same as segment tokens", text_token_syms == sent)
             # text_len = torch.IntTensor([text_tokens.size(1)], device=text_tokens.device)
             # print(text_len)
             progress += 1
+            self._set_gr_progress(0.2 + 0.4 * (progress - 1) / len(segments),
+                                  f"gpt latents inference {progress}/{len(segments)}...")
             m_start_time = time.perf_counter()
             with torch.no_grad():
                 with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
                         f"Input text tokens: {text_tokens.shape[1]}. "
+                        f"Consider reducing `max_text_tokens_per_segment`({max_text_tokens_per_segment}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
                     print(codes, type(codes))
                     print(f"fix codes shape: {codes.shape}, codes type: {codes.dtype}")
                     print(f"code len: {code_lens}")
+                self._set_gr_progress(0.2 + 0.4 * progress / len(segments),
+                                      f"gpt speech inference {progress}/{len(segments)}...")
                 m_start_time = time.perf_counter()
                 # latent, text_lens_out, code_lens_out = \
                 with torch.amp.autocast(text_tokens.device.type, enabled=self.dtype is not None, dtype=self.dtype):
                 # wavs.append(wav[:, :-512])
                 wavs.append(wav.cpu())  # to cpu before saving
         end_time = time.perf_counter()
+        self._set_gr_progress(0.9, "saving audio...")
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
         print(f">> Reference audio length: {cond_mel_frame * 256 / sampling_rate:.2f} seconds")

indextts/infer_v2.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import os
 from subprocess import CalledProcessError
 import time
 import librosa
 import torch
@@ -34,38 +37,43 @@ import torch.nn.functional as F
 class IndexTTS2:
     def __init__(
-            self, cfg_path="checkpoints/config.yaml", model_dir="checkpoints", is_fp16=False, device=None,
             use_cuda_kernel=None,use_deepspeed=False
     ):
         """
         Args:
             cfg_path (str): path to the config file.
             model_dir (str): path to the model directory.
-            is_fp16 (bool): whether to use fp16.
             device (str): device to use (e.g., 'cuda:0', 'cpu'). If None, it will be set automatically based on the availability of CUDA or MPS.
             use_cuda_kernel (None | bool): whether to use BigVGan custom fused activation CUDA kernel, only for CUDA device.
         """
         if device is not None:
             self.device = device
-            self.is_fp16 = False if device == "cpu" else is_fp16
             self.use_cuda_kernel = use_cuda_kernel is not None and use_cuda_kernel and device.startswith("cuda")
         elif torch.cuda.is_available():
             self.device = "cuda:0"
-            self.is_fp16 = is_fp16
             self.use_cuda_kernel = use_cuda_kernel is None or use_cuda_kernel
         elif hasattr(torch, "mps") and torch.backends.mps.is_available():
             self.device = "mps"
-            self.is_fp16 = False  # Use float16 on MPS is overhead than float32
             self.use_cuda_kernel = False
         else:
             self.device = "cpu"
-            self.is_fp16 = False
             self.use_cuda_kernel = False
             print(">> Be patient, it may take a while to run in CPU mode.")
         self.cfg = OmegaConf.load(cfg_path)
         self.model_dir = model_dir
-        self.dtype = torch.float16 if self.is_fp16 else None
         self.stop_mel_token = self.cfg.gpt.stop_mel_token
         self.qwen_emo = QwenEmotion(os.path.join(self.model_dir, self.cfg.qwen_emo_path))
@@ -74,32 +82,30 @@ class IndexTTS2:
         self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
         load_checkpoint(self.gpt, self.gpt_path)
         self.gpt = self.gpt.to(self.device)
-        if self.is_fp16:
             self.gpt.eval().half()
         else:
             self.gpt.eval()
         print(">> GPT weights restored from:", self.gpt_path)
-        if self.is_fp16:
             try:
                 import deepspeed
             except (ImportError, OSError, CalledProcessError) as e:
                 use_deepspeed = False
-                print(f">> DeepSpeed加载失败，回退到标准推理: {e}")
-            self.gpt.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=True, half=True)
-        else:
-            self.gpt.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=True, half=False)
         if self.use_cuda_kernel:
             # preload the CUDA kernel for BigVGAN
             try:
-                from indextts.BigVGAN.alias_free_activation.cuda import load
-                anti_alias_activation_cuda = load.load()
-                print(">> Preload custom CUDA kernel for BigVGAN", anti_alias_activation_cuda)
-            except:
                 print(">> Failed to load custom CUDA kernel for BigVGAN. Falling back to torch.")
                 self.use_cuda_kernel = False
         self.extract_features = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
@@ -143,7 +149,7 @@ class IndexTTS2:
         print(">> campplus_model weights restored from:", campplus_ckpt_path)
         bigvgan_name = self.cfg.vocoder.name
-        self.bigvgan = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=False)
         self.bigvgan = self.bigvgan.to(self.device)
         self.bigvgan.remove_weight_norm()
         self.bigvgan.eval()
@@ -261,7 +267,7 @@ class IndexTTS2:
     def insert_interval_silence(self, wavs, sampling_rate=22050, interval_silence=200):
         """
-        Insert silences between sentences.
         wavs: List[torch.tensor]
         """
@@ -286,47 +292,69 @@ class IndexTTS2:
         if self.gr_progress is not None:
             self.gr_progress(value, desc=desc)
     # 原始推理模式
     def infer(self, spk_audio_prompt, text, output_path,
               emo_audio_prompt=None, emo_alpha=1.0,
               emo_vector=None,
               use_emo_text=False, emo_text=None, use_random=False, interval_silence=200,
-              verbose=False, max_text_tokens_per_sentence=120, **generation_kwargs):
-        print(">> start inference...")
-        self._set_gr_progress(0, "start inference...")
         if verbose:
-            print(f"origin text:{text}, spk_audio_prompt:{spk_audio_prompt},"
-                  f" emo_audio_prompt:{emo_audio_prompt}, emo_alpha:{emo_alpha}, "
                   f"emo_vector:{emo_vector}, use_emo_text:{use_emo_text}, "
                   f"emo_text:{emo_text}")
         start_time = time.perf_counter()
-        if use_emo_text:
             emo_audio_prompt = None
-            emo_alpha = 1.0
-            # assert emo_audio_prompt is None
-            # assert emo_alpha == 1.0
             if emo_text is None:
-                emo_text = text
-            emo_dict, content = self.qwen_emo.inference(emo_text)
-            print(emo_dict)
             emo_vector = list(emo_dict.values())
         if emo_vector is not None:
-            emo_audio_prompt = None
-            emo_alpha = 1.0
-            # assert emo_audio_prompt is None
-            # assert emo_alpha == 1.0
         if emo_audio_prompt is None:
             emo_audio_prompt = spk_audio_prompt
             emo_alpha = 1.0
-            # assert emo_alpha == 1.0
         # 如果参考音频改变了，才需要重新生成, 提升速度
         if self.cache_spk_cond is None or self.cache_spk_audio_prompt != spk_audio_prompt:
-            audio, sr = librosa.load(spk_audio_prompt)
-            audio = torch.tensor(audio).unsqueeze(0)
             audio_22k = torchaudio.transforms.Resample(sr, 22050)(audio)
             audio_16k = torchaudio.transforms.Resample(sr, 16000)(audio)
@@ -377,7 +405,7 @@ class IndexTTS2:
             emovec_mat = emovec_mat.unsqueeze(0)
         if self.cache_emo_cond is None or self.cache_emo_audio_prompt != emo_audio_prompt:
-            emo_audio, _ = librosa.load(emo_audio_prompt, sr=16000)
             emo_inputs = self.extract_features(emo_audio, sampling_rate=16000, return_tensors="pt")
             emo_input_features = emo_inputs["input_features"]
             emo_attention_mask = emo_inputs["attention_mask"]
@@ -392,12 +420,13 @@ class IndexTTS2:
         self._set_gr_progress(0.1, "text processing...")
         text_tokens_list = self.tokenizer.tokenize(text)
-        sentences = self.tokenizer.split_sentences(text_tokens_list, max_text_tokens_per_sentence)
         if verbose:
             print("text_tokens_list:", text_tokens_list)
-            print("sentences count:", len(sentences))
-            print("max_text_tokens_per_sentence:", max_text_tokens_per_sentence)
-            print(*sentences, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
@@ -414,9 +443,11 @@ class IndexTTS2:
         gpt_forward_time = 0
         s2mel_time = 0
         bigvgan_time = 0
-        progress = 0
         has_warned = False
-        for sent in sentences:
             text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
             text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
             if verbose:
@@ -424,7 +455,7 @@ class IndexTTS2:
                 print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                 # debug tokenizer
                 text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
-                print("text_token_syms is same as sentence tokens", text_token_syms == sent)
             m_start_time = time.perf_counter()
             with torch.no_grad():
@@ -465,7 +496,7 @@ class IndexTTS2:
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
                         f"Input text tokens: {text_tokens.shape[1]}. "
-                        f"Consider reducing `max_text_tokens_per_sentence`({max_text_tokens_per_sentence}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
@@ -546,7 +577,8 @@ class IndexTTS2:
                 # wavs.append(wav[:, :-512])
                 wavs.append(wav.cpu())  # to cpu before saving
         end_time = time.perf_counter()
-        self._set_gr_progress(0.9, "save audio...")
         wavs = self.insert_interval_silence(wavs, sampling_rate=sampling_rate, interval_silence=interval_silence)
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
@@ -595,59 +627,52 @@ class QwenEmotion:
             device_map="auto"
         )
         self.prompt = "文本情感分类"
-        self.convert_dict = {
-            "愤怒": "angry",
             "高兴": "happy",
-            "恐惧": "fear",
-            "反感": "hate",
             "悲伤": "sad",
-            "低落": "low",
-            "惊讶": "surprise",
-            "自然": "neutral",
         }
-        self.backup_dict = {"happy": 0, "angry": 0, "sad": 0, "fear": 0, "hate": 0, "low": 0, "surprise": 0,
-                            "neutral": 1.0}
         self.max_score = 1.2
         self.min_score = 0.0
     def convert(self, content):
-        content = content.replace("\n", " ")
-        content = content.replace(" ", "")
-        content = content.replace("{", "")
-        content = content.replace("}", "")
-        content = content.replace('"', "")
-        parts = content.strip().split(',')
-        print(parts)
-        parts_dict = {}
-        desired_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"]
-        for part in parts:
-            key_value = part.strip().split(':')
-            if len(key_value) == 2:
-                parts_dict[key_value[0].strip()] = part
-        # 按照期望顺序重新排列
-        ordered_parts = [parts_dict[key] for key in desired_order if key in parts_dict]
-        parts = ordered_parts
-        if len(parts) != len(self.convert_dict):
-            return self.backup_dict
-        emotion_dict = {}
-        for part in parts:
-            key_value = part.strip().split(':')
-            if len(key_value) == 2:
-                try:
-                    key = self.convert_dict[key_value[0].strip()]
-                    value = float(key_value[1].strip())
-                    value = max(self.min_score, min(self.max_score, value))
-                    emotion_dict[key] = value
-                except Exception:
-                    continue
-        for key in self.backup_dict:
-            if key not in emotion_dict:
-                emotion_dict[key] = 0.0
-        if sum(emotion_dict.values()) <= 0:
-            return self.backup_dict
         return emotion_dict
@@ -680,9 +705,30 @@ class QwenEmotion:
         except ValueError:
             index = 0
-        content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
-        emotion_dict = self.convert(content)
-        return emotion_dict, content
 if __name__ == "__main__":

 import os
 from subprocess import CalledProcessError
+os.environ['HF_HUB_CACHE'] = './checkpoints/hf_cache'
+import json
+import re
 import time
 import librosa
 import torch
 class IndexTTS2:
     def __init__(
+            self, cfg_path="checkpoints/config.yaml", model_dir="checkpoints", use_fp16=False, device=None,
             use_cuda_kernel=None,use_deepspeed=False
     ):
         """
         Args:
             cfg_path (str): path to the config file.
             model_dir (str): path to the model directory.
+            use_fp16 (bool): whether to use fp16.
             device (str): device to use (e.g., 'cuda:0', 'cpu'). If None, it will be set automatically based on the availability of CUDA or MPS.
             use_cuda_kernel (None | bool): whether to use BigVGan custom fused activation CUDA kernel, only for CUDA device.
+            use_deepspeed (bool): whether to use DeepSpeed or not.
         """
         if device is not None:
             self.device = device
+            self.use_fp16 = False if device == "cpu" else use_fp16
             self.use_cuda_kernel = use_cuda_kernel is not None and use_cuda_kernel and device.startswith("cuda")
         elif torch.cuda.is_available():
             self.device = "cuda:0"
+            self.use_fp16 = use_fp16
             self.use_cuda_kernel = use_cuda_kernel is None or use_cuda_kernel
+        elif hasattr(torch, "xpu") and torch.xpu.is_available():
+            self.device = "xpu"
+            self.use_fp16 = use_fp16
+            self.use_cuda_kernel = False
         elif hasattr(torch, "mps") and torch.backends.mps.is_available():
             self.device = "mps"
+            self.use_fp16 = False  # Use float16 on MPS is overhead than float32
             self.use_cuda_kernel = False
         else:
             self.device = "cpu"
+            self.use_fp16 = False
             self.use_cuda_kernel = False
             print(">> Be patient, it may take a while to run in CPU mode.")
         self.cfg = OmegaConf.load(cfg_path)
         self.model_dir = model_dir
+        self.dtype = torch.float16 if self.use_fp16 else None
         self.stop_mel_token = self.cfg.gpt.stop_mel_token
         self.qwen_emo = QwenEmotion(os.path.join(self.model_dir, self.cfg.qwen_emo_path))
         self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
         load_checkpoint(self.gpt, self.gpt_path)
         self.gpt = self.gpt.to(self.device)
+        if self.use_fp16:
             self.gpt.eval().half()
         else:
             self.gpt.eval()
         print(">> GPT weights restored from:", self.gpt_path)
+        if use_deepspeed:
             try:
                 import deepspeed
             except (ImportError, OSError, CalledProcessError) as e:
                 use_deepspeed = False
+                print(f">> Failed to load DeepSpeed. Falling back to normal inference. Error: {e}")
+        self.gpt.post_init_gpt2_config(use_deepspeed=use_deepspeed, kv_cache=True, half=self.use_fp16)
         if self.use_cuda_kernel:
             # preload the CUDA kernel for BigVGAN
             try:
+                from indextts.s2mel.modules.bigvgan.alias_free_activation.cuda import activation1d
+                print(">> Preload custom CUDA kernel for BigVGAN", activation1d.anti_alias_activation_cuda)
+            except Exception as e:
                 print(">> Failed to load custom CUDA kernel for BigVGAN. Falling back to torch.")
+                print(f"{e!r}")
                 self.use_cuda_kernel = False
         self.extract_features = SeamlessM4TFeatureExtractor.from_pretrained("facebook/w2v-bert-2.0")
         print(">> campplus_model weights restored from:", campplus_ckpt_path)
         bigvgan_name = self.cfg.vocoder.name
+        self.bigvgan = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=self.use_cuda_kernel)
         self.bigvgan = self.bigvgan.to(self.device)
         self.bigvgan.remove_weight_norm()
         self.bigvgan.eval()
     def insert_interval_silence(self, wavs, sampling_rate=22050, interval_silence=200):
         """
+        Insert silences between generated segments.
         wavs: List[torch.tensor]
         """
         if self.gr_progress is not None:
             self.gr_progress(value, desc=desc)
+    def _load_and_cut_audio(self,audio_path,max_audio_length_seconds,verbose=False,sr=None):
+        if not sr:
+            audio, sr = librosa.load(audio_path)
+        else:
+            audio, _ = librosa.load(audio_path,sr=sr)
+        audio = torch.tensor(audio).unsqueeze(0)
+        max_audio_samples = int(max_audio_length_seconds * sr)
+        if audio.shape[1] > max_audio_samples:
+            if verbose:
+                print(f"Audio too long ({audio.shape[1]} samples), truncating to {max_audio_samples} samples")
+            audio = audio[:, :max_audio_samples]
+        return audio, sr
     # 原始推理模式
     def infer(self, spk_audio_prompt, text, output_path,
               emo_audio_prompt=None, emo_alpha=1.0,
               emo_vector=None,
               use_emo_text=False, emo_text=None, use_random=False, interval_silence=200,
+              verbose=False, max_text_tokens_per_segment=120, **generation_kwargs):
+        print(">> starting inference...")
+        self._set_gr_progress(0, "starting inference...")
         if verbose:
+            print(f"origin text:{text}, spk_audio_prompt:{spk_audio_prompt}, "
+                  f"emo_audio_prompt:{emo_audio_prompt}, emo_alpha:{emo_alpha}, "
                   f"emo_vector:{emo_vector}, use_emo_text:{use_emo_text}, "
                   f"emo_text:{emo_text}")
         start_time = time.perf_counter()
+        if use_emo_text or emo_vector is not None:
+            # we're using a text or emotion vector guidance; so we must remove
+            # "emotion reference voice", to ensure we use correct emotion mixing!
             emo_audio_prompt = None
+        if use_emo_text:
+            # automatically generate emotion vectors from text prompt
             if emo_text is None:
+                emo_text = text  # use main text prompt
+            emo_dict = self.qwen_emo.inference(emo_text)
+            print(f"detected emotion vectors from text: {emo_dict}")
+            # convert ordered dict to list of vectors; the order is VERY important!
             emo_vector = list(emo_dict.values())
         if emo_vector is not None:
+            # we have emotion vectors; they can't be blended via alpha mixing
+            # in the main inference process later, so we must pre-calculate
+            # their new strengths here based on the alpha instead!
+            emo_vector_scale = max(0.0, min(1.0, emo_alpha))
+            if emo_vector_scale != 1.0:
+                # scale each vector and truncate to 4 decimals (for nicer printing)
+                emo_vector = [int(x * emo_vector_scale * 10000) / 10000 for x in emo_vector]
+                print(f"scaled emotion vectors to {emo_vector_scale}x: {emo_vector}")
         if emo_audio_prompt is None:
+            # we are not using any external "emotion reference voice"; use
+            # speaker's voice as the main emotion reference audio.
             emo_audio_prompt = spk_audio_prompt
+            # must always use alpha=1.0 when we don't have an external reference voice
             emo_alpha = 1.0
         # 如果参考音频改变了，才需要重新生成, 提升速度
         if self.cache_spk_cond is None or self.cache_spk_audio_prompt != spk_audio_prompt:
+            audio,sr = self._load_and_cut_audio(spk_audio_prompt,15,verbose)
             audio_22k = torchaudio.transforms.Resample(sr, 22050)(audio)
             audio_16k = torchaudio.transforms.Resample(sr, 16000)(audio)
             emovec_mat = emovec_mat.unsqueeze(0)
         if self.cache_emo_cond is None or self.cache_emo_audio_prompt != emo_audio_prompt:
+            emo_audio, _ = self._load_and_cut_audio(emo_audio_prompt,15,verbose,sr=16000)
             emo_inputs = self.extract_features(emo_audio, sampling_rate=16000, return_tensors="pt")
             emo_input_features = emo_inputs["input_features"]
             emo_attention_mask = emo_inputs["attention_mask"]
         self._set_gr_progress(0.1, "text processing...")
         text_tokens_list = self.tokenizer.tokenize(text)
+        segments = self.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment)
+        segments_count = len(segments)
         if verbose:
             print("text_tokens_list:", text_tokens_list)
+            print("segments count:", segments_count)
+            print("max_text_tokens_per_segment:", max_text_tokens_per_segment)
+            print(*segments, sep="\n")
         do_sample = generation_kwargs.pop("do_sample", True)
         top_p = generation_kwargs.pop("top_p", 0.8)
         top_k = generation_kwargs.pop("top_k", 30)
         gpt_forward_time = 0
         s2mel_time = 0
         bigvgan_time = 0
         has_warned = False
+        for seg_idx, sent in enumerate(segments):
+            self._set_gr_progress(0.2 + 0.7 * seg_idx / segments_count,
+                                  f"speech synthesis {seg_idx + 1}/{segments_count}...")
             text_tokens = self.tokenizer.convert_tokens_to_ids(sent)
             text_tokens = torch.tensor(text_tokens, dtype=torch.int32, device=self.device).unsqueeze(0)
             if verbose:
                 print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
                 # debug tokenizer
                 text_token_syms = self.tokenizer.convert_ids_to_tokens(text_tokens[0].tolist())
+                print("text_token_syms is same as segment tokens", text_token_syms == sent)
             m_start_time = time.perf_counter()
             with torch.no_grad():
                     warnings.warn(
                         f"WARN: generation stopped due to exceeding `max_mel_tokens` ({max_mel_tokens}). "
                         f"Input text tokens: {text_tokens.shape[1]}. "
+                        f"Consider reducing `max_text_tokens_per_segment`({max_text_tokens_per_segment}) or increasing `max_mel_tokens`.",
                         category=RuntimeWarning
                     )
                     has_warned = True
                 # wavs.append(wav[:, :-512])
                 wavs.append(wav.cpu())  # to cpu before saving
         end_time = time.perf_counter()
+        self._set_gr_progress(0.9, "saving audio...")
         wavs = self.insert_interval_silence(wavs, sampling_rate=sampling_rate, interval_silence=interval_silence)
         wav = torch.cat(wavs, dim=1)
         wav_length = wav.shape[-1] / sampling_rate
             device_map="auto"
         )
         self.prompt = "文本情感分类"
+        self.cn_key_to_en = {
             "高兴": "happy",
+            "愤怒": "angry",
             "悲伤": "sad",
+            "恐惧": "afraid",
+            "反感": "disgusted",
+            # TODO: the "低落" (melancholic) emotion will always be mapped to
+            # "悲伤" (sad) by QwenEmotion's text analysis. it doesn't know the
+            # difference between those emotions even if user writes exact words.
+            # SEE: `self.melancholic_words` for current workaround.
+            "低落": "melancholic",
+            "惊讶": "surprised",
+            "自然": "calm",
+        }
+        self.desired_vector_order = ["高兴", "愤怒", "悲伤", "恐惧", "反感", "低落", "惊讶", "自然"]
+        self.melancholic_words = {
+            # emotion text phrases that will force QwenEmotion's "悲伤" (sad) detection
+            # to become "低落" (melancholic) instead, to fix limitations mentioned above.
+            "低落",
+            "melancholy",
+            "melancholic",
+            "depression",
+            "depressed",
+            "gloomy",
         }
         self.max_score = 1.2
         self.min_score = 0.0
+    def clamp_score(self, value):
+        return max(self.min_score, min(self.max_score, value))
     def convert(self, content):
+        # generate emotion vector dictionary:
+        # - insert values in desired order (Python 3.7+ `dict` remembers insertion order)
+        # - convert Chinese keys to English
+        # - clamp all values to the allowed min/max range
+        # - use 0.0 for any values that were missing in `content`
+        emotion_dict = {
+            self.cn_key_to_en[cn_key]: self.clamp_score(content.get(cn_key, 0.0))
+            for cn_key in self.desired_vector_order
+        }
+        # default to a calm/neutral voice if all emotion vectors were empty
+        if all(val <= 0.0 for val in emotion_dict.values()):
+            print(">> no emotions detected; using default calm/neutral voice")
+            emotion_dict["calm"] = 1.0
         return emotion_dict
         except ValueError:
             index = 0
+        content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True)
+        # decode the JSON emotion detections as a dictionary
+        try:
+            content = json.loads(content)
+        except json.decoder.JSONDecodeError:
+            # invalid JSON; fallback to manual string parsing
+            # print(">> parsing QwenEmotion response", content)
+            content = {
+                m.group(1): float(m.group(2))
+                for m in re.finditer(r'([^\s":.,]+?)"?\s*:\s*([\d.]+)', content)
+            }
+            # print(">> dict result", content)
+        # workaround for QwenEmotion's inability to distinguish "悲伤" (sad) vs "低落" (melancholic).
+        # if we detect any of the IndexTTS "melancholic" words, we swap those vectors
+        # to encode the "sad" emotion as "melancholic" (instead of sadness).
+        text_input_lower = text_input.lower()
+        if any(word in text_input_lower for word in self.melancholic_words):
+            # print(">> before vec swap", content)
+            content["悲伤"], content["低落"] = content.get("低落", 0.0), content.get("悲伤", 0.0)
+            # print(">>  after vec swap", content)
+        return self.convert(content)
 if __name__ == "__main__":

indextts/s2mel/modules/openvoice/api.py CHANGED Viewed

@@ -63,9 +63,9 @@ class BaseSpeakerTTS(OpenVoiceBaseClass):
         return audio_segments
     @staticmethod
-    def split_sentences_into_pieces(text, language_str):
-        texts = utils.split_sentence(text, language_str=language_str)
-        print(" > Text splitted to sentences.")
         print('\n'.join(texts))
         print(" > ===========================")
         return texts
@@ -74,7 +74,7 @@ class BaseSpeakerTTS(OpenVoiceBaseClass):
         mark = self.language_marks.get(language.lower(), None)
         assert mark is not None, f"language {language} is not supported"
-        texts = self.split_sentences_into_pieces(text, mark)
         audio_list = []
         for t in texts:

         return audio_segments
     @staticmethod
+    def split_segments_into_pieces(text, language_str):
+        texts = utils.split_segment(text, language_str=language_str)
+        print(" > Text split into segments.")
         print('\n'.join(texts))
         print(" > ===========================")
         return texts
         mark = self.language_marks.get(language.lower(), None)
         assert mark is not None, f"language {language} is not supported"
+        texts = self.split_segments_into_pieces(text, mark)
         audio_list = []
         for t in texts:

indextts/s2mel/modules/openvoice/openvoice_app.py CHANGED Viewed

@@ -233,7 +233,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             input_text_gr = gr.Textbox(
                 label="Text Prompt",
-                info="One or two sentences at a time is better. Up to 200 text characters.",
                 value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
             )
             style_gr = gr.Dropdown(

         with gr.Column():
             input_text_gr = gr.Textbox(
                 label="Text Prompt",
+                info="One or two sentences at a time produces the best results. Up to 200 text characters.",
                 value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
             )
             style_gr = gr.Dropdown(

indextts/s2mel/modules/openvoice/utils.py CHANGED Viewed

@@ -75,23 +75,23 @@ def bits_to_string(bits_array):
     return output_string
-def split_sentence(text, min_len=10, language_str='[EN]'):
     if language_str in ['EN']:
-        sentences = split_sentences_latin(text, min_len=min_len)
     else:
-        sentences = split_sentences_zh(text, min_len=min_len)
-    return sentences
-def split_sentences_latin(text, min_len=10):
-    """Split Long sentences into list of short ones
     Args:
         str: Input sentences.
     Returns:
-        List[str]: list of output sentences.
     """
-    # deal with dirty sentences
     text = re.sub('[。！？；]', '.', text)
     text = re.sub('[，]', ',', text)
     text = re.sub('[“”]', '"', text)
@@ -100,36 +100,36 @@ def split_sentences_latin(text, min_len=10):
     text = re.sub('[\n\t ]+', ' ', text)
     text = re.sub('([,.!?;])', r'\1 $#!', text)
     # split
-    sentences = [s.strip() for s in text.split('$#!')]
-    if len(sentences[-1]) == 0: del sentences[-1]
-    new_sentences = []
     new_sent = []
     count_len = 0
-    for ind, sent in enumerate(sentences):
         # print(sent)
         new_sent.append(sent)
         count_len += len(sent.split(" "))
-        if count_len > min_len or ind == len(sentences) - 1:
             count_len = 0
-            new_sentences.append(' '.join(new_sent))
             new_sent = []
-    return merge_short_sentences_latin(new_sentences)
-def merge_short_sentences_latin(sens):
-    """Avoid short sentences by merging them with the following sentence.
     Args:
-        List[str]: list of input sentences.
     Returns:
-        List[str]: list of output sentences.
     """
     sens_out = []
     for s in sens:
-        # If the previous sentence is too short, merge them with
-        # the current sentence.
         if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
             sens_out[-1] = sens_out[-1] + " " + s
         else:
@@ -142,7 +142,7 @@ def merge_short_sentences_latin(sens):
         pass
     return sens_out
-def split_sentences_zh(text, min_len=10):
     text = re.sub('[。！？；]', '.', text)
     text = re.sub('[，]', ',', text)
     # 将文本中的换行符、空格和制表符替换为空格
@@ -150,37 +150,37 @@ def split_sentences_zh(text, min_len=10):
     # 在标点符号后添加一个空格
     text = re.sub('([,.!?;])', r'\1 $#!', text)
     # 分隔句子并去除前后空格
-    # sentences = [s.strip() for s in re.split('(。|！|？|；)', text)]
-    sentences = [s.strip() for s in text.split('$#!')]
-    if len(sentences[-1]) == 0: del sentences[-1]
-    new_sentences = []
     new_sent = []
     count_len = 0
-    for ind, sent in enumerate(sentences):
         new_sent.append(sent)
         count_len += len(sent)
-        if count_len > min_len or ind == len(sentences) - 1:
             count_len = 0
-            new_sentences.append(' '.join(new_sent))
             new_sent = []
-    return merge_short_sentences_zh(new_sentences)
-def merge_short_sentences_zh(sens):
     # return sens
-    """Avoid short sentences by merging them with the following sentence.
     Args:
-        List[str]: list of input sentences.
     Returns:
-        List[str]: list of output sentences.
     """
     sens_out = []
     for s in sens:
         # If the previous sentense is too short, merge them with
-        # the current sentence.
         if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
             sens_out[-1] = sens_out[-1] + " " + s
         else:

     return output_string
+def split_segment(text, min_len=10, language_str='[EN]'):
     if language_str in ['EN']:
+        segments = split_segments_latin(text, min_len=min_len)
     else:
+        segments = split_segments_zh(text, min_len=min_len)
+    return segments
+def split_segments_latin(text, min_len=10):
+    """Split Long sentences into list of short segments.
     Args:
         str: Input sentences.
     Returns:
+        List[str]: list of output segments.
     """
+    # deal with dirty text characters
     text = re.sub('[。！？；]', '.', text)
     text = re.sub('[，]', ',', text)
     text = re.sub('[“”]', '"', text)
     text = re.sub('[\n\t ]+', ' ', text)
     text = re.sub('([,.!?;])', r'\1 $#!', text)
     # split
+    segments = [s.strip() for s in text.split('$#!')]
+    if len(segments[-1]) == 0: del segments[-1]
+    new_segments = []
     new_sent = []
     count_len = 0
+    for ind, sent in enumerate(segments):
         # print(sent)
         new_sent.append(sent)
         count_len += len(sent.split(" "))
+        if count_len > min_len or ind == len(segments) - 1:
             count_len = 0
+            new_segments.append(' '.join(new_sent))
             new_sent = []
+    return merge_short_segments_latin(new_segments)
+def merge_short_segments_latin(sens):
+    """Avoid short segments by merging them with the following segment.
     Args:
+        List[str]: list of input segments.
     Returns:
+        List[str]: list of output segments.
     """
     sens_out = []
     for s in sens:
+        # If the previous segment is too short, merge them with
+        # the current segment.
         if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
             sens_out[-1] = sens_out[-1] + " " + s
         else:
         pass
     return sens_out
+def split_segments_zh(text, min_len=10):
     text = re.sub('[。！？；]', '.', text)
     text = re.sub('[，]', ',', text)
     # 将文本中的换行符、空格和制表符替换为空格
     # 在标点符号后添加一个空格
     text = re.sub('([,.!?;])', r'\1 $#!', text)
     # 分隔句子并去除前后空格
+    # segments = [s.strip() for s in re.split('(。|！|？|；)', text)]
+    segments = [s.strip() for s in text.split('$#!')]
+    if len(segments[-1]) == 0: del segments[-1]
+    new_segments = []
     new_sent = []
     count_len = 0
+    for ind, sent in enumerate(segments):
         new_sent.append(sent)
         count_len += len(sent)
+        if count_len > min_len or ind == len(segments) - 1:
             count_len = 0
+            new_segments.append(' '.join(new_sent))
             new_sent = []
+    return merge_short_segments_zh(new_segments)
+def merge_short_segments_zh(sens):
     # return sens
+    """Avoid short segments by merging them with the following segment.
     Args:
+        List[str]: list of input segments.
     Returns:
+        List[str]: list of output segments.
     """
     sens_out = []
     for s in sens:
         # If the previous sentense is too short, merge them with
+        # the current segment.
         if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
             sens_out[-1] = sens_out[-1] + " " + s
         else:

indextts/utils/front.py CHANGED Viewed

@@ -91,7 +91,7 @@ class TextNormalizer:
         import platform
         if self.zh_normalizer is not None and self.en_normalizer is not None:
             return
-        if platform.system() == "Darwin":
             from wetext import Normalizer
             self.zh_normalizer = Normalizer(remove_erhua=False, lang="zh", operator="tn")
@@ -342,8 +342,8 @@ class TextTokenizer:
         return de_tokenized_by_CJK_char(decoded, do_lower_case=do_lower_case)
     @staticmethod
-    def split_sentences_by_token(
-        tokenized_str: List[str], split_tokens: List[str], max_tokens_per_sentence: int
     ) -> List[List[str]]:
         """
         将tokenize后的结果按特定token进一步分割
@@ -351,67 +351,67 @@ class TextTokenizer:
         # 处理特殊情况
         if len(tokenized_str) == 0:
             return []
-        sentences: List[List[str]] = []
-        current_sentence = []
-        current_sentence_tokens_len = 0
         for i in range(len(tokenized_str)):
             token = tokenized_str[i]
-            current_sentence.append(token)
-            current_sentence_tokens_len += 1
-            if current_sentence_tokens_len <= max_tokens_per_sentence:
-                if token in split_tokens and current_sentence_tokens_len > 2:
                     if i < len(tokenized_str) - 1:
                         if tokenized_str[i + 1] in ["'", "▁'"]:
                             # 后续token是'，则不切分
-                            current_sentence.append(tokenized_str[i + 1])
                             i += 1
-                    sentences.append(current_sentence)
-                    current_sentence = []
-                    current_sentence_tokens_len = 0
                 continue
             # 如果当前tokens的长度超过最大限制
-            if not  ("," in split_tokens or "▁," in split_tokens ) and ("," in current_sentence or "▁," in current_sentence):
                 # 如果当前tokens中有,，则按,分割
-                sub_sentences = TextTokenizer.split_sentences_by_token(
-                    current_sentence, [",", "▁,"], max_tokens_per_sentence=max_tokens_per_sentence
                 )
-            elif "-" not in split_tokens and "-" in current_sentence:
                 # 没有,，则按-分割
-                sub_sentences = TextTokenizer.split_sentences_by_token(
-                    current_sentence, ["-"], max_tokens_per_sentence=max_tokens_per_sentence
                 )
             else:
                 # 按照长度分割
-                sub_sentences = []
-                for j in range(0, len(current_sentence), max_tokens_per_sentence):
-                    if j + max_tokens_per_sentence < len(current_sentence):
-                        sub_sentences.append(current_sentence[j : j + max_tokens_per_sentence])
                     else:
-                        sub_sentences.append(current_sentence[j:])
                 warnings.warn(
-                    f"The tokens length of sentence exceeds limit: {max_tokens_per_sentence}, "
-                    f"Tokens in sentence: {current_sentence}."
                     "Maybe unexpected behavior",
                     RuntimeWarning,
                 )
-            sentences.extend(sub_sentences)
-            current_sentence = []
-            current_sentence_tokens_len = 0
-        if current_sentence_tokens_len > 0:
-            assert current_sentence_tokens_len <= max_tokens_per_sentence
-            sentences.append(current_sentence)
         # 如果相邻的句子加起来长度小于最大限制，则合并
-        merged_sentences = []
-        for sentence in sentences:
-            if len(sentence) == 0:
                 continue
-            if len(merged_sentences) == 0:
-                merged_sentences.append(sentence)
-            elif len(merged_sentences[-1]) + len(sentence) <= max_tokens_per_sentence:
-                merged_sentences[-1] = merged_sentences[-1] + sentence
             else:
-                merged_sentences.append(sentence)
-        return merged_sentences
     punctuation_marks_tokens = [
         ".",
@@ -422,9 +422,9 @@ class TextTokenizer:
         "▁?",
         "▁...", # ellipsis
     ]
-    def split_sentences(self, tokenized: List[str], max_tokens_per_sentence=120) -> List[List[str]]:
-        return TextTokenizer.split_sentences_by_token(
-            tokenized, self.punctuation_marks_tokens, max_tokens_per_sentence=max_tokens_per_sentence
         )
@@ -516,19 +516,19 @@ if __name__ == "__main__":
         # 测试 normalize后的字符能被分词器识别
         print(f"`{ch}`", "->", tokenizer.sp_model.Encode(ch, out_type=str))
         print(f"` {ch}`", "->", tokenizer.sp_model.Encode(f" {ch}", out_type=str))
-    max_tokens_per_sentence=120
     for i in range(len(cases)):
         print(f"原始文本: {cases[i]}")
         print(f"Normalized: {text_normalizer.normalize(cases[i])}")
         tokens = tokenizer.tokenize(cases[i])
         print("Tokenzied: ", ", ".join([f"`{t}`" for t in tokens]))
-        sentences = tokenizer.split_sentences(tokens, max_tokens_per_sentence=max_tokens_per_sentence)
-        print("Splitted sentences count:", len(sentences))
-        if len(sentences) > 1:
-            for j in range(len(sentences)):
-                print(f"  {j}, count:", len(sentences[j]), ", tokens:", "".join(sentences[j]))
-                if len(sentences[j]) > max_tokens_per_sentence:
-                    print(f"Warning: sentence {j} is too long, length: {len(sentences[j])}")
         #print(f"Token IDs (first 10): {codes[i][:10]}")
         if tokenizer.unk_token in codes[i]:
             print(f"Warning: `{cases[i]}` contains UNKNOWN token")

         import platform
         if self.zh_normalizer is not None and self.en_normalizer is not None:
             return
+        if platform.system() != "Linux":  # Mac and Windows
             from wetext import Normalizer
             self.zh_normalizer = Normalizer(remove_erhua=False, lang="zh", operator="tn")
         return de_tokenized_by_CJK_char(decoded, do_lower_case=do_lower_case)
     @staticmethod
+    def split_segments_by_token(
+        tokenized_str: List[str], split_tokens: List[str], max_text_tokens_per_segment: int
     ) -> List[List[str]]:
         """
         将tokenize后的结果按特定token进一步分割
         # 处理特殊情况
         if len(tokenized_str) == 0:
             return []
+        segments: List[List[str]] = []
+        current_segment = []
+        current_segment_tokens_len = 0
         for i in range(len(tokenized_str)):
             token = tokenized_str[i]
+            current_segment.append(token)
+            current_segment_tokens_len += 1
+            if current_segment_tokens_len <= max_text_tokens_per_segment:
+                if token in split_tokens and current_segment_tokens_len > 2:
                     if i < len(tokenized_str) - 1:
                         if tokenized_str[i + 1] in ["'", "▁'"]:
                             # 后续token是'，则不切分
+                            current_segment.append(tokenized_str[i + 1])
                             i += 1
+                    segments.append(current_segment)
+                    current_segment = []
+                    current_segment_tokens_len = 0
                 continue
             # 如果当前tokens的长度超过最大限制
+            if not  ("," in split_tokens or "▁," in split_tokens ) and ("," in current_segment or "▁," in current_segment):
                 # 如果当前tokens中有,，则按,分割
+                sub_segments = TextTokenizer.split_segments_by_token(
+                    current_segment, [",", "▁,"], max_text_tokens_per_segment=max_text_tokens_per_segment
                 )
+            elif "-" not in split_tokens and "-" in current_segment:
                 # 没有,，则按-分割
+                sub_segments = TextTokenizer.split_segments_by_token(
+                    current_segment, ["-"], max_text_tokens_per_segment=max_text_tokens_per_segment
                 )
             else:
                 # 按照长度分割
+                sub_segments = []
+                for j in range(0, len(current_segment), max_text_tokens_per_segment):
+                    if j + max_text_tokens_per_segment < len(current_segment):
+                        sub_segments.append(current_segment[j : j + max_text_tokens_per_segment])
                     else:
+                        sub_segments.append(current_segment[j:])
                 warnings.warn(
+                    f"The tokens length of segment exceeds limit: {max_text_tokens_per_segment}, "
+                    f"Tokens in segment: {current_segment}."
                     "Maybe unexpected behavior",
                     RuntimeWarning,
                 )
+            segments.extend(sub_segments)
+            current_segment = []
+            current_segment_tokens_len = 0
+        if current_segment_tokens_len > 0:
+            assert current_segment_tokens_len <= max_text_tokens_per_segment
+            segments.append(current_segment)
         # 如果相邻的句子加起来长度小于最大限制，则合并
+        merged_segments = []
+        for segment in segments:
+            if len(segment) == 0:
                 continue
+            if len(merged_segments) == 0:
+                merged_segments.append(segment)
+            elif len(merged_segments[-1]) + len(segment) <= max_text_tokens_per_segment:
+                merged_segments[-1] = merged_segments[-1] + segment
             else:
+                merged_segments.append(segment)
+        return merged_segments
     punctuation_marks_tokens = [
         ".",
         "▁?",
         "▁...", # ellipsis
     ]
+    def split_segments(self, tokenized: List[str], max_text_tokens_per_segment=120) -> List[List[str]]:
+        return TextTokenizer.split_segments_by_token(
+            tokenized, self.punctuation_marks_tokens, max_text_tokens_per_segment=max_text_tokens_per_segment
         )
         # 测试 normalize后的字符能被分词器识别
         print(f"`{ch}`", "->", tokenizer.sp_model.Encode(ch, out_type=str))
         print(f"` {ch}`", "->", tokenizer.sp_model.Encode(f" {ch}", out_type=str))
+    max_text_tokens_per_segment=120
     for i in range(len(cases)):
         print(f"原始文本: {cases[i]}")
         print(f"Normalized: {text_normalizer.normalize(cases[i])}")
         tokens = tokenizer.tokenize(cases[i])
         print("Tokenzied: ", ", ".join([f"`{t}`" for t in tokens]))
+        segments = tokenizer.split_segments(tokens, max_text_tokens_per_segment=max_text_tokens_per_segment)
+        print("Segments count:", len(segments))
+        if len(segments) > 1:
+            for j in range(len(segments)):
+                print(f"  {j}, count:", len(segments[j]), ", tokens:", "".join(segments[j]))
+                if len(segments[j]) > max_text_tokens_per_segment:
+                    print(f"Warning: segment {j} is too long, length: {len(segments[j])}")
         #print(f"Token IDs (first 10): {codes[i][:10]}")
         if tokenizer.unk_token in codes[i]:
             print(f"Warning: `{cases[i]}` contains UNKNOWN token")

indextts/utils/maskgct/models/codec/facodec/modules/JDC/bst.t7 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54dc94364b97e18ac1dfa6287714ed121248cfaac4cfd39d061c6e0a089ef169
+size 21029926

tools/i18n/locale/en_US.json CHANGED Viewed

@@ -1,46 +1,49 @@
 {
-  "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "This software is open-sourced under the MIT License. The author has no control over the software, and users of the software, as well as those who distribute the audio generated by the software, assume full responsibility.",
-  "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "If you do not agree to these terms, you are not permitted to use or reference any code or files within the software package. For further details, please refer to the LICENSE file in the root directory.",
   "时长必须为正数": "Duration must be a positive number",
   "请输入有效的浮点数": "Please enter a valid floating-point number",
   "使用情感参考音频": "Use emotion reference audio",
-  "使用情感向量控制": "Use emotion vector",
   "使用情感描述文本控制": "Use text description to control emotion",
   "上传情感参考音频": "Upload emotion reference audio",
   "情感权重": "Emotion control weight",
   "喜": "Happy",
   "怒": "Angry",
   "哀": "Sad",
-  "惧": "Fear",
-  "厌恶": "Hate",
-  "低落": "Low",
-  "惊喜": "Surprise",
-  "平静": "Neutral",
   "情感描述文本": "Emotion description",
-  "请输入情感描述文本": "Please input emotion description",
   "高级生成参数设置": "Advanced generation parameter settings",
   "情感向量之和不能超过1.5，请调整后重试。": "The sum of the emotion vectors cannot exceed 1.5. Please adjust and try again.",
-  "音色参考音频": "Voice reference",
   "音频生成": "Speech Synthesis",
   "文本": "Text",
   "生成语音": "Synthesize",
   "生成结果": "Synthesis Result",
   "功能设置": "Settings",
-  "分句设置": "Sentence segmentation settings",
-  "参数会影响音频质量和生成速度": "Parameters below affect audio quality and generation speed",
-  "分句最大Token数": "Max tokens per sentence",
-  "建议80~200之间，值越大，分句越长；值越小，分句越碎；过小过大都可能导致音频质量不高": "Recommended between 80 and 200. The larger the value, the longer the sentences; the smaller the value, the more fragmented the sentences. Values that are too small or too large may lead to poor audio quality.",
-  "预览分句结果": "Preview sentence segmentation result",
   "序号": "Index",
   "分句内容": "Content",
   "Token数": "Token Count",
   "情感控制方式": "Emotion control method",
   "GPT2 采样设置": "GPT-2 Sampling Configuration",
-  "参数会影响音频多样性和生成速度详见": "Influence both the diversity of the generated audio and the generation speed. For further details, refer to",
-  "请上传情感参考音频": "Please upload emotion reference audio",
-  "当前模型版本": "Current model version ",
-  "请输入目标文本": "Please input text to synthesize",
-  "例如：高兴，愤怒，悲伤等": "e.g., happy, angry, sad, etc.",
   "与音色参考音频相同": "Same as the voice reference",
-  "情感随机采样": "Random emotion sampling"
 }

 {
+  "本软件以自拟协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "This software is open-sourced under customized license. The author has no control over the software, and users of the software, as well as those who distribute the audio generated by the software, assume full responsibility.",
+  "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "If you do not agree to these terms, you are not permitted to use or reference any code or files within the software package. For further details, please refer to the LICENSE files in the root directory.",
   "时长必须为正数": "Duration must be a positive number",
   "请输入有效的浮点数": "Please enter a valid floating-point number",
   "使用情感参考音频": "Use emotion reference audio",
+  "使用情感向量控制": "Use emotion vectors",
   "使用情感描述文本控制": "Use text description to control emotion",
   "上传情感参考音频": "Upload emotion reference audio",
   "情感权重": "Emotion control weight",
   "喜": "Happy",
   "怒": "Angry",
   "哀": "Sad",
+  "惧": "Afraid",
+  "厌恶": "Disgusted",
+  "低落": "Melancholic",
+  "惊喜": "Surprised",
+  "平静": "Calm",
   "情感描述文本": "Emotion description",
+  "请输入情绪描述（或留空以自动使用目标文本作为情绪描述）": "Please input an emotion description (or leave blank to automatically use the main text prompt)",
   "高级生成参数设置": "Advanced generation parameter settings",
   "情感向量之和不能超过1.5，请调整后重试。": "The sum of the emotion vectors cannot exceed 1.5. Please adjust and try again.",
+  "音色参考音频": "Voice Reference",
   "音频生成": "Speech Synthesis",
   "文本": "Text",
   "生成语音": "Synthesize",
   "生成结果": "Synthesis Result",
   "功能设置": "Settings",
+  "分句设置": "Text segmentation settings",
+  "参数会影响音频质量和生成速度": "These parameters affect the audio quality and generation speed.",
+  "分句最大Token数": "Max tokens per generation segment",
+  "建议80~200之间，值越大，分句越长；值越小，分句越碎；过小过大都可能导致音频质量不高": "Recommended range: 80 - 200. Larger values require more VRAM but improves the flow of the speech, while lower values require less VRAM but means more fragmented sentences. Values that are too small or too large may lead to less coherent speech.",
+  "预览分句结果": "Preview of the audio generation segments",
   "序号": "Index",
   "分句内容": "Content",
   "Token数": "Token Count",
   "情感控制方式": "Emotion control method",
   "GPT2 采样设置": "GPT-2 Sampling Configuration",
+  "参数会影响音频多样性和生成速度详见": "Influences both the diversity of the generated audio and the generation speed. For further details, refer to",
+  "是否进行采样": "Enable GPT-2 sampling",
+  "生成Token最大数量，���小导致音频被截断": "Maximum number of tokens to generate. If text exceeds this, the audio will be cut off.",
+  "请上传情感参考音频": "Please upload the emotion reference audio",
+  "当前模型版本": "Current model version: ",
+  "请输入目标文本": "Please input the text to synthesize",
+  "例如：委屈巴巴、危险在悄悄逼近": "e.g. deeply sad, danger is creeping closer",
   "与音色参考音频相同": "Same as the voice reference",
+  "情感随机采样": "Randomize emotion sampling",
+  "显示实验功能": "Show experimental features"
 }

tools/i18n/locale/zh_CN.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "本软件以MIT协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.",
   "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.",
   "时长必须为正数": "时长必须为正数",
   "请输入有效的浮点数": "请输入有效的浮点数",
@@ -17,7 +17,7 @@
   "惊喜": "惊喜",
   "平静": "平静",
   "情感描述文本": "情感描述文本",
-  "请输入情感描述文本": "请输入情感描述文本",
   "高级生成参数设置": "高级生成参数设置",
   "情感向量之和不能超过1.5，请调整后重试。": "情感向量之和不能超过1.5，请调整后重试。",
   "音色参考音频":  "音色参考音频",
@@ -36,5 +36,9 @@
   "Token数": "Token数",
   "情感控制方式": "情感控制方式",
   "GPT2 采样设置": "GPT2 采样设置",
-  "参数会影响音频多样性和生成速度详见": "参数会影响音频多样性和生成速度详见"
 }

 {
+  "本软件以自拟协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.": "本软件以自拟协议开源, 作者不对软件具备任何控制力, 使用软件者、传播软件导出的声音者自负全责.",
   "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.": "如不认可该条款, 则不能使用或引用软件包内任何代码和文件. 详见根目录LICENSE.",
   "时长必须为正数": "时长必须为正数",
   "请输入有效的浮点数": "请输入有效的浮点数",
   "惊喜": "惊喜",
   "平静": "平静",
   "情感描述文本": "情感描述文本",
+  "请输入情绪描述（或留空以自动使用目标文本作为情绪描述）": "请输入情绪描述（或留空以自动使用目标文本作为情绪描述）",
   "高级生成参数设置": "高级生成参数设置",
   "情感向量之和不能超过1.5，请调整后重试。": "情感向量之和不能超过1.5，请调整后重试。",
   "音色参考音频":  "音色参考音频",
   "Token数": "Token数",
   "情感控制方式": "情感控制方式",
   "GPT2 采样设置": "GPT2 采样设置",
+  "参数会影响音频多样性和生成速度详见": "参数会影响音频多样性和生成速度详见",
+  "是否进行采样": "是否进行采样",
+  "生成Token最大数量，过小导致音频被截断": "生成Token最大数量，过小导致音频被截断",
+  "显示实验功能": "显示实验功能",
+  "例如：委屈巴巴、危险在悄悄逼近": "例如：委屈巴巴、危险在悄悄逼近"
 }

webui.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import json
-import logging
-import spaces
 import os
 import sys
 import threading
@@ -8,40 +6,60 @@ import time
 import warnings
-import pandas as pd
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 sys.path.append(os.path.join(current_dir, "indextts"))
 import argparse
-parser = argparse.ArgumentParser(description="IndexTTS WebUI")
 parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose mode")
 parser.add_argument("--port", type=int, default=7860, help="Port to run the web UI on")
 parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the web UI on")
-parser.add_argument("--model_dir", type=str, default="checkpoints", help="Model checkpoints directory")
-parser.add_argument("--is_fp16", action="store_true", default=False, help="Fp16 infer")
 cmd_args = parser.parse_args()
-from tools.download_files import download_model_from_huggingface
-download_model_from_huggingface(os.path.join(current_dir,"checkpoints"),
-                                os.path.join(current_dir, "checkpoints","hf_cache"))
 import gradio as gr
-from indextts import infer
 from indextts.infer_v2 import IndexTTS2
 from tools.i18n.i18n import I18nAuto
-from modelscope.hub import api
 i18n = I18nAuto(language="Auto")
 MODE = 'local'
 tts = IndexTTS2(model_dir=cmd_args.model_dir,
                 cfg_path=os.path.join(cmd_args.model_dir, "config.yaml"),
-                is_fp16=False,use_cuda_kernel=False)
 # 支持的语言列表
 LANGUAGES = {
     "中文": "zh_CN",
@@ -51,6 +69,9 @@ EMO_CHOICES = [i18n("与音色参考音频相同"),
                 i18n("使用情感参考音频"),
                 i18n("使用情感向量控制"),
                 i18n("使用情感描述文本控制")]
 os.makedirs("outputs/tasks",exist_ok=True)
 os.makedirs("prompts",exist_ok=True)
@@ -79,15 +100,23 @@ with open("examples/cases.jsonl", "r", encoding="utf-8") as f:
                              example.get("emo_vec_5",0),
                              example.get("emo_vec_6",0),
                              example.get("emo_vec_7",0),
-                             example.get("emo_vec_8",0)]
                              )
-@spaces.GPU
 def gen_single(emo_control_method,prompt, text,
                emo_ref_path, emo_weight,
                vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
                emo_text,emo_random,
-               max_text_tokens_per_sentence=120,
                 *args, progress=gr.Progress()):
     output_path = None
     if not output_path:
@@ -110,28 +139,31 @@ def gen_single(emo_control_method,prompt, text,
     }
     if type(emo_control_method) is not int:
         emo_control_method = emo_control_method.value
-    if emo_control_method == 0:
-        emo_ref_path = None
-        emo_weight = 1.0
-    if emo_control_method == 1:
-        emo_weight = emo_weight
-    if emo_control_method == 2:
         vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
-        vec_sum = sum([vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8])
-        if vec_sum > 1.5:
-            gr.Warning(i18n("情感向量之和不能超过1.5，请调整后重试。"))
-            return
     else:
         vec = None
-    print(f"Emo control mode:{emo_control_method},vec:{vec}")
     output = tts.infer(spk_audio_prompt=prompt, text=text,
                        output_path=output_path,
                        emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight,
                        emo_vector=vec,
                        use_emo_text=(emo_control_method==3), emo_text=emo_text,use_random=emo_random,
                        verbose=cmd_args.verbose,
-                       max_text_tokens_per_sentence=int(max_text_tokens_per_sentence),
                        **kwargs)
     return gr.update(value=output,visible=True)
@@ -147,6 +179,7 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
 <a href='https://arxiv.org/abs/2506.21619'><img src='https://img.shields.io/badge/ArXiv-2506.21619-red'></a>
 </p>
     ''')
     with gr.Tab(i18n("音频生成")):
         with gr.Row():
             os.makedirs("prompts",exist_ok=True)
@@ -160,49 +193,54 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                 input_text_single = gr.TextArea(label=i18n("文本"),key="input_text_single", placeholder=i18n("请输入目标文本"), info=f"{i18n('当前模型版本')}{tts.model_version or '1.0'}")
                 gen_button = gr.Button(i18n("生成语音"), key="gen_button",interactive=True)
             output_audio = gr.Audio(label=i18n("生成结果"), visible=True,key="output_audio")
         with gr.Accordion(i18n("功能设置")):
             # 情感控制选项部分
             with gr.Row():
                 emo_control_method = gr.Radio(
-                    choices=EMO_CHOICES,
                     type="index",
-                    value=EMO_CHOICES[0],label=i18n("情感控制方式"))
         # 情感参考音频部分
         with gr.Group(visible=False) as emotion_reference_group:
             with gr.Row():
                 emo_upload = gr.Audio(label=i18n("上传情感参考音频"), type="filepath")
-            with gr.Row():
-                emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.6, value=0.8, step=0.01)
         # 情感随机采样
-        with gr.Row():
-            emo_random = gr.Checkbox(label=i18n("情感随机采样"),value=False,visible=False)
         # 情感向量控制部分
         with gr.Group(visible=False) as emotion_vector_group:
             with gr.Row():
                 with gr.Column():
-                    vec1 = gr.Slider(label=i18n("喜"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec2 = gr.Slider(label=i18n("怒"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec3 = gr.Slider(label=i18n("哀"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec4 = gr.Slider(label=i18n("惧"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
                 with gr.Column():
-                    vec5 = gr.Slider(label=i18n("厌恶"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec6 = gr.Slider(label=i18n("低落"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec7 = gr.Slider(label=i18n("惊喜"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
-                    vec8 = gr.Slider(label=i18n("平静"), minimum=0.0, maximum=1.4, value=0.0, step=0.05)
         with gr.Group(visible=False) as emo_text_group:
             with gr.Row():
-                emo_text = gr.Textbox(label=i18n("情感描述文本"), placeholder=i18n("请输入情感描述文本"), value="", info=i18n("例如：高兴，愤怒，悲伤等"))
-        with gr.Accordion(i18n("高级生成参数设置"), open=False):
             with gr.Row():
                 with gr.Column(scale=1):
-                    gr.Markdown(f"**{i18n('GPT2 采样设置')}** _{i18n('参数会影响音频多样性和生成速度详见')}[Generation strategies](https://huggingface.co/docs/transformers/main/en/generation_strategies)_")
                     with gr.Row():
-                        do_sample = gr.Checkbox(label="do_sample", value=True, info="是否进行采样")
                         temperature = gr.Slider(label="temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1)
                     with gr.Row():
                         top_p = gr.Slider(label="top_p", minimum=0.0, maximum=1.0, value=0.8, step=0.01)
@@ -211,21 +249,22 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                     with gr.Row():
                         repetition_penalty = gr.Number(label="repetition_penalty", precision=None, value=10.0, minimum=0.1, maximum=20.0, step=0.1)
                         length_penalty = gr.Number(label="length_penalty", precision=None, value=0.0, minimum=-2.0, maximum=2.0, step=0.1)
-                    max_mel_tokens = gr.Slider(label="max_mel_tokens", value=1500, minimum=50, maximum=tts.cfg.gpt.max_mel_tokens, step=10, info="生成Token最大数量，过小导致音频被截断", key="max_mel_tokens")
                     # with gr.Row():
                     #     typical_sampling = gr.Checkbox(label="typical_sampling", value=False, info="不建议使用")
                     #     typical_mass = gr.Slider(label="typical_mass", value=0.9, minimum=0.0, maximum=1.0, step=0.1)
                 with gr.Column(scale=2):
                     gr.Markdown(f'**{i18n("分句设置")}** _{i18n("参数会影响音频质量和生成速度")}_')
                     with gr.Row():
-                        max_text_tokens_per_sentence = gr.Slider(
-                            label=i18n("分句最大Token数"), value=120, minimum=20, maximum=tts.cfg.gpt.max_text_tokens, step=2, key="max_text_tokens_per_sentence",
                             info=i18n("建议80~200之间，值越大，分句越长；值越小，分句越碎；过小过大都可能导致音频质量不高"),
                         )
-                    with gr.Accordion(i18n("预览分句结果"), open=True) as sentences_settings:
-                        sentences_preview = gr.Dataframe(
                             headers=[i18n("序号"), i18n("分句内容"), i18n("Token数")],
-                            key="sentences_preview",
                             wrap=True,
                         )
             advanced_params = [
@@ -234,8 +273,20 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                 # typical_sampling, typical_mass,
             ]
-        if len(example_cases) > 0:
-            gr.Examples(
                 examples=example_cases,
                 examples_per_page=20,
                 inputs=[prompt_audio,
@@ -244,71 +295,93 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                         emo_upload,
                         emo_weight,
                         emo_text,
-                        vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8]
             )
-    def on_input_text_change(text, max_tokens_per_sentence):
         if text and len(text) > 0:
             text_tokens_list = tts.tokenizer.tokenize(text)
-            sentences = tts.tokenizer.split_sentences(text_tokens_list, max_tokens_per_sentence=int(max_tokens_per_sentence))
             data = []
-            for i, s in enumerate(sentences):
-                sentence_str = ''.join(s)
                 tokens_count = len(s)
-                data.append([i, sentence_str, tokens_count])
             return {
-                sentences_preview: gr.update(value=data, visible=True, type="array"),
             }
         else:
             df = pd.DataFrame([], columns=[i18n("序号"), i18n("分句内容"), i18n("Token数")])
             return {
-                sentences_preview: gr.update(value=df),
             }
     def on_method_select(emo_control_method):
-        if emo_control_method == 1:
             return (gr.update(visible=True),
                     gr.update(visible=False),
                     gr.update(visible=False),
-                    gr.update(visible=False)
                     )
-        elif emo_control_method == 2:
             return (gr.update(visible=False),
                     gr.update(visible=True),
                     gr.update(visible=True),
                     gr.update(visible=False)
                     )
-        elif emo_control_method == 3:
             return (gr.update(visible=False),
                     gr.update(visible=True),
                     gr.update(visible=False),
                     gr.update(visible=True)
                     )
-        else:
             return (gr.update(visible=False),
                     gr.update(visible=False),
                     gr.update(visible=False),
                     gr.update(visible=False)
                     )
     emo_control_method.select(on_method_select,
         inputs=[emo_control_method],
         outputs=[emotion_reference_group,
-                 emo_random,
                  emotion_vector_group,
-                 emo_text_group]
     )
     input_text_single.change(
         on_input_text_change,
-        inputs=[input_text_single, max_text_tokens_per_sentence],
-        outputs=[sentences_preview]
     )
-    max_text_tokens_per_sentence.change(
         on_input_text_change,
-        inputs=[input_text_single, max_text_tokens_per_sentence],
-        outputs=[sentences_preview]
     )
     prompt_audio.upload(update_prompt_audio,
                          inputs=[],
                          outputs=[gen_button])
@@ -317,7 +390,7 @@ with gr.Blocks(title="IndexTTS Demo") as demo:
                      inputs=[emo_control_method,prompt_audio, input_text_single, emo_upload, emo_weight,
                             vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
                              emo_text,emo_random,
-                             max_text_tokens_per_sentence,
                              *advanced_params,
                      ],
                      outputs=[output_audio])

 import json
 import os
 import sys
 import threading
 import warnings
+import numpy as np
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=UserWarning)
+import pandas as pd
 current_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(current_dir)
 sys.path.append(os.path.join(current_dir, "indextts"))
 import argparse
+parser = argparse.ArgumentParser(
+    description="IndexTTS WebUI",
+    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+)
 parser.add_argument("--verbose", action="store_true", default=False, help="Enable verbose mode")
 parser.add_argument("--port", type=int, default=7860, help="Port to run the web UI on")
 parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to run the web UI on")
+parser.add_argument("--model_dir", type=str, default="./checkpoints", help="Model checkpoints directory")
+parser.add_argument("--fp16", action="store_true", default=False, help="Use FP16 for inference if available")
+parser.add_argument("--deepspeed", action="store_true", default=False, help="Use DeepSpeed to accelerate if available")
+parser.add_argument("--cuda_kernel", action="store_true", default=False, help="Use CUDA kernel for inference if available")
+parser.add_argument("--gui_seg_tokens", type=int, default=120, help="GUI: Max tokens per generation segment")
 cmd_args = parser.parse_args()
+if not os.path.exists(cmd_args.model_dir):
+    print(f"Model directory {cmd_args.model_dir} does not exist. Please download the model first.")
+    sys.exit(1)
+for file in [
+    "bpe.model",
+    "gpt.pth",
+    "config.yaml",
+    "s2mel.pth",
+    "wav2vec2bert_stats.pt"
+]:
+    file_path = os.path.join(cmd_args.model_dir, file)
+    if not os.path.exists(file_path):
+        print(f"Required file {file_path} does not exist. Please download it.")
+        sys.exit(1)
 import gradio as gr
 from indextts.infer_v2 import IndexTTS2
 from tools.i18n.i18n import I18nAuto
 i18n = I18nAuto(language="Auto")
 MODE = 'local'
 tts = IndexTTS2(model_dir=cmd_args.model_dir,
                 cfg_path=os.path.join(cmd_args.model_dir, "config.yaml"),
+                use_fp16=cmd_args.fp16,
+                use_deepspeed=cmd_args.deepspeed,
+                use_cuda_kernel=cmd_args.cuda_kernel,
+                )
 # 支持的语言列表
 LANGUAGES = {
     "中文": "zh_CN",
                 i18n("使用情感参考音频"),
                 i18n("使用情感向量控制"),
                 i18n("使用情感描述文本控制")]
+EMO_CHOICES_BASE = EMO_CHOICES[:3]  # 基础选项
+EMO_CHOICES_EXPERIMENTAL = EMO_CHOICES  # 全部选项（包括文本描述）
 os.makedirs("outputs/tasks",exist_ok=True)
 os.makedirs("prompts",exist_ok=True)
                              example.get("emo_vec_5",0),
                              example.get("emo_vec_6",0),
                              example.get("emo_vec_7",0),
+                             example.get("emo_vec_8",0),
+                             example.get("emo_text") is not None]
                              )
+def normalize_emo_vec(emo_vec):
+    # emotion factors for better user experience
+    k_vec = [0.75,0.70,0.80,0.80,0.75,0.75,0.55,0.45]
+    tmp = np.array(k_vec) * np.array(emo_vec)
+    if np.sum(tmp) > 0.8:
+        tmp = tmp * 0.8/ np.sum(tmp)
+    return tmp.tolist()
 def gen_single(emo_control_method,prompt, text,
                emo_ref_path, emo_weight,
                vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
                emo_text,emo_random,
+               max_text_tokens_per_segment=120,
                 *args, progress=gr.Progress()):
     output_path = None
     if not output_path:
     }
     if type(emo_control_method) is not int:
         emo_control_method = emo_control_method.value
+    if emo_control_method == 0:  # emotion from speaker
+        emo_ref_path = None  # remove external reference audio
+    if emo_control_method == 1:  # emotion from reference audio
+        # normalize emo_alpha for better user experience
+        emo_weight = emo_weight * 0.8
+        pass
+    if emo_control_method == 2:  # emotion from custom vectors
         vec = [vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8]
+        vec = normalize_emo_vec(vec)
     else:
+        # don't use the emotion vector inputs for the other modes
         vec = None
+    if emo_text == "":
+        # erase empty emotion descriptions; `infer()` will then automatically use the main prompt
+        emo_text = None
+    print(f"Emo control mode:{emo_control_method},weight:{emo_weight},vec:{vec}")
     output = tts.infer(spk_audio_prompt=prompt, text=text,
                        output_path=output_path,
                        emo_audio_prompt=emo_ref_path, emo_alpha=emo_weight,
                        emo_vector=vec,
                        use_emo_text=(emo_control_method==3), emo_text=emo_text,use_random=emo_random,
                        verbose=cmd_args.verbose,
+                       max_text_tokens_per_segment=int(max_text_tokens_per_segment),
                        **kwargs)
     return gr.update(value=output,visible=True)
 <a href='https://arxiv.org/abs/2506.21619'><img src='https://img.shields.io/badge/ArXiv-2506.21619-red'></a>
 </p>
     ''')
     with gr.Tab(i18n("音频生成")):
         with gr.Row():
             os.makedirs("prompts",exist_ok=True)
                 input_text_single = gr.TextArea(label=i18n("文本"),key="input_text_single", placeholder=i18n("请输入目标文本"), info=f"{i18n('当前模型版本')}{tts.model_version or '1.0'}")
                 gen_button = gr.Button(i18n("生成语音"), key="gen_button",interactive=True)
             output_audio = gr.Audio(label=i18n("生成结果"), visible=True,key="output_audio")
+        experimental_checkbox = gr.Checkbox(label=i18n("显示实验功能"),value=False)
         with gr.Accordion(i18n("功能设置")):
             # 情感控制选项部分
             with gr.Row():
                 emo_control_method = gr.Radio(
+                    choices=EMO_CHOICES_BASE,
                     type="index",
+                    value=EMO_CHOICES_BASE[0],label=i18n("情感控制方式"))
         # 情感参考音频部分
         with gr.Group(visible=False) as emotion_reference_group:
             with gr.Row():
                 emo_upload = gr.Audio(label=i18n("上传情感参考音频"), type="filepath")
         # 情感随机采样
+        with gr.Row(visible=False) as emotion_randomize_group:
+            emo_random = gr.Checkbox(label=i18n("情感随机采样"), value=False)
         # 情感向量控制部分
         with gr.Group(visible=False) as emotion_vector_group:
             with gr.Row():
                 with gr.Column():
+                    vec1 = gr.Slider(label=i18n("喜"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec2 = gr.Slider(label=i18n("怒"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec3 = gr.Slider(label=i18n("哀"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec4 = gr.Slider(label=i18n("惧"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
                 with gr.Column():
+                    vec5 = gr.Slider(label=i18n("厌恶"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec6 = gr.Slider(label=i18n("低落"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec7 = gr.Slider(label=i18n("惊喜"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
+                    vec8 = gr.Slider(label=i18n("平静"), minimum=0.0, maximum=1.0, value=0.0, step=0.05)
         with gr.Group(visible=False) as emo_text_group:
             with gr.Row():
+                emo_text = gr.Textbox(label=i18n("情感描述文本"),
+                                      placeholder=i18n("请输入情绪描述（或留空以自动使用目标文本作为情绪描述）"),
+                                      value="",
+                                      info=i18n("例如：委屈巴巴、危险在悄悄逼近"))
+        with gr.Row(visible=False) as emo_weight_group:
+            emo_weight = gr.Slider(label=i18n("情感权重"), minimum=0.0, maximum=1.0, value=0.8, step=0.01)
+        with gr.Accordion(i18n("高级生成参数设置"), open=False,visible=False) as advanced_settings_group:
             with gr.Row():
                 with gr.Column(scale=1):
+                    gr.Markdown(f"**{i18n('GPT2 采样设置')}** _{i18n('参数会影响音频多样性和生成速度详见')} [Generation strategies](https://huggingface.co/docs/transformers/main/en/generation_strategies)._")
                     with gr.Row():
+                        do_sample = gr.Checkbox(label="do_sample", value=True, info=i18n("是否进行采样"))
                         temperature = gr.Slider(label="temperature", minimum=0.1, maximum=2.0, value=0.8, step=0.1)
                     with gr.Row():
                         top_p = gr.Slider(label="top_p", minimum=0.0, maximum=1.0, value=0.8, step=0.01)
                     with gr.Row():
                         repetition_penalty = gr.Number(label="repetition_penalty", precision=None, value=10.0, minimum=0.1, maximum=20.0, step=0.1)
                         length_penalty = gr.Number(label="length_penalty", precision=None, value=0.0, minimum=-2.0, maximum=2.0, step=0.1)
+                    max_mel_tokens = gr.Slider(label="max_mel_tokens", value=1500, minimum=50, maximum=tts.cfg.gpt.max_mel_tokens, step=10, info=i18n("生成Token最大数量，过小导致音频被截断"), key="max_mel_tokens")
                     # with gr.Row():
                     #     typical_sampling = gr.Checkbox(label="typical_sampling", value=False, info="不建议使用")
                     #     typical_mass = gr.Slider(label="typical_mass", value=0.9, minimum=0.0, maximum=1.0, step=0.1)
                 with gr.Column(scale=2):
                     gr.Markdown(f'**{i18n("分句设置")}** _{i18n("参数会影响音频质量和生成速度")}_')
                     with gr.Row():
+                        initial_value = max(20, min(tts.cfg.gpt.max_text_tokens, cmd_args.gui_seg_tokens))
+                        max_text_tokens_per_segment = gr.Slider(
+                            label=i18n("分句最大Token数"), value=initial_value, minimum=20, maximum=tts.cfg.gpt.max_text_tokens, step=2, key="max_text_tokens_per_segment",
                             info=i18n("建议80~200之间，值越大，分句越长；值越小，分句越碎；过小过大都可能导致音频质量不高"),
                         )
+                    with gr.Accordion(i18n("预览分句结果"), open=True) as segments_settings:
+                        segments_preview = gr.Dataframe(
                             headers=[i18n("序号"), i18n("分句内容"), i18n("Token数")],
+                            key="segments_preview",
                             wrap=True,
                         )
             advanced_params = [
                 # typical_sampling, typical_mass,
             ]
+        if len(example_cases) > 2:
+            example_table = gr.Examples(
+                examples=example_cases[:-2],
+                examples_per_page=20,
+                inputs=[prompt_audio,
+                        emo_control_method,
+                        input_text_single,
+                        emo_upload,
+                        emo_weight,
+                        emo_text,
+                        vec1,vec2,vec3,vec4,vec5,vec6,vec7,vec8,experimental_checkbox]
+            )
+        elif len(example_cases) > 0:
+            example_table = gr.Examples(
                 examples=example_cases,
                 examples_per_page=20,
                 inputs=[prompt_audio,
                         emo_upload,
                         emo_weight,
                         emo_text,
+                        vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, experimental_checkbox]
             )
+    def on_input_text_change(text, max_text_tokens_per_segment):
         if text and len(text) > 0:
             text_tokens_list = tts.tokenizer.tokenize(text)
+            segments = tts.tokenizer.split_segments(text_tokens_list, max_text_tokens_per_segment=int(max_text_tokens_per_segment))
             data = []
+            for i, s in enumerate(segments):
+                segment_str = ''.join(s)
                 tokens_count = len(s)
+                data.append([i, segment_str, tokens_count])
             return {
+                segments_preview: gr.update(value=data, visible=True, type="array"),
             }
         else:
             df = pd.DataFrame([], columns=[i18n("序号"), i18n("分句内容"), i18n("Token数")])
             return {
+                segments_preview: gr.update(value=df),
             }
     def on_method_select(emo_control_method):
+        if emo_control_method == 1:  # emotion reference audio
             return (gr.update(visible=True),
                     gr.update(visible=False),
                     gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=True)
                     )
+        elif emo_control_method == 2:  # emotion vectors
             return (gr.update(visible=False),
                     gr.update(visible=True),
                     gr.update(visible=True),
+                    gr.update(visible=False),
                     gr.update(visible=False)
                     )
+        elif emo_control_method == 3:  # emotion text description
             return (gr.update(visible=False),
                     gr.update(visible=True),
                     gr.update(visible=False),
+                    gr.update(visible=True),
                     gr.update(visible=True)
                     )
+        else:  # 0: same as speaker voice
             return (gr.update(visible=False),
+                    gr.update(visible=False),
                     gr.update(visible=False),
                     gr.update(visible=False),
                     gr.update(visible=False)
                     )
+    def on_experimental_change(is_exp):
+        # 切换情感控制选项
+        # 第三个返回值实际没有起作用
+        if is_exp:
+            return gr.update(choices=EMO_CHOICES_EXPERIMENTAL, value=EMO_CHOICES_EXPERIMENTAL[0]), gr.update(visible=True),gr.update(value=example_cases)
+        else:
+            return gr.update(choices=EMO_CHOICES_BASE, value=EMO_CHOICES_BASE[0]), gr.update(visible=False),gr.update(value=example_cases[:-2])
     emo_control_method.select(on_method_select,
         inputs=[emo_control_method],
         outputs=[emotion_reference_group,
+                 emotion_randomize_group,
                  emotion_vector_group,
+                 emo_text_group,
+                 emo_weight_group]
     )
     input_text_single.change(
         on_input_text_change,
+        inputs=[input_text_single, max_text_tokens_per_segment],
+        outputs=[segments_preview]
     )
+    experimental_checkbox.change(
+        on_experimental_change,
+        inputs=[experimental_checkbox],
+        outputs=[emo_control_method, advanced_settings_group,example_table.dataset]  # 高级参数Accordion
+    )
+    max_text_tokens_per_segment.change(
         on_input_text_change,
+        inputs=[input_text_single, max_text_tokens_per_segment],
+        outputs=[segments_preview]
     )
     prompt_audio.upload(update_prompt_audio,
                          inputs=[],
                          outputs=[gen_button])
                      inputs=[emo_control_method,prompt_audio, input_text_single, emo_upload, emo_weight,
                             vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8,
                              emo_text,emo_random,
+                             max_text_tokens_per_segment,
                              *advanced_params,
                      ],
                      outputs=[output_audio])