From ebbd7bd91ff5f8f5d07d9176493e0d2bee33aa22 Mon Sep 17 00:00:00 2001 From: Danh Tran Date: Tue, 24 Jun 2025 22:23:00 +0700 Subject: [PATCH] =?UTF-8?q?Update=20WAV=20File=20Naming=20and=20Dependenci?= =?UTF-8?q?es=20=F0=9F=93=9D=F0=9F=94=8A=20(#1091)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update infer_cli.py * Update pyproject.toml * formalized --------- Co-authored-by: SWivid --- pyproject.toml | 1 + src/f5_tts/infer/infer_cli.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 69e4c9c..bbd633e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dependencies = [ "tqdm>=4.65.0", "transformers", "transformers_stream_generator", + "unidecode", "vocos", "wandb", "x_transformers>=1.31.14", diff --git a/src/f5_tts/infer/infer_cli.py b/src/f5_tts/infer/infer_cli.py index 95800fd..5bd20ce 100644 --- a/src/f5_tts/infer/infer_cli.py +++ b/src/f5_tts/infer/infer_cli.py @@ -12,6 +12,7 @@ import tomli from cached_path import cached_path from hydra.utils import get_class from omegaconf import OmegaConf +from unidecode import unidecode from f5_tts.infer.utils_infer import ( cfg_strength, @@ -112,6 +113,11 @@ parser.add_argument( action="store_true", help="To save each audio chunks during inference", ) +parser.add_argument( + "--no_legacy_text", + action="store_false", + help="Not to use lossy ASCII transliterations of unicode text in saved file names.", +) parser.add_argument( "--remove_silence", action="store_true", @@ -197,6 +203,12 @@ output_file = args.output_file or config.get( ) save_chunk = args.save_chunk or config.get("save_chunk", False) +use_legacy_text = args.no_legacy_text or config.get("no_legacy_text", False) # no_legacy_text is a store_false arg +if save_chunk and use_legacy_text: + print( + "\nWarning to --save_chunk: lossy ASCII transliterations of unicode text for legacy (.wav) file names, --no_legacy_text to disable.\n" + ) + remove_silence = args.remove_silence or config.get("remove_silence", False) load_vocoder_from_local = args.load_vocoder_from_local or config.get("load_vocoder_from_local", False) @@ -344,6 +356,8 @@ def main(): if save_chunk: if len(gen_text_) > 200: gen_text_ = gen_text_[:200] + " ... " + if use_legacy_text: + gen_text_ = unidecode(gen_text_) sf.write( os.path.join(output_chunk_dir, f"{len(generated_audio_segments) - 1}_{gen_text_}.wav"), audio_segment,