Spaces:
Runtime error
Runtime error
Commit
·
a692a02
1
Parent(s):
975e651
update
Browse files- cosyvoice/cli/cosyvoice.py +31 -2
cosyvoice/cli/cosyvoice.py
CHANGED
|
@@ -26,11 +26,13 @@ class CosyVoice:
|
|
| 26 |
@spaces.GPU
|
| 27 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
| 28 |
instruct = True if '-Instruct' in model_dir else False
|
|
|
|
| 29 |
self.model_dir = model_dir
|
| 30 |
if not os.path.exists(model_dir):
|
| 31 |
model_dir = snapshot_download(model_dir)
|
| 32 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
| 33 |
configs = load_hyperpyyaml(f)
|
|
|
|
| 34 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
| 35 |
configs['feat_extractor'],
|
| 36 |
'{}/campplus.onnx'.format(model_dir),
|
|
@@ -53,15 +55,25 @@ class CosyVoice:
|
|
| 53 |
'{}/flow.encoder.fp32.zip'.format(model_dir))
|
| 54 |
if load_onnx:
|
| 55 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
| 56 |
-
del configs
|
| 57 |
|
| 58 |
@spaces.GPU
|
| 59 |
def list_avaliable_spks(self):
|
| 60 |
spks = list(self.frontend.spk2info.keys())
|
| 61 |
return spks
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
@spaces.GPU
|
| 64 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
|
|
|
| 65 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 66 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
| 67 |
start_time = time.time()
|
|
@@ -74,6 +86,7 @@ class CosyVoice:
|
|
| 74 |
|
| 75 |
@spaces.GPU
|
| 76 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
|
| 77 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
| 78 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 79 |
if len(i) < 0.5 * len(prompt_text):
|
|
@@ -89,6 +102,7 @@ class CosyVoice:
|
|
| 89 |
|
| 90 |
@spaces.GPU
|
| 91 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
|
| 92 |
if self.frontend.instruct is True:
|
| 93 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
| 94 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
|
@@ -103,6 +117,7 @@ class CosyVoice:
|
|
| 103 |
|
| 104 |
@spaces.GPU
|
| 105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
|
|
|
| 106 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
| 107 |
if self.frontend.instruct is False:
|
| 108 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
|
@@ -119,6 +134,7 @@ class CosyVoice:
|
|
| 119 |
|
| 120 |
@spaces.GPU
|
| 121 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
|
| 122 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 123 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
| 124 |
start_time = time.time()
|
|
@@ -131,6 +147,7 @@ class CosyVoice:
|
|
| 131 |
|
| 132 |
@spaces.GPU
|
| 133 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
|
|
|
| 134 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
| 135 |
start_time = time.time()
|
| 136 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
|
@@ -143,11 +160,13 @@ class CosyVoice2(CosyVoice):
|
|
| 143 |
@spaces.GPU
|
| 144 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
| 145 |
instruct = True if '-Instruct' in model_dir else False
|
|
|
|
| 146 |
self.model_dir = model_dir
|
| 147 |
if not os.path.exists(model_dir):
|
| 148 |
model_dir = snapshot_download(model_dir)
|
| 149 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
| 150 |
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
|
|
|
|
| 151 |
# print(f"Loading configs:{configs}")
|
| 152 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
| 153 |
configs['feat_extractor'],
|
|
@@ -177,4 +196,14 @@ class CosyVoice2(CosyVoice):
|
|
| 177 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
| 178 |
if load_trt:
|
| 179 |
self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
|
| 180 |
-
del configs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
@spaces.GPU
|
| 27 |
def __init__(self, model_dir, load_jit=True, load_onnx=False, fp16=True):
|
| 28 |
instruct = True if '-Instruct' in model_dir else False
|
| 29 |
+
self.instruct = instruct
|
| 30 |
self.model_dir = model_dir
|
| 31 |
if not os.path.exists(model_dir):
|
| 32 |
model_dir = snapshot_download(model_dir)
|
| 33 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
| 34 |
configs = load_hyperpyyaml(f)
|
| 35 |
+
self.configs = configs
|
| 36 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
| 37 |
configs['feat_extractor'],
|
| 38 |
'{}/campplus.onnx'.format(model_dir),
|
|
|
|
| 55 |
'{}/flow.encoder.fp32.zip'.format(model_dir))
|
| 56 |
if load_onnx:
|
| 57 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
|
|
|
| 58 |
|
| 59 |
@spaces.GPU
|
| 60 |
def list_avaliable_spks(self):
|
| 61 |
spks = list(self.frontend.spk2info.keys())
|
| 62 |
return spks
|
| 63 |
|
| 64 |
+
@spaces.GPU
|
| 65 |
+
def reload_frontend(self):
|
| 66 |
+
self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
|
| 67 |
+
self.configs['feat_extractor'],
|
| 68 |
+
'{}/campplus.onnx'.format(self.model_dir),
|
| 69 |
+
'{}/speech_tokenizer_v1.onnx'.format(self.model_dir),
|
| 70 |
+
'{}/spk2info.pt'.format(self.model_dir),
|
| 71 |
+
self.instruct,
|
| 72 |
+
self.configs['allowed_special'])
|
| 73 |
+
|
| 74 |
@spaces.GPU
|
| 75 |
def inference_sft(self, tts_text, spk_id, stream=False, speed=1.0):
|
| 76 |
+
self.reload_frontend()
|
| 77 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 78 |
model_input = self.frontend.frontend_sft(i, spk_id)
|
| 79 |
start_time = time.time()
|
|
|
|
| 86 |
|
| 87 |
@spaces.GPU
|
| 88 |
def inference_zero_shot(self, tts_text, prompt_text, prompt_speech_16k, stream=False, speed=1.0):
|
| 89 |
+
self.reload_frontend()
|
| 90 |
prompt_text = self.frontend.text_normalize(prompt_text, split=False)
|
| 91 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 92 |
if len(i) < 0.5 * len(prompt_text):
|
|
|
|
| 102 |
|
| 103 |
@spaces.GPU
|
| 104 |
def inference_cross_lingual(self, tts_text, prompt_speech_16k, stream=False, speed=1.0):
|
| 105 |
+
self.reload_frontend()
|
| 106 |
if self.frontend.instruct is True:
|
| 107 |
raise ValueError('{} do not support cross_lingual inference'.format(self.model_dir))
|
| 108 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
|
|
|
| 117 |
|
| 118 |
@spaces.GPU
|
| 119 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
| 120 |
+
self.reload_frontend()
|
| 121 |
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
| 122 |
if self.frontend.instruct is False:
|
| 123 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
|
|
|
| 134 |
|
| 135 |
@spaces.GPU
|
| 136 |
def inference_instruct2(self, tts_text, instruct_text, prompt_speech_16k, stream=False, speed=1.0):
|
| 137 |
+
self.reload_frontend()
|
| 138 |
for i in tqdm(self.frontend.text_normalize(tts_text, split=True)):
|
| 139 |
model_input = self.frontend.frontend_instruct2(i, instruct_text, prompt_speech_16k, self.sample_rate)
|
| 140 |
start_time = time.time()
|
|
|
|
| 147 |
|
| 148 |
@spaces.GPU
|
| 149 |
def inference_vc(self, source_speech_16k, prompt_speech_16k, stream=False, speed=1.0):
|
| 150 |
+
self.reload_frontend()
|
| 151 |
model_input = self.frontend.frontend_vc(source_speech_16k, prompt_speech_16k, self.sample_rate)
|
| 152 |
start_time = time.time()
|
| 153 |
for model_output in self.model.vc(**model_input, stream=stream, speed=speed):
|
|
|
|
| 160 |
@spaces.GPU
|
| 161 |
def __init__(self, model_dir, load_jit=False, load_onnx=False, load_trt=False):
|
| 162 |
instruct = True if '-Instruct' in model_dir else False
|
| 163 |
+
self.instruct = instruct
|
| 164 |
self.model_dir = model_dir
|
| 165 |
if not os.path.exists(model_dir):
|
| 166 |
model_dir = snapshot_download(model_dir)
|
| 167 |
with open('{}/cosyvoice.yaml'.format(model_dir), 'r') as f:
|
| 168 |
configs = load_hyperpyyaml(f, overrides={'qwen_pretrain_path': os.path.join(model_dir, 'CosyVoice-BlankEN')})
|
| 169 |
+
self.configs = configs
|
| 170 |
# print(f"Loading configs:{configs}")
|
| 171 |
self.frontend = CosyVoiceFrontEnd(configs['get_tokenizer'],
|
| 172 |
configs['feat_extractor'],
|
|
|
|
| 196 |
self.model.load_onnx('{}/flow.decoder.estimator.fp32.onnx'.format(model_dir))
|
| 197 |
if load_trt:
|
| 198 |
self.model.load_trt('{}/flow.decoder.estimator.fp16.l20.plan'.format(model_dir))
|
| 199 |
+
del configs
|
| 200 |
+
|
| 201 |
+
@spaces.GPU
|
| 202 |
+
def reload_frontend(self):
|
| 203 |
+
self.frontend = CosyVoiceFrontEnd(self.configs['get_tokenizer'],
|
| 204 |
+
self.configs['feat_extractor'],
|
| 205 |
+
'{}/campplus.onnx'.format(self.model_dir),
|
| 206 |
+
'{}/speech_tokenizer_v2.onnx'.format(self.model_dir),
|
| 207 |
+
'{}/spk2info.pt'.format(self.model_dir),
|
| 208 |
+
self.instruct,
|
| 209 |
+
self.configs['allowed_special'])
|