Spaces:
Running
Running
Use multi-speaker model
Browse files- app.py +24 -13
- language_ids.json +4 -0
- speakers.pth +3 -0
app.py
CHANGED
|
@@ -6,7 +6,7 @@ import torch
|
|
| 6 |
|
| 7 |
CUDA = torch.cuda.is_available()
|
| 8 |
|
| 9 |
-
REPO_ID = "ayymen/Coqui-TTS-Vits-
|
| 10 |
|
| 11 |
VOICE_CONVERSION_MODELS = {
|
| 12 |
'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
|
|
@@ -14,26 +14,36 @@ VOICE_CONVERSION_MODELS = {
|
|
| 14 |
'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
|
| 15 |
}
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
my_title = "ⴰⴹⵕⵉⵚ ⵙ ⵉⵎⵙⵍⵉ - Tamazight Text-to-Speech"
|
| 18 |
my_description = "This model is based on [VITS](https://github.com/jaywalnut310/vits), thanks to 🐸 [Coqui.ai](https://coqui.ai/)."
|
| 19 |
|
| 20 |
my_examples = [
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
]
|
| 26 |
|
| 27 |
my_inputs = [
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
]
|
| 33 |
|
| 34 |
my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
|
| 35 |
|
| 36 |
-
best_model_path = hf_hub_download(repo_id=REPO_ID, filename="
|
| 37 |
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
|
| 38 |
|
| 39 |
api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA else "cpu")
|
|
@@ -42,21 +52,22 @@ api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA
|
|
| 42 |
for model in VOICE_CONVERSION_MODELS.values():
|
| 43 |
api.load_vc_model_by_name(model, gpu=CUDA)
|
| 44 |
|
| 45 |
-
def tts(text: str,
|
| 46 |
# replace oov characters
|
| 47 |
text = text.replace("\n", ". ")
|
| 48 |
text = text.replace("(", ",")
|
| 49 |
text = text.replace(")", ",")
|
| 50 |
text = text.replace('"', ",")
|
|
|
|
| 51 |
text = text.replace(";", ",")
|
| 52 |
text = text.replace("-", " ")
|
| 53 |
|
| 54 |
with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
|
| 55 |
if speaker_wav:
|
| 56 |
api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
|
| 57 |
-
api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
|
| 58 |
else:
|
| 59 |
-
api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)
|
| 60 |
|
| 61 |
return fp.name
|
| 62 |
|
|
|
|
| 6 |
|
| 7 |
CUDA = torch.cuda.is_available()
|
| 8 |
|
| 9 |
+
REPO_ID = "ayymen/Coqui-TTS-Vits-Multispeaker"
|
| 10 |
|
| 11 |
VOICE_CONVERSION_MODELS = {
|
| 12 |
'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
|
|
|
|
| 14 |
'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
|
| 15 |
}
|
| 16 |
|
| 17 |
+
VARIANTS = {"Tachelhit": "shi", "Tarifit": "rif"}
|
| 18 |
+
|
| 19 |
+
SPEAKERS = ["yan", "sin", "idj"]
|
| 20 |
+
|
| 21 |
my_title = "ⴰⴹⵕⵉⵚ ⵙ ⵉⵎⵙⵍⵉ - Tamazight Text-to-Speech"
|
| 22 |
my_description = "This model is based on [VITS](https://github.com/jaywalnut310/vits), thanks to 🐸 [Coqui.ai](https://coqui.ai/)."
|
| 23 |
|
| 24 |
my_examples = [
|
| 25 |
+
["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?", "shi", "yan", True],
|
| 26 |
+
["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?", "shi", "sin", False],
|
| 27 |
+
["ⴳⵏ ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ.", "shi", "yan", False],
|
| 28 |
+
["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!", "shi", "yan", False],
|
| 29 |
+
["ⴰⵣⵓⵍ. ⵎⴰⵎⵛ ⵜⴷⵊⵉⵜ?", "rif", "idj", True],
|
| 30 |
+
["ⴰⵇⵎⵎⵓⵎ ⵉⵇⵏⴻⵏ ⵓⵔ ⵜ ⵜⵜⵉⴷⴼⵏ ⵉⵣⴰⵏ.", "rif", "idj", False],
|
| 31 |
+
["ⵇⵇⵉⵎ ⵅ ⵜⴰⴷⴷⴰⵔⵜ ⵏⵏⵛ!", "rif", "idj", False],
|
| 32 |
+
["ⵜⴻⵜⵜⵏ ⴰⴳ ⵡⵓⵛⵛⵏ, ⵜⵜⵔⵓⵏ ⵅ ⵓⵎⴽⵙⴰ.", "rif", "idj", False]
|
| 33 |
]
|
| 34 |
|
| 35 |
my_inputs = [
|
| 36 |
+
gr.Textbox(lines=5, label="Input Text", placeholder="The only available characters are: ⴰⴱⴳⴷⴹⴻⴼⴽⵀⵃⵄⵅⵇⵉⵊⵍⵎⵏⵓⵔⵕⵖⵙⵚⵛⵜⵟⵡⵢⵣⵥⵯ !,.:?"),
|
| 37 |
+
gr.Dropdown(label="Variant", choices=list(VARIANTS.items()), value="shi"),
|
| 38 |
+
gr.Dropdown(label="Speaker", choices=SPEAKERS, value="yan"),
|
| 39 |
+
gr.Checkbox(label="Split Sentences (each sentence will be generated separately)", value=False),
|
| 40 |
+
gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
|
| 41 |
+
gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
|
| 42 |
]
|
| 43 |
|
| 44 |
my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
|
| 45 |
|
| 46 |
+
best_model_path = hf_hub_download(repo_id=REPO_ID, filename="checkpoint_390000.pth")
|
| 47 |
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")
|
| 48 |
|
| 49 |
api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA else "cpu")
|
|
|
|
| 52 |
for model in VOICE_CONVERSION_MODELS.values():
|
| 53 |
api.load_vc_model_by_name(model, gpu=CUDA)
|
| 54 |
|
| 55 |
+
def tts(text: str, variant: str = "shi", speaker: str = "yan", split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
|
| 56 |
# replace oov characters
|
| 57 |
text = text.replace("\n", ". ")
|
| 58 |
text = text.replace("(", ",")
|
| 59 |
text = text.replace(")", ",")
|
| 60 |
text = text.replace('"', ",")
|
| 61 |
+
text = text.replace("'", ",")
|
| 62 |
text = text.replace(";", ",")
|
| 63 |
text = text.replace("-", " ")
|
| 64 |
|
| 65 |
with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
|
| 66 |
if speaker_wav:
|
| 67 |
api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
|
| 68 |
+
api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant)
|
| 69 |
else:
|
| 70 |
+
api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant)
|
| 71 |
|
| 72 |
return fp.name
|
| 73 |
|
language_ids.json
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"rif": 0,
|
| 3 |
+
"shi": 1
|
| 4 |
+
}
|
speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a9df430489a8bf3eac98f38325dbdbd8d986fa731787724406062bacac5a471
|
| 3 |
+
size 864
|