-
Notifications
You must be signed in to change notification settings - Fork 2
/
shn_tts_combined.py
49 lines (36 loc) · 1.62 KB
/
shn_tts_combined.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from ttsmms import TTS
from ftlangdetect import detect
from pydub import AudioSegment
import numpy as np
import sys
# text = "ၸၢမ်းတူၺ်း this is english word လိၵ်ႈတႆး"
# text2 = "english letter containe english word english word"
# text3 = "ၵူၼ်းဢမ်ႇမီးႁိူၼ်းတၢႆထပ်းၵၼ်ဝၼ်းလဵဝ်ဢမ်ႇယွမ်း သၢမ်ၵေႃႉတီႈဝဵင်းလႃႈသဵဝ်ႈ"
# text4 = "ငဝ်ႈငုၼ်းၶွင်ႇသီႇဢဝ်ၶိုၼ်းၸိုင်ႈတႆး RCSS-SSA လႄႈ ပႃႇတီႇမႂ်ႇသုင်ၸိုင်ႈတႆး တပ်ႉသိုၵ်းၸိုင်ႈတႆး SSPP/SSA"
if len(sys.argv) != 2:
print("Usage: python shn_tts_combined.py 'some Shan sentences'")
sys.exit(1)
text = sys.argv[1]
sentences = text.split()
shn_model = TTS("./model/shn")
eng_model = TTS("./model/eng")
audio_segments = []
synthesis_vector = []
for sentence in sentences:
language = detect(sentence)
if language["lang"] == "en":
tts = eng_model
else:
tts = shn_model
tts_output = tts.synthesis(sentence)
audio_data = tts_output["x"]
sampling_rate = tts_output["sampling_rate"]
audio_array = np.array(audio_data * 32767, dtype=np.int16)
audio_segment = AudioSegment(
audio_array.tobytes(), frame_rate=sampling_rate, sample_width=2, channels=1
)
audio_segments.append(audio_segment)
# combined the audio
combined_audio = sum(audio_segments)
# export
combined_audio.export("output/generate.wav", format="wav")