-
Notifications
You must be signed in to change notification settings - Fork 0
/
AppOnly.py
193 lines (126 loc) · 4.3 KB
/
AppOnly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python
# coding: utf-8
# In[1]:
class_names = ['cothe', 'khong', 'nguoi', 'toi', 'va']
import librosa
import numpy as np
import os
import math
from sklearn.cluster import KMeans
import hmmlearn.hmm
def get_mfcc(file_path):
y, sr = librosa.load(file_path) # read .wav file
hop_length = math.floor(sr*0.010) # 10ms hop
win_length = math.floor(sr*0.025) # 25ms frame
# mfcc is 12 x T matrix
mfcc = librosa.feature.mfcc(
y, sr, n_mfcc=12, n_fft=1024,
hop_length=hop_length, win_length=win_length)
# substract mean from mfcc --> normalize mfcc
mfcc = mfcc - np.mean(mfcc, axis=1).reshape((-1,1))
# delta feature 1st order and 2nd order
delta1 = librosa.feature.delta(mfcc, order=1)
delta2 = librosa.feature.delta(mfcc, order=2)
# X is 36 x T
X = np.concatenate([mfcc, delta1, delta2], axis=0) # O^r
# return T x 36 (transpose of X)
return X.T # hmmlearn use T x N matrix
# In[2]:
#loadmodels
import pickle
model = {}
for key in class_names:
name = f"models\model_{key}.pkl"
with open(name, 'rb') as file:
model[key] = pickle.load(file)
# In[ ]:
# In[ ]:
# In[3]:
from tkinter import messagebox
import winsound
from pydub import AudioSegment
import ffmpeg
#Thay đổi threshold dựa vào tạp âm, càng ồn thì threshold càng lớn
def detect_leading_silence(sound, silence_threshold=-42.0, chunk_size=10):
'''
sound is a pydub.AudioSegment
silence_threshold in dB
chunk_size in ms
iterate over chunks until you find the first one with sound
'''
trim_ms = 0 # ms
assert chunk_size > 0 # to avoid infinite loop
while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
trim_ms += chunk_size
return trim_ms
def record():
import pyaudio
import wave
from base64 import b64decode
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 22050
RECORD_SECONDS = 2
WAVE_OUTPUT_FILENAME = "record.wav"
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
frames = []
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
def play():
filename = 'record.wav'
winsound.PlaySound(filename, winsound.SND_FILENAME)
def playtrimmed():
filename = 'trimmed.wav'
winsound.PlaySound(filename, winsound.SND_FILENAME)
def predict():
#Trim silence
sound = AudioSegment.from_file("record.wav", format="wav")
start_trim = detect_leading_silence(sound)
end_trim = detect_leading_silence(sound.reverse())
duration = len(sound)
trimmed_sound = sound[start_trim:duration-end_trim]
trimmed_sound.export("trimmed.wav", format="wav")
#Predict
record_mfcc = get_mfcc("trimmed.wav")
scores = [model[cname].score(record_mfcc) for cname in class_names]
pred = np.argmax(scores)
messagebox.showinfo("result", class_names[pred])
# In[4]:
import tkinter as tk
window = tk.Tk()
window.geometry("300x200")
window.title("Speech recognition")
frame0 = tk.Frame(master=window)
frame0.pack()
frame1 = tk.Frame(master=window)
frame1.pack()
frame2 = tk.Frame(master=window)
frame2.pack()
label = tk.Label(master=frame0, text="Speech recognition")
label.pack(padx=5, pady=10)
btn_record = tk.Button(master=frame1, width=13, height=2, text="record", command=record)
btn_record.pack(side=tk.LEFT, padx=5, pady=5)
btn_playback = tk.Button(master=frame1, width=13, height=2, text="playback", command=play)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)
btn_predict = tk.Button(master=frame2, width=13, height=2, text="trim & predict", command=predict)
btn_predict.pack(side=tk.LEFT, padx=5, pady=5)
btn_playback = tk.Button(master=frame2, width=13, height=2, text="playbacktrimmed", command=playtrimmed)
btn_playback.pack(side=tk.LEFT, padx=5, pady=5)
window.mainloop()
# In[ ]: