-
Notifications
You must be signed in to change notification settings - Fork 0
/
NuanceSpeech.py
149 lines (118 loc) · 4.74 KB
/
NuanceSpeech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#==================================================================================
# IBM Bluemix Speech Recognition for the RocConf Project
# - Jeffery A. White - September 2016
#
# Based upon the SpeechRecognition Library
# SpeechRecognition - Zhang, A. (2015).
# Speech Recognition (Version 3.1) [Software].
# Available from https://github.com/Uberi/speech_recognition#readme.
#
# Modified here to take .flac as input since we couldn't get the
# flac encoder in that library to work. Uses FFMPEG converted .flac files.
#
# Usage - BluemixSpeech.py {Filename}
#==================================================================================
import pymongo
from pymongo import MongoClient
import pprint
import sys
import os, base64, requests, wave
import json
from sys import stdout
from time import sleep
from urllib import urlencode
from urllib2 import Request, urlopen, URLError, HTTPError
class RequestError(Exception): pass
client = MongoClient()
database = client['rocconf']
session_key = ""
user_id = ""
pp = pprint.PrettyPrinter(indent=2)
#==================================================================================
# Access keys for using Nuance.
#==================================================================================
NUANCE_URL = "https://dictation.nuancemobility.net"
NUANCE_ENDPOINT = "/NMDPAsrCmdServlet/dictation"
NUANCE_APPID = "NMDPTRIAL_jwhite637_gmail_com20161009231933"
NUANCE_APPKEY = "811afaaf29e1ce63dd9c591634233a2a3ca55cdbb1e6a1659d00b180cb93394dad33fcd615ca39f26792089bb643c5f2e821ce9a086fffa56fb0253a435e87f0"
NUANCE_REQUESTORID = "RocConf"
#==================================================================================
# Send the .wav data to Nuance
#==================================================================================
def nuance_call(filename):
language = "en-US"
basepath = os.path.dirname(__file__)
filepath = os.path.abspath(os.path.join(basepath,filename))
#total_size = os.path.getsize(filepath) - 44
#analyze_function = read_wav_file_in_chunks(filepath)
#wav_data = analyze_function
file_to_play = wave.open(filepath, 'r')
numFrames = file_to_play.getnframes()
wav_data = ""
print("Sample Rate: " + str(file_to_play.getframerate()))
print("Reading in " + str(numFrames) + " for the file")
for i in range(0,numFrames):
wav_data += file_to_play.readframes(1)
total_size = sys.getsizeof(wav_data)
print("Total Size: " + str(total_size))
hdrs = {
u"Content-Type": u"audio/x-wav;codec=pcm;bit=16;rate=8000",
u"Accept-Language": u"en_US",
u"Content-Length": total_size,
u"Accept": u"text/plain",
u"Accept-Topic": u"Dictation"
}
url = ret = "%s%s?appId=%s&appKey=%s&id=%s" % (NUANCE_URL, NUANCE_ENDPOINT, NUANCE_APPID, NUANCE_APPKEY, NUANCE_REQUESTORID)
print(url)
res = requests.post(url, data=wav_data, headers=hdrs)
pp.pprint(res)
#result = json.loads(response_text)
# Dump to MongoDB for later analysis!
#final_dict = {}
#final_dict['session_key'] = session_key
#final_dict['user'] = user_id
#final_dict['data'] = result
#raw_data_collection.insert_one(final_dict).inserted_id
#return result
def read_wav_file_in_chunks(filepath):
file_to_play = wave.open(filepath, 'r')
total_size = os.path.getsize(filepath) #- 44 # 44 = wave header size
print " Audio File %s" % filepath
data = file_to_play.readframes(2048)
total_chunks = 0
while data != '':
total_chunks += len(data)
stdout.write("\r Bytes Sent %d/%d \t%d%% " % (total_chunks,total_size,100*total_chunks/total_size))
stdout.flush()
sleep(0.05)
yield data
data = file_to_play.readframes(2048)
stdout.write("\n\n")
#================================================================================================
# Processing a text transcript from bluemix results data
#================================================================================================
def process_transcript(raw_data):
result = raw_data
pp.pprint(raw_data)
transcription = []
for utterance in result["results"]:
if "alternatives" not in utterance: raise UnknownValueError()
for hypothesis in utterance["alternatives"]:
if "transcript" in hypothesis:
transcription.append(hypothesis["transcript"])
transcript = ". ".join(transcription)
# Dump transcript into MongoDB for display and later analysis!
final_dict = {}
final_dict['session_key'] = session_key
final_dict['user'] = user_id
final_dict['transcript'] = transcript
transcript_collection.insert_one(final_dict).inserted_id
#=======================================================
# Main Caller
#=======================================================
if __name__ == "__main__":
#final = sys.argv[1].split('_')
#session_key = final[1]
#user_id = final[2]
raw_data = nuance_call(sys.argv[1])
#process_transcript(raw_data)