Skip to content

Commit

Permalink
Fixing google stt plugin (#184)
Browse files Browse the repository at this point in the history
I have reviewed these changes and they work for me on my Raspberry Pi 3B+ running Raspbian Stretch.

The instructions at https://cloud.google.com/speech-to-text/docs/quickstart-protocol (as noted in the source code) worked for me. The documentation as it currently stands could be updated/improved but it's not strictly necessary, since the current link does eventually get the user to the instructions for creating an account, setting up billing, downloading the json authorization file, and setting the environment variable.

It's nice to have the google stt option available again. Thanks for the help!
  • Loading branch information
olest authored and aaronchantrill committed Apr 29, 2019
1 parent 03cc6a5 commit 3bd3fcf
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 95 deletions.
142 changes: 47 additions & 95 deletions plugins/stt/google-stt/google.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,22 @@
import json
import logging
import urllib
import wave
import requests
from naomi import plugin

from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from google.api_core.exceptions import GoogleAPICallError,RetryError


class GoogleSTTPlugin(plugin.STTPlugin):
"""
Speech-To-Text implementation which relies on the Google Speech API.
This implementation requires a Google API key to be present in profile.yml
To obtain an API key:
1. Join the Chromium Dev group:
https://groups.google.com/a/chromium.org/
forum/?fromgroups#!forum/chromium-dev
2. Create a project through the Google Developers console:
https://console.developers.google.com/project
3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate
the Speech API.
4. Under "APIs & Auth," navigate to "Credentials." Create a new key for
public API access.
5. Add your credentials to your profile.yml. Add an entry to the 'keys'
section using the key name 'GOOGLE_SPEECH.' Sample configuration:
6. Set the value of the 'stt_engine' key in your profile.yml to 'google'
Speech-To-Text implementation using the Google Speech API.
Uses the google python client:
https://googleapis.github.io/google-cloud-python/latest/speech/index.html
You need to download an google cloud API key and set its location using the
environment variable GOOGLE_APPLICATION_CREDENTIALS. Details here:
Excerpt from sample profile.yml:
...
timezone: US/Pacific
stt_engine: google
keys:
GOOGLE_SPEECH: $YOUR_KEY_HERE
https://cloud.google.com/speech-to-text/docs/quickstart-protocol
"""

Expand All @@ -42,21 +25,18 @@ def __init__(self, *args, **kwargs):
# FIXME: get init args from config

self._logger = logging.getLogger(__name__)
self._request_url = None
self._language = None
self._api_key = None
self._http = requests.Session()
self._client = speech.SpeechClient()
self._config = None
try:
language = self.profile['language']
except KeyError:
language = 'en-US'

self.language = language.lower()
self.api_key = self.profile['keys']['GOOGLE_SPEECH']

@property
def request_url(self):
return self._request_url
self._regenerate_config()


@property
def language(self):
Expand All @@ -65,84 +45,56 @@ def language(self):
@language.setter
def language(self, value):
self._language = value
self._regenerate_request_url()
self._regenerate_config()

def _regenerate_config(self):
keyword = None
try:
keyword = self.profile["keyword"]
except:
pass

self._config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
language_code=self.language,
speech_contexts=[speech.types.SpeechContext(phrases=[keyword])],
model="command_and_search"
)

@property
def api_key(self):
return self._api_key

@api_key.setter
def api_key(self, value):
self._api_key = value
self._regenerate_request_url()

def _regenerate_request_url(self):
if self.api_key and self.language:
query = urllib.urlencode({'output': 'json',
'client': 'chromium',
'key': self.api_key,
'lang': self.language,
'maxresults': 6,
'pfilter': 2})
self._request_url = urllib.parse.urlunparse(
('https', 'www.google.com', '/speech-api/v2/recognize', '',
query, ''))
else:
self._request_url = None

def transcribe(self, fp):
"""
Performs STT via the Google Speech API, transcribing an audio file and
returning an English string.
Arguments:
audio_file_path -- the path to the .wav file to be transcribed
fp -- the path to the .wav file to be transcribed
"""

if not self.api_key:
self._logger.critical('API key missing, transcription request ' +
'aborted.')
return []
elif not self.language:
if not self.language:
self._logger.critical('Language info missing, transcription ' +
'request aborted.')
return []

wav = wave.open(fp, 'rb')
frame_rate = wav.getframerate()
wav.close()
data = fp.read()

headers = {'content-type': 'audio/l16; rate=%s' % frame_rate}
r = self._http.post(self.request_url, data=data, headers=headers)
try:
r.raise_for_status()
except requests.exceptions.HTTPError:
self._logger.critical('Request failed with http status %d',
r.status_code)
if r.status_code == requests.codes['forbidden']:
self._logger.warning('Status 403 is probably caused by an ' +
'invalid Google API key.')
return []
r.encoding = 'utf-8'
content = fp.read()
audio = types.RecognitionAudio(content=content)
try:
# We cannot simply use r.json() because Google sends invalid json
# (i.e. multiple json objects, seperated by newlines. We only want
# the last one).
response = json.loads(list(r.text.strip().split('\n', 1))[-1])
if len(response['result']) == 0:
# Response result is empty
raise ValueError('Nothing has been transcribed.')
results = [alt['transcript'] for alt
in response['result'][0]['alternative']]
response = self._client.recognize(self._config, audio)
except GoogleAPICallError as e:
# request failed for any reason
self._logger.warning('Google STT retry error. response: %s', e.args[0])
results = []
except RetryError as e:
# failed due to a retryable error and retry attempts failed
self._logger.warning('Google STT retry error. response: %s', e.args[0])
results = []
except ValueError as e:
self._logger.warning('Empty response: %s', e.args[0])
results = []
except (KeyError, IndexError):
self._logger.warning('Cannot parse response.', exc_info=True)
results = []
else:
# Convert all results to uppercase
results = tuple(result.upper() for result in results)
results = [ str(result.alternatives[0].transcript).upper() for
result in response.results]
self._logger.info('Transcribed: %r', results)

return results

7 changes: 7 additions & 0 deletions python_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,10 @@ python-dateutil==2.7.3

# MPDControl module
python-mpd==0.3.0

# google stt
google-api-core==1.9.0
google-auth==1.6.3
google-cloud-speech==1.0.0
googleapis-common-protos==1.5.9

0 comments on commit 3bd3fcf

Please sign in to comment.