Fixing google stt plugin (#184)

I have reviewed these changes and they work for me on my Raspberry Pi 3B+ running Raspbian Stretch. The instructions at https://cloud.google.com/speech-to-text/docs/quickstart-protocol (as noted in the source code) worked for me. The documentation as it currently stands could be updated/improved but it's not strictly necessary, since the current link does eventually get the user to the instructions for creating an account, setting up billing, downloading the json authorization file, and setting the environment variable. It's nice to have the google stt option available again. Thanks for the help!
NaomiProject · Apr 29, 2019 · 3bd3fcf · 3bd3fcf
1 parent 03cc6a5
commit 3bd3fcf
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 95 deletions.
diff --git a/plugins/stt/google-stt/google.py b/plugins/stt/google-stt/google.py
@@ -1,39 +1,22 @@
-import json
 import logging
-import urllib
-import wave
-import requests
 from naomi import plugin
 
+from google.cloud import speech
+from google.cloud.speech import enums
+from google.cloud.speech import types
+from google.api_core.exceptions import GoogleAPICallError,RetryError
+
 
 class GoogleSTTPlugin(plugin.STTPlugin):
     """
-    Speech-To-Text implementation which relies on the Google Speech API.
-
-    This implementation requires a Google API key to be present in profile.yml
-
-    To obtain an API key:
-    1. Join the Chromium Dev group:
-       https://groups.google.com/a/chromium.org/
-            forum/?fromgroups#!forum/chromium-dev
-    2. Create a project through the Google Developers console:
-       https://console.developers.google.com/project
-    3. Select your project. In the sidebar, navigate to "APIs & Auth." Activate
-       the Speech API.
-    4. Under "APIs & Auth," navigate to "Credentials." Create a new key for
-       public API access.
-    5. Add your credentials to your profile.yml. Add an entry to the 'keys'
-       section using the key name 'GOOGLE_SPEECH.' Sample configuration:
-    6. Set the value of the 'stt_engine' key in your profile.yml to 'google'
+    Speech-To-Text implementation using the Google Speech API.
+    Uses the google python client:
+    https://googleapis.github.io/google-cloud-python/latest/speech/index.html
 
+    You need to download an google cloud API key and set its location using the 
+    environment variable GOOGLE_APPLICATION_CREDENTIALS. Details here:
 
-    Excerpt from sample profile.yml:
-
-        ...
-        timezone: US/Pacific
-        stt_engine: google
-        keys:
-            GOOGLE_SPEECH: $YOUR_KEY_HERE
+    https://cloud.google.com/speech-to-text/docs/quickstart-protocol
 
     """
 
@@ -42,21 +25,18 @@ def __init__(self, *args, **kwargs):
         # FIXME: get init args from config
 
         self._logger = logging.getLogger(__name__)
-        self._request_url = None
         self._language = None
-        self._api_key = None
-        self._http = requests.Session()
+        self._client = speech.SpeechClient()
+        self._config = None
         try:
             language = self.profile['language']
         except KeyError:
             language = 'en-US'
 
         self.language = language.lower()
-        self.api_key = self.profile['keys']['GOOGLE_SPEECH']
 
-    @property
-    def request_url(self):
-        return self._request_url
+        self._regenerate_config()
+
 
     @property
     def language(self):
@@ -65,84 +45,56 @@ def language(self):
     @language.setter
     def language(self, value):
         self._language = value
-        self._regenerate_request_url()
+        self._regenerate_config()
+
+    def _regenerate_config(self):
+        keyword = None
+        try:
+            keyword = self.profile["keyword"]
+        except:
+            pass
+
+        self._config = types.RecognitionConfig(
+                           encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
+                           language_code=self.language,
+                           speech_contexts=[speech.types.SpeechContext(phrases=[keyword])],
+                           model="command_and_search"
+                       )
 
-    @property
-    def api_key(self):
-        return self._api_key
-
-    @api_key.setter
-    def api_key(self, value):
-        self._api_key = value
-        self._regenerate_request_url()
-
-    def _regenerate_request_url(self):
-        if self.api_key and self.language:
-            query = urllib.urlencode({'output': 'json',
-                                      'client': 'chromium',
-                                      'key': self.api_key,
-                                      'lang': self.language,
-                                      'maxresults': 6,
-                                      'pfilter': 2})
-            self._request_url = urllib.parse.urlunparse(
-                ('https', 'www.google.com', '/speech-api/v2/recognize', '',
-                 query, ''))
-        else:
-            self._request_url = None
 
     def transcribe(self, fp):
         """
         Performs STT via the Google Speech API, transcribing an audio file and
         returning an English string.
 
         Arguments:
-        audio_file_path -- the path to the .wav file to be transcribed
+        fp -- the path to the .wav file to be transcribed
         """
-
-        if not self.api_key:
-            self._logger.critical('API key missing, transcription request ' +
-                                  'aborted.')
-            return []
-        elif not self.language:
+        if not self.language:
             self._logger.critical('Language info missing, transcription ' +
                                   'request aborted.')
             return []
 
-        wav = wave.open(fp, 'rb')
-        frame_rate = wav.getframerate()
-        wav.close()
-        data = fp.read()
-
-        headers = {'content-type': 'audio/l16; rate=%s' % frame_rate}
-        r = self._http.post(self.request_url, data=data, headers=headers)
-        try:
-            r.raise_for_status()
-        except requests.exceptions.HTTPError:
-            self._logger.critical('Request failed with http status %d',
-                                  r.status_code)
-            if r.status_code == requests.codes['forbidden']:
-                self._logger.warning('Status 403 is probably caused by an ' +
-                                     'invalid Google API key.')
-            return []
-        r.encoding = 'utf-8'
+        content = fp.read()
+        audio = types.RecognitionAudio(content=content)
         try:
-            # We cannot simply use r.json() because Google sends invalid json
-            # (i.e. multiple json objects, seperated by newlines. We only want
-            # the last one).
-            response = json.loads(list(r.text.strip().split('\n', 1))[-1])
-            if len(response['result']) == 0:
-                # Response result is empty
-                raise ValueError('Nothing has been transcribed.')
-            results = [alt['transcript'] for alt
-                       in response['result'][0]['alternative']]
+            response = self._client.recognize(self._config, audio)
+        except GoogleAPICallError as e: 
+            # request failed for any reason
+            self._logger.warning('Google STT retry error. response: %s', e.args[0])
+            results = []
+        except RetryError as e:
+            # failed due to a retryable error and retry attempts failed
+            self._logger.warning('Google STT retry error. response: %s', e.args[0])
+            results = []
         except ValueError as e:
             self._logger.warning('Empty response: %s', e.args[0])
             results = []
-        except (KeyError, IndexError):
-            self._logger.warning('Cannot parse response.', exc_info=True)
-            results = []
         else:
             # Convert all results to uppercase
-            results = tuple(result.upper() for result in results)
+            results = [ str(result.alternatives[0].transcript).upper() for 
+                            result in response.results]
             self._logger.info('Transcribed: %r', results)
+
         return results
+
diff --git a/python_requirements.txt b/python_requirements.txt
@@ -30,3 +30,10 @@ python-dateutil==2.7.3
 
 # MPDControl module
 python-mpd==0.3.0
+
+# google stt
+google-api-core==1.9.0
+google-auth==1.6.3
+google-cloud-speech==1.0.0
+googleapis-common-protos==1.5.9
+