fix doc extract crashes due to raise_for_status()

fix doc extract final col name
GooeyAI · Feb 12, 2024 · 0a9c0db · 0a9c0db
1 parent 3609a12
commit 0a9c0db
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 38 deletions.
diff --git a/daras_ai_v2/exceptions.py b/daras_ai_v2/exceptions.py
@@ -2,8 +2,8 @@
 
 import requests
 from requests import HTTPError
-from requests.exceptions import JSONDecodeError
 
+from daras_ai.image_input import truncate_filename
 
 logger = getLogger(__name__)
 
@@ -24,20 +24,15 @@ def raise_for_status(resp: requests.Response):
     else:
         reason = resp.reason
 
-    try:
-        response_body = str(resp.json())
-    except JSONDecodeError:
-        try:
-            response_body = resp.text
-        except ValueError:
-            response_body = resp.content
-    response_body = response_body[:500]  # truncate to at max 500 characters
-
     if 400 <= resp.status_code < 500:
-        http_error_msg = f"{resp.status_code} Client Error: {reason} | URL: {resp.url} | Response: {response_body!r}"
+        http_error_msg = f"{resp.status_code} Client Error: {reason} | URL: {resp.url} | Response: {_response_preview(resp)!r}"
 
     elif 500 <= resp.status_code < 600:
-        http_error_msg = f"{resp.status_code} Server Error: {reason} | URL: {resp.url} | Response: {response_body!r}"
+        http_error_msg = f"{resp.status_code} Server Error: {reason} | URL: {resp.url} | Response: {_response_preview(resp)!r}"
 
     if http_error_msg:
         raise HTTPError(http_error_msg, response=resp)
+
+
+def _response_preview(resp: requests.Response) -> bytes:
+    return truncate_filename(resp.content, 500, sep=b"...")
diff --git a/recipes/DocExtract.py b/recipes/DocExtract.py
@@ -48,12 +48,13 @@
 class Columns(IntegerChoices):
     webpage_url = 1, "url"
     title = 2, "title"
-    description = 3, "Description"
-    content_url = 4, "Content"
-    transcript = 5, "Transcript"
-    translation = 6, "Translation"
-    summary = 7, "Summarized"
-    status = 8, "Status"
+    final_output = 3, "snippet/sections"
+    description = 4, "Description"
+    content_url = 5, "Content URL"
+    transcript = 6, "Transcript"
+    translation = 7, "Translation"
+    summary = 8, "Summarized"
+    status = 9, "Status"
 
 
 class DocExtractPage(BasePage):
@@ -409,8 +410,7 @@ def process_source(
             )
         update_cell(spreadsheet_id, row, Columns.content_url.value, content_url)
 
-    usable_out_col = (Columns.transcript.value, "snippet")
-
+    final_col_name = "snippet"
     transcript = existing_values[Columns.transcript.value]
     if not transcript:
         if (
@@ -427,12 +427,12 @@ def process_source(
             else:
                 params = None
             transcript = str(azure_doc_extract_pages(content_url, params=params)[0])
-            usable_out_col = (Columns.transcript.value, "sections")
         else:
             raise NotImplementedError(
                 f"Unsupported type {doc_meta and doc_meta.mime_type} for {webpage_url}"
             )
         update_cell(spreadsheet_id, row, Columns.transcript.value, transcript)
+    final_value = transcript
 
     if request.google_translate_target:
         translation = existing_values[Columns.translation.value]
@@ -445,30 +445,39 @@ def process_source(
                 glossary_url=request.glossary_document,
             )[0]
             update_cell(spreadsheet_id, row, Columns.translation.value, translation)
-        usable_out_col = (Columns.translation.value, "snippet")
+            final_col_name = "sections"
+        final_value = translation
     else:
         translation = transcript
         update_cell(spreadsheet_id, row, Columns.translation.value, "")
 
     summary = existing_values[Columns.summary.value]
-    if not summary and request.task_instructions:
-        yield "Summarizing"
-        prompt = request.task_instructions.strip() + "\n\n" + translation
-        summary = "\n---\n".join(
-            run_language_model(
-                model=request.selected_model,
-                quality=request.quality,
-                num_outputs=request.num_outputs,
-                temperature=request.sampling_temperature,
-                prompt=prompt,
-                max_tokens=request.max_tokens,
-                avoid_repetition=request.avoid_repetition,
+    if request.task_instructions:
+        if not summary:
+            yield "Summarizing"
+            prompt = request.task_instructions.strip() + "\n\n" + translation
+            summary = "\n---\n".join(
+                run_language_model(
+                    model=request.selected_model,
+                    quality=request.quality,
+                    num_outputs=request.num_outputs,
+                    temperature=request.sampling_temperature,
+                    prompt=prompt,
+                    max_tokens=request.max_tokens,
+                    avoid_repetition=request.avoid_repetition,
+                )
             )
-        )
-        update_cell(spreadsheet_id, row, Columns.summary.value, summary)
+            update_cell(spreadsheet_id, row, Columns.summary.value, summary)
+        if final_col_name == "snippet":
+            final_value = f"content={final_value}\ncontent={summary}"
+            final_col_name = "sections"
+        else:
+            final_value = f"{final_value}\ncontent={summary}"
+    else:
+        update_cell(spreadsheet_id, row, Columns.summary.value, "")
 
-    if usable_out_col:
-        update_cell(spreadsheet_id, 1, *usable_out_col)
+    update_cell(spreadsheet_id, 1, Columns.final_output.value, final_col_name)
+    update_cell(spreadsheet_id, row, Columns.final_output.value, final_value)
 
 
 def google_api_should_retry(e: Exception) -> bool: