Merge pull request #39 from nimh-dsst/improve-bulk-upload

Improve bulk upload
nimh-dsst · Aug 25, 2024 · 9a6fa93 · 9a6fa93
2 parents 8b81a3c + b16d211
commit 9a6fa93
Show file tree

Hide file tree

Showing 12 changed files with 604 additions and 560 deletions.
diff --git a/osm/_utils.py b/osm/_utils.py
@@ -6,15 +6,38 @@
 import shlex
 import subprocess
 import time
+import types
 from pathlib import Path
 
+import pandas as pd
 import requests
 
 from osm._version import __version__
 
 DEFAULT_OUTPUT_DIR = "./osm_output"
 logger = logging.getLogger(__name__)
 
+ERROR_CSV_PATH = Path("error_log.csv")
+ERROR_LOG_PATH = Path("error.log")
+
+
+def write_error_to_file(row: pd.Series, error: Exception):
+    with ERROR_CSV_PATH.open("a") as csv_file, ERROR_LOG_PATH.open("a") as log_file:
+        # Write the problematic row data to the CSV, add header if not yet populated.
+        row.to_csv(
+            csv_file,
+            header=not ERROR_CSV_PATH.exists() or ERROR_CSV_PATH.stat().st_size == 0,
+            index=False,
+        )
+
+        # Drop string values as they tend to be too long
+        display_row = (
+            row.apply(lambda x: x if not isinstance(x, str) else None)
+            .dropna()
+            .to_dict()
+        )
+        log_file.write(f"Error processing data:\n {display_row}\nError: {error}\n\n")
+
 
 def _get_metrics_dir(output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path:
     metrics_dir = Path(output_dir) / "metrics"
@@ -81,15 +104,15 @@ def wait_for_containers():
 
 
 def compose_up():
-    cmd = shlex.split("docker-compose up -d --build")
+    cmd = shlex.split("docker compose up -d --build")
     subprocess.run(
         cmd,
         check=True,
     )
 
 
 def compose_down():
-    cmd = shlex.split("docker-compose down")
+    cmd = shlex.split("docker compose down")
     subprocess.run(
         cmd,
         check=True,
@@ -119,3 +142,32 @@ def _setup(args):
     print("Waiting for containers to be ready...")
     wait_for_containers()
     return xml_path, metrics_path
+
+
+def coerce_to_string(v):
+    if isinstance(v, (int, float, bool)):
+        return str(v)
+    elif isinstance(v, types.NoneType):
+        return None
+    elif pd.isna(v):
+        return None
+    elif not isinstance(v, str):
+        raise ValueError("string required or a type that can be coerced to a string")
+    return v
+
+
+def flatten_dict(d):
+    """
+    Recursively flattens a nested dictionary without prepending parent keys.
+
+    :param d: Dictionary to flatten.
+    :return: Flattened dictionary.
+    """
+    items = []
+    for k, v in d.items():
+        if isinstance(v, dict):
+            # If the value is a dictionary, flatten it without the parent key
+            items.extend(flatten_dict(v).items())
+        else:
+            items.append((k, v))
+    return dict(items)
diff --git a/osm/schemas/metrics_schemas.py b/osm/schemas/metrics_schemas.py
@@ -1,16 +1,22 @@
-import types
 from typing import Optional
 
 from odmantic import EmbeddedModel
 from pydantic import field_validator
 
+from osm._utils import coerce_to_string
+
 from .custom_fields import LongStr
 
 
 #  The rtransparent tool can extract from parsed pdfs or from XML directly from pubmed central. The latter has many more fields.
 #  all_indicators.csv from the rtransparent publication has both but has the following extra fields:
 # code_text,com_code,com_data_availibility,com_file_formats,com_general_db,com_github_data,com_specific_db,com_suppl_code,com_supplemental_data,data_text,dataset,eigenfactor_score,field,is_art,is_code_pred,is_data_pred,is_relevant_code,is_relevant_data,jif,n_cite,score,year,
 class RtransparentMetrics(EmbeddedModel):
+    model_config = {
+        "json_encoders": {
+            LongStr: lambda v: v.get_value(),
+        },
+    }
     # Mandatory fields
     is_open_code: Optional[bool]
     is_open_data: Optional[bool]
@@ -146,7 +152,7 @@ class RtransparentMetrics(EmbeddedModel):
     is_success: Optional[bool] = None
     is_art: Optional[bool] = None
     field: Optional[str] = None
-    score: Optional[int] = None
+    score: Optional[float] = None
     jif: Optional[float] = None
     eigenfactor_score: Optional[float] = None
     n_cite: Optional[float] = None
@@ -189,36 +195,5 @@ class RtransparentMetrics(EmbeddedModel):
     is_code_pred: Optional[bool] = None
 
     @field_validator("article")
-    def coerce_to_string(cls, v):
-        if isinstance(v, (int, float, bool)):
-            return str(v)
-        elif isinstance(v, types.NoneType):
-            return None
-        elif not isinstance(v, str):
-            raise ValueError(
-                "string required or a type that can be coerced to a string"
-            )
-        return v
-
-
-# Tried to define  programmatically but both ways seemed to yield a model class without type annotated fields...
-
-# 1
-#  RtransparentMetrics = type(
-#     "RtransparentMetrics",
-#     (Model,),
-#     {n: Optional[t] for n, t in rtransparent_metric_types.items()},
-# )
-
-# 2
-# Use Field to explicitly define the fields in the model
-# namespace = {
-#     n: (Optional[t], Field(default=None))
-#     for n, t in rtransparent_metric_types.items()
-# }
-# Dynamically create the Pydantic/ODMantic model
-# RtransparentMetrics: Type[Model] = type(
-#     "RtransparentMetrics",
-#     (Model,),
-#     namespace,
-# )
+    def fix_string(cls, v):
+        return coerce_to_string(v)