Skip to content

Commit

Permalink
Merge pull request #39 from nimh-dsst/improve-bulk-upload
Browse files Browse the repository at this point in the history
Improve bulk upload
  • Loading branch information
leej3 authored Aug 25, 2024
2 parents 8b81a3c + b16d211 commit 9a6fa93
Show file tree
Hide file tree
Showing 12 changed files with 604 additions and 560 deletions.
56 changes: 54 additions & 2 deletions osm/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,38 @@
import shlex
import subprocess
import time
import types
from pathlib import Path

import pandas as pd
import requests

from osm._version import __version__

DEFAULT_OUTPUT_DIR = "./osm_output"
logger = logging.getLogger(__name__)

ERROR_CSV_PATH = Path("error_log.csv")
ERROR_LOG_PATH = Path("error.log")


def write_error_to_file(row: pd.Series, error: Exception):
with ERROR_CSV_PATH.open("a") as csv_file, ERROR_LOG_PATH.open("a") as log_file:
# Write the problematic row data to the CSV, add header if not yet populated.
row.to_csv(
csv_file,
header=not ERROR_CSV_PATH.exists() or ERROR_CSV_PATH.stat().st_size == 0,
index=False,
)

# Drop string values as they tend to be too long
display_row = (
row.apply(lambda x: x if not isinstance(x, str) else None)
.dropna()
.to_dict()
)
log_file.write(f"Error processing data:\n {display_row}\nError: {error}\n\n")


def _get_metrics_dir(output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path:
metrics_dir = Path(output_dir) / "metrics"
Expand Down Expand Up @@ -81,15 +104,15 @@ def wait_for_containers():


def compose_up():
cmd = shlex.split("docker-compose up -d --build")
cmd = shlex.split("docker compose up -d --build")
subprocess.run(
cmd,
check=True,
)


def compose_down():
cmd = shlex.split("docker-compose down")
cmd = shlex.split("docker compose down")
subprocess.run(
cmd,
check=True,
Expand Down Expand Up @@ -119,3 +142,32 @@ def _setup(args):
print("Waiting for containers to be ready...")
wait_for_containers()
return xml_path, metrics_path


def coerce_to_string(v):
if isinstance(v, (int, float, bool)):
return str(v)
elif isinstance(v, types.NoneType):
return None
elif pd.isna(v):
return None
elif not isinstance(v, str):
raise ValueError("string required or a type that can be coerced to a string")
return v


def flatten_dict(d):
"""
Recursively flattens a nested dictionary without prepending parent keys.
:param d: Dictionary to flatten.
:return: Flattened dictionary.
"""
items = []
for k, v in d.items():
if isinstance(v, dict):
# If the value is a dictionary, flatten it without the parent key
items.extend(flatten_dict(v).items())
else:
items.append((k, v))
return dict(items)
45 changes: 10 additions & 35 deletions osm/schemas/metrics_schemas.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
import types
from typing import Optional

from odmantic import EmbeddedModel
from pydantic import field_validator

from osm._utils import coerce_to_string

from .custom_fields import LongStr


# The rtransparent tool can extract from parsed pdfs or from XML directly from pubmed central. The latter has many more fields.
# all_indicators.csv from the rtransparent publication has both but has the following extra fields:
# code_text,com_code,com_data_availibility,com_file_formats,com_general_db,com_github_data,com_specific_db,com_suppl_code,com_supplemental_data,data_text,dataset,eigenfactor_score,field,is_art,is_code_pred,is_data_pred,is_relevant_code,is_relevant_data,jif,n_cite,score,year,
class RtransparentMetrics(EmbeddedModel):
model_config = {
"json_encoders": {
LongStr: lambda v: v.get_value(),
},
}
# Mandatory fields
is_open_code: Optional[bool]
is_open_data: Optional[bool]
Expand Down Expand Up @@ -146,7 +152,7 @@ class RtransparentMetrics(EmbeddedModel):
is_success: Optional[bool] = None
is_art: Optional[bool] = None
field: Optional[str] = None
score: Optional[int] = None
score: Optional[float] = None
jif: Optional[float] = None
eigenfactor_score: Optional[float] = None
n_cite: Optional[float] = None
Expand Down Expand Up @@ -189,36 +195,5 @@ class RtransparentMetrics(EmbeddedModel):
is_code_pred: Optional[bool] = None

@field_validator("article")
def coerce_to_string(cls, v):
if isinstance(v, (int, float, bool)):
return str(v)
elif isinstance(v, types.NoneType):
return None
elif not isinstance(v, str):
raise ValueError(
"string required or a type that can be coerced to a string"
)
return v


# Tried to define programmatically but both ways seemed to yield a model class without type annotated fields...

# 1
# RtransparentMetrics = type(
# "RtransparentMetrics",
# (Model,),
# {n: Optional[t] for n, t in rtransparent_metric_types.items()},
# )

# 2
# Use Field to explicitly define the fields in the model
# namespace = {
# n: (Optional[t], Field(default=None))
# for n, t in rtransparent_metric_types.items()
# }
# Dynamically create the Pydantic/ODMantic model
# RtransparentMetrics: Type[Model] = type(
# "RtransparentMetrics",
# (Model,),
# namespace,
# )
def fix_string(cls, v):
return coerce_to_string(v)
Loading

0 comments on commit 9a6fa93

Please sign in to comment.