Skip to content

Commit

Permalink
Merge pull request #45 from nimh-dsst/fix-cli
Browse files Browse the repository at this point in the history
Fix cli
  • Loading branch information
leej3 authored Aug 27, 2024
2 parents 7b33d0d + 1434111 commit 57d1f1c
Show file tree
Hide file tree
Showing 14 changed files with 177 additions and 119 deletions.
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ osm -f path/to/pdf-or-xml2 -u uuid2 --user-managed-compose

# Contributing

N.B. On Apple silicon you must use emulation i.e. `export DOCKER_DEFAULT_PLATFORM=linux/amd64`
N.B. On Apple silicon you must use emulation and download the mongo container in advance:

If you wish to contribute to this project you can set up a development environment with the following:
```
export DOCKER_DEFAULT_PLATFORM=linux/amd64
docker pull mongo:4.4.6
```

To contribute to the project please run the following commands to set up a development environment:

```
pip install -e .
Expand Down
6 changes: 3 additions & 3 deletions compose.development.override.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ services:
web_api:
container_name: web_api
environment:
- MONGODB_URI=mongodb://db:27017/test
- MONGODB_URI=mongodb://db:27017/osm
build:
context: .
dockerfile: ./web/api/Dockerfile
Expand All @@ -30,7 +30,7 @@ services:
context: .
dockerfile: ./web/dashboard/Dockerfile
environment:
- MONGODB_URI=mongodb://db:27017/test
- MONGODB_URI=mongodb://db:27017/osm
working_dir: /app
ports:
- "8501:8501"
Expand All @@ -45,4 +45,4 @@ services:
ports:
- 27017:27017
environment:
- MONGO_INITDB_DATABASE=test
- MONGO_INITDB_DATABASE=osm
5 changes: 0 additions & 5 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,3 @@ services:
image: nimhdsst/rtransparent:staging
ports:
- "8071:8071"
healthcheck:
test: ["CMD", "curl", "--include", "--request", "GET", "http://localhost:8071/health"]
interval: 1s
timeout: 3s
retries: 3
26 changes: 11 additions & 15 deletions external_components/rtransparent/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pandas as pd
import psutil
import rpy2.robjects as ro
from fastapi import FastAPI, HTTPException, Query, Request, status
from fastapi import FastAPI, File, HTTPException, Query, UploadFile, status
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from rpy2.robjects import pandas2ri
Expand Down Expand Up @@ -51,15 +51,14 @@ def rtransparent_metric_extraction(
future = importr("future")
future.plan(future.multisession, workers=workers)

# Write the XML content to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as temp_xml_file:
temp_xml_file.write(xml_content)
temp_xml_file_path = temp_xml_file.name
if parser == "pmc":
if parser == "PMCParser":
# XML files from pubmedcentral can have extra metadata exracted
df = extract_from_pmc_xml(temp_xml_file_path, rtransparent)
else:
df = extract_from_xml(temp_xml_file_path, rtransparent)
# Clean up the temporary file
temp_xml_file.close()
Path(temp_xml_file_path).unlink()
return df
Expand All @@ -81,7 +80,11 @@ def extract_from_xml(temp_xml_file_path, rtransparent):

def extract_from_pmc_xml(temp_xml_file_path, rtransparent):
raise NotImplementedError(
"Not all XML files provided at pubmedcentral include the datasharing statements."
"""
Not all XML files provided at pubmedcentral include the datasharing
statements so this is a not a priority. The data returned contains R Na
types which need to be converted to an appropriate python type.
"""
)
# dfs = {}
# with (ro.default_converter + pandas2ri.converter).context():
Expand All @@ -105,24 +108,17 @@ def extract_from_pmc_xml(temp_xml_file_path, rtransparent):


@app.post("/extract-metrics/")
async def extract_metrics(request: Request, parser: str = Query("other")):
async def extract_metrics(file: UploadFile = File(...), parser: str = Query("other")):
try:
# Attempt to read the XML content from the request body
xml_content = await request.body()
xml_content = await file.read()
if not xml_content:
raise NotImplementedError(
"""For now the XML content must be provided. Check the output of
the parsing stage."""
)

metrics_df = rtransparent_metric_extraction(xml_content, parser)

# Log the extracted metrics
logger.info(metrics_df)

# Return the first row as a JSON response
logger.info(metrics_df.info())
return JSONResponse(content=metrics_df.iloc[0].to_dict(), status_code=200)

except Exception as e:
# Handle exceptions and return a 500 Internal Server Error
raise HTTPException(status_code=500, detail=str(e))
45 changes: 29 additions & 16 deletions osm/_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import argparse
import datetime
import logging
import os
import shlex
import subprocess
import time
import types
from pathlib import Path
from time import sleep

import pandas as pd
import requests
Expand Down Expand Up @@ -47,6 +47,12 @@ def _get_text_dir(output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path:
return text_dir


def _get_logs_dir(output_dir: Path = DEFAULT_OUTPUT_DIR) -> Path:
logs_dir = Path(output_dir) / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)
return logs_dir


def _existing_file(path_string):
path = Path(path_string)
if not path.exists():
Expand All @@ -59,6 +65,10 @@ def get_compute_context_id():


def wait_for_containers():
"""
A hack for now, on Apple Silicon, the parser container fails. Ideally we
would just use the wait kwargs for docker compose up
"""
while True:
try:
response = requests.get("http://localhost:8071/health")
Expand All @@ -71,19 +81,24 @@ def wait_for_containers():


def compose_up():
cmd = shlex.split("docker compose up -d --build")
subprocess.run(
cmd,
check=True,
)
from python_on_whales import docker

logger.info("Waiting for containers to be ready...")
print("Waiting for containers to be ready...")
docker.compose.up(detach=True, wait=True)
print("Containers ready!")
sleep(5)


def compose_down():
cmd = shlex.split("docker compose down")
subprocess.run(
cmd,
check=True,
from python_on_whales import docker

docker_log = _get_logs_dir() / (
datetime.datetime.now().strftime("docker_log_%Y%m%d_%H%M%S.txt")
)
docker_log.write_text(docker.compose.logs())
docker.compose.down()
print(f"Logs of docker containers are saved at {docker_log}")


def _setup(args):
Expand All @@ -104,17 +119,15 @@ def _setup(args):
metrics_path = _get_metrics_dir() / f"{args.uid}.json"
if metrics_path.exists():
raise FileExistsError(metrics_path)
# create logs directory if necessary
_ = _get_logs_dir()
if not args.user_managed_compose:
compose_up()

logger.info("Waiting for containers to be ready...")
print("Waiting for containers to be ready...")
wait_for_containers()
print("Containers ready!")
return xml_path, metrics_path


def coerce_to_string(v):
"Can be useful for schemas with permissive string fields"
if isinstance(v, (int, float, bool)):
return str(v)
elif isinstance(v, types.NoneType):
Expand Down
7 changes: 4 additions & 3 deletions osm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from osm._utils import DEFAULT_OUTPUT_DIR, _existing_file, _setup, compose_down
from osm.pipeline.core import Pipeline, Savers
from osm.pipeline.extractors import RTransparentExtractor
from osm.pipeline.parsers import NoopParser, ScienceBeamParser
from osm.pipeline.parsers import NoopParser, PMCParser, ScienceBeamParser
from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver

PARSERS = {
"sciencebeam": ScienceBeamParser,
"no-op": NoopParser,
"pmc": PMCParser,
}
EXTRACTORS = {
"rtransparent": RTransparentExtractor,
Expand Down Expand Up @@ -41,7 +42,7 @@ def parse_args():
choices=PARSERS.keys(),
default=["sciencebeam"],
nargs="+",
help="Select the tool for parsing the input document. Default is 'sciencebeam'.",
help="Select the tool for parsing the input document. Default is 'sciencebeam'. Choose 'pmc' for xml files from pubmed central.",
)
parser.add_argument(
"--metrics-type",
Expand Down Expand Up @@ -76,7 +77,7 @@ def main():
xml_path, metrics_path = _setup(args)

pipeline = Pipeline(
filepath=args.filepath,
input_path=args.filepath,
xml_path=xml_path,
metrics_path=metrics_path,
parsers=[PARSERS[p]() for p in args.parser],
Expand Down
30 changes: 17 additions & 13 deletions osm/pipeline/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,14 @@ def __init__(self, version: str = "0.0.1"):
self._orm_model = None

@abstractmethod
def run(self, data: Any, **kwargs) -> Any:
def _run(self, data: bytes, **kwargs) -> Any:
"""Abstract method that subclasses must implement."""
pass

def run(self, data: bytes, *args, **kwargs) -> Any:
print(f"{self.name} (version {self.version}) is running.")
return self._run(data, *args, **kwargs)

def _get_orm_fields(self) -> dict[str, Any]:
fields = {}
for fieldname in self.orm_model_class.model_fields.keys():
Expand Down Expand Up @@ -60,19 +65,19 @@ def __iter__(self):
yield self.osm_saver

def save_file(self, data: str, path: Path):
self.file_saver.run(data, path)
self.file_saver.run(data, path=path)

def save_json(self, data: dict, path: Path):
self.json_saver.run(data, path)
self.json_saver.run(data, path=path)

def save_osm(
self,
file_in: bytes,
data: bytes,
metrics: dict,
components: list,
):
# Call the method to save or upload the data
self.osm_saver.run(file_in, metrics, components)
self.osm_saver.run(data, metrics=metrics, components=components)


class Pipeline:
Expand All @@ -82,14 +87,14 @@ def __init__(
parsers: list[Component],
extractors: list[Component],
savers: Savers,
filepath: str,
input_path: str,
xml_path: Optional[str] = None,
metrics_path: Optional[str] = None,
):
self.parsers = parsers
self.extractors = extractors
self.savers = savers
self.filepath = filepath
self.input_path = input_path
self._file_data = None
self.xml_path = xml_path
self.metrics_path = metrics_path
Expand All @@ -100,22 +105,21 @@ def run(self):
if isinstance(parsed_data, str):
self.savers.save_file(parsed_data, self.xml_path)
for extractor in self.extractors:
# extracted_metrics = extractor.run(parsed_data,parser=parser.name)
extracted_metrics = extractor.run(parsed_data)
extracted_metrics = extractor.run(parsed_data, parser=parser.name)
self.savers.save_osm(
file_in=self.file_data,
data=self.file_data,
metrics=extracted_metrics,
components=[*self.parsers, *self.extractors, *self.savers],
)
self.savers.save_json(extracted_metrics, self.metrics_path)

@staticmethod
def read_file(filepath: str) -> bytes:
with open(filepath, "rb") as file:
def read_file(input_path: str) -> bytes:
with open(input_path, "rb") as file:
return file.read()

@property
def file_data(self):
if not self._file_data:
self._file_data = self.read_file(self.filepath)
self._file_data = self.read_file(self.input_path)
return self._file_data
22 changes: 15 additions & 7 deletions osm/pipeline/extractors.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
import io
import logging

import requests

from osm.schemas.custom_fields import LongBytes

from .core import Component

logger = logging.getLogger(__name__)


class RTransparentExtractor(Component):
def run(self, data: str, parser: str = None) -> dict:
headers = {"Content-Type": "application/octet-stream"}
def _run(self, data: bytes, parser: str = None) -> dict:
self.sample = LongBytes(data)

# Prepare the file to be sent as a part of form data
files = {"file": ("input.xml", io.BytesIO(data), "application/xml")}

# Send the request with the file
response = requests.post(
"http://localhost:8071/extract-metrics",
data=data,
headers=headers,
"http://localhost:8071/extract-metrics/",
files=files,
params={"parser": parser},
)

if response.status_code == 200:
metrics = response.json()
# pmid only exists when input filename is correct
metrics.pop("pmid")
# replace bizarre sentinel value
metrics.pop("pmid", None) # Use .pop() with a default to avoid KeyError
# Replace NA value
for k, v in metrics.items():
if v == -2147483648:
metrics[k] = None
Expand Down
19 changes: 15 additions & 4 deletions osm/pipeline/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
class NoopParser(Component):
"""Used if the input is xml and so needs no parsing."""

def run(self, data: bytes) -> str:
self.sample = LongBytes(data)
return data.decode("utf-8")
def _run(self, data: bytes) -> bytes:
return data


class PMCParser(NoopParser):
"""
Used if the input is a PMC derived XML and so needs no parsing. PMC
parsed XMLs can have unique features. For example RTransparent extracts
additional metadata from them aided by the document structure. Sometimes
data sharing statements etc. may not be in the PMC parsed XML despite being
in the pdf version of the publication.
"""

pass


class ScienceBeamParser(Component):
def run(self, data: bytes) -> str:
def _run(self, data: bytes) -> str:
self.sample = LongBytes(data)
headers = {"Accept": "application/tei+xml", "Content-Type": "application/pdf"}
response = requests.post(SCIENCEBEAM_URL, data=data, headers=headers)
Expand Down
Loading

0 comments on commit 57d1f1c

Please sign in to comment.