Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reorg with fixes #32

Merged
merged 1 commit into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.terraform
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ jobs:

- name: Build and push Docker image
run: |
DOCKER_BUILDKIT=1 docker build -t osm_web_api:${{ github.event.inputs.environment || 'production' }} -f ./docker_images/web_api/Dockerfile .
DOCKER_BUILDKIT=1 docker build -t osm_web_api:${{ github.event.inputs.environment || 'production' }} -f ./web/app/Dockerfile .
docker tag osm_web_api:${{ github.event.inputs.environment || 'production' }}:latest ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }}
docker push ${{ secrets.AWS_ACCOUNT_ID }}.dkr.ecr.${{ secrets.AWS_REGION }}.amazonaws.com/osm_web_api:${{ github.event.inputs.environment || 'production' }}

Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ repos:
hooks:
- id: trailing-whitespace
files: ".*\\.py"
exclude: "examples|docs/examples"
exclude: "examples|docs/examples|tests/data"
- id: check-added-large-files
- id: check-toml
- id: end-of-file-fixer
exclude: "examples|docs/examples"
exclude: "examples|docs/examples|tests/data"

# - repo: https://github.com/pre-commit/mirrors-prettier
# rev: v4.0.0-alpha.8
Expand Down
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,41 @@ OpenSciMetrics (OSM) applies NLP and LLM-based metrics and indicators related to

N.B. pdf parsing does not work on Apple silicon...

- With docker-compose and python >3.11 installed, runng the following from the project's root directory:
- With docker-compose and python >=3.11 installed, run the following from the project's root directory:

```
pip install .
osm -f path/to/pdf-or-xml -u uuid
```

If you have many files to upload you may with to start up the docker-compose dependencies in a separate terminal window:

```
docker compose up # docker-compose on some systems
```

And then tell the osm tool that this has been handled:

```
osm -f path/to/pdf-or-xml -u uuid --user-managed-compose
osm -f path/to/pdf-or-xml2 -u uuid2 --user-managed-compose
```

# Contributing

If you wish to contribute to this project you can set up a development environment with the following:

```
pip install -e .
docker compose -f compose.yaml -f compose.development.override.yaml up --build
```
And in another terminal:

```
osm -f path/to/pdf-or-xml -u uuid --user-managed-compose
```


## Using pre-commit for commit checks

Pre-commit will run all of its hooks on every commit you make. To install
Expand Down
21 changes: 16 additions & 5 deletions compose.override.yaml → compose.development.override.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
name: local-osm
services:
rtransparent:
build:
context: .
dockerfile: docker_images/rtransparent/Dockerfile
dockerfile: ./external_components/rtransparent/Dockerfile
volumes:
- ./docker_images/rtransparent:/app
- ./external_components/rtransparent:/app

osm_web_api:
web_api:
environment:
- MONGODB_URI=mongodb://db:27017/test
build:
context: .
dockerfile: ./docker_images/web_api/Dockerfile
dockerfile: ./web/api/Dockerfile
ports:
- 80:80
volumes:
- ./docker_images/web_api:/app/app
- ./web/api:/app/app
working_dir: /app/app
command: ["fastapi","dev","--host","0.0.0.0","--port","80"]
depends_on:
- db

dashboard:
build:
context: .
dockerfile: ./web/dashboard/Dockerfile
environment:
- MONGODB_URI=mongodb://db:27017/test
working_dir: /app
ports:
- "8501:8501"

db:
# use old version of mongo to avoid Apple Instruction set error
image: mongo:4.4.6
Expand Down
19 changes: 4 additions & 15 deletions external_components/rtransparent/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,7 @@ SHELL ["/bin/bash", "--login", "-c"]
# Set working directory
WORKDIR /app

# Install debugging tools
RUN apt-get update && apt-get install -y \
git \
curl \
iputils-ping \
net-tools \
&& rm -rf /var/lib/apt/lists/*

COPY docker_images/rtransparent/environment.yaml /app
COPY external_components/rtransparent/environment.yaml /app

# Create the environment
RUN conda env create -f environment.yaml
Expand All @@ -20,16 +12,13 @@ RUN conda env create -f environment.yaml
RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate osm" | tee -a ~/.bashrc /etc/profile /etc/profile.d/conda.sh /etc/skel/.bashrc /etc/skel/.profile > /dev/null

RUN R -e '\
install.packages("roadoi", repos = "http://cran.us.r-project.org"); \
devtools::install_github("quest-bih/oddpub"); \
devtools::install_github("cran/crminer"); \
devtools::install_github("serghiou/metareadr"); \
devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863"); \
devtools::install_github("serghiou/rtransparent", build_vignettes = F)'

# # Copy the project files and install the package
COPY docker_images/rtransparent/app.py /app
COPY external_components/rtransparent/app.py /app

# Make entrypoint etc. convenient for users
COPY docker_images/_entrypoint.sh /usr/local/bin/_entrypoint.sh
COPY external_components/_entrypoint.sh /usr/local/bin/_entrypoint.sh
ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"]
78 changes: 63 additions & 15 deletions external_components/rtransparent/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
import tempfile
from pathlib import Path

import pandas as pd
import psutil
import rpy2.robjects as ro
from fastapi import FastAPI, HTTPException, Request, status
from fastapi import FastAPI, HTTPException, Query, Request, status
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from rpy2.robjects import pandas2ri
Expand Down Expand Up @@ -44,7 +45,7 @@ def get_health() -> HealthCheck:


def rtransparent_metric_extraction(
xml_content: bytes, workers: int = psutil.cpu_count()
xml_content: bytes, parser: str, workers: int = psutil.cpu_count()
):
rtransparent = importr("rtransparent")
future = importr("future")
Expand All @@ -54,27 +55,74 @@ def rtransparent_metric_extraction(
with tempfile.NamedTemporaryFile(delete=False, suffix=".xml") as temp_xml_file:
temp_xml_file.write(xml_content)
temp_xml_file_path = temp_xml_file.name

with (ro.default_converter + pandas2ri.converter).context():
df = ro.conversion.get_conversion().rpy2py(
rtransparent.rt_all(temp_xml_file_path)
)

if parser == "pmc":
df = extract_from_pmc_xml(temp_xml_file_path, rtransparent)
else:
df = extract_from_xml(temp_xml_file_path, rtransparent)
# Clean up the temporary file
temp_xml_file.close()
Path(temp_xml_file_path).unlink()

return df


# from osm.schemas import Invocation
@app.post("/extract-metrics")
async def extract_metrics(request: Request):
def extract_from_xml(temp_xml_file_path, rtransparent):
dfs = {}
with (ro.default_converter + pandas2ri.converter).context():
dfs["data_code"] = ro.conversion.get_conversion().rpy2py(
rtransparent.rt_data_code(temp_xml_file_path)
)
# "all" contains fund, register, and coi outputs
with (ro.default_converter + pandas2ri.converter).context():
dfs["all"] = ro.conversion.get_conversion().rpy2py(
rtransparent.rt_all(temp_xml_file_path)
)
return pd.concat([dfs["all"], dfs["data_code"].drop(columns=["article"])], axis=1)


def extract_from_pmc_xml(temp_xml_file_path, rtransparent):
raise NotImplementedError(
"Not all XML files provided at pubmedcentral include the datasharing statements."
)
# dfs = {}
# with (ro.default_converter + pandas2ri.converter).context():
# dfs["meta_pmc"] = ro.conversion.get_conversion().rpy2py(
# rtransparent.rt_meta_pmc(temp_xml_file_path)
# )
# # data_code_pmc is a subset of all_pmc
# with (ro.default_converter + pandas2ri.converter).context():
# dfs["all_pmc"] = ro.conversion.get_conversion().rpy2py(
# rtransparent.rt_all_pmc(temp_xml_file_path)
# )
# return pd.concat(
# [
# dfs["all_pmc"],
# dfs["meta_pmc"].drop(
# columns=["doi", "filename", "is_success", "pmcid_pmc", "pmid"]
# ),
# ],
# axis=1,
# )


@app.post("/extract-metrics/")
async def extract_metrics(request: Request, parser: str = Query("other")):
try:
# Attempt to read the XML content from the request body
xml_content = await request.body()
metrics_df = rtransparent_metric_extraction(xml_content)
if not xml_content:
raise NotImplementedError(
"""For now the XML content must be provided. Check the output of
the parsing stage."""
)

metrics_df = rtransparent_metric_extraction(xml_content, parser)

# Log the extracted metrics
logger.info(metrics_df)
metrics_json = metrics_df.to_json(orient="records")
return JSONResponse(content=metrics_json, status_code=200)

# Return the first row as a JSON response
return JSONResponse(content=metrics_df.iloc[0].to_dict(), status_code=200)

except Exception as e:
# Handle exceptions and return a 500 Internal Server Error
raise HTTPException(status_code=500, detail=str(e))
11 changes: 8 additions & 3 deletions osm/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ def get_compute_context_id():


def _upload_data(args, file_in, xml, metrics, components):
"""
TODO: add in derivatives and components
"""
osm_api = os.environ.get("OSM_API", "http://localhost:80")

payload = {
Expand Down Expand Up @@ -106,11 +103,19 @@ def _setup(args):
if args.filepath.name.endswith(".pdf"):
if xml_path.exists():
raise FileExistsError(xml_path)
elif args.filepath.name.endswith(".xml"):
logger.warning(
"""The input file is an xml file. Skipping the pdf to text
conversion and so ignoring requested parsers."""
)
args.parser = ["no-op"]
metrics_path = _get_metrics_dir() / f"{args.uid}.json"
if metrics_path.exists():
raise FileExistsError(metrics_path)
if not args.user_managed_compose:
compose_up()

logger.info("Waiting for containers to be ready...")
print("Waiting for containers to be ready...")
wait_for_containers()
return xml_path, metrics_path
35 changes: 15 additions & 20 deletions osm/cli.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import argparse

from osm._utils import DEFAULT_OUTPUT_DIR, _existing_file, _setup, compose_down
from osm.pipeline.core import Pipeline
from osm.pipeline.core import Pipeline, Savers
from osm.pipeline.extractors import RTransparentExtractor
from osm.pipeline.parsers import ScienceBeamParser
from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver, Savers
from osm.pipeline.parsers import NoopParser, ScienceBeamParser
from osm.pipeline.savers import FileSaver, JSONSaver, OSMSaver

PARSERS = {
"sciencebeam": ScienceBeamParser,
"no-op": NoopParser,
}
EXTRACTORS = {
"rtransparent": RTransparentExtractor,
Expand Down Expand Up @@ -72,14 +73,22 @@ def main():
args = parse_args()
try:
xml_path, metrics_path = _setup(args)

pipeline = Pipeline(
filepath=args.filepath,
xml_path=xml_path,
metrics_path=metrics_path,
parsers=[PARSERS[p] for p in args.parser],
extractors=[EXTRACTORS[m] for m in args.metrics_type],
parsers=[PARSERS[p]() for p in args.parser],
extractors=[EXTRACTORS[m]() for m in args.metrics_type],
savers=Savers(
file_saver=FileSaver(), json_saver=JSONSaver(), osm_saver=OSMSaver()
file_saver=FileSaver(),
json_saver=JSONSaver(),
osm_saver=OSMSaver(
comment=args.comment,
email=args.email,
user_defined_id=args.uid,
filename=args.filepath.name,
),
),
)
pipeline.run()
Expand All @@ -90,17 +99,3 @@ def main():

if __name__ == "__main__":
main()

# def main():
# args = parse_args()
# try:
# pipeline = _setup(args)
# pipeline.parse()
# pipeline.extract()
# pipeline.save()
# xml_path, metrics_path, parser, extractor = _setup(args)
# xml = parser.parse()
# xml_path.write_bytes(xml)
# metrics = _extract(xml)
# metrics_path.write_text(json.dumps(metrics))
# _upload_data(args, file_in, xml, metrics,components)
Loading
Loading