Skip to content

Commit

Permalink
start to integrate
Browse files Browse the repository at this point in the history
  • Loading branch information
leej3 committed Aug 28, 2024
1 parent c30ea88 commit c5243f1
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 208 deletions.
84 changes: 0 additions & 84 deletions external_components/chat_gpt_4o_2024_08_06/extract.py

This file was deleted.

123 changes: 0 additions & 123 deletions external_components/chat_gpt_4o_2024_08_06/extract_2.py

This file was deleted.

24 changes: 24 additions & 0 deletions external_components/llm_extraction/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM condaforge/mambaforge:24.3.0-0
SHELL ["/bin/bash", "--login", "-c"]
# Set working directory
WORKDIR /app

COPY external_components/rtransparent/environment.yaml /app

# Create the environment
RUN conda env create -f environment.yaml

# Ensure the conda environment is activated
RUN echo "source /opt/conda/etc/profile.d/conda.sh && conda activate osm" | tee -a ~/.bashrc /etc/profile /etc/profile.d/conda.sh /etc/skel/.bashrc /etc/skel/.profile > /dev/null

RUN R -e '\
devtools::install_github("quest-bih/oddpub",ref="c5b091c7e82ed6177192dc380a515b3dc6304863"); \
devtools::install_github("serghiou/rtransparent", build_vignettes = F)'

# # Copy the project files and install the package
COPY external_components/rtransparent/app.py /app

# Make entrypoint etc. convenient for users
COPY external_components/_entrypoint.sh /usr/local/bin/_entrypoint.sh
ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
CMD ["fastapi", "dev", "--host", "0.0.0.0", "--port", "8071"]
77 changes: 77 additions & 0 deletions external_components/llm_extraction/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import logging

from fastapi import FastAPI, File, HTTPException, Query, UploadFile
from llama_index.core import ChatPromptTemplate
from llama_index.core.llms import LLM, ChatMessage
from llama_index.llms.openai import OpenAI
from llama_index.program.openai import OpenAIPydanticProgram

# from pydantic import BaseModel, Field
from osm.schemas.metrics_schemas import LLMExtractor

LLM_MODELS = {"gpt-4o-2024-08-06": OpenAI(model="gpt-4o-2024-08-06")}


logger = logging.getLogger(__name__)
app = FastAPI()


def get_program(llm: LLM) -> OpenAIPydanticProgram:
prompt = ChatPromptTemplate(
message_templates=[
ChatMessage(
role="system",
content=(
"You are an expert at extracting information from scientific publications with a keen eye for details that when combined together allows you to summarize aspects of the publication"
),
),
ChatMessage(
role="user",
content=(
"The llm model is {llm_model}. The publication in xml follows below:\n"
"------\n"
"{xml_content}\n"
"------"
),
),
]
)

program = OpenAIPydanticProgram.from_defaults(
output_cls=LLMExtractor,
llm=llm,
prompt=prompt,
verbose=True,
)
return program


def extract_with_llm(xml_content: bytes, llm: LLM) -> LLMExtractor:
program = get_program(llm=llm)
return program(xml_content=xml_content, llm_model=llm.model)


def llm_metric_extraction(
xml_content: bytes,
llm_model: str,
):
return extract_with_llm(xml_content, LLM_MODELS[llm_model])


@app.post("/extract-metrics/", response_model=LLMExtractor)
async def extract_metrics(
file: UploadFile = File(...), llm_model: str = Query("other")
):
try:
xml_content = await file.read()
if not xml_content:
raise NotImplementedError(
"""For now the XML content must be provided. Check the output of
the parsing stage."""
)
metrics = llm_metric_extraction(xml_content, llm_model)
logger.info(metrics)
return metrics

except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
47 changes: 47 additions & 0 deletions external_components/llm_extraction/environment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: osm
channels:
- conda-forge
- nodefaults
dependencies:
- fastapi
- lxml
- pandas
- pip
- psutil
- python
- requests
- rpy2
- uvicorn
# Dependencies for rtransparent
- r-crul
- r-devtools
- r-dplyr
- r-furrr
- r-future
- r-globals
- r-hoardr
- r-httpcode
- r-lazyeval
- r-lubridate
- r-magrittr
- r-pbapply
- r-pdftools
- r-plyr
- r-purrr
- r-qpdf
- r-readr
# - r-rentrez
- r-rlang
- r-stringr
- r-tibble
- r-tidyr
- r-tidyselect
- r-timechange
- r-tokenizers
- r-triebeard
- r-urltools
- r-utf8
# - r-XML
- r-xml2
- pip:
- metapub
22 changes: 22 additions & 0 deletions osm/pipeline/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,28 @@ def _run(self, data: bytes, parser: str = None) -> dict:
response.raise_for_status()


class LLMExtractor(Component):
def _run(self, data: bytes, llm_model: str = None) -> dict:
self.sample = LongBytes(data)

# Prepare the file to be sent as a part of form data
files = {"file": ("input.xml", io.BytesIO(data), "application/xml")}

# Send the request with the file
response = requests.post(
"http://localhost:8072/extract-metrics/",
files=files,
params={"llm_model": llm_model},
)

if response.status_code == 200:
metrics = response.json()
return metrics
else:
logger.error(f"Error: {response.text}")
response.raise_for_status()


# import psutil
# # Adjust the logging level for rpy2
# rpy2_logger = logging.getLogger("rpy2")
Expand Down
Loading

0 comments on commit c5243f1

Please sign in to comment.