move towards more generic

nimh-dsst · Aug 28, 2024 · c30ea88 · c30ea88
1 parent 041a521
commit c30ea88
Showing 1 changed file with 49 additions and 31 deletions.
diff --git a/external_components/chat_gpt_4o_2024_08_06/extract_2.py b/external_components/chat_gpt_4o_2024_08_06/extract_2.py
@@ -1,17 +1,13 @@
 from llama_index.core import ChatPromptTemplate
-from llama_index.core.llms import ChatMessage
+from llama_index.core.llms import LLM, ChatMessage
 from llama_index.llms.openai import OpenAI
 from llama_index.program.openai import OpenAIPydanticProgram
 from pydantic import BaseModel, Field
 
 llm = OpenAI(model="gpt-4o-2024-08-06")
 
-# response = llm.complete(
-#     "Generate a sales call transcript, use real names, talk about a product, discuss some action items"
-# )
 
-
-class GPT_4o_2024_08_06(BaseModel):
+class LLMExtractor(BaseModel):
     """
     Model for extracting information from scientific publications. These metrics
     are a summary of the publications adherence to transparent or open
@@ -48,19 +44,28 @@ class GPT_4o_2024_08_06(BaseModel):
     is_open_code: bool = Field(
         description="Whether there is evidence that the code used for analysis in the paper has been shared online",
     )
-    code_sharing_statement: str = Field(
+    code_sharing_statement: list[str] = Field(
         description="The statement in the paper that indicates whether the code used for analysis has been shared online",
     )
     is_open_data: bool = Field(
         description="Whether there is evidence that the data used for analysis in the paper has been shared online",
     )
-    data_sharing_statement: str = Field(
+    data_sharing_statement: list[str] = Field(
         description="The statement in the paper that indicates whether the data used for analysis has been shared online",
     )
+    data_repository_url: str = Field(
+        description="The URL of the repository where the data can be found"
+    )
+    dataset_unique_identifier: list[str] = Field(
+        description="Any unique identifiers the dataset may have"
+    )
+    code_repository_url: str = Field(
+        description="The URL of the repository where the code and data can be found"
+    )
     has_coi_statement: bool = Field(
         description="Whether there is a conflict of interest statement in the paper",
     )
-    coi_statement: str = Field(
+    coi_statement: list[str] = Field(
         description="The conflict of interest statement in the paper"
     )
     funder: list[str] = Field(
@@ -69,37 +74,50 @@ class GPT_4o_2024_08_06(BaseModel):
     has_funding_statement: bool = Field(
         description="Whether there is a funding statement in the paper"
     )
-    funding_statement: str = Field(description="The funding statement in the paper")
+    funding_statement: list[str] = Field(
+        description="The funding statement in the paper"
+    )
     has_registration_statement: bool = Field(
         description="Whether there is a registration statement in the paper",
     )
-    registration_statement: str = Field(
+    registration_statement: list[str] = Field(
         description="The registration statement in the paper"
     )
     reasoning_steps: list[str] = Field(
         description="The reasoning steps used to extract the information from the paper",
     )
 
 
-prompt = ChatPromptTemplate(
-    message_templates=[
-        ChatMessage(
-            role="system",
-            content=(
-                "You are an expert at extracting information from scientific publications with a keen eye for details that when combined together allows you to summarize aspects of the publication"
+def get_program(llm: LLM) -> OpenAIPydanticProgram:
+    prompt = ChatPromptTemplate(
+        message_templates=[
+            ChatMessage(
+                role="system",
+                content=(
+                    "You are an expert at extracting information from scientific publications with a keen eye for details that when combined together allows you to summarize aspects of the publication"
+                ),
             ),
-        ),
-        ChatMessage(
-            role="user",
-            content=(
-                "Here is the transcript: \n" "------\n" "{xml_content}\n" "------"
+            ChatMessage(
+                role="user",
+                content=(
+                    "The llm model is {llm_model}. The publication in xml follows below:\n"
+                    "------\n"
+                    "{xml_content}\n"
+                    "------"
+                ),
             ),
-        ),
-    ]
-)
-program = OpenAIPydanticProgram.from_defaults(
-    output_cls=GPT_4o_2024_08_06,
-    llm=llm,
-    prompt=prompt,
-    verbose=True,
-)
+        ]
+    )
+
+    program = OpenAIPydanticProgram.from_defaults(
+        output_cls=LLMExtractor,
+        llm=llm,
+        prompt=prompt,
+        verbose=True,
+    )
+    return program
+
+
+def extract_with_llm(xml_content: bytes, llm: LLM) -> LLMExtractor:
+    program = get_program(llm=llm)
+    return program(xml_content=xml_content, llm_model=llm.model)