From 36967c1e9d1d9b075a539d630f3bf612d90bb0b3 Mon Sep 17 00:00:00 2001 From: leej3 Date: Wed, 28 Aug 2024 15:04:42 +0100 Subject: [PATCH] try approach not limitted by tokens --- .../chat_gpt_4o_2024_08_06/extract.py | 45 +++------ .../chat_gpt_4o_2024_08_06/extract_2.py | 98 +++++++++++++++++++ 2 files changed, 111 insertions(+), 32 deletions(-) create mode 100644 external_components/chat_gpt_4o_2024_08_06/extract_2.py diff --git a/external_components/chat_gpt_4o_2024_08_06/extract.py b/external_components/chat_gpt_4o_2024_08_06/extract.py index 6245fa77..d97911f2 100644 --- a/external_components/chat_gpt_4o_2024_08_06/extract.py +++ b/external_components/chat_gpt_4o_2024_08_06/extract.py @@ -1,6 +1,5 @@ from typing import Optional -import openai from odmantic import EmbeddedModel, Field from openai import OpenAI @@ -9,76 +8,60 @@ class GPT_4o_2024_08_06(EmbeddedModel): year: Optional[int] = Field( - default=None, description="Best attempt at extracting the year of the publication", ) journal: Optional[str] = Field( - default=None, description="The journal in which the paper was published" + description="The journal in which the paper was published" ) article_type: list[str] = Field( - default=None, description="The type of article e.g. research article, review, erratum, meta-analysis etc.", ) affiliation_country: list[str] = Field( - default_factory=list, description="The countries of the affiliations of the authors", ) institute: list[str] = Field( - default_factory=list, description="The institutes of the affiliations of the authors", ) - doi: Optional[str] = Field(default=None, description="The DOI of the paper") - pmid: Optional[int] = Field(default=None, description="The PMID of the paper") - pmcid: Optional[str] = Field(default=None, description="The PMCID of the paper") - title: Optional[str] = Field(default=None, description="The title of the paper") - authors: list[str] = Field( - default_factory=list, description="The authors of the paper" - ) - publisher: Optional[str] = Field( - default=None, description="The publisher of the paper" - ) + doi: Optional[str] = Field(description="The DOI of the paper") + pmid: Optional[int] = Field(description="The PMID of the paper") + pmcid: Optional[str] = Field(description="The PMCID of the paper") + title: Optional[str] = Field(description="The title of the paper") + authors: list[str] = Field(description="The authors of the paper") + publisher: Optional[str] = Field(description="The publisher of the paper") is_open_code: Optional[bool] = Field( - default=None, description="Whether there is evidence that the code used for analysis in the paper has been shared online", ) code_sharing_statement: Optional[str] = Field( - default=None, description="The statement in the paper that indicates whether the code used for analysis has been shared online", ) is_open_data: Optional[bool] = Field( - default=None, description="Whether there is evidence that the data used for analysis in the paper has been shared online", ) data_sharing_statement: Optional[str] = Field( - default=None, description="The statement in the paper that indicates whether the data used for analysis has been shared online", ) has_coi_statement: Optional[bool] = Field( - default=None, description="Whether there is a conflict of interest statement in the paper", ) coi_statement: Optional[str] = Field( - default=None, description="The conflict of interest statement in the paper" + description="The conflict of interest statement in the paper" ) funder: list[str] = Field( - default_factory=list, description="The funders of the research, may contain multiple funders", ) has_funding_statement: Optional[bool] = Field( - default=None, description="Whether there is a funding statement in the paper" + description="Whether there is a funding statement in the paper" ) funding_statement: Optional[str] = Field( - default=None, description="The funding statement in the paper" + description="The funding statement in the paper" ) has_registration_statement: Optional[bool] = Field( - default=None, description="Whether there is a registration statement in the paper", ) registration_statement: Optional[str] = Field( - default=None, description="The registration statement in the paper" + description="The registration statement in the paper" ) reasoning_steps: list[str] = Field( - default_factory=list, description="The reasoning steps used to extract the information from the paper", ) @@ -93,11 +76,9 @@ def parse_xml_content(xml_content: bytes) -> GPT_4o_2024_08_06: }, { "role": "user", - "content": f'{xml_content.decode("utf-8")}', + "content": f'{xml_content.decode("utf-8")[:100]}', }, ], - tools=[ - openai.pydantic_function_tool(GPT_4o_2024_08_06.__pydantic_model__), - ], + response_format=GPT_4o_2024_08_06.__pydantic_model__, ) return completion.messages[-1].content diff --git a/external_components/chat_gpt_4o_2024_08_06/extract_2.py b/external_components/chat_gpt_4o_2024_08_06/extract_2.py new file mode 100644 index 00000000..b59ade5f --- /dev/null +++ b/external_components/chat_gpt_4o_2024_08_06/extract_2.py @@ -0,0 +1,98 @@ +from llama_index.core import ChatPromptTemplate +from llama_index.core.llms import ChatMessage +from llama_index.llms.openai import OpenAI +from llama_index.program.openai import OpenAIPydanticProgram +from pydantic import BaseModel, Field + +llm = OpenAI(model="gpt-4o-2024-08-06") + +# response = llm.complete( +# "Generate a sales call transcript, use real names, talk about a product, discuss some action items" +# ) + + +class GPT_4o_2024_08_06(BaseModel): + """ + Model for extracting information from scientific publications. These metrics + are a summary of the publications adherence to transparent or open + scientific practices. + Many unavailable identifiers (PMID, PMCID etc) can be found using pubmed: https://pubmed.ncbi.nlm.nih.gov/advanced/ + """ + + year: int = Field( + description="Best attempt at extracting the year of the publication", + ) + journal: str = Field(description="The journal in which the paper was published") + article_type: list[str] = Field( + description="The type of article e.g. research article, review, erratum, meta-analysis etc.", + ) + affiliation_country: list[str] = Field( + description="The countries of the affiliations of the authors", + ) + institute: list[str] = Field( + description="The institutes of the affiliations of the authors", + ) + doi: str = Field(description="The DOI of the paper") + pmid: int = Field(description="The PMID of the paper") + pmcid: str = Field(description="The PMCID of the paper") + title: str = Field(description="The title of the paper") + authors: list[str] = Field(description="The authors of the paper") + publisher: str = Field(description="The publisher of the paper") + is_open_code: bool = Field( + description="Whether there is evidence that the code used for analysis in the paper has been shared online", + ) + code_sharing_statement: str = Field( + description="The statement in the paper that indicates whether the code used for analysis has been shared online", + ) + is_open_data: bool = Field( + description="Whether there is evidence that the data used for analysis in the paper has been shared online", + ) + data_sharing_statement: str = Field( + description="The statement in the paper that indicates whether the data used for analysis has been shared online", + ) + has_coi_statement: bool = Field( + description="Whether there is a conflict of interest statement in the paper", + ) + coi_statement: str = Field( + description="The conflict of interest statement in the paper" + ) + funder: list[str] = Field( + description="The funders of the research, may contain multiple funders", + ) + has_funding_statement: bool = Field( + description="Whether there is a funding statement in the paper" + ) + funding_statement: str = Field(description="The funding statement in the paper") + has_registration_statement: bool = Field( + description="Whether there is a registration statement in the paper", + ) + registration_statement: str = Field( + description="The registration statement in the paper" + ) + reasoning_steps: list[str] = Field( + description="The reasoning steps used to extract the information from the paper", + ) + + +prompt = ChatPromptTemplate( + message_templates=[ + ChatMessage( + role="system", + content=( + "You are an expert at extracting information from scientific publications with a keen eye for details that when combined together allows you to summarize aspects of the publication" + ), + ), + ChatMessage( + role="user", + content=( + "Here is the transcript: \n" "------\n" "{xml_content}\n" "------" + ), + ), + ] +) +program = OpenAIPydanticProgram.from_defaults( + output_cls=GPT_4o_2024_08_06, + llm=llm, + prompt=prompt, + verbose=True, +)