diff --git a/mind_palace/app.py b/mind_palace/app.py index e5e7423..5423d3b 100644 --- a/mind_palace/app.py +++ b/mind_palace/app.py @@ -56,7 +56,10 @@ def load_index(model): st.markdown("### Sources") for i, source_node in enumerate(response.source_nodes): - with st.expander(f"[{i + 1}] {source_node.node.node_id}"): - st.write(f"relevancy score: {source_node.score}") + with st.expander(f"[{i + 1}] {source_node.node.metadata['citation']}"): + if source_node.metadata["paragraph_number"]: + st.write( + f"paragraph number: {source_node.metadata['paragraph_number']}" + ) st.write("original text:") st.write(source_node.node.get_text().split(":", 1)[1]) diff --git a/mind_palace/docs.py b/mind_palace/docs.py index a62fa5f..f942180 100644 --- a/mind_palace/docs.py +++ b/mind_palace/docs.py @@ -1,5 +1,5 @@ -from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo import grobid_tei_xml +from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode def load_tei_xml(file_path): @@ -8,6 +8,22 @@ def load_tei_xml(file_path): return grobid_tei_xml.parse_document_xml(xml_file.read()) +def cite_authors(xml): + first_author = xml.header.authors[0] + first_author_display_name = f"{first_author.surname}, {first_author.given_name}" + return first_author_display_name + ( + ", et al" if len(xml.header.authors) > 1 else "" + ) + + +def cite_journal(xml): + return f"{xml.header.journal_abbrev} {xml.header.date};{xml.header.volume}({xml.header.issue}). doi:{xml.header.doi}" + + +def cite(xml): + return f"{cite_authors(xml)}. {xml.header.title}. {cite_journal(xml)}." + + def title(xml, doc_id): return TextNode( text=xml.header.title, @@ -44,6 +60,7 @@ def body(xml, doc_id): return [ TextNode( text=line, + metadata={"paragraph_number": index + 1}, id_=f"{doc_id}-body-paragraph-{index}", ) for index, line in enumerate(xml.body.split("\n")) @@ -64,3 +81,21 @@ def set_relationships(title_node, abstract_node, body_nodes): set_prev_relationships(body_nodes) set_next_relationships(body_nodes) return + + +def _set_citation_metadata(citation, node): + node.metadata["citation"] = citation + return + + +def set_citations(xml, nodes): + citation = cite(xml) + for node in nodes: + if isinstance(node, list): + for sub_node in node: + _set_citation_metadata(citation, sub_node) + continue + else: + _set_citation_metadata(citation, node) + + return diff --git a/mind_palace/extract.py b/mind_palace/extract.py index 79cd5db..7100c9b 100644 --- a/mind_palace/extract.py +++ b/mind_palace/extract.py @@ -1,7 +1,7 @@ import os -from llama_index.schema import TextNode import docs +from llama_index.schema import TextNode def _gen_document_dict(file_path) -> dict[str, TextNode]: @@ -15,6 +15,8 @@ def _gen_document_dict(file_path) -> dict[str, TextNode]: body_nodes = docs.body(xml, doi) docs.set_relationships(title_node, abstract_node, body_nodes) + docs.set_citations(xml=xml, nodes=[title_node, abstract_node, body_nodes]) + return { "title": title_node, "abstract": abstract_node, diff --git a/tests/unit/test_docs.py b/tests/unit/test_docs.py index fc7547d..3d6d3d2 100644 --- a/tests/unit/test_docs.py +++ b/tests/unit/test_docs.py @@ -1,10 +1,41 @@ -from tests.context import docs +from unittest.mock import MagicMock, patch + import grobid_tei_xml.types as grobid_types -from llama_index.schema import TextNode, NodeRelationship +from llama_index.schema import NodeRelationship, TextNode + +from tests.context import docs XML_PATH = "./resources/xmls/12-pdfs-from-steve-aug-22/" +def test_cite_authors(): + xml = MagicMock() + xml.header.authors = [ + MagicMock(surname="Doe", given_name="John"), + MagicMock(surname="Smith", given_name="Jane"), + ] + assert docs.cite_authors(xml) == "Doe, John, et al" + + xml.header.authors = [MagicMock(surname="Doe", given_name="John")] + assert docs.cite_authors(xml) == "Doe, John" + + +def test_cite(): + xml = MagicMock() + + with patch("docs.cite_authors", return_value="Doe, John"): + xml.header.title = "This is a title" + xml.header.journal_abbrev = "J. Abbrev." + xml.header.date = "2023" + xml.header.volume = "1" + xml.header.issue = "9" + xml.header.doi = "10.1234/1234" + assert ( + docs.cite(xml) + == "Doe, John. This is a title. J. Abbrev. 2023;1(9). doi:10.1234/1234." + ) + + def gen_nodes(): return [ TextNode(text="this is first"), @@ -63,6 +94,21 @@ def test_node_relationships(): ) +def test_set_citations(): + xml = docs.load_tei_xml( + XML_PATH + + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml" + ) + title_node = TextNode(text="this is title") + abstract_node = TextNode(text="this is abstract") + body_nodes = gen_nodes() + docs.set_citations(xml=xml, nodes=[title_node, abstract_node, body_nodes]) + assert title_node.metadata["citation"] == docs.cite(xml) + assert abstract_node.metadata["citation"] == docs.cite(xml) + for body_node in body_nodes: + assert body_node.metadata["citation"] == docs.cite(xml) + + def test_load_tei_xml(): xml = docs.load_tei_xml( XML_PATH