diff --git a/mind_palace/docs.py b/mind_palace/docs.py index a62fa5f..64148b1 100644 --- a/mind_palace/docs.py +++ b/mind_palace/docs.py @@ -1,5 +1,5 @@ -from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo import grobid_tei_xml +from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode def load_tei_xml(file_path): @@ -8,6 +8,14 @@ def load_tei_xml(file_path): return grobid_tei_xml.parse_document_xml(xml_file.read()) +def cite_authors(xml): + first_author = xml.header.authors[0] + first_author_display_name = f"{first_author.surname}, {first_author.given_name}" + return first_author_display_name + ( + ", et al." if len(xml.header.authors) > 1 else "." + ) + + def title(xml, doc_id): return TextNode( text=xml.header.title, diff --git a/tests/unit/test_docs.py b/tests/unit/test_docs.py index fc7547d..f6305a5 100644 --- a/tests/unit/test_docs.py +++ b/tests/unit/test_docs.py @@ -1,10 +1,25 @@ -from tests.context import docs +from unittest.mock import MagicMock + import grobid_tei_xml.types as grobid_types -from llama_index.schema import TextNode, NodeRelationship +from llama_index.schema import NodeRelationship, TextNode + +from tests.context import docs XML_PATH = "./resources/xmls/12-pdfs-from-steve-aug-22/" +def test_cite_authors(): + xml = MagicMock() + xml.header.authors = [ + MagicMock(surname="Doe", given_name="John"), + MagicMock(surname="Smith", given_name="Jane"), + ] + assert docs.cite_authors(xml) == "Doe, John, et al." + + xml.header.authors = [MagicMock(surname="Doe", given_name="John")] + assert docs.cite_authors(xml) == "Doe, John." + + def gen_nodes(): return [ TextNode(text="this is first"),