Skip to content

Commit

Permalink
refactor out title(), abstract(), and set_relationships()
Browse files Browse the repository at this point in the history
  • Loading branch information
Quantisan committed Sep 13, 2023
1 parent 6bc6b0f commit d58c882
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 12 deletions.
37 changes: 25 additions & 12 deletions backend/mind_palace/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,38 @@ def _load_tei_xml(filepath):
return grobid_tei_xml.parse_document_xml(xml_file.read())


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
)


def set_relationships(title_node, abstract_node):
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=title_node.node_id
)
return


def _gen_document_dict(xml) -> dict[str, TextNode]:
doi = xml.header.doi
assert doi is not None

try:
node_title = TextNode(
text=xml.header.title,
id_=f"{doi}-title",
)
node_abstract = TextNode(
text=xml.abstract,
id_=f"{doi}-abstract",
)
title_node = title(xml, doi)
abstract_node = abstract(xml, doi)
# TODO: load more sections

node_abstract.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=node_title.node_id
)
return {"title": node_title, "abstract": node_abstract}
set_relationships(title_node, abstract_node)
return {"title": title_node, "abstract": abstract_node}
except Exception as e:
print(f"failed to load DOI {doi} because {e}")
return {}
Expand Down
11 changes: 11 additions & 0 deletions backend/tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,20 @@
from .context import extract
import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship

XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"


def test_node_relationships():
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
assert extract.set_relationships(title_node, abstract_node) is None
assert (
abstract_node.relationships[NodeRelationship.PARENT].node_id
== title_node.node_id
)


def test_load_tei_xml():
xml = extract._load_tei_xml(
XML_PATH
Expand Down

0 comments on commit d58c882

Please sign in to comment.