Skip to content

Commit

Permalink
refactor out docs module
Browse files Browse the repository at this point in the history
  • Loading branch information
Quantisan committed Sep 13, 2023
1 parent 1ee1dfa commit 018b59b
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 64 deletions.
29 changes: 29 additions & 0 deletions backend/mind_palace/docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
import grobid_tei_xml


def load_tei_xml(file_path):
print(f"Loading {file_path}")
with open(file_path, "r") as xml_file:
return grobid_tei_xml.parse_document_xml(xml_file.read())


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
)


def set_relationships(title_node, abstract_node):
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=title_node.node_id
)
return
46 changes: 9 additions & 37 deletions backend/mind_palace/extract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import llama_index as li
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
import grobid_tei_xml
from llama_index.schema import TextNode

import docs


def load_pdfs():
Expand All @@ -28,45 +29,18 @@ def nodes(documents, service_context=li.ServiceContext.from_defaults()):
# 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template',
# 'metadata_seperator'])

# TODO: move these into a submodule


def _load_tei_xml(filepath):
with open(filepath, "r") as xml_file:
return grobid_tei_xml.parse_document_xml(xml_file.read())


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
)


def set_relationships(title_node, abstract_node):
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=title_node.node_id
)
return


def _gen_document_dict(xml) -> dict[str, TextNode]:
def _gen_document_dict(file_path) -> dict[str, TextNode]:
xml = docs.load_tei_xml(file_path)
doi = xml.header.doi
assert doi is not None

try:
title_node = title(xml, doi)
abstract_node = abstract(xml, doi)
title_node = docs.title(xml, doi)
abstract_node = docs.abstract(xml, doi)
# TODO: load more sections

set_relationships(title_node, abstract_node)
docs.set_relationships(title_node, abstract_node)
return {"title": title_node, "abstract": abstract_node}
except Exception as e:
print(f"failed to load DOI {doi} because {e}")
Expand All @@ -87,9 +61,7 @@ def seed_nodes(input_dir) -> list[TextNode]:
file_paths = _get_file_paths(input_dir)

for file_path in file_paths:
print(f"loading {file_path}")
xml_data = _load_tei_xml(file_path)
nodes_dict = _gen_document_dict(xml_data)
nodes_dict = _gen_document_dict(file_path)
if nodes_dict:
for node in nodes_dict.values():
nodes.append(node)
Expand Down
1 change: 1 addition & 0 deletions backend/tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
)

import extract
import docs
23 changes: 23 additions & 0 deletions backend/tests/test_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from .context import docs
import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship

XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"


def test_node_relationships():
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
assert docs.set_relationships(title_node, abstract_node) is None
assert (
abstract_node.relationships[NodeRelationship.PARENT].node_id
== title_node.node_id
)


def test_load_tei_xml():
xml = docs.load_tei_xml(
XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
assert isinstance(xml, grobid_types.GrobidDocument)
31 changes: 4 additions & 27 deletions backend/tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,18 @@
from .context import extract
import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship

XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"


def test_node_relationships():
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
assert extract.set_relationships(title_node, abstract_node) is None
assert (
abstract_node.relationships[NodeRelationship.PARENT].node_id
== title_node.node_id
)


def test_load_tei_xml():
xml = extract._load_tei_xml(
XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
assert isinstance(xml, grobid_types.GrobidDocument)
from . import test_docs


def test_gen_document_dict():
xml = extract._load_tei_xml(
XML_PATH
nodes_dict = extract._gen_document_dict(
test_docs.XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
nodes_dict = extract._gen_document_dict(xml)
for node in nodes_dict.values():
assert isinstance(node, extract.TextNode)


def test_seed_nodes():
input_dir = XML_PATH
nodes = extract.seed_nodes(input_dir)
nodes = extract.seed_nodes(test_docs.XML_PATH)
assert isinstance(nodes, list)
for node in nodes:
assert isinstance(node, extract.TextNode)

0 comments on commit 018b59b

Please sign in to comment.