refactor out docs module

TheDataGuild · Sep 13, 2023 · 018b59b · 018b59b
1 parent 1ee1dfa
commit 018b59b
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 64 deletions.
diff --git a/backend/mind_palace/docs.py b/backend/mind_palace/docs.py
@@ -0,0 +1,29 @@
+from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
+import grobid_tei_xml
+
+
+def load_tei_xml(file_path):
+    print(f"Loading {file_path}")
+    with open(file_path, "r") as xml_file:
+        return grobid_tei_xml.parse_document_xml(xml_file.read())
+
+
+def title(xml, doc_id):
+    return TextNode(
+        text=xml.header.title,
+        id_=f"{doc_id}-title",
+    )
+
+
+def abstract(xml, doc_id):
+    return TextNode(
+        text=xml.abstract,
+        id_=f"{doc_id}-abstract",
+    )
+
+
+def set_relationships(title_node, abstract_node):
+    abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
+        node_id=title_node.node_id
+    )
+    return
diff --git a/backend/mind_palace/extract.py b/backend/mind_palace/extract.py
@@ -1,7 +1,8 @@
 import os
 import llama_index as li
-from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
-import grobid_tei_xml
+from llama_index.schema import TextNode
+
+import docs
 
 
 def load_pdfs():
@@ -28,45 +29,18 @@ def nodes(documents, service_context=li.ServiceContext.from_defaults()):
 # 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template',
 # 'metadata_seperator'])
 
-# TODO: move these into a submodule
-
-
-def _load_tei_xml(filepath):
-    with open(filepath, "r") as xml_file:
-        return grobid_tei_xml.parse_document_xml(xml_file.read())
-
-
-def title(xml, doc_id):
-    return TextNode(
-        text=xml.header.title,
-        id_=f"{doc_id}-title",
-    )
-
-
-def abstract(xml, doc_id):
-    return TextNode(
-        text=xml.abstract,
-        id_=f"{doc_id}-abstract",
-    )
-
-
-def set_relationships(title_node, abstract_node):
-    abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
-        node_id=title_node.node_id
-    )
-    return
-
 
-def _gen_document_dict(xml) -> dict[str, TextNode]:
+def _gen_document_dict(file_path) -> dict[str, TextNode]:
+    xml = docs.load_tei_xml(file_path)
     doi = xml.header.doi
     assert doi is not None
 
     try:
-        title_node = title(xml, doi)
-        abstract_node = abstract(xml, doi)
+        title_node = docs.title(xml, doi)
+        abstract_node = docs.abstract(xml, doi)
         # TODO: load more sections
 
-        set_relationships(title_node, abstract_node)
+        docs.set_relationships(title_node, abstract_node)
         return {"title": title_node, "abstract": abstract_node}
     except Exception as e:
         print(f"failed to load DOI {doi} because {e}")
@@ -87,9 +61,7 @@ def seed_nodes(input_dir) -> list[TextNode]:
     file_paths = _get_file_paths(input_dir)
 
     for file_path in file_paths:
-        print(f"loading {file_path}")
-        xml_data = _load_tei_xml(file_path)
-        nodes_dict = _gen_document_dict(xml_data)
+        nodes_dict = _gen_document_dict(file_path)
         if nodes_dict:
             for node in nodes_dict.values():
                 nodes.append(node)

diff --git a/backend/tests/context.py b/backend/tests/context.py
@@ -6,3 +6,4 @@
 )
 
 import extract
+import docs
diff --git a/backend/tests/test_docs.py b/backend/tests/test_docs.py
@@ -0,0 +1,23 @@
+from .context import docs
+import grobid_tei_xml.types as grobid_types
+from llama_index.schema import TextNode, NodeRelationship
+
+XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"
+
+
+def test_node_relationships():
+    title_node = TextNode(text="this is title")
+    abstract_node = TextNode(text="this is abstract")
+    assert docs.set_relationships(title_node, abstract_node) is None
+    assert (
+        abstract_node.relationships[NodeRelationship.PARENT].node_id
+        == title_node.node_id
+    )
+
+
+def test_load_tei_xml():
+    xml = docs.load_tei_xml(
+        XML_PATH
+        + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
+    )
+    assert isinstance(xml, grobid_types.GrobidDocument)
diff --git a/backend/tests/test_extract.py b/backend/tests/test_extract.py
@@ -1,41 +1,18 @@
 from .context import extract
-import grobid_tei_xml.types as grobid_types
-from llama_index.schema import TextNode, NodeRelationship
-
-XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"
-
-
-def test_node_relationships():
-    title_node = TextNode(text="this is title")
-    abstract_node = TextNode(text="this is abstract")
-    assert extract.set_relationships(title_node, abstract_node) is None
-    assert (
-        abstract_node.relationships[NodeRelationship.PARENT].node_id
-        == title_node.node_id
-    )
-
-
-def test_load_tei_xml():
-    xml = extract._load_tei_xml(
-        XML_PATH
-        + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
-    )
-    assert isinstance(xml, grobid_types.GrobidDocument)
+from . import test_docs
 
 
 def test_gen_document_dict():
-    xml = extract._load_tei_xml(
-        XML_PATH
+    nodes_dict = extract._gen_document_dict(
+        test_docs.XML_PATH
         + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
     )
-    nodes_dict = extract._gen_document_dict(xml)
     for node in nodes_dict.values():
         assert isinstance(node, extract.TextNode)
 
 
 def test_seed_nodes():
-    input_dir = XML_PATH
-    nodes = extract.seed_nodes(input_dir)
+    nodes = extract.seed_nodes(test_docs.XML_PATH)
     assert isinstance(nodes, list)
     for node in nodes:
         assert isinstance(node, extract.TextNode)