Merge pull request #25 from TheDataGuild/refactor/docs-module

Refactor/docs module
TheDataGuild · Sep 13, 2023 · 9beb969 · 9beb969
2 parents 1ee1dfa + b8c769b
commit 9beb969
Show file tree

Hide file tree

Showing 31 changed files with 66 additions and 90 deletions.
diff --git a/backend/mind_palace/__init__.py b/backend/mind_palace/__init__.py
diff --git a/backend/mind_palace/docs.py b/backend/mind_palace/docs.py
@@ -0,0 +1,29 @@
+from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
+import grobid_tei_xml
+
+
+def load_tei_xml(file_path):
+    print(f"Loading {file_path}")
+    with open(file_path, "r") as xml_file:
+        return grobid_tei_xml.parse_document_xml(xml_file.read())
+
+
+def title(xml, doc_id):
+    return TextNode(
+        text=xml.header.title,
+        id_=f"{doc_id}-title",
+    )
+
+
+def abstract(xml, doc_id):
+    return TextNode(
+        text=xml.abstract,
+        id_=f"{doc_id}-abstract",
+    )
+
+
+def set_relationships(title_node, abstract_node):
+    abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
+        node_id=title_node.node_id
+    )
+    return
diff --git a/backend/mind_palace/extract.py b/backend/mind_palace/extract.py
@@ -1,72 +1,20 @@
 import os
-import llama_index as li
-from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
-import grobid_tei_xml
+from llama_index.schema import TextNode
 
+import docs
 
-def load_pdfs():
-    required_exts = [".pdf"]
 
-    # Q: we could use a better text extractor supporting section-aware extraction
-    return li.SimpleDirectoryReader(
-        input_dir="./resources/pdfs/12-pdfs-from-steve-aug-22",
-        required_exts=required_exts,
-    ).load_data()
-
-
-def nodes(documents, service_context=li.ServiceContext.from_defaults()):
-    return service_context.node_parser.get_nodes_from_documents(
-        documents, show_progress=True
-    )
-
-
-# a Node from load_pdfs standard pipeline
-#
-# >>> nodes[0].__dict__.keys()
-# dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys',
-# 'excluded_llm_metadata_keys', 'relationships', 'hash', 'text',
-# 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template',
-# 'metadata_seperator'])
-
-# TODO: move these into a submodule
-
-
-def _load_tei_xml(filepath):
-    with open(filepath, "r") as xml_file:
-        return grobid_tei_xml.parse_document_xml(xml_file.read())
-
-
-def title(xml, doc_id):
-    return TextNode(
-        text=xml.header.title,
-        id_=f"{doc_id}-title",
-    )
-
-
-def abstract(xml, doc_id):
-    return TextNode(
-        text=xml.abstract,
-        id_=f"{doc_id}-abstract",
-    )
-
-
-def set_relationships(title_node, abstract_node):
-    abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
-        node_id=title_node.node_id
-    )
-    return
-
-
-def _gen_document_dict(xml) -> dict[str, TextNode]:
+def _gen_document_dict(file_path) -> dict[str, TextNode]:
+    xml = docs.load_tei_xml(file_path)
     doi = xml.header.doi
     assert doi is not None
 
     try:
-        title_node = title(xml, doi)
-        abstract_node = abstract(xml, doi)
+        title_node = docs.title(xml, doi)
+        abstract_node = docs.abstract(xml, doi)
         # TODO: load more sections
 
-        set_relationships(title_node, abstract_node)
+        docs.set_relationships(title_node, abstract_node)
         return {"title": title_node, "abstract": abstract_node}
     except Exception as e:
         print(f"failed to load DOI {doi} because {e}")
@@ -87,9 +35,7 @@ def seed_nodes(input_dir) -> list[TextNode]:
     file_paths = _get_file_paths(input_dir)
 
     for file_path in file_paths:
-        print(f"loading {file_path}")
-        xml_data = _load_tei_xml(file_path)
-        nodes_dict = _gen_document_dict(xml_data)
+        nodes_dict = _gen_document_dict(file_path)
         if nodes_dict:
             for node in nodes_dict.values():
                 nodes.append(node)

diff --git a/backend/mind_palace/repl.py b/backend/mind_palace/repl.py
@@ -4,7 +4,7 @@
 
 
 # PDF Extraction
-input_dir = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml"
+input_dir = "./resources/xmls/12-pdfs-from-steve-aug-22/"
 nodes = extract.seed_nodes(input_dir)
 
 

diff --git a/...sating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf b/...sating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf
diff --git a/...ound Med Biol_Sonoporation as a Cellular Stress Induction of Morphological Repression.pdf b/...ound Med Biol_Sonoporation as a Cellular Stress Induction of Morphological Repression.pdf
diff --git a/...from-steve-aug-22/2013_Ultrasound_in_Medicine_and_Biol_Hu_ Sonoporation_Pore_Dynamics.pdf b/...from-steve-aug-22/2013_Ultrasound_in_Medicine_and_Biol_Hu_ Sonoporation_Pore_Dynamics.pdf
diff --git a/...om-steve-aug-22/2014_J_Royal_Soc_Interface_Yu_Single-site_sonoporation_disrupts_actin.pdf b/...om-steve-aug-22/2014_J_Royal_Soc_Interface_Yu_Single-site_sonoporation_disrupts_actin.pdf
diff --git a/...dfs-from-steve-aug-22/2015_Royal_Society_Interface_Yu_Membrane blebbing as a recovery.pdf b/...dfs-from-steve-aug-22/2015_Royal_Society_Interface_Yu_Membrane blebbing as a recovery.pdf
diff --git a/...ed Release_Mechanistic understanding the bioeffects of ultrasound-driven microbubbles.pdf b/...ed Release_Mechanistic understanding the bioeffects of ultrasound-driven microbubbles.pdf
diff --git a/...nsity Pulsed Ultrasound and Sonoporation_ Platform Design and Flow Cytometry Protocol.pdf b/...nsity Pulsed Ultrasound and Sonoporation_ Platform Design and Flow Cytometry Protocol.pdf
diff --git a/...ics_Evaluation of the Properties of Daughter Bubbles Generated by Inertial Cavitation.pdf b/...ics_Evaluation of the Properties of Daughter Bubbles Generated by Inertial Cavitation.pdf
diff --git a/...urces/pdfs/12-pdfs-from-steve-aug-22/2020_Ultrasound in Medicine and BioIogy_Yu_Blebs.pdf b/...urces/pdfs/12-pdfs-from-steve-aug-22/2020_Ultrasound in Medicine and BioIogy_Yu_Blebs.pdf
diff --git a/.../pdfs/12-pdfs-from-steve-aug-22/2021_Nature Scientific Reports_Sonoporation generates.pdf b/.../pdfs/12-pdfs-from-steve-aug-22/2021_Nature Scientific Reports_Sonoporation generates.pdf
diff --git a/...g-22/2021_Scientific Reports Nature_Sonoporation generates downstream impact in vitro.pdf b/...g-22/2021_Scientific Reports Nature_Sonoporation generates downstream impact in vitro.pdf
diff --git a/...-aug-22/2022_BME Frontiers_Ultrasound-Mediated Drug Delivery Sonoporation Mechanisms,.pdf b/...-aug-22/2022_BME Frontiers_Ultrasound-Mediated Drug Delivery Sonoporation Mechanisms,.pdf
diff --git a/...Single-Cell Membrane Poration.pdf.tei.xml → ...Single-Cell Membrane Poration.pdf.tei.xml b/...Single-Cell Membrane Poration.pdf.tei.xml → ...Single-Cell Membrane Poration.pdf.tei.xml
diff --git a/...n of Morphological Repression.pdf.tei.xml → ...n of Morphological Repression.pdf.tei.xml b/...n of Morphological Repression.pdf.tei.xml → ...n of Morphological Repression.pdf.tei.xml
diff --git a/...u_ Sonoporation_Pore_Dynamics.pdf.tei.xml → ...u_ Sonoporation_Pore_Dynamics.pdf.tei.xml b/...u_ Sonoporation_Pore_Dynamics.pdf.tei.xml → ...u_ Sonoporation_Pore_Dynamics.pdf.tei.xml
diff --git a/...e_sonoporation_disrupts_actin.pdf.tei.xml → ...e_sonoporation_disrupts_actin.pdf.tei.xml b/...e_sonoporation_disrupts_actin.pdf.tei.xml → ...e_sonoporation_disrupts_actin.pdf.tei.xml
diff --git a/...mbrane blebbing as a recovery.pdf.tei.xml → ...mbrane blebbing as a recovery.pdf.tei.xml b/...mbrane blebbing as a recovery.pdf.tei.xml → ...mbrane blebbing as a recovery.pdf.tei.xml
diff --git a/...ltrasound-driven microbubbles.pdf.tei.xml → ...ltrasound-driven microbubbles.pdf.tei.xml b/...ltrasound-driven microbubbles.pdf.tei.xml → ...ltrasound-driven microbubbles.pdf.tei.xml
diff --git a/...n and Flow Cytometry Protocol.pdf.tei.xml → ...n and Flow Cytometry Protocol.pdf.tei.xml b/...n and Flow Cytometry Protocol.pdf.tei.xml → ...n and Flow Cytometry Protocol.pdf.tei.xml
diff --git a/...erated by Inertial Cavitation.pdf.tei.xml → ...erated by Inertial Cavitation.pdf.tei.xml b/...erated by Inertial Cavitation.pdf.tei.xml → ...erated by Inertial Cavitation.pdf.tei.xml
diff --git a/...Medicine and BioIogy_Yu_Blebs.pdf.tei.xml → ...Medicine and BioIogy_Yu_Blebs.pdf.tei.xml b/...Medicine and BioIogy_Yu_Blebs.pdf.tei.xml → ...Medicine and BioIogy_Yu_Blebs.pdf.tei.xml
diff --git a/...eports_Sonoporation generates.pdf.tei.xml → ...eports_Sonoporation generates.pdf.tei.xml b/...eports_Sonoporation generates.pdf.tei.xml → ...eports_Sonoporation generates.pdf.tei.xml
diff --git a/...es downstream impact in vitro.pdf.tei.xml → ...es downstream impact in vitro.pdf.tei.xml b/...es downstream impact in vitro.pdf.tei.xml → ...es downstream impact in vitro.pdf.tei.xml
diff --git a/...very Sonoporation Mechanisms,.pdf.tei.xml → ...very Sonoporation Mechanisms,.pdf.tei.xml b/...very Sonoporation Mechanisms,.pdf.tei.xml → ...very Sonoporation Mechanisms,.pdf.tei.xml
diff --git a/backend/tests/context.py b/backend/tests/context.py
@@ -6,3 +6,4 @@
 )
 
 import extract
+import docs
diff --git a/backend/tests/test_docs.py b/backend/tests/test_docs.py
@@ -0,0 +1,23 @@
+from .context import docs
+import grobid_tei_xml.types as grobid_types
+from llama_index.schema import TextNode, NodeRelationship
+
+XML_PATH = "./resources/xmls/12-pdfs-from-steve-aug-22/"
+
+
+def test_node_relationships():
+    title_node = TextNode(text="this is title")
+    abstract_node = TextNode(text="this is abstract")
+    assert docs.set_relationships(title_node, abstract_node) is None
+    assert (
+        abstract_node.relationships[NodeRelationship.PARENT].node_id
+        == title_node.node_id
+    )
+
+
+def test_load_tei_xml():
+    xml = docs.load_tei_xml(
+        XML_PATH
+        + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
+    )
+    assert isinstance(xml, grobid_types.GrobidDocument)
diff --git a/backend/tests/test_extract.py b/backend/tests/test_extract.py
@@ -1,41 +1,18 @@
 from .context import extract
-import grobid_tei_xml.types as grobid_types
-from llama_index.schema import TextNode, NodeRelationship
-
-XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"
-
-
-def test_node_relationships():
-    title_node = TextNode(text="this is title")
-    abstract_node = TextNode(text="this is abstract")
-    assert extract.set_relationships(title_node, abstract_node) is None
-    assert (
-        abstract_node.relationships[NodeRelationship.PARENT].node_id
-        == title_node.node_id
-    )
-
-
-def test_load_tei_xml():
-    xml = extract._load_tei_xml(
-        XML_PATH
-        + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
-    )
-    assert isinstance(xml, grobid_types.GrobidDocument)
+from . import test_docs
 
 
 def test_gen_document_dict():
-    xml = extract._load_tei_xml(
-        XML_PATH
+    nodes_dict = extract._gen_document_dict(
+        test_docs.XML_PATH
         + "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
     )
-    nodes_dict = extract._gen_document_dict(xml)
     for node in nodes_dict.values():
         assert isinstance(node, extract.TextNode)
 
 
 def test_seed_nodes():
-    input_dir = XML_PATH
-    nodes = extract.seed_nodes(input_dir)
+    nodes = extract.seed_nodes(test_docs.XML_PATH)
     assert isinstance(nodes, list)
     for node in nodes:
         assert isinstance(node, extract.TextNode)