Skip to content

Commit

Permalink
Merge pull request #25 from TheDataGuild/refactor/docs-module
Browse files Browse the repository at this point in the history
Refactor/docs module
  • Loading branch information
Quantisan authored Sep 13, 2023
2 parents 1ee1dfa + b8c769b commit 9beb969
Show file tree
Hide file tree
Showing 31 changed files with 66 additions and 90 deletions.
Empty file added backend/mind_palace/__init__.py
Empty file.
29 changes: 29 additions & 0 deletions backend/mind_palace/docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
import grobid_tei_xml


def load_tei_xml(file_path):
print(f"Loading {file_path}")
with open(file_path, "r") as xml_file:
return grobid_tei_xml.parse_document_xml(xml_file.read())


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
)


def set_relationships(title_node, abstract_node):
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=title_node.node_id
)
return
70 changes: 8 additions & 62 deletions backend/mind_palace/extract.py
Original file line number Diff line number Diff line change
@@ -1,72 +1,20 @@
import os
import llama_index as li
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
import grobid_tei_xml
from llama_index.schema import TextNode

import docs

def load_pdfs():
required_exts = [".pdf"]

# Q: we could use a better text extractor supporting section-aware extraction
return li.SimpleDirectoryReader(
input_dir="./resources/pdfs/12-pdfs-from-steve-aug-22",
required_exts=required_exts,
).load_data()


def nodes(documents, service_context=li.ServiceContext.from_defaults()):
return service_context.node_parser.get_nodes_from_documents(
documents, show_progress=True
)


# a Node from load_pdfs standard pipeline
#
# >>> nodes[0].__dict__.keys()
# dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys',
# 'excluded_llm_metadata_keys', 'relationships', 'hash', 'text',
# 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template',
# 'metadata_seperator'])

# TODO: move these into a submodule


def _load_tei_xml(filepath):
with open(filepath, "r") as xml_file:
return grobid_tei_xml.parse_document_xml(xml_file.read())


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
id_=f"{doc_id}-title",
)


def abstract(xml, doc_id):
return TextNode(
text=xml.abstract,
id_=f"{doc_id}-abstract",
)


def set_relationships(title_node, abstract_node):
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
node_id=title_node.node_id
)
return


def _gen_document_dict(xml) -> dict[str, TextNode]:
def _gen_document_dict(file_path) -> dict[str, TextNode]:
xml = docs.load_tei_xml(file_path)
doi = xml.header.doi
assert doi is not None

try:
title_node = title(xml, doi)
abstract_node = abstract(xml, doi)
title_node = docs.title(xml, doi)
abstract_node = docs.abstract(xml, doi)
# TODO: load more sections

set_relationships(title_node, abstract_node)
docs.set_relationships(title_node, abstract_node)
return {"title": title_node, "abstract": abstract_node}
except Exception as e:
print(f"failed to load DOI {doi} because {e}")
Expand All @@ -87,9 +35,7 @@ def seed_nodes(input_dir) -> list[TextNode]:
file_paths = _get_file_paths(input_dir)

for file_path in file_paths:
print(f"loading {file_path}")
xml_data = _load_tei_xml(file_path)
nodes_dict = _gen_document_dict(xml_data)
nodes_dict = _gen_document_dict(file_path)
if nodes_dict:
for node in nodes_dict.values():
nodes.append(node)
Expand Down
2 changes: 1 addition & 1 deletion backend/mind_palace/repl.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@


# PDF Extraction
input_dir = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml"
input_dir = "./resources/xmls/12-pdfs-from-steve-aug-22/"
nodes = extract.seed_nodes(input_dir)


Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions backend/tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
)

import extract
import docs
23 changes: 23 additions & 0 deletions backend/tests/test_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from .context import docs
import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship

XML_PATH = "./resources/xmls/12-pdfs-from-steve-aug-22/"


def test_node_relationships():
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
assert docs.set_relationships(title_node, abstract_node) is None
assert (
abstract_node.relationships[NodeRelationship.PARENT].node_id
== title_node.node_id
)


def test_load_tei_xml():
xml = docs.load_tei_xml(
XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
assert isinstance(xml, grobid_types.GrobidDocument)
31 changes: 4 additions & 27 deletions backend/tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,18 @@
from .context import extract
import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship

XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/"


def test_node_relationships():
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
assert extract.set_relationships(title_node, abstract_node) is None
assert (
abstract_node.relationships[NodeRelationship.PARENT].node_id
== title_node.node_id
)


def test_load_tei_xml():
xml = extract._load_tei_xml(
XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
assert isinstance(xml, grobid_types.GrobidDocument)
from . import test_docs


def test_gen_document_dict():
xml = extract._load_tei_xml(
XML_PATH
nodes_dict = extract._gen_document_dict(
test_docs.XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
nodes_dict = extract._gen_document_dict(xml)
for node in nodes_dict.values():
assert isinstance(node, extract.TextNode)


def test_seed_nodes():
input_dir = XML_PATH
nodes = extract.seed_nodes(input_dir)
nodes = extract.seed_nodes(test_docs.XML_PATH)
assert isinstance(nodes, list)
for node in nodes:
assert isinstance(node, extract.TextNode)

0 comments on commit 9beb969

Please sign in to comment.