-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
66 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo | ||
import grobid_tei_xml | ||
|
||
|
||
def load_tei_xml(file_path): | ||
print(f"Loading {file_path}") | ||
with open(file_path, "r") as xml_file: | ||
return grobid_tei_xml.parse_document_xml(xml_file.read()) | ||
|
||
|
||
def title(xml, doc_id): | ||
return TextNode( | ||
text=xml.header.title, | ||
id_=f"{doc_id}-title", | ||
) | ||
|
||
|
||
def abstract(xml, doc_id): | ||
return TextNode( | ||
text=xml.abstract, | ||
id_=f"{doc_id}-abstract", | ||
) | ||
|
||
|
||
def set_relationships(title_node, abstract_node): | ||
abstract_node.relationships[NodeRelationship.PARENT] = RelatedNodeInfo( | ||
node_id=title_node.node_id | ||
) | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ | |
) | ||
|
||
import extract | ||
import docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from .context import docs | ||
import grobid_tei_xml.types as grobid_types | ||
from llama_index.schema import TextNode, NodeRelationship | ||
|
||
XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/" | ||
|
||
|
||
def test_node_relationships(): | ||
title_node = TextNode(text="this is title") | ||
abstract_node = TextNode(text="this is abstract") | ||
assert docs.set_relationships(title_node, abstract_node) is None | ||
assert ( | ||
abstract_node.relationships[NodeRelationship.PARENT].node_id | ||
== title_node.node_id | ||
) | ||
|
||
|
||
def test_load_tei_xml(): | ||
xml = docs.load_tei_xml( | ||
XML_PATH | ||
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml" | ||
) | ||
assert isinstance(xml, grobid_types.GrobidDocument) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,18 @@ | ||
from .context import extract | ||
import grobid_tei_xml.types as grobid_types | ||
from llama_index.schema import TextNode, NodeRelationship | ||
|
||
XML_PATH = "./resources/pdfs/12-pdfs-from-steve-aug-22/xml/" | ||
|
||
|
||
def test_node_relationships(): | ||
title_node = TextNode(text="this is title") | ||
abstract_node = TextNode(text="this is abstract") | ||
assert extract.set_relationships(title_node, abstract_node) is None | ||
assert ( | ||
abstract_node.relationships[NodeRelationship.PARENT].node_id | ||
== title_node.node_id | ||
) | ||
|
||
|
||
def test_load_tei_xml(): | ||
xml = extract._load_tei_xml( | ||
XML_PATH | ||
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml" | ||
) | ||
assert isinstance(xml, grobid_types.GrobidDocument) | ||
from . import test_docs | ||
|
||
|
||
def test_gen_document_dict(): | ||
xml = extract._load_tei_xml( | ||
XML_PATH | ||
nodes_dict = extract._gen_document_dict( | ||
test_docs.XML_PATH | ||
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml" | ||
) | ||
nodes_dict = extract._gen_document_dict(xml) | ||
for node in nodes_dict.values(): | ||
assert isinstance(node, extract.TextNode) | ||
|
||
|
||
def test_seed_nodes(): | ||
input_dir = XML_PATH | ||
nodes = extract.seed_nodes(input_dir) | ||
nodes = extract.seed_nodes(test_docs.XML_PATH) | ||
assert isinstance(nodes, list) | ||
for node in nodes: | ||
assert isinstance(node, extract.TextNode) |