Skip to content

Commit

Permalink
Merge pull request #36 from TheDataGuild/feature/show-proper-source-c…
Browse files Browse the repository at this point in the history
…itation

Feature/show proper source citation
  • Loading branch information
Quantisan authored Sep 26, 2023
2 parents cc42a96 + 91d9764 commit 83a302b
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 6 deletions.
7 changes: 5 additions & 2 deletions mind_palace/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,10 @@ def load_index(model):

st.markdown("### Sources")
for i, source_node in enumerate(response.source_nodes):
with st.expander(f"[{i + 1}] {source_node.node.node_id}"):
st.write(f"relevancy score: {source_node.score}")
with st.expander(f"[{i + 1}] {source_node.node.metadata['citation']}"):
if source_node.metadata["paragraph_number"]:
st.write(
f"paragraph number: {source_node.metadata['paragraph_number']}"
)
st.write("original text:")
st.write(source_node.node.get_text().split(":", 1)[1])
37 changes: 36 additions & 1 deletion mind_palace/docs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo
import grobid_tei_xml
from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode


def load_tei_xml(file_path):
Expand All @@ -8,6 +8,22 @@ def load_tei_xml(file_path):
return grobid_tei_xml.parse_document_xml(xml_file.read())


def cite_authors(xml):
first_author = xml.header.authors[0]
first_author_display_name = f"{first_author.surname}, {first_author.given_name}"
return first_author_display_name + (
", et al" if len(xml.header.authors) > 1 else ""
)


def cite_journal(xml):
return f"{xml.header.journal_abbrev} {xml.header.date};{xml.header.volume}({xml.header.issue}). doi:{xml.header.doi}"


def cite(xml):
return f"{cite_authors(xml)}. {xml.header.title}. {cite_journal(xml)}."


def title(xml, doc_id):
return TextNode(
text=xml.header.title,
Expand Down Expand Up @@ -44,6 +60,7 @@ def body(xml, doc_id):
return [
TextNode(
text=line,
metadata={"paragraph_number": index + 1},
id_=f"{doc_id}-body-paragraph-{index}",
)
for index, line in enumerate(xml.body.split("\n"))
Expand All @@ -64,3 +81,21 @@ def set_relationships(title_node, abstract_node, body_nodes):
set_prev_relationships(body_nodes)
set_next_relationships(body_nodes)
return


def _set_citation_metadata(citation, node):
node.metadata["citation"] = citation
return


def set_citations(xml, nodes):
citation = cite(xml)
for node in nodes:
if isinstance(node, list):
for sub_node in node:
_set_citation_metadata(citation, sub_node)
continue
else:
_set_citation_metadata(citation, node)

return
4 changes: 3 additions & 1 deletion mind_palace/extract.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from llama_index.schema import TextNode

import docs
from llama_index.schema import TextNode


def _gen_document_dict(file_path) -> dict[str, TextNode]:
Expand All @@ -15,6 +15,8 @@ def _gen_document_dict(file_path) -> dict[str, TextNode]:
body_nodes = docs.body(xml, doi)

docs.set_relationships(title_node, abstract_node, body_nodes)
docs.set_citations(xml=xml, nodes=[title_node, abstract_node, body_nodes])

return {
"title": title_node,
"abstract": abstract_node,
Expand Down
50 changes: 48 additions & 2 deletions tests/unit/test_docs.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,41 @@
from tests.context import docs
from unittest.mock import MagicMock, patch

import grobid_tei_xml.types as grobid_types
from llama_index.schema import TextNode, NodeRelationship
from llama_index.schema import NodeRelationship, TextNode

from tests.context import docs

XML_PATH = "./resources/xmls/12-pdfs-from-steve-aug-22/"


def test_cite_authors():
xml = MagicMock()
xml.header.authors = [
MagicMock(surname="Doe", given_name="John"),
MagicMock(surname="Smith", given_name="Jane"),
]
assert docs.cite_authors(xml) == "Doe, John, et al"

xml.header.authors = [MagicMock(surname="Doe", given_name="John")]
assert docs.cite_authors(xml) == "Doe, John"


def test_cite():
xml = MagicMock()

with patch("docs.cite_authors", return_value="Doe, John"):
xml.header.title = "This is a title"
xml.header.journal_abbrev = "J. Abbrev."
xml.header.date = "2023"
xml.header.volume = "1"
xml.header.issue = "9"
xml.header.doi = "10.1234/1234"
assert (
docs.cite(xml)
== "Doe, John. This is a title. J. Abbrev. 2023;1(9). doi:10.1234/1234."
)


def gen_nodes():
return [
TextNode(text="this is first"),
Expand Down Expand Up @@ -63,6 +94,21 @@ def test_node_relationships():
)


def test_set_citations():
xml = docs.load_tei_xml(
XML_PATH
+ "2010_PhysRevLett_Pulsating Tandem Microbubble for Localized and Directional Single-Cell Membrane Poration.pdf.tei.xml"
)
title_node = TextNode(text="this is title")
abstract_node = TextNode(text="this is abstract")
body_nodes = gen_nodes()
docs.set_citations(xml=xml, nodes=[title_node, abstract_node, body_nodes])
assert title_node.metadata["citation"] == docs.cite(xml)
assert abstract_node.metadata["citation"] == docs.cite(xml)
for body_node in body_nodes:
assert body_node.metadata["citation"] == docs.cite(xml)


def test_load_tei_xml():
xml = docs.load_tei_xml(
XML_PATH
Expand Down

0 comments on commit 83a302b

Please sign in to comment.