Skip to content

Commit

Permalink
Add page number identification for legal citations
Browse files Browse the repository at this point in the history
  • Loading branch information
duckduckgrayduck committed Aug 12, 2024
1 parent 74d2385 commit 0113fe9
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 12 deletions.
27 changes: 16 additions & 11 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
""" Requires eyecite to find legal citations """
import csv
from documentcloud.addon import AddOn
from eyecite import get_citations
import csv


class LegalCitations(AddOn):
""" DocumentCloud Add-On that uses eyecite to find legal citations in a document """
def main(self):

""" Loops through each page on each document """
citations_found = []

for document in self.get_documents():
citation_list = get_citations(document.full_text)
tagged_citation_list = [
(document.title, document.id, citation)
for citation in citation_list
]
citations_found += tagged_citation_list
for page_number in range(1, document.page_count + 1):
page_text = document.get_page_text(page_number)
citation_list = get_citations(page_text)
tagged_citation_list = [
(document.title, document.id, page_number, citation)
for citation in citation_list
]
citations_found += tagged_citation_list

# output the citations as a CSV.
with open("citations_found.csv", "w+") as file_:
with open("citations_found.csv", "w+", encoding="utf-8") as file_:
writer = csv.writer(file_)
writer.writerow(("title", "id", "citation"))
writer.writerow(("title", "id", "page number", "citation"))
writer.writerows(citations_found)
self.upload_file(file_)

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
python-documentcloud==3.7.1
python-documentcloud==4.1.3
requests
eyecite

0 comments on commit 0113fe9

Please sign in to comment.