From 0113fe9dd6bcaf7906621a6d214ce8f55653103d Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Mon, 12 Aug 2024 13:27:09 -0500 Subject: [PATCH] Add page number identification for legal citations --- main.py | 27 ++++++++++++++++----------- requirements.txt | 2 +- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index 8d2696f..22bb409 100644 --- a/main.py +++ b/main.py @@ -1,24 +1,29 @@ +""" Requires eyecite to find legal citations """ +import csv from documentcloud.addon import AddOn from eyecite import get_citations -import csv + class LegalCitations(AddOn): + """ DocumentCloud Add-On that uses eyecite to find legal citations in a document """ def main(self): - + """ Loops through each page on each document """ citations_found = [] - + for document in self.get_documents(): - citation_list = get_citations(document.full_text) - tagged_citation_list = [ - (document.title, document.id, citation) - for citation in citation_list - ] - citations_found += tagged_citation_list + for page_number in range(1, document.page_count + 1): + page_text = document.get_page_text(page_number) + citation_list = get_citations(page_text) + tagged_citation_list = [ + (document.title, document.id, page_number, citation) + for citation in citation_list + ] + citations_found += tagged_citation_list # output the citations as a CSV. - with open("citations_found.csv", "w+") as file_: + with open("citations_found.csv", "w+", encoding="utf-8") as file_: writer = csv.writer(file_) - writer.writerow(("title", "id", "citation")) + writer.writerow(("title", "id", "page number", "citation")) writer.writerows(citations_found) self.upload_file(file_) diff --git a/requirements.txt b/requirements.txt index b6c7255..0b010ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -python-documentcloud==3.7.1 +python-documentcloud==4.1.3 requests eyecite