Skip to content

Commit

Permalink
feature: guess and ignore header
Browse files Browse the repository at this point in the history
  • Loading branch information
hueyy committed Oct 9, 2022
1 parent 7b2f1c8 commit 9f73c3d
Show file tree
Hide file tree
Showing 11 changed files with 360,780 additions and 69,593 deletions.
26 changes: 16 additions & 10 deletions pdf_scout/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,28 @@
from pdf_scout.extract import extract_all_words
from pdf_scout.scoring import score_words
from pdf_scout.bookmarks import generate_bookmarks, write_bookmarks
from pdf_scout.types import Word
from pdf_scout.types import HeadingScore, Word
from time import time
from typing import List, Tuple
from operator import itemgetter
import pdfplumber
import typer


def get_top_scored_words(
scored_words: List[Tuple[HeadingScore, Word]], levels: int
) -> List[Tuple[int, Word]]:
all_scores = list(set([score["overall"] for score, _ in scored_words]))
all_scores.sort(reverse=True)
top_scores: List[Number] = all_scores[0:levels]
top_scored_words: List[Tuple[int, Word]] = [
(top_scores.index(score["overall"]), word)
for score, word in scored_words
if score["overall"] in top_scores
]
return top_scored_words


def main(
input_file_path: str,
output_file_path: str = typer.Argument(""),
Expand All @@ -30,15 +44,7 @@ def main(
extract_all_words(pdf_file)
)
scored_words = score_words(all_words, non_body_words)

all_scores = list(set([score["overall"] for score, _ in scored_words]))
all_scores.sort(reverse=True)
top_scores: List[Number] = all_scores[0:levels]
top_scored_words: List[Tuple[int, Word]] = [
(top_scores.index(score["overall"]), word)
for score, word in scored_words
if score["overall"] in top_scores
]
top_scored_words = get_top_scored_words(scored_words, levels)

bookmarks = generate_bookmarks(pdf_file, top_scored_words)
pdf_file.close()
Expand Down
52 changes: 46 additions & 6 deletions pdf_scout/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from operator import itemgetter
from typing import List, Tuple
from pdf_scout.logger import debug_log
from pdf_scout.types import RawWord, Word, DocumentWords
from pdf_scout.types import RawWord, Word, DocumentWords, Rect
import statistics
import pdfplumber

Expand Down Expand Up @@ -33,8 +33,40 @@ def guess_left_margin(words) -> List[Number]:
)


def get_header_words(all_words: List[Word]):
return
def get_header_bottom_position(pdf_file: pdfplumber.PDF) -> Number:
HEADER_POSITION_THRESHOLD = 0.2 # assume header is in top 20% of page
HEADER_COUNT_THRESHOLD = 0.7 # assume header is on >70% of pages

# check if rectangle header
header_rects: List[List[Rect]] = [
[
rect
for rect in page.rects
if (
rect["height"] > 0
and rect["width"] > 0
and rect["bottom"] <= HEADER_POSITION_THRESHOLD * page.height
)
]
for page in pdf_file.pages
]
is_rectangle_header = len(
[True for rects_in_page in header_rects if len(rects_in_page) >= 1]
) >= HEADER_COUNT_THRESHOLD * len(pdf_file.pages)

if is_rectangle_header:
return statistics.mode(
[rect["bottom"] for rects_in_page in header_rects for rect in rects_in_page]
)
else:
return 0


def get_footer_top_position(pdf_file: pdfplumber.PDF):
FOOTER_POSITION_THRESHOLD = 0.2 # assume footer is in bottom 20% of page
FOOTER_COUNT_THRESHOLD = 0.7 # assume footer is on >70% of pages
# TODO
return None


def add_line_spacing_to_words(
Expand Down Expand Up @@ -91,7 +123,9 @@ def add_line_spacing_to_word(
}


def raw_extract_words(pdf_file: pdfplumber.PDF) -> List[RawWord]:
def raw_extract_words(
pdf_file: pdfplumber.PDF, header_bottom_position: Number = 0
) -> List[RawWord]:
all_words = [
word
for page_list in (
Expand All @@ -106,13 +140,19 @@ def raw_extract_words(pdf_file: pdfplumber.PDF) -> List[RawWord]:
for page in pdf_file.pages
)
for word in page_list
if (len(word["text"]) > 0) # ignore all words that are just whitespace
if (
(len(word["text"]) > 0) # ignore all words that are just whitespace
and word["top"] > header_bottom_position # ignore header
)
]
return all_words


def extract_all_words(pdf_file: pdfplumber.PDF) -> DocumentWords:
raw_words = raw_extract_words(pdf_file)

header_bottom_position = get_header_bottom_position(pdf_file)

raw_words = raw_extract_words(pdf_file, header_bottom_position)
all_words_with_line_spacing = add_line_spacing_to_words(pdf_file, raw_words)

body_top_spacing, body_bottom_spacing = guess_body_spacing(
Expand Down
16 changes: 14 additions & 2 deletions pdf_scout/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from pdf_scout.tests.input_files import INPUT_FILES
from pdf_scout.app import open_pdf_file
from pdf_scout.extract import raw_extract_words, extract_all_words
from pdf_scout.scoring import score_words
import pytest
import pdfplumber
from operator import itemgetter


@pytest.fixture(scope="session", params=INPUT_FILES)
def file_output(request):
return open_pdf_file(request.param)
return pdfplumber.open(request.param)


@pytest.fixture(scope="session")
Expand All @@ -19,3 +21,13 @@ def file_raw_output(file_output):
def file_clean_output(file_output):
file = file_output
return file, extract_all_words(file)


@pytest.fixture(scope="session")
def scored_words_output(file_clean_output):
file, extracted_words = file_clean_output
all_words, non_body_words = itemgetter("all_words", "non_body_words")(
extracted_words
)
scored_words = score_words(all_words, non_body_words)
return file, scored_words
1 change: 1 addition & 0 deletions pdf_scout/tests/input_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@
"./pdf/RecordTV Pte Ltd v MediaCorp TV Singapore Pte Ltd and others [2010] SGCA 43 - Judgment.pdf",
"./pdf/PUBLIC PROSECUTOR v GCK [2020] SGCA 2 - Judgment.pdf",
"./pdf/UQP v UQQ [2019] SGHCF 7 - Judgment.pdf",
"./pdf/PDPA Key Concepts.pdf",
]
Loading

0 comments on commit 9f73c3d

Please sign in to comment.