Skip to content

Commit

Permalink
feature: tweak filtering and scoring
Browse files Browse the repository at this point in the history
  • Loading branch information
hueyy committed Oct 2, 2022
1 parent 7fc095f commit 71ecf25
Show file tree
Hide file tree
Showing 10 changed files with 157,147 additions and 50,270 deletions.
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,22 @@ pip uninstall pdf_scout

![screenshot](./assets/screenshot.png)

This project is a work in progress and will likely only generate accurate bookmarks for documents that conform to the following requirements:
This project is a work in progress and will likely only generate suitable bookmarks for documents that conform to the following requirements:

* Single column of text (not multiple columns)
* Font size of header text >= font size of body text
* Font size of header text > font size of body text
* Header text is justified or left-aligned
* Paragraph spacing for headers > body text paragraph spacing
* Consistent left margins on every page

## Supported document types

`pdf_scout` has been tested on and expressly supports the following classes of documents:

- Singapore State Court and Supreme Court Judgments (unreported)
- Singapore Law Reports

It may support other types of documents as well. If a particular class of document isn't supported or does not work well, please open an issue and I will consider adding support for it.

## Development

Expand Down
9 changes: 2 additions & 7 deletions pdf_scout/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,14 @@ def open_pdf_file(input_path: str) -> pdfplumber.PDF:
return pdfplumber.open(input_path)


def get_words_from_pdf_file(pdf_file: pdfplumber.PDF):
all_words = extract_all_words(pdf_file)
return all_words


def add_bookmarks_to_pdf(input_path: str, output_path: str = "", levels=3):
if len(output_path) == 0:
input_path_start, _ = input_path.split(".pdf")
output_path = f"{input_path_start}-out.pdf"

pdf_file = open_pdf_file(input_path)
all_words = get_words_from_pdf_file(pdf_file)
scored_words = score_words(all_words)
all_words, non_body_words = extract_all_words(pdf_file)
scored_words = score_words(all_words, non_body_words)

top_scores: List[Number] = sorted(
list(set([score["overall"] for score, _ in scored_words])), reverse=True
Expand Down
16 changes: 5 additions & 11 deletions pdf_scout/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
def guess_left_margin(words) -> List[Number]:
words_x0 = [round(word["x0"]) for word in words]
counts = [(x0, words_x0.count(x0)) for x0 in set(words_x0)]
std_dev = statistics.pstdev([count for _, count in counts])
mean = statistics.mean([count for _, count in counts])
threshold_counts = [
(left_margin, count)
for left_margin, count in counts
if count >= 0.1 * len(words_x0)
if count >= mean + std_dev * 5
]

debug_log("guess_left_margin locals:", locals())
Expand Down Expand Up @@ -51,10 +53,6 @@ def guess_body_spacing(words) -> Tuple[Number, Number]:
)


def guess_body_font_size(words) -> Number:
return statistics.mode([word["size"] for word in words])


def get_word_line_position(word) -> Number:
return word["top"]

Expand Down Expand Up @@ -103,7 +101,6 @@ def raw_extract_words(pdf_file) -> List[dict[str, any]]:

def extract_all_words(pdf_file) -> List[dict[str, any]]:
all_words = raw_extract_words(pdf_file)
body_font_size = guess_body_font_size(all_words)
all_words = add_line_spacing_to_words(pdf_file, all_words)

body_top_spacing, body_bottom_spacing = guess_body_spacing(all_words)
Expand All @@ -112,7 +109,7 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:
# TODO: handle center-aligned text
left_margins = guess_left_margin(all_words)

all_words = [
non_body_words = [
word
for word in all_words
if (
Expand All @@ -122,9 +119,6 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:
word["top_spacing"] >= body_top_spacing * 1.05
and word["bottom_spacing"] >= body_bottom_spacing * 1.05
)
and ( # ignore all text smaller than body font size
word["size"] >= body_font_size
)
and ( # ignore all words not at left margin
round(word["x0"]) in left_margins
)
Expand All @@ -133,4 +127,4 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:

debug_log("extract_all_words locals: ", locals())

return all_words
return all_words, non_body_words
20 changes: 12 additions & 8 deletions pdf_scout/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def score_font_name(font_name: str) -> Number:
if re.search(r"(Bold|BoldMT)$", font_name):
return 20
elif re.search(r"Semibold", font_name):
elif re.search(r"Semibold$", font_name):
return 15
elif re.search(r"(BoldItalic|BoldItalicMT)$", font_name):
return 10
Expand Down Expand Up @@ -65,16 +65,20 @@ def guess_body_score(word_list: Tuple[HeadingScore, any]) -> Number:
return statistics.mode([score["font"] for score, _ in word_list])


def score_words(all_words: List[any]):
scored_words: List[Tuple[HeadingScore, any]] = [
def score_words(all_words: List[any], non_body_words: List[any]):
scored_all_words: List[Tuple[HeadingScore, any]] = [
(get_heading_score(word), word) for word in all_words
]
body_score = guess_body_score(scored_words)
body_score = guess_body_score(scored_all_words)

scored_non_body_words = [(get_heading_score(word), word) for word in non_body_words]
# ignore all body text
scored_words = [
(score, word) for score, word in scored_words if score["font"] != body_score
scored_non_body_words = [
(score, word)
for score, word in scored_non_body_words
if score["font"] > body_score
]

debug_log("score_words locals:", body_score, scored_words)
debug_log("score_words locals:", body_score, scored_non_body_words)

return scored_words
return scored_non_body_words
6 changes: 6 additions & 0 deletions pdf_scout/tests/input_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
INPUT_FILES = [
"./pdf/Law Society of Singapore v Loh Wai Mun Daniel [2004] SGHC 36 - Judgment.pdf",
"./pdf/RecordTV Pte Ltd v MediaCorp TV Singapore Pte Ltd and others [2010] SGCA 43 - Judgment.pdf",
"./pdf/PUBLIC PROSECUTOR v GCK [2020] SGCA 2 - Judgment.pdf",
"./pdf/UQP v UQQ [2019] SGHCF 7 - Judgment.pdf",
]
Loading

0 comments on commit 71ecf25

Please sign in to comment.