feature: tweak filtering and scoring

hueyy · Oct 2, 2022 · 71ecf25 · 71ecf25
1 parent 7fc095f
commit 71ecf25
Show file tree

Hide file tree

Showing 10 changed files with 157,147 additions and 50,270 deletions.
diff --git a/README.md b/README.md
@@ -17,11 +17,22 @@ pip uninstall pdf_scout
 
 ![screenshot](./assets/screenshot.png)
 
-This project is a work in progress and will likely only generate accurate bookmarks for documents that conform to the following requirements:
+This project is a work in progress and will likely only generate suitable bookmarks for documents that conform to the following requirements:
 
 * Single column of text (not multiple columns)
-* Font size of header text >= font size of body text
+* Font size of header text > font size of body text
 * Header text is justified or left-aligned
+* Paragraph spacing for headers > body text paragraph spacing
+* Consistent left margins on every page
+
+## Supported document types
+
+`pdf_scout` has been tested on and expressly supports the following classes of documents:
+
+- Singapore State Court and Supreme Court Judgments (unreported)
+- Singapore Law Reports
+
+It may support other types of documents as well. If a particular class of document isn't supported or does not work well, please open an issue and I will consider adding support for it.
 
 ## Development
 

diff --git a/pdf_scout/app.py b/pdf_scout/app.py
@@ -73,19 +73,14 @@ def open_pdf_file(input_path: str) -> pdfplumber.PDF:
     return pdfplumber.open(input_path)
 
 
-def get_words_from_pdf_file(pdf_file: pdfplumber.PDF):
-    all_words = extract_all_words(pdf_file)
-    return all_words
-
-
 def add_bookmarks_to_pdf(input_path: str, output_path: str = "", levels=3):
     if len(output_path) == 0:
         input_path_start, _ = input_path.split(".pdf")
         output_path = f"{input_path_start}-out.pdf"
 
     pdf_file = open_pdf_file(input_path)
-    all_words = get_words_from_pdf_file(pdf_file)
-    scored_words = score_words(all_words)
+    all_words, non_body_words = extract_all_words(pdf_file)
+    scored_words = score_words(all_words, non_body_words)
 
     top_scores: List[Number] = sorted(
         list(set([score["overall"] for score, _ in scored_words])), reverse=True

diff --git a/pdf_scout/extract.py b/pdf_scout/extract.py
@@ -9,10 +9,12 @@
 def guess_left_margin(words) -> List[Number]:
     words_x0 = [round(word["x0"]) for word in words]
     counts = [(x0, words_x0.count(x0)) for x0 in set(words_x0)]
+    std_dev = statistics.pstdev([count for _, count in counts])
+    mean = statistics.mean([count for _, count in counts])
     threshold_counts = [
         (left_margin, count)
         for left_margin, count in counts
-        if count >= 0.1 * len(words_x0)
+        if count >= mean + std_dev * 5
     ]
 
     debug_log("guess_left_margin locals:", locals())
@@ -51,10 +53,6 @@ def guess_body_spacing(words) -> Tuple[Number, Number]:
     )
 
 
-def guess_body_font_size(words) -> Number:
-    return statistics.mode([word["size"] for word in words])
-
-
 def get_word_line_position(word) -> Number:
     return word["top"]
 
@@ -103,7 +101,6 @@ def raw_extract_words(pdf_file) -> List[dict[str, any]]:
 
 def extract_all_words(pdf_file) -> List[dict[str, any]]:
     all_words = raw_extract_words(pdf_file)
-    body_font_size = guess_body_font_size(all_words)
     all_words = add_line_spacing_to_words(pdf_file, all_words)
 
     body_top_spacing, body_bottom_spacing = guess_body_spacing(all_words)
@@ -112,7 +109,7 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:
     # TODO: handle center-aligned text
     left_margins = guess_left_margin(all_words)
 
-    all_words = [
+    non_body_words = [
         word
         for word in all_words
         if (
@@ -122,9 +119,6 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:
                 word["top_spacing"] >= body_top_spacing * 1.05
                 and word["bottom_spacing"] >= body_bottom_spacing * 1.05
             )
-            and (  # ignore all text smaller than body font size
-                word["size"] >= body_font_size
-            )
             and (  # ignore all words not at left margin
                 round(word["x0"]) in left_margins
             )
@@ -133,4 +127,4 @@ def extract_all_words(pdf_file) -> List[dict[str, any]]:
 
     debug_log("extract_all_words locals: ", locals())
 
-    return all_words
+    return all_words, non_body_words
diff --git a/pdf_scout/scoring.py b/pdf_scout/scoring.py
@@ -8,7 +8,7 @@
 def score_font_name(font_name: str) -> Number:
     if re.search(r"(Bold|BoldMT)$", font_name):
         return 20
-    elif re.search(r"Semibold", font_name):
+    elif re.search(r"Semibold$", font_name):
         return 15
     elif re.search(r"(BoldItalic|BoldItalicMT)$", font_name):
         return 10
@@ -65,16 +65,20 @@ def guess_body_score(word_list: Tuple[HeadingScore, any]) -> Number:
     return statistics.mode([score["font"] for score, _ in word_list])
 
 
-def score_words(all_words: List[any]):
-    scored_words: List[Tuple[HeadingScore, any]] = [
+def score_words(all_words: List[any], non_body_words: List[any]):
+    scored_all_words: List[Tuple[HeadingScore, any]] = [
         (get_heading_score(word), word) for word in all_words
     ]
-    body_score = guess_body_score(scored_words)
+    body_score = guess_body_score(scored_all_words)
+
+    scored_non_body_words = [(get_heading_score(word), word) for word in non_body_words]
     # ignore all body text
-    scored_words = [
-        (score, word) for score, word in scored_words if score["font"] != body_score
+    scored_non_body_words = [
+        (score, word)
+        for score, word in scored_non_body_words
+        if score["font"] > body_score
     ]
 
-    debug_log("score_words locals:", body_score, scored_words)
+    debug_log("score_words locals:", body_score, scored_non_body_words)
 
-    return scored_words
+    return scored_non_body_words
diff --git a/pdf_scout/tests/input_files.py b/pdf_scout/tests/input_files.py
@@ -0,0 +1,6 @@
+INPUT_FILES = [
+    "./pdf/Law Society of Singapore v Loh Wai Mun Daniel [2004] SGHC 36 - Judgment.pdf",
+    "./pdf/RecordTV Pte Ltd v MediaCorp TV Singapore Pte Ltd and others [2010] SGCA 43 - Judgment.pdf",
+    "./pdf/PUBLIC PROSECUTOR v GCK [2020] SGCA 2 - Judgment.pdf",
+    "./pdf/UQP v UQQ [2019] SGHCF 7 - Judgment.pdf",
+]