Skip to content

Commit

Permalink
chore: move bookmarks-related code to a separate file
Browse files Browse the repository at this point in the history
  • Loading branch information
hueyy committed Oct 9, 2022
1 parent 8f81133 commit 7b2f1c8
Show file tree
Hide file tree
Showing 8 changed files with 8,958 additions and 11,107 deletions.
130 changes: 27 additions & 103 deletions pdf_scout/app.py
Original file line number Diff line number Diff line change
@@ -1,125 +1,50 @@
from numbers import Number
from pdf_scout.extract import extract_all_words
from pdf_scout.scoring import score_words
from pdf_scout.logger import debug_log
from PyPDF2 import PdfMerger
from pdf_scout.bookmarks import generate_bookmarks, write_bookmarks
from pdf_scout.types import Word
from time import time
from typing import Any, List, Tuple, TypedDict
from typing import List, Tuple
from operator import itemgetter
import pdfplumber
import typer


class Bookmark(TypedDict):
title: str
page_number: str
scroll_distance: Number

def main(
input_file_path: str,
output_file_path: str = typer.Argument(""),
levels: int = typer.Argument(3),
):
start_time = time()

def write_bookmarks(
input_path: str, output_path: str, bookmarks: List[Tuple[int, Bookmark]]
) -> None:
merger = PdfMerger()
merger.append(input_path, import_outline=False) # disregard existing outline
if input_file_path is None or len(input_file_path) == 0:
print("Error: file_path not provided")
raise typer.Exit(code=1)

parent_bookmarks: List[Tuple[int, Any]] = []
# last item in list is last outline item added
if len(output_file_path) == 0:
input_path_start, _ = input_file_path.split(".pdf")
output_file_path = f"{input_path_start}-out.pdf"

add_bookmark_to_writer = lambda writer, bookmark, parent: writer.add_outline_item(
bookmark["title"],
bookmark["page_number"] - 1,
parent,
None,
False,
False,
"/FitH",
bookmark["scroll_distance"],
)
get_last_bookmark = (
lambda parent_bs: parent_bs[-1][1] if len(parent_bs) >= 1 else None
pdf_file = pdfplumber.open(input_file_path)
all_words, non_body_words = itemgetter("all_words", "non_body_words")(
extract_all_words(pdf_file)
)

for rank, bookmark in bookmarks:
add_bookmark = lambda p: add_bookmark_to_writer(merger, bookmark, p)

debug_log("Rank: ", rank, bookmark["title"])

if len(parent_bookmarks) == 0:
new_bookmark = add_bookmark(None)
else:
last_rank, last_bookmark = parent_bookmarks[-1]
if last_rank < rank:
new_bookmark = add_bookmark(last_bookmark)
else:
parent_bookmarks.pop()
if last_rank == rank:
parent_bookmark = get_last_bookmark(parent_bookmarks)
new_bookmark = add_bookmark(parent_bookmark)
elif last_rank > rank:
rank_difference = last_rank - rank
for _ in range(rank_difference):
if len(parent_bookmarks) >= 1:
parent_bookmarks.pop()
parent_bookmark = get_last_bookmark(parent_bookmarks)
new_bookmark = add_bookmark(parent_bookmark)
parent_bookmarks.append((rank, new_bookmark))

merger.write(output_path)
merger.close()


def open_pdf_file(input_path: str) -> pdfplumber.PDF:
return pdfplumber.open(input_path)


def add_bookmarks_to_pdf(input_path: str, output_path: str = "", levels: int = 3):
if len(output_path) == 0:
input_path_start, _ = input_path.split(".pdf")
output_path = f"{input_path_start}-out.pdf"

pdf_file = open_pdf_file(input_path)
all_words, non_body_words = extract_all_words(pdf_file)
scored_words = score_words(all_words, non_body_words)

top_scores: List[Number] = sorted(
list(set([score["overall"] for score, _ in scored_words])), reverse=True
)[0:levels]
top_scored_words = [
[top_scores.index(score["overall"]), word]
all_scores = list(set([score["overall"] for score, _ in scored_words]))
all_scores.sort(reverse=True)
top_scores: List[Number] = all_scores[0:levels]
top_scored_words: List[Tuple[int, Word]] = [
(top_scores.index(score["overall"]), word)
for score, word in scored_words
if score["overall"] in top_scores
]

bookmarks: List[Tuple[int, Bookmark]] = [
(
rank,
dict(
title=word["text"],
page_number=word["page_number"],
scroll_distance=(
pdf_file.pages[word["page_number"] - 1].height
- word["top"]
+ word["bottom"]
- word["top"]
),
),
)
for rank, word in top_scored_words
]

debug_log("add_bookmarks_to_pdf locals: ", locals())

write_bookmarks(input_path, output_path, bookmarks)

bookmarks = generate_bookmarks(pdf_file, top_scored_words)
pdf_file.close()

write_bookmarks(input_file_path, output_file_path, bookmarks)

def main(input_file_path: str, output_file_path: str = typer.Argument("")):
start_time = time()

if input_file_path is None or len(input_file_path) == 0:
print("Error: file_path not provided")
raise typer.Exit(code=1)
add_bookmarks_to_pdf(input_file_path, output_file_path)
end_time = time()
print(f"Finished in {end_time - start_time}s")

Expand All @@ -129,5 +54,4 @@ def start():


if __name__ == "__main__":
main("./pdf/PDPA Key Concepts.pdf", "")
# start()
start()
83 changes: 83 additions & 0 deletions pdf_scout/bookmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from PyPDF2 import PdfMerger
from typing import Any, List, Tuple
from pdf_scout.logger import debug_log
from pdf_scout.types import Word, Bookmark
import pdfplumber


def write_bookmarks(
input_path: str, output_path: str, bookmarks: List[Tuple[int, Bookmark]]
) -> None:
merger = PdfMerger()
merger.append(input_path, import_outline=False) # disregard existing outline

parent_bookmarks: List[Tuple[int, Any]] = []
# last item in list is last outline item added

add_bookmark_to_writer = lambda writer, bookmark, parent: writer.add_outline_item(
bookmark["title"],
bookmark["page_number"] - 1,
parent,
None,
False,
False,
"/FitH",
bookmark["scroll_distance"],
)
get_last_bookmark = (
lambda parent_bs: parent_bs[-1][1] if len(parent_bs) >= 1 else None
)

for rank, bookmark in bookmarks:
add_bookmark = lambda p: add_bookmark_to_writer(merger, bookmark, p)

debug_log("Rank: ", rank, bookmark["title"])

if len(parent_bookmarks) == 0:
new_bookmark = add_bookmark(None)
else:
last_rank, last_bookmark = parent_bookmarks[-1]
if last_rank < rank:
new_bookmark = add_bookmark(last_bookmark)
else:
parent_bookmarks.pop()
if last_rank == rank:
parent_bookmark = get_last_bookmark(parent_bookmarks)
new_bookmark = add_bookmark(parent_bookmark)
elif last_rank > rank:
rank_difference = last_rank - rank
for _ in range(rank_difference):
if len(parent_bookmarks) >= 1:
parent_bookmarks.pop()
parent_bookmark = get_last_bookmark(parent_bookmarks)
new_bookmark = add_bookmark(parent_bookmark)
parent_bookmarks.append((rank, new_bookmark))

merger.write(output_path)
merger.close()
return


def generate_bookmarks(
pdf_file: pdfplumber.PDF, top_scored_words: List[Tuple[int, Word]]
):
bookmarks: List[Tuple[int, Bookmark]] = [
(
rank,
dict(
title=word["text"],
page_number=word["page_number"],
scroll_distance=(
pdf_file.pages[word["page_number"] - 1].height
- word["top"]
+ word["bottom"]
- word["top"]
),
),
)
for rank, word in top_scored_words
]

debug_log("add_bookmarks_to_pdf locals: ", locals())

return bookmarks
31 changes: 10 additions & 21 deletions pdf_scout/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from operator import itemgetter
from typing import List, Tuple
from pdf_scout.logger import debug_log
from pdf_scout.types import RawWord, Word
from pdf_scout.types import RawWord, Word, DocumentWords
import statistics
import pdfplumber

Expand Down Expand Up @@ -33,6 +33,10 @@ def guess_left_margin(words) -> List[Number]:
)


def get_header_words(all_words: List[Word]):
return


def add_line_spacing_to_words(
pdf_file: pdfplumber.PDF, all_words: List[RawWord]
) -> List[Word]:
Expand Down Expand Up @@ -107,25 +111,7 @@ def raw_extract_words(pdf_file: pdfplumber.PDF) -> List[RawWord]:
return all_words


def extract_all_words(pdf_file: pdfplumber.PDF) -> Tuple[List[Word], List[Word]]:
"""
Returns a list of dicts something like
{
'text': 'Law Society of Singapore v Loh Wai Mun Daniel[2004] SGHC 36',
'x0': 164.60769147751603,
'x1': 430.66493976121933,
'top': 56.17809901052692,
'doctop': 56.17809901052692,
'bottom': 85.4280988218394,
'upright': True, 'direction': 1,
'fontname': 'QDBAAA+ArialRegular',
'size': 14.249999908075324,
'page_number': 1,
'top_spacing': 56.18,
'bottom_spacing': 35.35
}
"""

def extract_all_words(pdf_file: pdfplumber.PDF) -> DocumentWords:
raw_words = raw_extract_words(pdf_file)
all_words_with_line_spacing = add_line_spacing_to_words(pdf_file, raw_words)

Expand Down Expand Up @@ -155,4 +141,7 @@ def extract_all_words(pdf_file: pdfplumber.PDF) -> Tuple[List[Word], List[Word]]

debug_log("extract_all_words locals: ", locals())

return all_words_with_line_spacing, non_body_words_with_line_spacing
return {
"all_words": all_words_with_line_spacing,
"non_body_words": non_body_words_with_line_spacing,
}
14 changes: 3 additions & 11 deletions pdf_scout/scoring.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from numbers import Number
from pdf_scout.logger import debug_log
from pdf_scout.types import Word
from typing import List, Tuple, TypedDict
from pdf_scout.types import Word, HeadingScore
from typing import List, Tuple
import re
import statistics

Expand Down Expand Up @@ -35,14 +35,6 @@ def score_word_length(length: int) -> Number:
return STARTING_SCORE * length / MIN_THRESHOLD


class HeadingScore(TypedDict):
font_name: Number
font_size: Number
word_length: Number
font: Number
overall: Number


def get_heading_score(word: Word) -> HeadingScore:
font_name: str = word["fontname"]
font_size: Number = word["size"]
Expand All @@ -68,7 +60,7 @@ def guess_body_score(word_list: Tuple[HeadingScore, Word]) -> Number:

def score_words(
all_words: List[Word], non_body_words: List[Word]
) -> Tuple(Number, Word):
) -> List[Tuple[HeadingScore, Word]]:
scored_all_words: List[Tuple[HeadingScore, Word]] = [
(get_heading_score(word), word) for word in all_words
]
Expand Down
Loading

0 comments on commit 7b2f1c8

Please sign in to comment.