-
Notifications
You must be signed in to change notification settings - Fork 0
/
glossika_pdf.rb
80 lines (56 loc) · 2.21 KB
/
glossika_pdf.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# coding: utf-8
require "pdf-reader"
class SentencePair
ENGLISH_PUNCTUATION = %w_! ? . ] ) " ,_ # some legitimately end with weird characters
CHINESE_PUNCTUATION = %w_! ? 。 ? 」_ + ["蓋"] # english question mark is sometimes used
attr_reader :english, :chinese, :pinyin
def initialize(english, chinese, pinyin)
@english = english.strip.gsub("\n", " ")
@chinese = chinese.strip.gsub("\n", " ")
@pinyin = pinyin.strip.gsub("\n", " ")
# some sentences forget the period outright
@english += "." if @english =~ /[a-z]\z/
unless ENGLISH_PUNCTUATION.include?(@english.chars.last)
raise "unexpected english punctuation: #{@english.chars.last} in sentence: #{@english}"
end
unless CHINESE_PUNCTUATION.include?(@chinese.chars.last)
raise "unexpected chinese punctuation: #{@chinese.chars.last} in sentence: #{@chinese}"
end
end
end
class GlossikaPDFParser
ZH_DELIMITER = "繁".freeze
def initialize(path, start_page, end_page)
raise ArgumentError.new("start_page must be <= end_page") unless start_page <= end_page
@reader = PDF::Reader.new(path)
@start_page = start_page
@end_page = end_page
end
def sentence_pairs
Enumerator.new do |enum|
(@start_page..@end_page).each do |page_number|
text = @reader.pages[page_number].text.lines.map(&:strip).reject(&:empty?).join("\n")
english = text.scan(/EN(?!ZT)\s*?(.+?)\n[^\w]/m)
chinese = text.scan(/#{ZH_DELIMITER}\s*?(.+?)\nPIN/m)
pinyin = text.scan(/PIN\s*?(.+?)\nIPA/m)
unless english.size == chinese.size && chinese.size == pinyin.size
msg = [
"bad sizes: #{english.size}, #{chinese.size}, #{pinyin.size}",
"original text:\n\n#{text}"
]
raise msg.join(" ")
end
if [english.size, chinese.size, pinyin.size].any?(&:zero?)
msg = [
"unexpected 0 value: #{english.size}, #{chinese.size}, #{pinyin.size}",
"original text:\n\n#{text}"
]
raise msg.join(" ")
end
[english, chinese, pinyin].map(&:flatten).transpose.each do |e, c, p|
enum.yield SentencePair.new(e, c, p)
end
end
end
end
end