forked from petewarden/dstk
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cruftstripper.rb
executable file
·233 lines (198 loc) · 6.62 KB
/
cruftstripper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#***********************************************************************************
#
# This module looks for strings that have the characteristics of English-language
# sentences. This means consisting a series of space-separated words, starting with
# a capital letter, ending with a period, etc. It strips out any strings that don't
# match these patterns, and returns the result.
#
# All code (C) Pete Warden, 2011
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#***********************************************************************************
def debug_log(message)
# printf(STDERR, message+"\n")
end
def strip_nonsentences(input, input_settings = { })
common_short_words = {
'a' => true,
'i' => true,
'ah' => true,
'an' => true,
'as' => true,
'at' => true,
'ax' => true,
'be' => true,
'by' => true,
'do' => true,
'ex' => true,
'go' => true,
'ha' => true,
'he' => true,
'hi' => true,
'id' => true,
'if' => true,
'in' => true,
'is' => true,
'it' => true,
'ma' => true,
'me' => true,
'my' => true,
'no' => true,
'of' => true,
'oh' => true,
'on' => true,
'or' => true,
'ox' => true,
'pa' => true,
'so' => true,
'to' => true,
'uh' => true,
'um' => true,
'un' => true,
'up' => true,
'us' => true,
'we' => true
}
default_settings = {
'words_threshold' => 0.75,
'sentences_threshold' => 0.5,
'min_words_in_sentence' => 4,
'min_sentences_in_paragraph' => 2
}
settings = {}
default_settings.each do |key, value|
if input_settings.has_key?(key)
settings[key] = input_settings[key]
else
settings[key] = default_settings[key]
end
end
result_lines = []
lines = input.split("\n")
lines.each do |line|
sentences = line.split(/[.?!][^a-zA-Z0-9]/)
# Go through all the 'sentences' and see which ones look valid
sentences_length = 0
sentences_matches = 0
sentences_count = 0
sentences.each do |sentence|
sentence.strip!
sentences_length += sentence.length
# Is this an empty sentence?
if sentence.length == 0
next
end
# Does this sentence start with a capital letter?
first_char_match = sentence.match(/[a-zA-Z]/)
if !first_char_match
debug_log(sentence+' - no characters found')
next
end
if first_char_match =~ /[a-z]/
debug_log(sentence+' - first character isn\'t uppercase - '+first_char_match)
next
end
# Split sentence by spaces, punctuation
words = sentence.split(/[ ]/)
# Is this too short to be a sentence?
if words.length<settings['min_words_in_sentence']
debug_log(sentence+' - too few words in sentence: '+words.length.to_s+' - '+words.inspect)
next
end
# Go through all the entries and see which ones look like real words
words_length = 0
words_matches = 0
words.each do |word|
words_length += word.length
# Not all letters?
if word =~ /[^a-zA-Z\-\'"\.,]/
#'
debug_log(word+' not all letters')
next
end
# Is it a short word, that isn't common?
if word.length<3 and not common_short_words.has_key?(word.downcase())
debug_log(word+' short, and not common')
next
end
words_matches += word.length
end
# No words found?
if words_length == 0
debug_log(sentence+' - no words found')
next
end
# Were there enough valid words to mark this as a sentence?
words_ratio = words_matches/(words_length*1.0)
if words_ratio > settings['words_threshold']
sentences_matches += sentence.length
sentences_count += 1
else
debug_log(sentence + ' - words ratio too low: '+words_ratio.to_s)
end
end
result_line = { 'line' => line }
# No sentences found?
if sentences_length == 0
result_line['is_sentence'] = false
else
# Were there enough valid sentences to mark this line as content?
sentences_ratio = sentences_matches/(sentences_length*1.0)
if sentences_ratio > settings['sentences_threshold']
result_line['is_sentence'] = true
result_line['sentences_count'] = sentences_count
result_line['ends_with_period'] = (line =~ /\.[^a-zA-Z]*$/)
else
result_line['is_sentence'] = false
debug_log(line + ' - sentences ratio too low: '+sentences_ratio.to_s)
end
end
result_lines.push(result_line)
end
result = ''
found_sentences_count = 0
found_sentences = ''
result_lines.each do |result_line|
is_sentence = result_line['is_sentence']
if !is_sentence
if found_sentences_count >= settings['min_sentences_in_paragraph']
result += found_sentences + "\n"
debug_log(found_sentences+' - found '+found_sentences_count.to_s)
else
debug_log(found_sentences+' - not enough sentences in paragraph: '+found_sentences_count.to_s)
end
found_sentences_count = 0
found_sentences = ''
else
sentences_count = result_line['sentences_count']
has_enough_sentences = sentences_count >= settings['min_sentences_in_paragraph']
ends_with_period = result_line['ends_with_period']
if has_enough_sentences or ends_with_period
found_sentences += result_line['line'].strip()+' '
found_sentences_count += sentences_count
else
debug_log(result_line['line']+' - skipping, not enough sentences: '+sentences_count.to_s)
end
end
if found_sentences_count >= settings['min_sentences_in_paragraph']
result += found_sentences + "\n"
found_sentences = ''
end
end
return result
end
#input = $stdin.read
#output = strip_nonsentences(input)
#puts output