-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhtml2txt.rb
85 lines (67 loc) · 1.82 KB
/
html2txt.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env ruby
require 'rubygems'
require 'nokogiri'
require 'optparse'
require 'pathname'
optparse = OptionParser.new do |opts|
opts.banner = 'Usage: html2txt.rb [options] files'
footer = <<EOF
Example: html2txt.rb -e utf -a out.txt 001.htm 002.htm
EOF
opts.on('-h', '-?', '--help', 'display this screen') do
puts opts
puts
puts footer
exit
end
help = ['output to file(s)',
'If file name is not given or given as _, input file name',
'is used to generate output file name.']
opts.on('-o', '--output [FILE]', *help) do |filename|
$output_file = filename || '_'
end
opts.on('-e', '--encoding ENCODING', 'input encoding') do |encoding|
$input_encoding = encoding
end
opts.on('-a', '--append [FILE]', 'append to output file') do |filename|
$append_mode = true
$output_file = filename
end
end
optparse.parse!
$filenum = 0
$output_num_format = "%0#{$1.length}d" if $output_file =~ /(#+)/
def is_single_output_file; !$output_num_format; end
def html2txt(io, src_filename)
$filenum += 1
doc = Nokogiri::HTML(io, nil, $input_encoding)
txt = doc.css('body').text
# txt = txt.gsub(/^\s+$/, "\n")
if $output_file
src_base_name = File.basename(src_filename || '-.', '.*')
output_file = $output_file.gsub('_', src_base_name)
if File.extname(output_file) == ''
output_file << '.txt'
end
if $output_num_format
output_file = output_file.sub(/(#+)/, $output_num_format % $filenum)
end
# puts "output file #{output_file}"
append_to_file = $append_mode || ($filenum > 1 && is_single_output_file)
# puts "append=#{append_to_file}"
file = open(output_file, append_to_file ? 'a' : 'w')
file.puts txt
file.close
else
puts txt
end
end
if $*.length > 0
ARGV.each do |filename|
file = open(filename)
html2txt file, filename
file.close
end
else
html2txt STDIN
end