-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch
executable file
·209 lines (187 loc) · 5.62 KB
/
fetch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env ruby
require "nokogiri"
require "net/http"
require "optparse"
require "fileutils"
require "uri"
require "json"
require "securerandom"
class WebsiteFetcher
class BadResponseError < StandardError
end
def initialize(urls = [], flags = {})
@urls = urls
@flags = flags
end
def perform
for url in @urls
begin
@parsed_url = parse_url(url)
@html_body = http_get(@parsed_url)
metadata_hash = fetch_metadata_and_archive_assets
save_to_local(html_body: @html_body, metadata: metadata_hash)
rescue StandardError => e
$stderr.puts "Process Filed for: #{url}\n"
$stderr.puts "#{e.message}\n\n"
next
end
end
end
private
#
# Save the html and metadata to local
# @return nil
#
def save_to_local(html_body:, metadata:)
FileUtils.mkdir_p @parsed_url.host
file_data = []
if File.exists?("#{@parsed_url.host}/metadata.json")
file_data = JSON.parse(File.read("#{@parsed_url.host}/metadata.json"))
end
previous_fetch_time = nil
previous_fetch_time = file_data[-1]["fetch_time"] if !file_data.empty?
file_data.push metadata
File.open("#{@parsed_url.host}/metadata.json", "w") do |f|
f.write(file_data.to_json)
end
File.open("#{@parsed_url.host}/#{@parsed_url.host}.html", "w") do |f|
f.write(html_body)
end
if @flags[:show_metadata]
$stdout.puts metadata_string_builder(
metadata_hash: metadata,
previous_fetch_time: previous_fetch_time,
)
end
end
#
# Fetch the response of the target url
# @return {String} response body
#
def http_get(url)
response = Net::HTTP.get_response(url)
if response.code.to_i > 299
raise BadResponseError, "Response Status:[#{response.code}] for #{url}"
end
response.body
end
#
# Fetch the metadata of the target website and download the assets
# @return {Hash} metadata
#
def fetch_metadata_and_archive_assets
parsed_source = Nokogiri.HTML(@html_body)
link_count = parsed_source.xpath("//a[@href]").count
images = parsed_source.xpath("//img[@src]")
image_count = images.count
if @flags[:archive]
images.each do |tag|
archive(tag, :src, File.join(@parsed_url.host, "images"))
end
@html_body = parsed_source.to_html
end
metadata_hash = {
site: @parsed_url.host,
num_links: link_count,
images: image_count,
fetch_time: Time.now.utc.strftime("%a %b %d %Y %H:%M:%S %Z"),
}
end
#
# Build the print string of the metadata
# @return {String} output string
#
def metadata_string_builder(metadata_hash:, previous_fetch_time:)
sb = ""
sb += "======================================================"
sb += "\n"
metadata_hash.each { |key, value| sb += "#{key}: #{value}\n" }
sb += "previous_fetch_time: #{previous_fetch_time || "--"}\n"
sb += "======================================================"
sb += "\n"
end
#
# Validate and parse the url using URI.parse
# @return parsed URI
#
def parse_url(url)
parsed_url = URI.parse(url)
return parsed_url unless parsed_url.host.nil?
raise URI::InvalidURIError, "Invalid URL: #{url}"
end
######################
#### Extra Credit ####
######################
#
# Archive the assets and transform the url to relative path in html
# @return nil
#
def archive(html_tag, key, local_dirctory)
raw_asset_url = html_tag[key]
full_asset_url = full_url_for_asset(raw_asset_url)
relative_path =
transform_url_to_relative_path(raw_asset_url, local_dirctory)
download_assets(full_asset_url, relative_path)
html_tag[key.to_s] = relative_path.partition(
"#{File.dirname(local_dirctory)}/",
).last
rescue StandardError => e
$stderr.puts "Skipping archive 1 asset for #{@parsed_url.host}, #{e.message[0, 18]}"
end
#
# Download the assets to local
# @return nil
#
def download_assets(asset_url, relative_path)
FileUtils.mkdir_p File.dirname(relative_path)
asset_uri_object = URI.parse(asset_url)
unless asset_uri_object.host.nil?
data = http_get asset_uri_object
File.open(relative_path, "wb") { |f| f.write(data) } if data
end
end
#
# Transform the url to relative path
# @return nil
#
def transform_url_to_relative_path(raw_url, local_dirctory)
relative_path = raw_url.gsub(%r{^https?\://(www.)?}, "")
relative_path.gsub!(%r{^[./]+}, "")
relative_path.gsub!(%r{[^-_./[:alnum:]]}, "_")
# Handle the case when file name is too long
if relative_path.size > 255
relative_path = SecureRandom.uuid + File.extname(relative_path)
end
File.join(local_dirctory, relative_path)
end
#
# Get the full url of the asset
# @return nil
#
def full_url_for_asset(raw_asset_url)
return raw_asset_url if URI.parse(raw_asset_url).absolute?
File.join(@parsed_url.origin, raw_asset_url)
end
end
# Driver Method
if __FILE__ == $0
flags = {}
opt = OptionParser.new
opt.on("-h", "--help", "Help") do |v|
$stdout.puts "Supported flags:
-m / --metadata: Display the metadata of the target website
-a / --archive: Archive the assets of the target website
-h / --help: Show all supported flags"
exit 0
end
opt.on("-m", "--metadata", "Display Metadata") do |v|
flags[:show_metadata] = true
end
opt.on("-a", "--archive", "Archive Assets") { |v| flags[:archive] = true }
opt.parse!(ARGV)
if ARGV.count < 1
$stderr.puts "Please input the url of target website. \nTry \"./fetch https://www.google.com/\""
exit 1
end
WebsiteFetcher.new(ARGV.uniq, flags).perform
end