-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* added a crawler * added TODOs * minor changes * Added Allow rules in robots.txt * added xml parser to extract urls in sitemap.xml
- Loading branch information
1 parent
a700640
commit 5677982
Showing
6 changed files
with
325 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,10 @@ | ||
using HTTP, Gumbo, AbstractTrees, URIs | ||
using Gumbo: HTMLDocument, HTMLElement | ||
using EzXML | ||
# using Regex | ||
|
||
# using Robots | ||
|
||
include("parser.jl") | ||
include("crawl.jl") | ||
include("extract_urls.jl") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
include("parser.jl") | ||
|
||
## TODO: Make multiple dispatch for the following function | ||
function parse_robots_txt!(robots_txt::String, url_queue::Vector{<:AbstractString}) | ||
## TODO: Make a cache of rules for a quick lookup | ||
rules = Dict{String,Dict{String,Vector{String}}}() | ||
current_user_agent = "" | ||
|
||
for line in split(robots_txt, '\n') | ||
line = strip(line) | ||
if startswith(line, "User-agent:") | ||
current_user_agent = strip(split(line, ":")[2]) | ||
if !haskey(rules, current_user_agent) | ||
rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}()) | ||
end | ||
elseif startswith(line, "Disallow:") | ||
disallow_path = strip(split(line, ":")[2]) | ||
if current_user_agent != "" && disallow_path != "" | ||
push!(rules[current_user_agent]["Disallow"], disallow_path) | ||
end | ||
elseif startswith(line, "Allow:") | ||
allow_path = strip(split(line, ":")[2]) | ||
if current_user_agent != "" && allow_path != "" | ||
push!(rules[current_user_agent]["Allow"], allow_path) | ||
end | ||
elseif startswith(line, "Sitemap:") | ||
url = strip(split(line, ":")[2]) | ||
push!(url_queue, url) | ||
end | ||
|
||
end | ||
return rules | ||
end | ||
|
||
|
||
function check_robots_txt(user_agent::AbstractString, | ||
url::AbstractString, | ||
restricted_urls::Dict{String,Set{AbstractString}}, | ||
url_queue::Vector{<:AbstractString}) | ||
|
||
URI = URIs.URI(url) | ||
path = URI.path | ||
if (haskey(restricted_urls, url)) | ||
if (in(path, restricted_urls[url])) | ||
println("Not allowed to crawl $url") | ||
return false | ||
else | ||
return true | ||
end | ||
end | ||
|
||
robots_URL = string(URI.scheme, "://", URI.host, "/robots.txt") | ||
try | ||
response = HTTP.get(robots_URL) | ||
robots_txt = String(response.body) | ||
rules = parse_robots_txt!(robots_txt, url_queue) | ||
user_agents = [user_agent, "*"] | ||
for ua in user_agents | ||
if haskey(rules, ua) | ||
allow_rules = rules[ua]["Allow"] | ||
disallow_rules = rules[ua]["Disallow"] | ||
|
||
for allow_rule in allow_rules | ||
if startswith(path, allow_rule) | ||
return true | ||
end | ||
end | ||
|
||
for disallow_rule in disallow_rules | ||
if startswith(path, disallow_rule) | ||
println("Not allowed to crawl $url") | ||
return false | ||
end | ||
end | ||
end | ||
end | ||
return true | ||
catch | ||
println("robots.txt unavailable for $url") | ||
return true | ||
end | ||
end | ||
|
||
|
||
""" | ||
get_base_url(url::AbstractString) | ||
Extracts the base url. | ||
# Arguments | ||
- `url`: The url string of which, the base url needs to be extracted | ||
""" | ||
function get_base_url(url::AbstractString) | ||
|
||
parsed_url = URIs.URI(url) | ||
base_url = string(parsed_url.scheme, "://", parsed_url.host, | ||
parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path) | ||
return base_url | ||
end | ||
|
||
|
||
""" | ||
makeRAG(input_urls::Vector{<:AbstractString}) | ||
Extracts the base url. | ||
# Arguments | ||
- `input_urls`: vector containing URL strings to parse | ||
""" | ||
function makeRAG(input_urls::Vector{<:AbstractString}) | ||
|
||
url_queue = Vector{AbstractString}(input_urls) | ||
visited_url_set = Set{AbstractString}() | ||
restricted_urls = Dict{String,Set{AbstractString}}() | ||
parsed_blocks = [] | ||
## TODO: Add parallel processing for URLs | ||
|
||
while !isempty(url_queue) | ||
url = url_queue[1] | ||
popfirst!(url_queue) | ||
base_url = get_base_url(url) | ||
|
||
## TODO: Show some respect to robots.txt | ||
if !in(base_url, visited_url_set) | ||
push!(visited_url_set, base_url) | ||
if !check_robots_txt("*", base_url, restricted_urls, url_queue) | ||
break | ||
end | ||
get_urls!(base_url, url_queue) | ||
push!(parsed_blocks, parse_url_to_blocks(base_url)) | ||
end | ||
end | ||
return parsed_blocks | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# Temporary until I find a package to simplify this | ||
|
||
function resolve_url(base_url::String, relative_url::String)::String | ||
base_uri = URI(base_url) | ||
relative_uri = URI(relative_url) | ||
|
||
## TODO: Make a list of allowed URLs which would contain Julia docs hostnames | ||
## TODO: Look for version number either on the bottom left dropdown or identify on the url | ||
|
||
if length(relative_url) > 4 && relative_url[1:4] == "http" | ||
if base_uri.host == relative_uri.host | ||
return relative_url | ||
end | ||
return "" | ||
end | ||
if !isempty(relative_url) && relative_url[1] == '#' | ||
return "" | ||
end | ||
|
||
if !isempty(relative_uri.path) && relative_uri.path[1] == '/' | ||
resolved_uri = URI( | ||
scheme=base_uri.scheme, | ||
userinfo=base_uri.userinfo, | ||
host=base_uri.host, | ||
port=base_uri.port, | ||
path=relative_uri.path, | ||
query=relative_uri.query, | ||
fragment=relative_uri.fragment | ||
) | ||
return string(resolved_uri) | ||
end | ||
|
||
# Split the paths into segments | ||
base_segments = split(base_uri.path, "/") | ||
base_segments = filter((i) -> i != "", base_segments) | ||
|
||
relative_segments = split(relative_uri.path, "/") | ||
relative_segments = filter((i) -> i != "", relative_segments) | ||
|
||
# Process the relative segments | ||
for segment in relative_segments | ||
if segment == ".." | ||
if !isempty(base_segments) | ||
pop!(base_segments) | ||
end | ||
elseif segment != "." | ||
push!(base_segments, segment) | ||
end | ||
end | ||
|
||
# Construct the new path | ||
resolved_path = "/" * join(base_segments, "/") | ||
|
||
# Create the resolved URI | ||
resolved_uri = URI( | ||
scheme=base_uri.scheme, | ||
userinfo=base_uri.userinfo, | ||
host=base_uri.host, | ||
port=base_uri.port, | ||
path=resolved_path, | ||
query=relative_uri.query, | ||
fragment=relative_uri.fragment | ||
) | ||
return string(resolved_uri) | ||
end | ||
|
||
|
||
""" | ||
find_urls!(url::AbstractString, | ||
node::Gumbo.HTMLElement, | ||
url_queue::Vector{<:AbstractString} | ||
Function to recursively find <a> and extract the urls | ||
# Arguments | ||
- url: The initial input URL | ||
- node: The HTML node of type Gumbo.HTMLElement | ||
- url_queue: Vector in which extracted URLs will be appended | ||
""" | ||
function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) | ||
if Gumbo.tag(node) == :a && haskey(node.attributes, "href") | ||
href = node.attributes["href"] | ||
if href !== nothing && !isempty(resolve_url(url, href)) | ||
push!(url_queue, resolve_url(url, href)) | ||
end | ||
end | ||
|
||
for child in node.children | ||
if isa(child, HTMLElement) | ||
find_urls_html!(url, child, url_queue) | ||
end | ||
end | ||
end | ||
|
||
|
||
|
||
function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) | ||
try | ||
fetched_content = HTTP.get(url) | ||
xml_content = String(fetched_content.body) | ||
url_pattern = r"http[^<]+" | ||
urls = eachmatch(url_pattern, xml_content) | ||
for url in urls | ||
push!(url_queue, url.match) | ||
end | ||
catch | ||
println("Can't get sitemap: $url") | ||
end | ||
end | ||
|
||
|
||
|
||
""" | ||
get_links!(url::AbstractString, | ||
url_queue::Vector{<:AbstractString}) | ||
Function to extract urls inside <a> tags | ||
# Arguments | ||
- url: url from which all other URLs will be extracted | ||
- url_queue: Vector in which extracted URLs will be appended | ||
""" | ||
function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString}) | ||
|
||
@info "Scraping link: $url" | ||
# println(url) | ||
try | ||
fetched_content = HTTP.get(url) | ||
parsed = Gumbo.parsehtml(String(fetched_content.body)) | ||
if (url[end-3:end] == ".xml") | ||
find_urls_xml!(url_xml, url_queue) | ||
else | ||
find_urls_html!(url, parsed.root, url_queue) | ||
end | ||
# print("-------------") | ||
catch e | ||
println("Bad URL: $url") | ||
end | ||
end |
Oops, something went wrong.