Skip to content

Commit

Permalink
Parser branch (#2)
Browse files Browse the repository at this point in the history
* added a crawler

* added TODOs

* minor changes

* Added Allow rules in robots.txt

* added xml parser to extract urls in sitemap.xml
  • Loading branch information
splendidbug authored Jun 20, 2024
1 parent a700640 commit 5677982
Show file tree
Hide file tree
Showing 6 changed files with 325 additions and 17 deletions.
26 changes: 25 additions & 1 deletion Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

julia_version = "1.10.2"
manifest_format = "2.0"
project_hash = "349e311d5cd42e1e3153d8d68dd0a7ecc199edb7"
project_hash = "7c1802a5f96b1ce00f602e7fbf2f1ad2f983adb5"

[[deps.AbstractTrees]]
git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
Expand Down Expand Up @@ -51,6 +51,12 @@ git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a"
uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4"
version = "0.1.10"

[[deps.EzXML]]
deps = ["Printf", "XML2_jll"]
git-tree-sha1 = "380053d61bb9064d6aa4a9777413b40429c79901"
uuid = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
version = "1.2.0"

[[deps.FileWatching]]
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"

Expand Down Expand Up @@ -109,6 +115,12 @@ version = "1.11.0+1"
[[deps.Libdl]]
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"

[[deps.Libiconv_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl"]
git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175"
uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
version = "1.17.0+0"

[[deps.Logging]]
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"

Expand Down Expand Up @@ -214,6 +226,12 @@ weakdeps = ["Random", "Test"]
[deps.TranscodingStreams.extensions]
TestExt = ["Test", "Random"]

[[deps.URIParser]]
deps = ["Unicode"]
git-tree-sha1 = "53a9f49546b8d2dd2e688d216421d050c9a31d0d"
uuid = "30578b45-9adc-5946-b283-645ec420af67"
version = "0.4.1"

[[deps.URIs]]
git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
Expand All @@ -226,6 +244,12 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
[[deps.Unicode]]
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"

[[deps.XML2_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
git-tree-sha1 = "52ff2af32e591541550bd753c0da8b9bc92bb9d9"
uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
version = "2.12.7+0"

[[deps.Zlib_jll]]
deps = ["Libdl"]
uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
Expand Down
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ version = "0.1.0"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"

[compat]
Expand Down
9 changes: 9 additions & 0 deletions src/RAGKit.jl
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
using HTTP, Gumbo, AbstractTrees, URIs
using Gumbo: HTMLDocument, HTMLElement
using EzXML
# using Regex

# using Robots

include("parser.jl")
include("crawl.jl")
include("extract_urls.jl")
134 changes: 134 additions & 0 deletions src/crawl.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
include("parser.jl")

## TODO: Make multiple dispatch for the following function
function parse_robots_txt!(robots_txt::String, url_queue::Vector{<:AbstractString})
## TODO: Make a cache of rules for a quick lookup
rules = Dict{String,Dict{String,Vector{String}}}()
current_user_agent = ""

for line in split(robots_txt, '\n')
line = strip(line)
if startswith(line, "User-agent:")
current_user_agent = strip(split(line, ":")[2])
if !haskey(rules, current_user_agent)
rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}())
end
elseif startswith(line, "Disallow:")
disallow_path = strip(split(line, ":")[2])
if current_user_agent != "" && disallow_path != ""
push!(rules[current_user_agent]["Disallow"], disallow_path)
end
elseif startswith(line, "Allow:")
allow_path = strip(split(line, ":")[2])
if current_user_agent != "" && allow_path != ""
push!(rules[current_user_agent]["Allow"], allow_path)
end
elseif startswith(line, "Sitemap:")
url = strip(split(line, ":")[2])
push!(url_queue, url)
end

end
return rules
end


function check_robots_txt(user_agent::AbstractString,
url::AbstractString,
restricted_urls::Dict{String,Set{AbstractString}},
url_queue::Vector{<:AbstractString})

URI = URIs.URI(url)
path = URI.path
if (haskey(restricted_urls, url))
if (in(path, restricted_urls[url]))
println("Not allowed to crawl $url")
return false
else
return true
end
end

robots_URL = string(URI.scheme, "://", URI.host, "/robots.txt")
try
response = HTTP.get(robots_URL)
robots_txt = String(response.body)
rules = parse_robots_txt!(robots_txt, url_queue)
user_agents = [user_agent, "*"]
for ua in user_agents
if haskey(rules, ua)
allow_rules = rules[ua]["Allow"]
disallow_rules = rules[ua]["Disallow"]

for allow_rule in allow_rules
if startswith(path, allow_rule)
return true
end
end

for disallow_rule in disallow_rules
if startswith(path, disallow_rule)
println("Not allowed to crawl $url")
return false
end
end
end
end
return true
catch
println("robots.txt unavailable for $url")
return true
end
end


"""
get_base_url(url::AbstractString)
Extracts the base url.
# Arguments
- `url`: The url string of which, the base url needs to be extracted
"""
function get_base_url(url::AbstractString)

parsed_url = URIs.URI(url)
base_url = string(parsed_url.scheme, "://", parsed_url.host,
parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path)
return base_url
end


"""
makeRAG(input_urls::Vector{<:AbstractString})
Extracts the base url.
# Arguments
- `input_urls`: vector containing URL strings to parse
"""
function makeRAG(input_urls::Vector{<:AbstractString})

url_queue = Vector{AbstractString}(input_urls)
visited_url_set = Set{AbstractString}()
restricted_urls = Dict{String,Set{AbstractString}}()
parsed_blocks = []
## TODO: Add parallel processing for URLs

while !isempty(url_queue)
url = url_queue[1]
popfirst!(url_queue)
base_url = get_base_url(url)

## TODO: Show some respect to robots.txt
if !in(base_url, visited_url_set)
push!(visited_url_set, base_url)
if !check_robots_txt("*", base_url, restricted_urls, url_queue)
break
end
get_urls!(base_url, url_queue)
push!(parsed_blocks, parse_url_to_blocks(base_url))
end
end
return parsed_blocks
end
139 changes: 139 additions & 0 deletions src/extract_urls.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Temporary until I find a package to simplify this

function resolve_url(base_url::String, relative_url::String)::String
base_uri = URI(base_url)
relative_uri = URI(relative_url)

## TODO: Make a list of allowed URLs which would contain Julia docs hostnames
## TODO: Look for version number either on the bottom left dropdown or identify on the url

if length(relative_url) > 4 && relative_url[1:4] == "http"
if base_uri.host == relative_uri.host
return relative_url
end
return ""
end
if !isempty(relative_url) && relative_url[1] == '#'
return ""
end

if !isempty(relative_uri.path) && relative_uri.path[1] == '/'
resolved_uri = URI(
scheme=base_uri.scheme,
userinfo=base_uri.userinfo,
host=base_uri.host,
port=base_uri.port,
path=relative_uri.path,
query=relative_uri.query,
fragment=relative_uri.fragment
)
return string(resolved_uri)
end

# Split the paths into segments
base_segments = split(base_uri.path, "/")
base_segments = filter((i) -> i != "", base_segments)

relative_segments = split(relative_uri.path, "/")
relative_segments = filter((i) -> i != "", relative_segments)

# Process the relative segments
for segment in relative_segments
if segment == ".."
if !isempty(base_segments)
pop!(base_segments)
end
elseif segment != "."
push!(base_segments, segment)
end
end

# Construct the new path
resolved_path = "/" * join(base_segments, "/")

# Create the resolved URI
resolved_uri = URI(
scheme=base_uri.scheme,
userinfo=base_uri.userinfo,
host=base_uri.host,
port=base_uri.port,
path=resolved_path,
query=relative_uri.query,
fragment=relative_uri.fragment
)
return string(resolved_uri)
end


"""
find_urls!(url::AbstractString,
node::Gumbo.HTMLElement,
url_queue::Vector{<:AbstractString}
Function to recursively find <a> and extract the urls
# Arguments
- url: The initial input URL
- node: The HTML node of type Gumbo.HTMLElement
- url_queue: Vector in which extracted URLs will be appended
"""
function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
if Gumbo.tag(node) == :a && haskey(node.attributes, "href")
href = node.attributes["href"]
if href !== nothing && !isempty(resolve_url(url, href))
push!(url_queue, resolve_url(url, href))
end
end

for child in node.children
if isa(child, HTMLElement)
find_urls_html!(url, child, url_queue)
end
end
end



function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
try
fetched_content = HTTP.get(url)
xml_content = String(fetched_content.body)
url_pattern = r"http[^<]+"
urls = eachmatch(url_pattern, xml_content)
for url in urls
push!(url_queue, url.match)
end
catch
println("Can't get sitemap: $url")
end
end



"""
get_links!(url::AbstractString,
url_queue::Vector{<:AbstractString})
Function to extract urls inside <a> tags
# Arguments
- url: url from which all other URLs will be extracted
- url_queue: Vector in which extracted URLs will be appended
"""
function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})

@info "Scraping link: $url"
# println(url)
try
fetched_content = HTTP.get(url)
parsed = Gumbo.parsehtml(String(fetched_content.body))
if (url[end-3:end] == ".xml")
find_urls_xml!(url_xml, url_queue)
else
find_urls_html!(url, parsed.root, url_queue)
end
# print("-------------")
catch e
println("Bad URL: $url")
end
end
Loading

0 comments on commit 5677982

Please sign in to comment.