Parser branch (#2)

* added a crawler * added TODOs * minor changes * Added Allow rules in robots.txt * added xml parser to extract urls in sitemap.xml
JuliaGenAI · Jun 20, 2024 · 5677982 · 5677982
1 parent a700640
commit 5677982
Show file tree

Hide file tree

Showing 6 changed files with 325 additions and 17 deletions.
diff --git a/Manifest.toml b/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.10.2"
 manifest_format = "2.0"
-project_hash = "349e311d5cd42e1e3153d8d68dd0a7ecc199edb7"
+project_hash = "7c1802a5f96b1ce00f602e7fbf2f1ad2f983adb5"
 
 [[deps.AbstractTrees]]
 git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
@@ -51,6 +51,12 @@ git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a"
 uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4"
 version = "0.1.10"
 
+[[deps.EzXML]]
+deps = ["Printf", "XML2_jll"]
+git-tree-sha1 = "380053d61bb9064d6aa4a9777413b40429c79901"
+uuid = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
+version = "1.2.0"
+
 [[deps.FileWatching]]
 uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 
@@ -109,6 +115,12 @@ version = "1.11.0+1"
 [[deps.Libdl]]
 uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 
+[[deps.Libiconv_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "f9557a255370125b405568f9767d6d195822a175"
+uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
+version = "1.17.0+0"
+
 [[deps.Logging]]
 uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
 
@@ -214,6 +226,12 @@ weakdeps = ["Random", "Test"]
     [deps.TranscodingStreams.extensions]
     TestExt = ["Test", "Random"]
 
+[[deps.URIParser]]
+deps = ["Unicode"]
+git-tree-sha1 = "53a9f49546b8d2dd2e688d216421d050c9a31d0d"
+uuid = "30578b45-9adc-5946-b283-645ec420af67"
+version = "0.4.1"
+
 [[deps.URIs]]
 git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
 uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
@@ -226,6 +244,12 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [[deps.Unicode]]
 uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
+[[deps.XML2_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
+git-tree-sha1 = "52ff2af32e591541550bd753c0da8b9bc92bb9d9"
+uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a"
+version = "2.12.7+0"
+
 [[deps.Zlib_jll]]
 deps = ["Libdl"]
 uuid = "83775a58-1f1d-513f-b197-d71354ab007a"

diff --git a/Project.toml b/Project.toml
@@ -5,8 +5,10 @@ version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 
 [compat]

diff --git a/src/RAGKit.jl b/src/RAGKit.jl
@@ -1 +1,10 @@
+using HTTP, Gumbo, AbstractTrees, URIs
+using Gumbo: HTMLDocument, HTMLElement
+using EzXML
+# using Regex
+
+# using Robots
+
 include("parser.jl")
+include("crawl.jl")
+include("extract_urls.jl")
diff --git a/src/crawl.jl b/src/crawl.jl
@@ -0,0 +1,134 @@
+include("parser.jl")
+
+## TODO: Make multiple dispatch for the following function
+function parse_robots_txt!(robots_txt::String, url_queue::Vector{<:AbstractString})
+    ## TODO: Make a cache of rules for a quick lookup
+    rules = Dict{String,Dict{String,Vector{String}}}()
+    current_user_agent = ""
+
+    for line in split(robots_txt, '\n')
+        line = strip(line)
+        if startswith(line, "User-agent:")
+            current_user_agent = strip(split(line, ":")[2])
+            if !haskey(rules, current_user_agent)
+                rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}())
+            end
+        elseif startswith(line, "Disallow:")
+            disallow_path = strip(split(line, ":")[2])
+            if current_user_agent != "" && disallow_path != ""
+                push!(rules[current_user_agent]["Disallow"], disallow_path)
+            end
+        elseif startswith(line, "Allow:")
+            allow_path = strip(split(line, ":")[2])
+            if current_user_agent != "" && allow_path != ""
+                push!(rules[current_user_agent]["Allow"], allow_path)
+            end
+        elseif startswith(line, "Sitemap:")
+            url = strip(split(line, ":")[2])
+            push!(url_queue, url)
+        end
+
+    end
+    return rules
+end
+
+
+function check_robots_txt(user_agent::AbstractString,
+    url::AbstractString,
+    restricted_urls::Dict{String,Set{AbstractString}},
+    url_queue::Vector{<:AbstractString})
+
+    URI = URIs.URI(url)
+    path = URI.path
+    if (haskey(restricted_urls, url))
+        if (in(path, restricted_urls[url]))
+            println("Not allowed to crawl $url")
+            return false
+        else
+            return true
+        end
+    end
+
+    robots_URL = string(URI.scheme, "://", URI.host, "/robots.txt")
+    try
+        response = HTTP.get(robots_URL)
+        robots_txt = String(response.body)
+        rules = parse_robots_txt!(robots_txt, url_queue)
+        user_agents = [user_agent, "*"]
+        for ua in user_agents
+            if haskey(rules, ua)
+                allow_rules = rules[ua]["Allow"]
+                disallow_rules = rules[ua]["Disallow"]
+
+                for allow_rule in allow_rules
+                    if startswith(path, allow_rule)
+                        return true
+                    end
+                end
+
+                for disallow_rule in disallow_rules
+                    if startswith(path, disallow_rule)
+                        println("Not allowed to crawl $url")
+                        return false
+                    end
+                end
+            end
+        end
+        return true
+    catch
+        println("robots.txt unavailable for $url")
+        return true
+    end
+end
+
+
+"""
+    get_base_url(url::AbstractString)
+
+Extracts the base url.
+
+# Arguments
+- `url`: The url string of which, the base url needs to be extracted
+"""
+function get_base_url(url::AbstractString)
+
+    parsed_url = URIs.URI(url)
+    base_url = string(parsed_url.scheme, "://", parsed_url.host,
+        parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path)
+    return base_url
+end
+
+
+"""
+    makeRAG(input_urls::Vector{<:AbstractString})
+
+Extracts the base url.
+
+# Arguments
+- `input_urls`: vector containing URL strings to parse
+"""
+function makeRAG(input_urls::Vector{<:AbstractString})
+
+    url_queue = Vector{AbstractString}(input_urls)
+    visited_url_set = Set{AbstractString}()
+    restricted_urls = Dict{String,Set{AbstractString}}()
+    parsed_blocks = []
+    ## TODO: Add parallel processing for URLs
+
+    while !isempty(url_queue)
+        url = url_queue[1]
+        popfirst!(url_queue)
+        base_url = get_base_url(url)
+
+        ## TODO: Show some respect to robots.txt
+        if !in(base_url, visited_url_set)
+            push!(visited_url_set, base_url)
+            if !check_robots_txt("*", base_url, restricted_urls, url_queue)
+                break
+            end
+            get_urls!(base_url, url_queue)
+            push!(parsed_blocks, parse_url_to_blocks(base_url))
+        end
+    end
+    return parsed_blocks
+end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
@@ -0,0 +1,139 @@
+# Temporary until I find a package to simplify this
+
+function resolve_url(base_url::String, relative_url::String)::String
+    base_uri = URI(base_url)
+    relative_uri = URI(relative_url)
+
+    ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames
+    ## TODO: Look for version number either on the bottom left dropdown or identify on the url
+
+    if length(relative_url) > 4 && relative_url[1:4] == "http"
+        if base_uri.host == relative_uri.host
+            return relative_url
+        end
+        return ""
+    end
+    if !isempty(relative_url) && relative_url[1] == '#'
+        return ""
+    end
+
+    if !isempty(relative_uri.path) && relative_uri.path[1] == '/'
+        resolved_uri = URI(
+            scheme=base_uri.scheme,
+            userinfo=base_uri.userinfo,
+            host=base_uri.host,
+            port=base_uri.port,
+            path=relative_uri.path,
+            query=relative_uri.query,
+            fragment=relative_uri.fragment
+        )
+        return string(resolved_uri)
+    end
+
+    # Split the paths into segments
+    base_segments = split(base_uri.path, "/")
+    base_segments = filter((i) -> i != "", base_segments)
+
+    relative_segments = split(relative_uri.path, "/")
+    relative_segments = filter((i) -> i != "", relative_segments)
+
+    # Process the relative segments
+    for segment in relative_segments
+        if segment == ".."
+            if !isempty(base_segments)
+                pop!(base_segments)
+            end
+        elseif segment != "."
+            push!(base_segments, segment)
+        end
+    end
+
+    # Construct the new path
+    resolved_path = "/" * join(base_segments, "/")
+
+    # Create the resolved URI
+    resolved_uri = URI(
+        scheme=base_uri.scheme,
+        userinfo=base_uri.userinfo,
+        host=base_uri.host,
+        port=base_uri.port,
+        path=resolved_path,
+        query=relative_uri.query,
+        fragment=relative_uri.fragment
+    )
+    return string(resolved_uri)
+end
+
+
+"""
+    find_urls!(url::AbstractString, 
+        node::Gumbo.HTMLElement, 
+        url_queue::Vector{<:AbstractString}
+
+Function to recursively find <a> and extract the urls
+
+# Arguments
+- url: The initial input URL 
+- node: The HTML node of type Gumbo.HTMLElement
+- url_queue: Vector in which extracted URLs will be appended
+"""
+function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
+    if Gumbo.tag(node) == :a && haskey(node.attributes, "href")
+        href = node.attributes["href"]
+        if href !== nothing && !isempty(resolve_url(url, href))
+            push!(url_queue, resolve_url(url, href))
+        end
+    end
+
+    for child in node.children
+        if isa(child, HTMLElement)
+            find_urls_html!(url, child, url_queue)
+        end
+    end
+end
+
+
+
+function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
+    try
+        fetched_content = HTTP.get(url)
+        xml_content = String(fetched_content.body)
+        url_pattern = r"http[^<]+"
+        urls = eachmatch(url_pattern, xml_content)
+        for url in urls
+            push!(url_queue, url.match)
+        end
+    catch
+        println("Can't get sitemap: $url")
+    end
+end
+
+
+
+"""
+    get_links!(url::AbstractString, 
+        url_queue::Vector{<:AbstractString})
+
+Function to extract urls inside <a> tags
+
+# Arguments
+- url: url from which all other URLs will be extracted
+- url_queue: Vector in which extracted URLs will be appended
+"""
+function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
+
+    @info "Scraping link: $url"
+    # println(url)
+    try
+        fetched_content = HTTP.get(url)
+        parsed = Gumbo.parsehtml(String(fetched_content.body))
+        if (url[end-3:end] == ".xml")
+            find_urls_xml!(url_xml, url_queue)
+        else
+            find_urls_html!(url, parsed.root, url_queue)
+        end
+        # print("-------------")
+    catch e
+        println("Bad URL: $url")
+    end
+end