Knowledge packs (#3)

* knowledge pack creqation is working- need to cleanup code * added joinpath * added tests and code improvements * refactored code - docstrings, path changes
JuliaGenAI · Jul 10, 2024 · 4390d9d · 4390d9d
1 parent 31f2f67
commit 4390d9d
Show file tree

Hide file tree

Showing 8 changed files with 506 additions and 54 deletions.
diff --git a/Project.toml b/Project.toml
@@ -5,9 +5,13 @@ version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 

diff --git a/src/RAGKit.jl b/src/RAGKit.jl
@@ -1,10 +1,28 @@
+module RAGKit
 using HTTP, Gumbo, AbstractTrees, URIs
 using Gumbo: HTMLDocument, HTMLElement
 using EzXML
+using PromptingTools
+const PT = PromptingTools
+const RT = PromptingTools.Experimental.RAGTools
+using LinearAlgebra, Unicode, SparseArrays
+using HDF5
+using Tar
+using Inflate
+
+using SHA
+using Serialization, URIs
 # using Regex
 
 # using Robots
 
 include("parser.jl")
 include("crawl.jl")
-include("extract_urls.jl")
+include("extract_urls.jl")
+include("preparation.jl")
+
+include("make_embeddings.jl")
+export make_embeddings
+
+
+end
diff --git a/src/crawl.jl b/src/crawl.jl
@@ -1,10 +1,16 @@
-include("parser.jl")
+## TODO: Make multiple dispatch for the following function to remove if-else
+"""
+    parse_robots_txt!(robots_txt::String)
 
-## TODO: Make multiple dispatch for the following function
-function parse_robots_txt!(robots_txt::String, url_queue::Vector{<:AbstractString})
-    ## TODO: Make a cache of rules for a quick lookup
+Parses the robots.txt string and returns rules along with the URLs on Sitemap
+
+# Arguments
+- `robots_txt`: robots.txt as a string
+"""
+function parse_robots_txt!(robots_txt::String)
     rules = Dict{String,Dict{String,Vector{String}}}()
     current_user_agent = ""
+    sitemap_urls = Vector{AbstractString}()
 
     for line in split(robots_txt, '\n')
         line = strip(line)
@@ -25,35 +31,46 @@ function parse_robots_txt!(robots_txt::String, url_queue::Vector{<:AbstractStrin
             end
         elseif startswith(line, "Sitemap:")
             url = strip(split(line, ":")[2])
-            push!(url_queue, url)
+            push!(sitemap_urls, url)
         end
 
     end
-    return rules
+    return rules, sitemap_urls
 end
 
 
+"""
+    check_robots_txt(user_agent::AbstractString,
+        url::AbstractString)
+
+Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url
+
+# Arguments
+- `user_agent`: user agent attempting to crawl the webpage
+- `url`: input URL string
+"""
 function check_robots_txt(user_agent::AbstractString,
-    url::AbstractString,
-    restricted_urls::Dict{String,Set{AbstractString}},
-    url_queue::Vector{<:AbstractString})
+    url::AbstractString)
+
+    ## TODO: Make a cache of rules for a quick lookup
+    # if (haskey(restricted_urls, url))
+    #     if (in(path, restricted_urls[url]))
+    #         println("Not allowed to crawl $url")
+    #         return false
+    #     else
+    #         return true
+    #     end
+    # end
 
     URI = URIs.URI(url)
     path = URI.path
-    if (haskey(restricted_urls, url))
-        if (in(path, restricted_urls[url]))
-            println("Not allowed to crawl $url")
-            return false
-        else
-            return true
-        end
-    end
 
     robots_URL = string(URI.scheme, "://", URI.host, "/robots.txt")
+    sitemap_urls = Vector{AbstractString}()
     try
         response = HTTP.get(robots_URL)
         robots_txt = String(response.body)
-        rules = parse_robots_txt!(robots_txt, url_queue)
+        rules, sitemap_urls = parse_robots_txt!(robots_txt)
         user_agents = [user_agent, "*"]
         for ua in user_agents
             if haskey(rules, ua)
@@ -62,26 +79,25 @@ function check_robots_txt(user_agent::AbstractString,
 
                 for allow_rule in allow_rules
                     if startswith(path, allow_rule)
-                        return true
+                        return true, sitemap_urls
                     end
                 end
 
                 for disallow_rule in disallow_rules
                     if startswith(path, disallow_rule)
-                        println("Not allowed to crawl $url")
-                        return false
+                        @warn "Not allowed to crawl $url"
+                        return false, sitemap_urls
                     end
                 end
             end
         end
-        return true
+        return true, sitemap_urls
     catch
-        println("robots.txt unavailable for $url")
-        return true
+        @info "robots.txt unavailable for $url"
+        return true, sitemap_urls
     end
 end
 
-
 """
     get_base_url(url::AbstractString)
 
@@ -100,35 +116,77 @@ end
 
 
 """
-    makeRAG(input_urls::Vector{<:AbstractString})
+    process_hostname(url::AbstractString)
 
-Extracts the base url.
+Returns the hostname of an input URL
+
+# Arguments
+- `url`: URL string
+"""
+function process_hostname(url::AbstractString)
+    URI = URIs.URI(url)
+    hostname = String(URI.host)
+    return hostname
+end
+
+
+"""
+    process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
+
+Adds the `url` to it's hostname in `hostname_dict`
 
 # Arguments
-- `input_urls`: vector containing URL strings to parse
+- `url`: URL string
+- `hostname_dict`: Dict with key being hostname and value being a vector of URLs
 """
-function makeRAG(input_urls::Vector{<:AbstractString})
+function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
+    hostname = process_hostname(url)
+
+    # Add the URL to the dictionary under its hostname
+    if haskey(hostname_dict, hostname)
+        push!(hostname_dict[hostname], url)
+    else
+        hostname_dict[hostname] = [url]
+    end
+end
+
+
+"""
+    crawl(input_urls::Vector{<:AbstractString})
+
+Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
+
+# Arguments
+- `input_urls`: A vector of input URLs
+"""
+function crawl(input_urls::Vector{<:AbstractString})
 
     url_queue = Vector{AbstractString}(input_urls)
     visited_url_set = Set{AbstractString}()
-    restricted_urls = Dict{String,Set{AbstractString}}()
-    parsed_blocks = []
-    ## TODO: Add parallel processing for URLs
+    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
+    sitemap_urls = Vector{AbstractString}()
 
+    # TODO: Add parallel processing for URLs
     while !isempty(url_queue)
         url = url_queue[1]
         popfirst!(url_queue)
         base_url = get_base_url(url)
 
-        ## TODO: Show some respect to robots.txt
         if !in(base_url, visited_url_set)
             push!(visited_url_set, base_url)
-            if !check_robots_txt("*", base_url, restricted_urls, url_queue)
-                break
+            crawlable, sitemap_urls = check_robots_txt("*", base_url)
+            append!(url_queue, sitemap_urls)
+            if crawlable
+                try
+                    get_urls!(base_url, url_queue)
+                    process_hostname!(url, hostname_url_dict)
+                catch
+                    @error "Bad URL: $base_url"
+                end
             end
-            get_urls!(base_url, url_queue)
-            push!(parsed_blocks, parse_url_to_blocks(base_url))
         end
     end
-    return parsed_blocks
-end
+
+    return hostname_url_dict
+
+end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
@@ -124,16 +124,16 @@ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
 
     @info "Scraping link: $url"
     # println(url)
-    try
-        fetched_content = HTTP.get(url)
-        parsed = Gumbo.parsehtml(String(fetched_content.body))
-        if (url[end-3:end] == ".xml")
-            find_urls_xml!(url_xml, url_queue)
-        else
-            find_urls_html!(url, parsed.root, url_queue)
-        end
-        # print("-------------")
-    catch e
-        println("Bad URL: $url")
+    # try
+    fetched_content = HTTP.get(url)
+    parsed = Gumbo.parsehtml(String(fetched_content.body))
+    if (url[end-3:end] == ".xml")
+        find_urls_xml!(url_xml, url_queue)
+    else
+        find_urls_html!(url, parsed.root, url_queue)
     end
+    # print("-------------")
+    # catch e
+    #     println("Bad URL: $url")
+    # end
 end