diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..5657bd0 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options +style = "sciml" diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 371b418..874943f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -3,7 +3,7 @@ on: push: branches: - main - tags: ['*'] + tags: ["*"] pull_request: workflow_dispatch: concurrency: @@ -23,9 +23,8 @@ jobs: fail-fast: false matrix: version: - <<#VERSIONS>> - - '<<&.>>' - <> + - "1.10" + - "nightly" os: - ubuntu-latest arch: @@ -52,13 +51,11 @@ jobs: actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created contents: write statuses: write - pages: write - id-token: write steps: - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: - version: '1' + version: "1" - uses: julia-actions/cache@v2 - name: Configure doc environment shell: julia --project=docs --color=yes {0} @@ -75,7 +72,6 @@ jobs: shell: julia --project=docs --color=yes {0} run: | using Documenter: DocMeta, doctest - using <<&PKG>> - DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true) - doctest(<<&PKG>>) - <> + using DocsScraper + DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) + doctest(DocsScraper) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..d48734a --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 1 * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..0cd3114 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,31 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.gitignore b/.gitignore index 9c929a1..8e2d4ba 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ # Ignore .env files .env knowledge_packs/ -Manifest.toml \ No newline at end of file +Manifest.toml +/Manifest.toml +/docs/Manifest.toml +/docs/build/ \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..9238ca7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "cSpell.words": [ + "eachmatch", + "postprocess" + ] +} diff --git a/MIT b/LICENSE similarity index 94% rename from MIT rename to LICENSE index 775ba1d..d7bd022 100644 --- a/MIT +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) {{{YEAR}}} {{{AUTHORS}}} +Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Project.toml b/Project.toml index 0c1f6a8..705a918 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,15 @@ -name = "RAGKit" -uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17" -authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"] +name = "DocsScraper" +uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649" +authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"] version = "0.1.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" -PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" @@ -19,7 +17,19 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" [compat] AbstractTrees = "0.4.5" +EzXML = "1.2.0" Gumbo = "0.8.2" +HDF5 = "0.17.2" HTTP = "1.10.4" +Inflate = "0.1.5" PromptingTools = "0.36.0" +URIParser = "0.4.1" URIs = "1.5.1" +Tar = "1.10.0" + +[extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Aqua", "Test"] diff --git a/docs/Project.toml b/docs/Project.toml index 6fea155..41b0b18 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -6,7 +6,6 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" -PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIParser = "30578b45-9adc-5946-b283-645ec420af67" diff --git a/docs/make.jl b/docs/make.jl index 38d4452..a54f0f6 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,24 +1,23 @@ -using Documenter: Documenter, makedocs, deploydocs -using PkgTemplates: PkgTemplates +using DocsScraper +using Documenter + +DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) makedocs(; - modules=[PkgTemplates], - authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>", - repo="https://github.com/splendidbug/RAGKit", - sitename="RAGKit.jl", + modules=[DocsScraper], + authors="Shreyas Agrawal @splendidbug and J S @svilupp", + sitename="DocsScraper.jl", # format=Documenter.HTML(; - # repolink="https://github.com/splendidbug/RAGKit", - # canonical="https://juliaci.github.io/PkgTemplates.jl", + # canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl", + # edit_link="master", # assets=String[], # ), pages=[ "Home" => "index.md", - "User Guide" => "user.md", - "Developer Guide" => "developer.md", - "Migrating To PkgTemplates 0.7+" => "migrating.md", ], ) deploydocs(; - repo="https://github.com/splendidbug/RAGKit", + repo="github.com/Shreyas Agrawal/DocsScraper.jl", + devbranch="main", ) diff --git a/docs/src/index.md b/docs/src/index.md index f53a411..a6f0129 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,4 @@ -# RAGKit +# DocsScraper ## Documentation diff --git a/src/RAGKit.jl b/src/DocsScraper.jl similarity index 56% rename from src/RAGKit.jl rename to src/DocsScraper.jl index b895363..e78dde7 100644 --- a/src/RAGKit.jl +++ b/src/DocsScraper.jl @@ -1,7 +1,9 @@ -module RAGKit +module DocsScraper using HTTP, Gumbo, AbstractTrees, URIs using Gumbo: HTMLDocument, HTMLElement using EzXML +using Pkg +Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl")) using PromptingTools const PT = PromptingTools const RT = PromptingTools.Experimental.RAGTools @@ -12,17 +14,18 @@ using Inflate using SHA using Serialization, URIs -# using Regex - -# using Robots include("parser.jl") include("crawl.jl") include("extract_urls.jl") include("preparation.jl") -include("make_embeddings.jl") -export make_embeddings +include("make_knowledge_packs.jl") +export make_knowledge_packs, just_generate + +include("user_preferences.jl") +include("utils.jl") +export remove_urls_from_index end \ No newline at end of file diff --git a/src/crawl.jl b/src/crawl.jl index b147511..a8f93c9 100644 --- a/src/crawl.jl +++ b/src/crawl.jl @@ -2,10 +2,7 @@ """ parse_robots_txt!(robots_txt::String) -Parses the robots.txt string and returns rules along with the URLs on Sitemap - -# Arguments -- `robots_txt`: robots.txt as a string +Parse the robots.txt string and return rules and the URLs on Sitemap """ function parse_robots_txt!(robots_txt::String) rules = Dict{String,Dict{String,Vector{String}}}() @@ -40,17 +37,15 @@ end """ - check_robots_txt(user_agent::AbstractString, - url::AbstractString) + check_robots_txt(user_agent::AbstractString, url::AbstractString) -Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url +Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls # Arguments - `user_agent`: user agent attempting to crawl the webpage - `url`: input URL string """ -function check_robots_txt(user_agent::AbstractString, - url::AbstractString) +function check_robots_txt(user_agent::AbstractString, url::AbstractString) ## TODO: Make a cache of rules for a quick lookup # if (haskey(restricted_urls, url)) @@ -101,10 +96,7 @@ end """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url """ function get_base_url(url::AbstractString) @@ -118,10 +110,7 @@ end """ process_hostname(url::AbstractString) -Returns the hostname of an input URL - -# Arguments -- `url`: URL string +Return the hostname of an input URL """ function process_hostname(url::AbstractString) URI = URIs.URI(url) @@ -133,7 +122,7 @@ end """ process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) -Adds the `url` to it's hostname in `hostname_dict` +Add `url` to its hostname in `hostname_dict` # Arguments - `url`: URL string @@ -154,10 +143,7 @@ end """ crawl(input_urls::Vector{<:AbstractString}) -Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs - -# Arguments -- `input_urls`: A vector of input URLs +Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs """ function crawl(input_urls::Vector{<:AbstractString}) @@ -187,6 +173,6 @@ function crawl(input_urls::Vector{<:AbstractString}) end end - return hostname_url_dict + return hostname_url_dict, visited_url_set end diff --git a/src/extract_urls.jl b/src/extract_urls.jl index b9ea364..d5e8fcf 100644 --- a/src/extract_urls.jl +++ b/src/extract_urls.jl @@ -1,31 +1,37 @@ -# Temporary until I find a package to simplify this +""" + resolve_url(base_url::String, extracted_url::String) -function resolve_url(base_url::String, relative_url::String)::String - base_uri = URI(base_url) - relative_uri = URI(relative_url) +Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. +Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url - ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames +# Arguments +- base_url: URL of the page from which other URLs are being extracted +- extracted_url: URL extracted from the base_url +""" +function resolve_url(base_url::String, extracted_url::String) + base_uri = URI(base_url) + extracted_uri = URI(extracted_url) ## TODO: Look for version number either on the bottom left dropdown or identify on the url - if length(relative_url) > 4 && relative_url[1:4] == "http" - if base_uri.host == relative_uri.host - return relative_url + if length(extracted_url) > 4 && extracted_url[1:4] == "http" + if base_uri.host == extracted_uri.host + return extracted_url end return "" end - if !isempty(relative_url) && relative_url[1] == '#' + if !isempty(extracted_url) && extracted_url[1] == '#' return "" end - if !isempty(relative_uri.path) && relative_uri.path[1] == '/' + if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/' resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=relative_uri.path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = extracted_uri.path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end @@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String base_segments = split(base_uri.path, "/") base_segments = filter((i) -> i != "", base_segments) - relative_segments = split(relative_uri.path, "/") - relative_segments = filter((i) -> i != "", relative_segments) + extracted_segments = split(extracted_uri.path, "/") + extracted_segments = filter((i) -> i != "", extracted_segments) - # Process the relative segments - for segment in relative_segments + # Process the directory traversal paths + for segment in extracted_segments if segment == ".." if !isempty(base_segments) pop!(base_segments) @@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String # Create the resolved URI resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=resolved_path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = resolved_path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end - """ - find_urls!(url::AbstractString, - node::Gumbo.HTMLElement, - url_queue::Vector{<:AbstractString} + find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString} -Function to recursively find and extract the urls +Function to recursively find tags and extract the urls # Arguments - url: The initial input URL - node: The HTML node of type Gumbo.HTMLElement - url_queue: Vector in which extracted URLs will be appended """ -function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) +function find_urls_html!( + url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) if Gumbo.tag(node) == :a && haskey(node.attributes, "href") href = node.attributes["href"] if href !== nothing && !isempty(resolve_url(url, href)) @@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end + # Go deep in the HTML tags and check if `node` is an tag for child in node.children if isa(child, HTMLElement) find_urls_html!(url, child, url_queue) @@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end +""" + find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) +Identify URL through regex pattern in xml files and push in `url_queue` +# Arguments +- url: url from which all other URLs will be extracted +- url_queue: Vector in which extracted URLs will be appended +""" function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) + # If a string starts with "http" then it is considered as a URL regardless of it being valid. + # Validity of URLs are checked during HTTP fetch try fetched_content = HTTP.get(url) xml_content = String(fetched_content.body) @@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString} end end - - """ get_links!(url::AbstractString, url_queue::Vector{<:AbstractString}) -Function to extract urls inside tags +Extract urls inside html or xml files # Arguments - url: url from which all other URLs will be extracted - url_queue: Vector in which extracted URLs will be appended """ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString}) - @info "Scraping link: $url" - # println(url) - # try fetched_content = HTTP.get(url) parsed = Gumbo.parsehtml(String(fetched_content.body)) - if (url[end-3:end] == ".xml") + if (url[(end - 3):end] == ".xml") find_urls_xml!(url_xml, url_queue) else find_urls_html!(url, parsed.root, url_queue) end - # print("-------------") - # catch e - # println("Bad URL: $url") - # end end \ No newline at end of file diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl deleted file mode 100644 index f51c865..0000000 --- a/src/make_embeddings.jl +++ /dev/null @@ -1,173 +0,0 @@ -## TODO: Make a function to Check for version number - -""" - report_artifact() - -prints artifact information -""" -function report_artifact(fn_output) - @info("ARTIFACT: $(basename(fn_output))") - @info("sha256: ", bytes2hex(open(sha256, fn_output))) - @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) -end - - - - -""" - create_output_folders() - -Creates output folders -""" -function create_output_folders(knowledge_pack_path::String) - # Define the folder path - folder_path = joinpath(knowledge_pack_path, "packs") - println("folder_path:", folder_path) - # Check if the folder exists - if !isdir(folder_path) - mkpath(folder_path) - @info "Folder created: $folder_path" - else - @info "Folder already exists: $folder_path" - end - -end - -""" - make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}) - -Parses URLs from hostname_url_dict and saves the chunks - -# Arguments -- hostname_url_dict: Dict with key being hostname and value being a vector of URLs -""" -function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String) - output_chunks = Vector{SubString{String}}() - output_sources = Vector{String}() - SAVE_CHUNKS = true - CHUNK_SIZE = 512 - for (hostname, urls) in hostname_url_dict - for url in urls - try - chunks, sources = process_paths(url) - append!(output_chunks, chunks) - append!(output_sources, sources) - catch - @error "error!! check url: $url" - end - end - if SAVE_CHUNKS - serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks) - serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources) - end - - end - - -end - -function l2_norm_columns(mat::AbstractMatrix) - norm_ = norm.(eachcol(mat)) - return mat ./ norm_' -end -function l2_norm_columns(vect::AbstractVector) - norm_ = norm(vect) - return vect / norm_ -end - - -""" - generate_embeddings() - -Deserializes chunks and sources to generate embeddings -""" -function generate_embeddings(knowledge_pack_path::String) - embedder = RT.BatchEmbedder() - entries = readdir(knowledge_pack_path) - - # Initialize a dictionary to group files by hostname and chunk size - hostname_files = Dict{String,Dict{Int,Dict{String,String}}}() - - # Regular expressions to match the file patterns - chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" - sources_pattern = r"^(.*)-sources-(\d+)\.jls$" - - # Group files by hostname and chunk size - for file in entries - match_chunks = match(chunks_pattern, file) - match_sources = match(sources_pattern, file) - - if match_chunks !== nothing - hostname = match_chunks.captures[1] - chunk_size = parse(Int, match_chunks.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file) - elseif match_sources !== nothing - hostname = match_sources.captures[1] - chunk_size = parse(Int, match_sources.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file) - end - end - - - # Process each pair of files - for (hostname, chunk_files) in hostname_files - for (chunk_size, files) in chunk_files - if haskey(files, "chunks") && haskey(files, "sources") - chunks_file = files["chunks"] - sources_file = files["sources"] - chunks = deserialize(chunks_file) - sources = deserialize(sources_file) - cost_tracker = Threads.Atomic{Float64}(0.0) - full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024) - - fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") - fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5") - h5open(fn_temp, "w") do file - file["chunks"] = chunks - file["sources"] = sources - file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x) - file["type"] = "ChunkIndex" - # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" - end - command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` - run(command) - report_artifact(fn_output) - - else - @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" - end - end - end - -end - - - -""" - make_embeddings(input_urls::Vector{<:AbstractString}) - -Entry point to crawl, parse and create embeddings - -# Arguments -- input_urls: vector containing URL strings to parse -""" -function make_embeddings(input_urls::Vector{<:AbstractString}) - hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}() - hostname_url_dict = crawl(input_urls) - knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") - create_output_folders(knowledge_pack_path) - make_chunks(hostname_url_dict, knowledge_pack_path) - generate_embeddings(knowledge_pack_path) -end \ No newline at end of file diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl new file mode 100644 index 0000000..291a9c7 --- /dev/null +++ b/src/make_knowledge_packs.jl @@ -0,0 +1,222 @@ +""" + report_artifact(fn_output) + +Print artifact information +""" +function report_artifact(fn_output) + @info("ARTIFACT: $(basename(fn_output))") + @info("sha256: ", bytes2hex(open(sha256, fn_output))) + @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) +end + +""" + create_output_folders(knowledge_pack_path::String) + +Create output folders on the knowledge_pack_path +""" +function create_output_folders(knowledge_pack_path::String) + # Define the folder path + folder_path = joinpath(knowledge_pack_path, "packs") + # Check if the folder exists + if !isdir(folder_path) + mkpath(folder_path) + end +end + +""" + make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE, + min_chunk_size::Int=MIN_CHUNK_SIZE) + +Parse URLs from hostname_url_dict and save the chunks + +# Arguments +- hostname_url_dict: Dict with key being hostname and value being a vector of URLs +- knowledge_pack_path: Knowledge pack path +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +""" +function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}}, + knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, + min_chunk_size::Int = MIN_CHUNK_SIZE) + SAVE_CHUNKS = true + for (hostname, urls) in hostname_url_dict + output_chunks = Vector{SubString{String}}() + output_sources = Vector{String}() + for url in urls + try + chunks, sources = process_paths(url; max_chunk_size, min_chunk_size) + append!(output_chunks, chunks) + append!(output_sources, sources) + catch + @error "error!! check url: $url" + end + end + if SAVE_CHUNKS + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_chunks) + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_sources) + end + end +end + +""" + l2_norm_columns(mat::AbstractMatrix) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(mat::AbstractMatrix) + norm_ = norm.(eachcol(mat)) + return mat ./ norm_' +end + +""" + l2_norm_columns(vect::AbstractVector) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(vect::AbstractVector) + norm_ = norm(vect) + return vect / norm_ +end + +""" + generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + +Deserialize chunks and sources to generate embeddings + +# Arguments +- model: Embedding model +- embedding_size: Embedding dimensions +""" +function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL, + embedding_size::Int = EMBEDDING_SIZE) + embedder = RT.BatchEmbedder() + entries = readdir(knowledge_pack_path) + # Initialize a dictionary to group files by hostname and chunk size + hostname_files = Dict{String, Dict{Int, Dict{String, String}}}() + + # Regular expressions to match the file patterns of chunks and sources + chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$" + sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$" + + # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" + # sources_pattern = r"^(.*)-sources-(\d+)\.jls$" + + # Group files by hostname and chunk size + for file in entries + match_chunks = match(chunks_pattern, file) + match_sources = match(sources_pattern, file) + + if match_chunks !== nothing + hostname = match_chunks.captures[1] + chunk_size = parse(Int, match_chunks.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], chunk_size) + hostname_files[hostname][chunk_size] = Dict{String, String}() + end + hostname_files[hostname][chunk_size]["chunks"] = joinpath( + knowledge_pack_path, file) + elseif match_sources !== nothing + hostname = match_sources.captures[1] + chunk_size = parse(Int, match_sources.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], chunk_size) + hostname_files[hostname][chunk_size] = Dict{String, String}() + end + hostname_files[hostname][chunk_size]["sources"] = joinpath( + knowledge_pack_path, file) + end + end + # Process each pair of files + for (hostname, chunk_files) in hostname_files + for (chunk_size, files) in chunk_files + if haskey(files, "chunks") && haskey(files, "sources") + chunks_file = files["chunks"] + sources_file = files["sources"] + chunks = deserialize(chunks_file) + sources = deserialize(sources_file) + cost_tracker = Threads.Atomic{Float64}(0.0) + full_embeddings = RT.get_embeddings( + embedder, chunks; model, verbose = false, cost_tracker) + @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))" + fn_output = joinpath(knowledge_pack_path, "packs", + "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") + fn_temp = joinpath(knowledge_pack_path, "packs", + "$hostname-textembedding3large-0-Float32__v1.0.hdf5") + h5open(fn_temp, "w") do file + file["chunks"] = chunks + file["sources"] = sources + file["embeddings"] = full_embeddings[1:embedding_size, :] |> + l2_norm_columns |> x -> map(>(0), x) + file["type"] = "ChunkIndex" + # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" + end + + command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` + run(command) + report_artifact(fn_output) + + else + @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" + end + end + end +end + +""" + make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[], + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE) + +Entry point to crawl, parse and generate embeddings + +# Arguments +- crawlable_urls: URLs that should be crawled to find more links +- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +- model: Embedding model +- embedding_size: Embedding dimensions +""" +function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[]; + single_urls::Vector{<:AbstractString} = String[], + max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE, + model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE) + if isempty(crawlable_urls) && isempty(single_urls) + error("At least one of `input_urls` or `single_pages` must be provided.") + end + + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() + + if !isempty(crawlable_urls) + hostname_url_dict, visited_url_set = crawl(crawlable_urls) + else + visited_url_set = Set{AbstractString}() + end + for url in single_urls + base_url = get_base_url(url) + if !in(base_url, visited_url_set) + push!(visited_url_set, base_url) + crawlable, sitemap_urls = check_robots_txt("*", base_url) + if crawlable + try + process_hostname!(url, hostname_url_dict) + catch + @error "Bad URL: $base_url" + end + end + end + end + knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") + create_output_folders(knowledge_pack_path) + make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) + generate_embeddings(knowledge_pack_path; model, embedding_size) +end diff --git a/src/parser.jl b/src/parser.jl index d909280..def1a17 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -1,21 +1,3 @@ -""" -Working: - -Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks -ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion. -For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks. -If the current node is a code block, return the text inside code block with backticks. -If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. -if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td] -it is assumed that everything inside the tag is part of a single text block with inline code. -But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. -To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration -that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. -We indicate this by a return flag is_text_inserted -""" - - - """ insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, parsed_blocks::Vector{Dict{String,Any}}, @@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector - text_to_insert: Text to be inserted - text_type: The text to be inserted could be heading or a code block or just text """ -function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - text_to_insert::AbstractString, - text_type::AbstractString) - +function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) if !isempty(strip(text_to_insert)) push!(parsed_blocks, Dict(text_type => strip(text_to_insert), @@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, end end - - """ process_headings!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl - parsed_blocks: Vector of Dicts to store parsed text and metadata """ function process_headings!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}) tag_name = Gumbo.tag(node) # Clear headings of equal or lower level for k in collect(keys(heading_hierarchy)) - if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) + if k != "header" && + Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) delete!(heading_hierarchy, k) end end @@ -123,11 +102,10 @@ If the node is neither heading nor code - prev_text_buffer: IO Buffer which contains previous text """ function process_generic_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) @@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless, # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. - if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header] - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, + :cite, :address, :em, :td, :a, :span, :header] + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + elseif tag_name in [:script] + continue else - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call) @@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, print(prev_text_buffer, " " * received_text) text_to_insert = text_to_insert * " " * received_text end - end # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, @@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if we're insert text in current node level, then we should insert the previous text if available, # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird if !isempty(strip(text_to_insert)) - insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") is_text_inserted = true end @@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, return "", is_code_block, is_text_inserted end - """ process_docstring!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -224,11 +206,10 @@ Function to process node of class `docstring` - prev_text_buffer: IO Buffer which contains previous text """ function process_docstring!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) is_code_block = false @@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement, # Insert "header" if Gumbo.tag(children[1]) == :header heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1])) - insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") end - received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) if !isempty(strip(received_text)) insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text") @@ -279,11 +262,10 @@ Function to process a node - prev_text_buffer: IO Buffer which contains previous text """ function process_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) tag_name = Gumbo.tag(node) if startswith(string(tag_name), "h") && isdigit(last(string(tag_name))) return process_headings!(node, heading_hierarchy, parsed_blocks) @@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement, return process_code(node) elseif tag_name == :article && getattr(node, "class", "") == "docstring" - return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_docstring!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_generic_node!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - """ multiple dispatch for process_node!() when node is of type Gumbo.HTMLText """ @@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...) return strip(Gumbo.text(node)), is_code_block, is_text_inserted end - """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url. """ function get_base_url(url::AbstractString) parsed_url = URIs.URI(url) @@ -329,7 +306,7 @@ end """ get_html_content(root::Gumbo.HTMLElement) -Returns the main content of the HTML. If not found, returns the whole HTML to parse +Return the main content of the HTML. If not found, return the whole HTML to parse # Arguments - `root`: The HTML root from which content is extracted @@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement) target_ids = Set(["VPContent", "main_content_wrap", "pages-content"]) target_classes = Set(["content", "franklin-content"]) - content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] + content_candidates = [el + for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] # First try to find by ID - content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates) + content_by_id = filter( + el -> getattr(el, "id", nothing) in target_ids, content_candidates) if !isempty(content_by_id) return only(content_by_id) end # Fallback to class if no ID matches - content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates) + content_by_class = filter( + el -> getattr(el, "class", nothing) in target_classes, content_candidates) if !isempty(content_by_class) return only(content_by_class) end # Fallback to the root node if no class matches return root - end - """ parse_url(url::AbstractString) -Initiator and main function to parse HTML from url +Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata # Arguments - `url`: URL string to parse - -# Returns -- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata - -# Usage -parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/") - -# Example -Let the HTML be: - - - - -

Heading 1

-

Heading 2

-

para 1

-

Heading 3

- this is my code block -

This is another h3 under Heading 2

-

This is a paragraph with inline code

- -

Heading 2_2

-

para ewg

- - - - -Output: -Any[ - Dict{String, Any}("URL" => "URL") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3") - Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2")) - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg") -] """ function parse_url_to_blocks(url::AbstractString) @@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString) # title = [el # for el in AbstractTrees.PreOrderDFS(r_parsed.root) # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") - parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)]) - heading_hierarchy = Dict{Symbol,Any}() + parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) + heading_hierarchy = Dict{Symbol, Any}() process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks) return parsed_blocks catch diff --git a/src/preparation.jl b/src/preparation.jl index ab8d7b5..9979155 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -1,9 +1,7 @@ -# include("recursive_splitter.jl") -include("utils.jl") """ get_header_path(d::Dict) -Concatenates the h1, h2, h3 keys from the metadata of a Dict +Concatenate the h1, h2, h3 keys from the metadata of a Dict # Examples ```julia @@ -12,7 +10,7 @@ get_header_path(d) # Output: "Axis/Attributes/yzoomkey" ``` """ -function get_header_path(d::Dict) +function get_header_path(d::Dict{String,Any}) metadata = get(d, "metadata", Dict{Any,Any}()) isempty(metadata) && return nothing keys_ = [:h1, :h2, :h3] @@ -21,8 +19,13 @@ function get_header_path(d::Dict) end -"Roll-up chunks (that have the same header!), so we can split them later by to get the desired length" -function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="") + +""" + roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") + +Roll-up chunks (that have the same header!), so we can split them later by to get the desired length +""" +function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") docs = String[] io = IOBuffer() last_header = nothing @@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end last_header = header @@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end return docs, sources @@ -56,19 +59,23 @@ end struct DocParserChunker <: RT.AbstractChunker end -""" - RT.get_chunks(chunker::DocParserChunker, - html_files::Vector{<:AbstractString}; - sources::AbstractVector{<:AbstractString}=html_files, - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) -Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length. +""" + RT.get_chunks(chunker::DocParserChunker, url::AbstractString; + verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) + +Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, +and splits them by separators to get the desired length. + +# Arguments +- chunker: DocParserChunker +- url: URL of the webpage to extract chunks +- verbose: Bool to print the log +- separators: Chunk separators +- max_chunk_size Maximum chunk size """ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) - + verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) SEP = "" sources = AbstractVector{<:AbstractString} @@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; ## roll up chunks by SEP splitter, then remove it later for (doc, src) in zip(docs_, sources_) ## roll up chunks by SEP splitter, then remove it later - doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|> + doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|> x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x) + chunk_lengths = length.(doc_chunks) # skip if no chunks found isempty(doc_chunks) && continue append!(output_chunks, doc_chunks) @@ -96,20 +104,24 @@ end -"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them." -function process_paths(url::AbstractString, max_length::Int=512) +""" + process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) + +Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them. +""" +function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) output_chunks = Vector{SubString{String}}() output_sources = Vector{String}() - chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length) + chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size) append!(output_chunks, chunks) append!(output_sources, sources) @info "Scraping done: $(length(output_chunks)) chunks" - postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true) + output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true) return output_chunks, output_sources end diff --git a/src/user_preferences.jl b/src/user_preferences.jl new file mode 100644 index 0000000..98794c6 --- /dev/null +++ b/src/user_preferences.jl @@ -0,0 +1,4 @@ +global MIN_CHUNK_SIZE = 40 +global MAX_CHUNK_SIZE = 256 +global MODEL = "text-embedding-3-large" +global EMBEDDING_SIZE = 1024 \ No newline at end of file diff --git a/src/utils.jl b/src/utils.jl index 4bf1e07..e8dc014 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,9 @@ -"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)." +""" + find_duplicates(chunks::AbstractVector{<:AbstractString}) + +Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, +where `true` indicates a duplicate (second instance of the same text). +""" function find_duplicates(chunks::AbstractVector{<:AbstractString}) # hash the chunks for easier search hashed_chunks = bytes2hex.(sha256.(chunks)) @@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString}) return duplicates end -"Removes chunks that are duplicated in the input list of chunks and their corresponding sources." +""" + remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) + +Remove chunks that are duplicated in the input list of chunks and their corresponding sources. +""" function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) idxs = find_duplicates(chunks) return chunks[.!idxs], sources[.!idxs] end -"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources." -function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true) + +""" + remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) + +Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources. +""" +function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) + + chunk_lengths = length.(chunks) idx = if skip_code - ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text) - findall(x -> length(x) >= min_length || occursin("```", x), chunks) + ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text) + findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks) else - findall(x -> length(x) >= min_length, chunks) + findall(x -> length(x) >= min_chunk_size, chunks) end + chunk_lengths = length.(chunks[idx]) return chunks[idx], sources[idx] end @@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`" replacement_pairs = paths .=> websites output = map(x -> replace(x, replacement_pairs...), sources) + return output end -"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates." -function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true, - paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) + + +""" + function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, + websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) + +Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates. +""" +function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, + websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) len_ = length(chunks) - chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code) + chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code) @info "Removed $(len_ - length(chunks)) short chunks" len_ = length(chunks) @@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A end return chunks, sources +end + +""" + function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) + +Remove chunks and sources corresponding to URLs starting with `prefix_urls` +""" +function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) + @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)." + h5open(index_path, "r+") do orig_file + # Load the sources dataset into a Julia array + sources = read(orig_file["sources"]) + chunks = read(orig_file["chunks"]) + embeddings = read(orig_file["embeddings"]) + for url_to_remove in prefix_urls + indices_to_remove = findall(x -> startswith(x, url_to_remove), sources) + sources = deleteat!(sources, indices_to_remove) + chunks = deleteat!(chunks, indices_to_remove) + embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)] + end + + write(file["sources"], sources) + write(file["chunks"], chunks) + write(file["embeddings"], embeddings) + end end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 78a78b4..4b4a92c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays using HDF5 using Tar using Inflate - using SHA using Serialization, URIs -include("..\\src\\crawl.jl") -include("..\\src\\extract_urls.jl") -include("..\\src\\parser.jl") -include("..\\src\\preparation.jl") +include(joinpath("..", "src", "crawl.jl")) +include(joinpath("..", "src", "extract_urls.jl")) +include(joinpath("..", "src", "parser.jl")) +include(joinpath("..", "src", "preparation.jl")) +include(joinpath("..", "src", "user_preferences.jl")) +include(joinpath("..", "src", "utils.jl")) + urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) url = urls[1] queue = Vector{AbstractString}() -@testset "check robots.txt" begin +@testset "HTTP" begin @test HTTP.get(url) != nothing - result, sitemap_queue = check_robots_txt("*", url) @test result == true end @@ -38,12 +39,13 @@ end parsed_blocks = parse_url_to_blocks(url) @test length(parsed_blocks) > 0 SEP = "" - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP) - @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing end @testset "overall test" begin chunks, sources = process_paths(url) - @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing - + @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && + sources[1] != nothing end