From 93185094aca8333d13f242a5ccc5897e7ec1cec2 Mon Sep 17 00:00:00 2001 From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Thu, 15 Aug 2024 02:40:33 -0700 Subject: [PATCH] structured according to PkgTemplate (#4) * structured according to PkgTemplate * structured according to PkgTemplate, other changes * dependency changes * dependency changes * code imptrovements --- .JuliaFormatter.toml | 3 + .github/dependabot.yml | 7 + .github/workflows/CI.yml | 76 +++++++++ .github/workflows/CompatHelper.yml | 16 ++ .github/workflows/TagBot.yml | 31 ++++ .gitignore | 7 +- LICENSE | 21 +++ Project.toml | 46 ++++-- docs/Project.toml | 15 ++ docs/make.jl | 24 +++ docs/src/index.md | 8 + src/{RAGKit.jl => DocsScraper.jl} | 19 ++- src/crawl.jl | 50 ++---- src/extract_package_name.jl | 162 +++++++++++++++++++ src/extract_urls.jl | 97 ++++++------ src/make_embeddings.jl | 163 ------------------- src/make_knowledge_packs.jl | 245 +++++++++++++++++++++++++++++ src/parser.jl | 165 ++++++------------- src/preparation.jl | 71 +++++---- src/user_preferences.jl | 4 + src/utils.jl | 126 +++++++++++++-- test/crawl.jl | 7 + test/make_knowledge_packs.jl | 8 + test/parser.jl | 11 ++ test/runtests.jl | 38 ++--- test/utils.jl | 10 ++ 26 files changed, 977 insertions(+), 453 deletions(-) create mode 100644 .JuliaFormatter.toml create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/CI.yml create mode 100644 .github/workflows/CompatHelper.yml create mode 100644 .github/workflows/TagBot.yml create mode 100644 LICENSE create mode 100644 docs/Project.toml create mode 100644 docs/make.jl create mode 100644 docs/src/index.md rename src/{RAGKit.jl => DocsScraper.jl} (59%) create mode 100644 src/extract_package_name.jl delete mode 100644 src/make_embeddings.jl create mode 100644 src/make_knowledge_packs.jl create mode 100644 src/user_preferences.jl create mode 100644 test/crawl.jl create mode 100644 test/make_knowledge_packs.jl create mode 100644 test/parser.jl create mode 100644 test/utils.jl diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..9601a61 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,3 @@ +# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options +style = "sciml" +ignore = ["knowledge_packs"] \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..5cd2adb --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,76 @@ +name: CI +on: + push: + branches: + - main + tags: ["*"] + pull_request: + workflow_dispatch: +concurrency: + # Skip intermediate builds: always. + # Cancel intermediate builds: only if it is a pull request build. + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} +jobs: + test: + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} + runs-on: ${{ matrix.os }} + timeout-minutes: 60 + permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created + actions: write + contents: read + strategy: + fail-fast: false + matrix: + version: + - "1.10" + os: + - ubuntu-latest + arch: + - x64 + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: ${{ matrix.version }} + arch: ${{ matrix.arch }} + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + - uses: julia-actions/julia-processcoverage@v1 + # - uses: codecov/codecov-action@v4 + # with: + # files: lcov.info + # token: ${{ secrets.CODECOV_TOKEN }} + # fail_ci_if_error: false + docs: + name: Documentation + runs-on: ubuntu-latest + permissions: + actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created + contents: write + statuses: write + steps: + - uses: actions/checkout@v4 + - uses: julia-actions/setup-julia@v2 + with: + version: "1" + - uses: julia-actions/cache@v2 + - name: Configure doc environment + shell: julia --project=docs --color=yes {0} + run: | + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate() + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-docdeploy@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + - name: Run doctests + shell: julia --project=docs --color=yes {0} + run: | + using Documenter: DocMeta, doctest + using DocsScraper + DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true) + doctest(DocsScraper) diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml new file mode 100644 index 0000000..d48734a --- /dev/null +++ b/.github/workflows/CompatHelper.yml @@ -0,0 +1,16 @@ +name: CompatHelper +on: + schedule: + - cron: 0 0 1 * * + workflow_dispatch: +jobs: + CompatHelper: + runs-on: ubuntu-latest + steps: + - name: Pkg.add("CompatHelper") + run: julia -e 'using Pkg; Pkg.add("CompatHelper")' + - name: CompatHelper.main() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} + run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml new file mode 100644 index 0000000..0cd3114 --- /dev/null +++ b/.github/workflows/TagBot.yml @@ -0,0 +1,31 @@ +name: TagBot +on: + issue_comment: + types: + - created + workflow_dispatch: + inputs: + lookback: + default: "3" +permissions: + actions: read + checks: read + contents: write + deployments: read + issues: read + discussions: read + packages: read + pages: read + pull-requests: read + repository-projects: read + security-events: read + statuses: read +jobs: + TagBot: + if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot' + runs-on: ubuntu-latest + steps: + - uses: JuliaRegistries/TagBot@v1 + with: + token: ${{ secrets.GITHUB_TOKEN }} + ssh: ${{ secrets.DOCUMENTER_KEY }} diff --git a/.gitignore b/.gitignore index 9c929a1..4a1c7f4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ # Ignore .env files .env knowledge_packs/ -Manifest.toml \ No newline at end of file +Manifest.toml +/Manifest.toml +/docs/Manifest.toml +/docs/build/ +.vscode/** +**/.DS_Store \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..183f1b7 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Shreyas Agrawal @splendidbug and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Project.toml b/Project.toml index 964d069..1fb77c2 100644 --- a/Project.toml +++ b/Project.toml @@ -1,22 +1,50 @@ -name = "RAGKit" -uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17" -authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"] +name = "DocsScraper" +uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649" +authors = ["Shreyas Agrawal @splendidbug and contributors"] version = "0.1.0" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" +Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" -URIParser = "30578b45-9adc-5946-b283-645ec420af67" +SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [compat] -AbstractTrees = "0.4.5" -Gumbo = "0.8.2" -HTTP = "1.10.4" -URIs = "1.5.1" +AbstractTrees = "0.4" +Aqua = "0.8" +Dates = "1" +EzXML = "1.2" +Gumbo = "0.8" +HDF5 = "0.17" +HTTP = "1.10" +Inflate = "0.1" +LinearAlgebra = "1" +PromptingTools = "0.48" +SHA = "0.7" +Serialization = "1" +SparseArrays = "1" +Tar = "1" +Test = "1" +URIs = "1.5" +Unicode = "1" +julia = "1.10" +JSON = "0.21" + +[extras] +Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Aqua", "Test"] diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..15c39b1 --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,15 @@ +[deps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +DocsScraper = "bd71d052-5e08-40cc-a492-eb4e8da4b649" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1" +EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" +Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" +HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" +LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589" +PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" +Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +URIParser = "30578b45-9adc-5946-b283-645ec420af67" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 0000000..47bd6f5 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,24 @@ +using DocsScraper +using Documenter + +DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive = true) + +makedocs(; + modules = [DocsScraper], + authors = "Shreyas Agrawal @splendidbug and contributors", + sitename = "DocsScraper.jl", + repo = "https://github.com/splendidbug/DocsScraper.jl/blob/{commit}{path}#{line}", + format = Documenter.HTML(; + repolink = "https://github.com/splendidbug/DocsScraper.jl", + canonical = "https://splendidbug.github.io/DocsScraper.jl", + edit_link = "main", + assets = String[]), + pages = [ + "API Index" => "index.md" + ] +) + +deploydocs(; + repo = "github.com/splendidbug/DocsScraper.jl", + devbranch = "main" +) diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 0000000..c30e1af --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,8 @@ +# Reference + +```@index +``` + +```@autodocs +Modules = [DocsScraper] +``` \ No newline at end of file diff --git a/src/RAGKit.jl b/src/DocsScraper.jl similarity index 59% rename from src/RAGKit.jl rename to src/DocsScraper.jl index b895363..7f114d9 100644 --- a/src/RAGKit.jl +++ b/src/DocsScraper.jl @@ -1,4 +1,4 @@ -module RAGKit +module DocsScraper using HTTP, Gumbo, AbstractTrees, URIs using Gumbo: HTMLDocument, HTMLElement using EzXML @@ -9,20 +9,23 @@ using LinearAlgebra, Unicode, SparseArrays using HDF5 using Tar using Inflate - using SHA using Serialization, URIs -# using Regex - -# using Robots +using Dates +using JSON include("parser.jl") include("crawl.jl") include("extract_urls.jl") include("preparation.jl") +include("extract_package_name.jl") +export get_package_name -include("make_embeddings.jl") -export make_embeddings +include("make_knowledge_packs.jl") +export make_knowledge_packs +include("user_preferences.jl") +include("utils.jl") +export remove_urls_from_index, urls_for_metadata -end \ No newline at end of file +end diff --git a/src/crawl.jl b/src/crawl.jl index b147511..c972ef2 100644 --- a/src/crawl.jl +++ b/src/crawl.jl @@ -2,13 +2,10 @@ """ parse_robots_txt!(robots_txt::String) -Parses the robots.txt string and returns rules along with the URLs on Sitemap - -# Arguments -- `robots_txt`: robots.txt as a string +Parse the robots.txt string and return rules and the URLs on Sitemap """ function parse_robots_txt!(robots_txt::String) - rules = Dict{String,Dict{String,Vector{String}}}() + rules = Dict{String, Dict{String, Vector{String}}}() current_user_agent = "" sitemap_urls = Vector{AbstractString}() @@ -17,7 +14,8 @@ function parse_robots_txt!(robots_txt::String) if startswith(line, "User-agent:") current_user_agent = strip(split(line, ":")[2]) if !haskey(rules, current_user_agent) - rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}()) + rules[current_user_agent] = Dict( + "Disallow" => Vector{String}(), "Allow" => Vector{String}()) end elseif startswith(line, "Disallow:") disallow_path = strip(split(line, ":")[2]) @@ -33,24 +31,20 @@ function parse_robots_txt!(robots_txt::String) url = strip(split(line, ":")[2]) push!(sitemap_urls, url) end - end return rules, sitemap_urls end - """ - check_robots_txt(user_agent::AbstractString, - url::AbstractString) + check_robots_txt(user_agent::AbstractString, url::AbstractString) -Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url +Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls # Arguments - `user_agent`: user agent attempting to crawl the webpage - `url`: input URL string """ -function check_robots_txt(user_agent::AbstractString, - url::AbstractString) +function check_robots_txt(user_agent::AbstractString, url::AbstractString) ## TODO: Make a cache of rules for a quick lookup # if (haskey(restricted_urls, url)) @@ -101,27 +95,19 @@ end """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url """ function get_base_url(url::AbstractString) - parsed_url = URIs.URI(url) base_url = string(parsed_url.scheme, "://", parsed_url.host, parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path) return base_url end - """ process_hostname(url::AbstractString) -Returns the hostname of an input URL - -# Arguments -- `url`: URL string +Return the hostname of an input URL """ function process_hostname(url::AbstractString) URI = URIs.URI(url) @@ -129,17 +115,17 @@ function process_hostname(url::AbstractString) return hostname end - """ process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) -Adds the `url` to it's hostname in `hostname_dict` +Add `url` to its hostname in `hostname_dict` # Arguments - `url`: URL string - `hostname_dict`: Dict with key being hostname and value being a vector of URLs """ -function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}}) +function process_hostname!( + url::AbstractString, hostname_dict::Dict{AbstractString, Vector{AbstractString}}) hostname = process_hostname(url) # Add the URL to the dictionary under its hostname @@ -150,20 +136,15 @@ function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractStri end end - """ crawl(input_urls::Vector{<:AbstractString}) -Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs - -# Arguments -- `input_urls`: A vector of input URLs +Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs """ function crawl(input_urls::Vector{<:AbstractString}) - url_queue = Vector{AbstractString}(input_urls) visited_url_set = Set{AbstractString}() - hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}() + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() sitemap_urls = Vector{AbstractString}() # TODO: Add parallel processing for URLs @@ -187,6 +168,5 @@ function crawl(input_urls::Vector{<:AbstractString}) end end - return hostname_url_dict - + return hostname_url_dict, visited_url_set end diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl new file mode 100644 index 0000000..525cecf --- /dev/null +++ b/src/extract_package_name.jl @@ -0,0 +1,162 @@ +""" + clean_url(url::String) + +Strip URL of any http:// ot https:// or www. prefixes +""" +function clean_url(url::String) + # Remove http://, https://, www., or wwws. + cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "") + return cleaned_url +end + +""" + base_url_segment(url::String) + +Return the base url and first path segment if all the other checks fail +""" +function base_url_segment(url::String) + # Clean the URL from unwanted prefixes + cleaned_url = clean_url(url) + + # Parse the cleaned URL + uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing + + # Extract the base URL (host) + base_url = replace(uri.host, r"^www\." => "") + + # Extract the first path segment + path_segments = split(uri.path, "/"; keepempty = false) + + if !isempty(path_segments) + first_segment = path_segments[1] + return "$base_url/$first_segment" + else + return base_url + end +end + +""" + url_package_name(url::AbstractString) + +Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes +""" +function url_package_name(url::AbstractString) + if occursin(r"\.jl", url) || occursin(r"_jl", url) + package_name = match(r"[\/]([^\/]+(?:\.jl|_jl))", url) + return package_name.captures[1] + end + return "" +end + +""" + get_base_url(url::AbstractString) + +Extract the base url +""" +function get_base_url(url::AbstractString) + parsed_url = URIs.URI(url) + base_url = string(parsed_url.scheme, "://", parsed_url.host, + parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path) + return base_url +end + +""" + nav_bar(url::AbstractString) + +Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree +""" +function nav_bar(url::AbstractString) + base_url = get_base_url(url) + fetched_content = HTTP.get(base_url) + parsed = Gumbo.parsehtml(String(fetched_content.body)) + content_candidates = [el + for el in AbstractTrees.PreOrderDFS(parsed.root) + if el isa HTMLElement] + content_by_class = filter( + el -> getattr(el, "class", nothing) in ["docs-package-name"], content_candidates) + if (!isempty(content_by_class)) + parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) + heading_hierarchy = Dict{Symbol, Any}() + process_node!(only(content_by_class), heading_hierarchy, parsed_blocks) + package_name = parsed_blocks[2]["text"] + return package_name + end + return "" +end + +""" + text_before_version(url::AbstractString) + +Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions +""" +function text_before_version(url::AbstractString) + language_prefixes = [ + "/en/", "/es/", "/fr/", "/de/", "/it/", "/pt/", "/ru/", "/zh/", "/ja/", "/ko/"] + contains_prefix = any(occursin(prefix, url) for prefix in language_prefixes) + if contains_prefix + pattern = r"/([^/]+)/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)(?:/|$)" + else + pattern = r"/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)" + end + package_name = match(pattern, url) + if package_name !== nothing + return package_name.captures[1] + end + return "" +end + +""" + docs_in_url(url::AbstractString) + +If the base url is in the form docs.package_name.domain_extension, then return the middle word i.e., package_name +""" +function docs_in_url(url::AbstractString) + cleaned_url = clean_url(url) + + # Parse the cleaned URL + uri = URI("https://" * cleaned_url) # Add https:// to ensure correct parsing + + # Extract the base URL (host) + base_url = replace(uri.host, r"^www\." => "") + pattern = r"docs\.([^.]+)\.(org|com|ai|net|io|co|tech)" + m = match(pattern, base_url) + if m !== nothing + return m.captures[1] + end + return "" +end + +""" + get_package_name(url::AbstractString) + +Return name of the package through the package URL +""" +function get_package_name(url::AbstractString) + + # try 1: look for package name in URL + package_name = url_package_name(url) + if (!isempty(package_name)) + return package_name + end + + # try 2: look for package name in nav bar + package_name = nav_bar(url) + if (!isempty(package_name)) + return package_name + end + + # try 3: if the base url is in the form docs.package_name.domain_extension + package_name = docs_in_url(url) + if (!isempty(package_name)) + return package_name + end + + # try 4: get text before "stable" or "dev" or any version in URL + package_name = text_before_version(url) + if (!isempty(package_name)) + return package_name + end + + # fallback: return base URL with first path segment + return base_url_segment(url) +end diff --git a/src/extract_urls.jl b/src/extract_urls.jl index b9ea364..d750f34 100644 --- a/src/extract_urls.jl +++ b/src/extract_urls.jl @@ -1,31 +1,37 @@ -# Temporary until I find a package to simplify this +""" + resolve_url(base_url::String, extracted_url::String) -function resolve_url(base_url::String, relative_url::String)::String - base_uri = URI(base_url) - relative_uri = URI(relative_url) +Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. +Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url - ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames +# Arguments +- base_url: URL of the page from which other URLs are being extracted +- extracted_url: URL extracted from the base_url +""" +function resolve_url(base_url::String, extracted_url::String) + base_uri = URI(base_url) + extracted_uri = URI(extracted_url) ## TODO: Look for version number either on the bottom left dropdown or identify on the url - if length(relative_url) > 4 && relative_url[1:4] == "http" - if base_uri.host == relative_uri.host - return relative_url + if length(extracted_url) > 4 && extracted_url[1:4] == "http" + if base_uri.host == extracted_uri.host + return extracted_url end return "" end - if !isempty(relative_url) && relative_url[1] == '#' + if !isempty(extracted_url) && extracted_url[1] == '#' return "" end - if !isempty(relative_uri.path) && relative_uri.path[1] == '/' + if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/' resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=relative_uri.path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = extracted_uri.path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end @@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String base_segments = split(base_uri.path, "/") base_segments = filter((i) -> i != "", base_segments) - relative_segments = split(relative_uri.path, "/") - relative_segments = filter((i) -> i != "", relative_segments) + extracted_segments = split(extracted_uri.path, "/") + extracted_segments = filter((i) -> i != "", extracted_segments) - # Process the relative segments - for segment in relative_segments + # Process the directory traversal paths + for segment in extracted_segments if segment == ".." if !isempty(base_segments) pop!(base_segments) @@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String # Create the resolved URI resolved_uri = URI( - scheme=base_uri.scheme, - userinfo=base_uri.userinfo, - host=base_uri.host, - port=base_uri.port, - path=resolved_path, - query=relative_uri.query, - fragment=relative_uri.fragment + scheme = base_uri.scheme, + userinfo = base_uri.userinfo, + host = base_uri.host, + port = base_uri.port, + path = resolved_path, + query = extracted_uri.query, + fragment = extracted_uri.fragment ) return string(resolved_uri) end - """ - find_urls!(url::AbstractString, - node::Gumbo.HTMLElement, - url_queue::Vector{<:AbstractString} + find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString} -Function to recursively find and extract the urls +Function to recursively find tags and extract the urls # Arguments - url: The initial input URL - node: The HTML node of type Gumbo.HTMLElement - url_queue: Vector in which extracted URLs will be appended """ -function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) +function find_urls_html!( + url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}) if Gumbo.tag(node) == :a && haskey(node.attributes, "href") href = node.attributes["href"] if href !== nothing && !isempty(resolve_url(url, href)) @@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end + # Go deep in the HTML tags and check if `node` is an tag for child in node.children if isa(child, HTMLElement) find_urls_html!(url, child, url_queue) @@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue end end +""" + find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) +Identify URL through regex pattern in xml files and push in `url_queue` +# Arguments +- url: url from which all other URLs will be extracted +- url_queue: Vector in which extracted URLs will be appended +""" function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}) + # If a string starts with "http" then it is considered as a URL regardless of it being valid. + # Validity of URLs are checked during HTTP fetch try fetched_content = HTTP.get(url) xml_content = String(fetched_content.body) @@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString} end end - - """ get_links!(url::AbstractString, url_queue::Vector{<:AbstractString}) -Function to extract urls inside tags +Extract urls inside html or xml files # Arguments - url: url from which all other URLs will be extracted - url_queue: Vector in which extracted URLs will be appended """ function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString}) - @info "Scraping link: $url" - # println(url) - # try fetched_content = HTTP.get(url) parsed = Gumbo.parsehtml(String(fetched_content.body)) - if (url[end-3:end] == ".xml") + if (url[(end - 3):end] == ".xml") find_urls_xml!(url_xml, url_queue) else find_urls_html!(url, parsed.root, url_queue) end - # print("-------------") - # catch e - # println("Bad URL: $url") - # end -end \ No newline at end of file +end diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl deleted file mode 100644 index ba079aa..0000000 --- a/src/make_embeddings.jl +++ /dev/null @@ -1,163 +0,0 @@ -## TODO: Make a function to Check for version number - -""" - report_artifact() - -prints artifact information -""" -function report_artifact(fn_output) - @info("ARTIFACT: $(basename(fn_output))") - @info("sha256: ", bytes2hex(open(sha256, fn_output))) - @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) -end - - - - -""" - create_output_folders() - -Creates output folders -""" -function create_output_folders(knowledge_pack_path::String) - # Define the folder path - folder_path = joinpath(knowledge_pack_path, "packs") - println("folder_path:", folder_path) - # Check if the folder exists - if !isdir(folder_path) - mkpath(folder_path) - @info "Folder created: $folder_path" - else - @info "Folder already exists: $folder_path" - end - -end - -""" - make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}) - -Parses URLs from hostname_url_dict and saves the chunks - -# Arguments -- hostname_url_dict: Dict with key being hostname and value being a vector of URLs -""" -function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String) - output_chunks = Vector{SubString{String}}() - output_sources = Vector{String}() - SAVE_CHUNKS = true - CHUNK_SIZE = 512 - for (hostname, urls) in hostname_url_dict - for url in urls - try - chunks, sources = process_paths(url) - append!(output_chunks, chunks) - append!(output_sources, sources) - catch - @error "error!! check url: $url" - end - end - if SAVE_CHUNKS - serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks) - serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources) - end - - end - - -end - -""" - generate_embeddings() - -Deserializes chunks and sources to generate embeddings -""" -function generate_embeddings(knowledge_pack_path::String) - embedder = RT.BatchEmbedder() - entries = readdir(knowledge_pack_path) - - # Initialize a dictionary to group files by hostname and chunk size - hostname_files = Dict{String,Dict{Int,Dict{String,String}}}() - - # Regular expressions to match the file patterns - chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" - sources_pattern = r"^(.*)-sources-(\d+)\.jls$" - - # Group files by hostname and chunk size - for file in entries - match_chunks = match(chunks_pattern, file) - match_sources = match(sources_pattern, file) - - if match_chunks !== nothing - hostname = match_chunks.captures[1] - chunk_size = parse(Int, match_chunks.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file) - elseif match_sources !== nothing - hostname = match_sources.captures[1] - chunk_size = parse(Int, match_sources.captures[2]) - if !haskey(hostname_files, hostname) - hostname_files[hostname] = Dict{Int,Dict{String,String}}() - end - if !haskey(hostname_files[hostname], chunk_size) - hostname_files[hostname][chunk_size] = Dict{String,String}() - end - hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file) - end - end - - - # Process each pair of files - for (hostname, chunk_files) in hostname_files - for (chunk_size, files) in chunk_files - if haskey(files, "chunks") && haskey(files, "sources") - chunks_file = files["chunks"] - sources_file = files["sources"] - chunks = deserialize(chunks_file) - sources = deserialize(sources_file) - cost_tracker = Threads.Atomic{Float64}(0.0) - full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, api_key=ENV["OPENAI_API_KEY"]) - - # Float32 - fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz") - fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5") - h5open(fn_temp, "w") do file - file["chunks"] = chunks - file["sources"] = sources - file["embeddings"] = full_embeddings - file["type"] = "ChunkIndex" - # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl" - end - run(tar - cvzf$fn_output - C$(dirname(fn_temp))$(basename(fn_temp))) - report_artifact(fn_output) - - else - @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size" - end - end - end - -end - - - -""" - make_embeddings(input_urls::Vector{<:AbstractString}) - -Entry point to crawl, parse and create embeddings - -# Arguments -- input_urls: vector containing URL strings to parse -""" -function make_embeddings(input_urls::Vector{<:AbstractString}) - hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}() - hostname_url_dict = crawl(input_urls) - knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") - create_output_folders(knowledge_pack_path) - make_chunks(hostname_url_dict, knowledge_pack_path) - generate_embeddings(knowledge_pack_path) -end \ No newline at end of file diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl new file mode 100644 index 0000000..5d56ff8 --- /dev/null +++ b/src/make_knowledge_packs.jl @@ -0,0 +1,245 @@ +""" + report_artifact(fn_output) + +Print artifact information +""" +function report_artifact(fn_output) + @info("ARTIFACT: $(basename(fn_output))") + @info("sha256: ", bytes2hex(open(sha256, fn_output))) + @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output)))) +end + +""" + create_output_folders(knowledge_pack_path::String) + +Create output folders on the knowledge_pack_path +""" +function create_output_folders(knowledge_pack_path::String) + # Define the folder path + folder_path = joinpath(knowledge_pack_path, "packs") + # Check if the folder exists + if !isdir(folder_path) + mkpath(folder_path) + end +end + +""" + make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) + +Parse URLs from hostname_url_dict and save the chunks + +# Arguments +- hostname_url_dict: Dict with key being hostname and value being a vector of URLs +- knowledge_pack_path: Knowledge pack path +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +""" +function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}}, + knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, + min_chunk_size::Int = MIN_CHUNK_SIZE) + SAVE_CHUNKS = true + for (hostname, urls) in hostname_url_dict + output_chunks = Vector{SubString{String}}() + output_sources = Vector{String}() + for url in urls + try + chunks, sources = process_paths( + url; max_chunk_size, min_chunk_size) + append!(output_chunks, chunks) + append!(output_sources, sources) + catch + @error "error!! check url: $url" + end + end + if SAVE_CHUNKS + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_chunks) + serialize( + joinpath(knowledge_pack_path, + "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"), + output_sources) + end + end +end + +""" + l2_norm_columns(mat::AbstractMatrix) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(mat::AbstractMatrix) + norm_ = norm.(eachcol(mat)) + return mat ./ norm_' +end + +""" + l2_norm_columns(vect::AbstractVector) + +Normalize the columns of the input embeddings +""" +function l2_norm_columns(vect::AbstractVector) + norm_ = norm(vect) + return vect / norm_ +end + +""" + generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, + embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString) + +Deserialize chunks and sources to generate embeddings + +# Arguments +- model: Embedding model +- embedding_size: Embedding dimensions +- custom_metadata: Custom metadata like ecosystem name if required +""" +function generate_embeddings( + knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE, + model::AbstractString = MODEL, + embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString) + embedder = RT.BatchEmbedder() + entries = readdir(knowledge_pack_path) + # Initialize a dictionary to group files by hostname and chunk size + hostname_files = Dict{String, Dict{Int, Dict{String, String}}}() + + # Regular expressions to match the file patterns of chunks and sources + chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$" + sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$" + + # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$" + # sources_pattern = r"^(.*)-sources-(\d+)\.jls$" + + # Group files by hostname and chunk size + for file in entries + match_chunks = match(chunks_pattern, file) + match_sources = match(sources_pattern, file) + + if match_chunks !== nothing + hostname = match_chunks.captures[1] + max_chunk_size = parse(Int, match_chunks.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], max_chunk_size) + hostname_files[hostname][max_chunk_size] = Dict{String, String}() + end + hostname_files[hostname][max_chunk_size]["chunks"] = joinpath( + knowledge_pack_path, file) + elseif match_sources !== nothing + hostname = match_sources.captures[1] + max_chunk_size = parse(Int, match_sources.captures[2]) + if !haskey(hostname_files, hostname) + hostname_files[hostname] = Dict{Int, Dict{String, String}}() + end + if !haskey(hostname_files[hostname], max_chunk_size) + hostname_files[hostname][max_chunk_size] = Dict{String, String}() + end + hostname_files[hostname][max_chunk_size]["sources"] = joinpath( + knowledge_pack_path, file) + end + end + # Process each pair of files + for (hostname, chunk_files) in hostname_files + for (max_chunk_size, files) in chunk_files + if haskey(files, "chunks") && haskey(files, "sources") + chunks_file = files["chunks"] + sources_file = files["sources"] + chunks = deserialize(chunks_file) + sources = deserialize(sources_file) + cost_tracker = Threads.Atomic{Float64}(0.0) + full_embeddings = RT.get_embeddings( + embedder, chunks; model, verbose = false, cost_tracker) + @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))" + + trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0 + fn_output = joinpath(knowledge_pack_path, "packs", + "$hostname-$model-$trunc-Float32__v1.0.tar.gz") + fn_temp = joinpath(knowledge_pack_path, "packs", + "$hostname-$model-$trunc-Float32__v1.0.hdf5") + + h5open(fn_temp, "w") do file + file["chunks"] = chunks + file["sources"] = sources + file["embeddings"] = full_embeddings[1:embedding_size, :] |> + l2_norm_columns |> x -> map(>(0), x) + file["type"] = "ChunkIndex" + + package_url_dict = Dict{String, Vector{String}}() + package_url_dict = urls_for_metadata(sources) + + metadata = Dict( + :embedded_dt => Dates.today(), + :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size, + :embedding_size => embedding_size, :model => model, + :packages => package_url_dict) + + metadata_json = JSON.json(metadata) + file["metadata"] = metadata_json + end + + command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))` + run(command) + report_artifact(fn_output) + + else + @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size" + end + end + end +end + +""" + make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[], + max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, + custom_metadata::AbstractString) + +Entry point to crawl, parse and generate embeddings + +# Arguments +- crawlable_urls: URLs that should be crawled to find more links +- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs +- max_chunk_size: Maximum chunk size +- min_chunk_size: Minimum chunk size +- model: Embedding model +- embedding_size: Embedding dimensions +- custom_metadata: Custom metadata like ecosystem name if required +""" +function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[]; + single_urls::Vector{<:AbstractString} = String[], + max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE, + model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "") + if isempty(crawlable_urls) && isempty(single_urls) + error("At least one of `input_urls` or `single_pages` must be provided.") + end + + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() + + if !isempty(crawlable_urls) + hostname_url_dict, visited_url_set = crawl(crawlable_urls) + else + visited_url_set = Set{AbstractString}() + end + for url in single_urls + base_url = get_base_url(url) + if !in(base_url, visited_url_set) + push!(visited_url_set, base_url) + crawlable, sitemap_urls = check_robots_txt("*", base_url) + if crawlable + try + process_hostname!(url, hostname_url_dict) + catch + @error "Bad URL: $base_url" + end + end + end + end + knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs") + create_output_folders(knowledge_pack_path) + make_chunks( + hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size) + generate_embeddings( + knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata) +end diff --git a/src/parser.jl b/src/parser.jl index d909280..2de7035 100644 --- a/src/parser.jl +++ b/src/parser.jl @@ -1,21 +1,3 @@ -""" -Working: - -Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks -ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion. -For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks. -If the current node is a code block, return the text inside code block with backticks. -If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. -if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td] -it is assumed that everything inside the tag is part of a single text block with inline code. -But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. -To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration -that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. -We indicate this by a return flag is_text_inserted -""" - - - """ insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, parsed_blocks::Vector{Dict{String,Any}}, @@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector - text_to_insert: Text to be inserted - text_type: The text to be inserted could be heading or a code block or just text """ -function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - text_to_insert::AbstractString, - text_type::AbstractString) - +function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) if !isempty(strip(text_to_insert)) push!(parsed_blocks, Dict(text_type => strip(text_to_insert), @@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, end end - - """ process_headings!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl - parsed_blocks: Vector of Dicts to store parsed text and metadata """ function process_headings!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}) tag_name = Gumbo.tag(node) # Clear headings of equal or lower level for k in collect(keys(heading_hierarchy)) - if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) + if k != "header" && + Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) delete!(heading_hierarchy, k) end end @@ -123,11 +102,10 @@ If the node is neither heading nor code - prev_text_buffer: IO Buffer which contains previous text """ function process_generic_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) @@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless, # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. - if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header] - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, + :cite, :address, :em, :td, :a, :span, :header] + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + elseif tag_name in [:script] + continue else - received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call) @@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, print(prev_text_buffer, " " * received_text) text_to_insert = text_to_insert * " " * received_text end - end # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, @@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement, # if we're insert text in current node level, then we should insert the previous text if available, # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird if !isempty(strip(text_to_insert)) - insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") is_text_inserted = true end @@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement, return "", is_code_block, is_text_inserted end - """ process_docstring!(node::Gumbo.HTMLElement, heading_hierarchy::Dict{Symbol,Any}, @@ -224,11 +206,10 @@ Function to process node of class `docstring` - prev_text_buffer: IO Buffer which contains previous text """ function process_docstring!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) seekstart(prev_text_buffer) prev_text = read(prev_text_buffer, String) is_code_block = false @@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement, # Insert "header" if Gumbo.tag(children[1]) == :header heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1])) - insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") + insert_parsed_data!( + heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") end - received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + received_text, is_code_block, is_text_inserted = process_node!( + children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) if !isempty(strip(received_text)) insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text") @@ -279,11 +262,10 @@ Function to process a node - prev_text_buffer: IO Buffer which contains previous text """ function process_node!(node::Gumbo.HTMLElement, - heading_hierarchy::Dict{Symbol,Any}, - parsed_blocks::Vector{Dict{String,Any}}, - child_new::Bool=true, - prev_text_buffer::IO=IOBuffer(write=true)) - + heading_hierarchy::Dict{Symbol, Any}, + parsed_blocks::Vector{Dict{String, Any}}, + child_new::Bool = true, + prev_text_buffer::IO = IOBuffer(write = true)) tag_name = Gumbo.tag(node) if startswith(string(tag_name), "h") && isdigit(last(string(tag_name))) return process_headings!(node, heading_hierarchy, parsed_blocks) @@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement, return process_code(node) elseif tag_name == :article && getattr(node, "class", "") == "docstring" - return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_docstring!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) - + return process_generic_node!( + node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) end - """ multiple dispatch for process_node!() when node is of type Gumbo.HTMLText """ @@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...) return strip(Gumbo.text(node)), is_code_block, is_text_inserted end - """ get_base_url(url::AbstractString) -Extracts the base url. - -# Arguments -- `url`: The url string of which, the base url needs to be extracted +Extract the base url. """ function get_base_url(url::AbstractString) parsed_url = URIs.URI(url) @@ -329,7 +306,7 @@ end """ get_html_content(root::Gumbo.HTMLElement) -Returns the main content of the HTML. If not found, returns the whole HTML to parse +Return the main content of the HTML. If not found, return the whole HTML to parse # Arguments - `root`: The HTML root from which content is extracted @@ -338,73 +315,31 @@ function get_html_content(root::Gumbo.HTMLElement) target_ids = Set(["VPContent", "main_content_wrap", "pages-content"]) target_classes = Set(["content", "franklin-content"]) - content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] + content_candidates = [el + for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] # First try to find by ID - content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates) + content_by_id = filter( + el -> getattr(el, "id", nothing) in target_ids, content_candidates) if !isempty(content_by_id) return only(content_by_id) end # Fallback to class if no ID matches - content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates) + content_by_class = filter( + el -> getattr(el, "class", nothing) in target_classes, content_candidates) if !isempty(content_by_class) return only(content_by_class) end # Fallback to the root node if no class matches return root - end - """ parse_url(url::AbstractString) -Initiator and main function to parse HTML from url - -# Arguments -- `url`: URL string to parse - -# Returns -- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata - -# Usage -parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/") - -# Example -Let the HTML be: - - - - -

Heading 1

-

Heading 2

-

para 1

-

Heading 3

- this is my code block -

This is another h3 under Heading 2

-

This is a paragraph with inline code

- -

Heading 2_2

-

para ewg

- - - - -Output: -Any[ - Dict{String, Any}("URL" => "URL") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3") - Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2")) - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2") - Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg") -] +Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata """ function parse_url_to_blocks(url::AbstractString) @@ -418,9 +353,9 @@ function parse_url_to_blocks(url::AbstractString) # Getting title of the document # title = [el # for el in AbstractTrees.PreOrderDFS(r_parsed.root) - # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") - parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)]) - heading_hierarchy = Dict{Symbol,Any}() + # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") + parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)]) + heading_hierarchy = Dict{Symbol, Any}() process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks) return parsed_blocks catch diff --git a/src/preparation.jl b/src/preparation.jl index ab8d7b5..8736050 100644 --- a/src/preparation.jl +++ b/src/preparation.jl @@ -1,9 +1,7 @@ -# include("recursive_splitter.jl") -include("utils.jl") """ get_header_path(d::Dict) -Concatenates the h1, h2, h3 keys from the metadata of a Dict +Concatenate the h1, h2, h3 keys from the metadata of a Dict # Examples ```julia @@ -12,17 +10,21 @@ get_header_path(d) # Output: "Axis/Attributes/yzoomkey" ``` """ -function get_header_path(d::Dict) - metadata = get(d, "metadata", Dict{Any,Any}()) +function get_header_path(d::Dict{String, Any}) + metadata = get(d, "metadata", Dict{Any, Any}()) isempty(metadata) && return nothing keys_ = [:h1, :h2, :h3] vals = get.(Ref(metadata), keys_, "") |> x -> filter(!isempty, x) |> x -> join(x, "/") isempty(vals) ? nothing : vals end +""" + roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="") -"Roll-up chunks (that have the same header!), so we can split them later by to get the desired length" -function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="") +Roll-up chunks (that have the same header!), so we can split them later by to get the desired length +""" +function roll_up_chunks(parsed_blocks::Vector{Dict{String, Any}}, + url::AbstractString; separator::String = "") docs = String[] io = IOBuffer() last_header = nothing @@ -35,7 +37,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end last_header = header @@ -48,28 +50,31 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="< str = String(take!(io)) if !isempty(str) push!(docs, str) - src = url * (isnothing(last_header) ? "" : "::$last_header") + src = url * (isnothing(last_header) ? "" : " - $last_header") push!(sources, src) end return docs, sources end - struct DocParserChunker <: RT.AbstractChunker end -""" - RT.get_chunks(chunker::DocParserChunker, - html_files::Vector{<:AbstractString}; - sources::AbstractVector{<:AbstractString}=html_files, - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) -Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length. """ -function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; - verbose::Bool=true, - separators=["\n\n", ". ", "\n", " "], max_length::Int=256) - - + RT.get_chunks(chunker::DocParserChunker, url::AbstractString; + verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE) + +Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, +and splits them by separators to get the desired length. + +# Arguments +- chunker: DocParserChunker +- url: URL of the webpage to extract chunks +- verbose: Bool to print the log +- separators: Chunk separators +- max_chunk_size Maximum chunk size +""" +function RT.get_chunks( + chunker::DocParserChunker, url::AbstractString; + verbose::Bool = true, separators = ["\n\n", ". ", "\n", " "], max_chunk_size::Int = MAX_CHUNK_SIZE) SEP = "" sources = AbstractVector{<:AbstractString} output_chunks = Vector{SubString{String}}() @@ -79,12 +84,13 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; parsed_blocks = parse_url_to_blocks(url) ## Roll up to the same header - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP) + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) ## roll up chunks by SEP splitter, then remove it later for (doc, src) in zip(docs_, sources_) ## roll up chunks by SEP splitter, then remove it later - doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|> + doc_chunks = PT.recursive_splitter( + doc, [SEP, separators...]; max_length = max_chunk_size) .|> x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x) # skip if no chunks found isempty(doc_chunks) && continue @@ -94,22 +100,25 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString; return output_chunks, output_sources end +""" + process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE) - -"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them." -function process_paths(url::AbstractString, max_length::Int=512) - +Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them. +""" +function process_paths(url::AbstractString; + max_chunk_size::Int = MAX_CHUNK_SIZE, + min_chunk_size::Int = MIN_CHUNK_SIZE) output_chunks = Vector{SubString{String}}() output_sources = Vector{String}() - chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length) + chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size) append!(output_chunks, chunks) append!(output_sources, sources) - @info "Scraping done: $(length(output_chunks)) chunks" - postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true) + output_chunks, output_sources = postprocess_chunks( + output_chunks, output_sources; min_chunk_size, skip_code = true) return output_chunks, output_sources end diff --git a/src/user_preferences.jl b/src/user_preferences.jl new file mode 100644 index 0000000..00c1a2f --- /dev/null +++ b/src/user_preferences.jl @@ -0,0 +1,4 @@ +global MIN_CHUNK_SIZE = 40 +global MAX_CHUNK_SIZE = 384 +global MODEL = "text-embedding-3-large" +global EMBEDDING_SIZE = 3072 diff --git a/src/utils.jl b/src/utils.jl index 4bf1e07..dfbc17c 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,4 +1,9 @@ -"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)." +""" + find_duplicates(chunks::AbstractVector{<:AbstractString}) + +Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, +where `true` indicates a duplicate (second instance of the same text). +""" function find_duplicates(chunks::AbstractVector{<:AbstractString}) # hash the chunks for easier search hashed_chunks = bytes2hex.(sha256.(chunks)) @@ -20,36 +25,60 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString}) return duplicates end -"Removes chunks that are duplicated in the input list of chunks and their corresponding sources." -function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) +""" + remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) + +Remove chunks that are duplicated in the input list of chunks and their corresponding sources. +""" +function remove_duplicates( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}) idxs = find_duplicates(chunks) return chunks[.!idxs], sources[.!idxs] end -"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources." -function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true) +""" + remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true) + +Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources. +""" +function remove_short_chunks( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true) + chunk_lengths = length.(chunks) idx = if skip_code - ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text) - findall(x -> length(x) >= min_length || occursin("```", x), chunks) + ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text) + findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks) else - findall(x -> length(x) >= min_length, chunks) + findall(x -> length(x) >= min_chunk_size, chunks) end + chunk_lengths = length.(chunks[idx]) return chunks[idx], sources[idx] end - -function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, websites::AbstractVector{<:AbstractString}) - @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`" +function replace_local_paths( + sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, + websites::AbstractVector{<:AbstractString}) + @assert length(paths)==length(websites) "Length of `paths` must match length of `websites`" replacement_pairs = paths .=> websites output = map(x -> replace(x, replacement_pairs...), sources) + return output end +""" + function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, + websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) -"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates." -function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true, - paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing) +Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates. +""" +function postprocess_chunks( + chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; + min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true, + paths::Union{Nothing, AbstractVector{<:AbstractString}} = nothing, + websites::Union{Nothing, AbstractVector{<:AbstractString}} = nothing) len_ = length(chunks) - chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code) + chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code) @info "Removed $(len_ - length(chunks)) short chunks" len_ = length(chunks) @@ -63,6 +92,71 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A end return chunks, sources +end + +""" + function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString}) + +Remove chunks and sources corresponding to URLs starting with `prefix_urls` +""" +function remove_urls_from_index( + index_path::AbstractString, prefix_urls = Vector{<:AbstractString}) + @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)." + + h5open(index_path, "r+") do orig_file + # Load the sources dataset into a Julia array + sources = read(orig_file["sources"]) + chunks = read(orig_file["chunks"]) + embeddings = read(orig_file["embeddings"]) + + for url_to_remove in prefix_urls + indices_to_remove = findall(x -> startswith(x, url_to_remove), sources) + sources = deleteat!(sources, indices_to_remove) + chunks = deleteat!(chunks, indices_to_remove) + embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)] + end + + write(file["sources"], sources) + write(file["chunks"], chunks) + write(file["embeddings"], embeddings) + end +end + +""" + urls_for_metadata(sources::Vector{String}) + +Return a Dict of package names with their associated URLs +Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata. +""" +function urls_for_metadata(sources::Vector{String}) + urls = [split(source, " -")[1] for source in sources] + pattern = r"(/(?:stable|dev|latest|v\d+(?:\.\d+)*))" + cleaned_urls = [endswith(String(url), "/") ? String(url)[1:(end - 1)] : String(url) + for url in urls] + unique_urls = unique(cleaned_urls) + package_names = Vector{String}() + + for url in unique_urls + push!(package_names, get_package_name(String(url))) + end + cleaned_urls = [match(pattern, url) !== nothing ? first(split(url, pattern)) : url + for url in unique_urls] -end \ No newline at end of file + zipped = zip(cleaned_urls, package_names) |> collect + unique_pairs = unique(zipped) + unique_urls = [pair[1] for pair in unique_pairs] + unique_package_names = [pair[2] for pair in unique_pairs] + + package_url_dict = Dict{String, Vector{String}}() + for (url, package_name) in zip(unique_urls, unique_package_names) + if haskey(package_url_dict, package_name) + # If the package_name is already a key, append the url to the existing array + push!(package_url_dict[package_name], url) + else + # Otherwise, create a new entry with the package_name and the url + package_url_dict[package_name] = [url] + end + end + return package_url_dict +end diff --git a/test/crawl.jl b/test/crawl.jl new file mode 100644 index 0000000..6b00ca4 --- /dev/null +++ b/test/crawl.jl @@ -0,0 +1,7 @@ +using DocsScraper: crawl + +@testset "crawl" begin + urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) + hostname_url_dict = crawl(urls) + @test length(hostname_url_dict) > 0 +end diff --git a/test/make_knowledge_packs.jl b/test/make_knowledge_packs.jl new file mode 100644 index 0000000..5690725 --- /dev/null +++ b/test/make_knowledge_packs.jl @@ -0,0 +1,8 @@ +using DocsScraper: process_paths + +@testset "overall test" begin + url = "https://docs.julialang.org/en/v1/" + chunks, sources = process_paths(url) + @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && + sources[1] != nothing +end diff --git a/test/parser.jl b/test/parser.jl new file mode 100644 index 0000000..0faeb04 --- /dev/null +++ b/test/parser.jl @@ -0,0 +1,11 @@ +using DocsScraper: parse_url_to_blocks, roll_up_chunks + +@testset "parse & roll_up" begin + url = "https://docs.julialang.org/en/v1/" + parsed_blocks = parse_url_to_blocks(url) + @test length(parsed_blocks) > 0 + SEP = "" + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing +end diff --git a/test/runtests.jl b/test/runtests.jl index fdde81f..6e1e7e8 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,33 +1,13 @@ - +using DocsScraper using Test -urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"]) -url = urls[1] -queue = Vector{AbstractString}() - -@testset "check robots.txt" begin - result, sitemap_queue = check_robots_txt("*", url) - @test result == true -end - -@testset "HTTP get" begin - @test HTTP.get(url) != nothing -end - -@testset "get_urls!" begin - get_urls!(url, queue) - @test length(queue) > 1 -end - -@testset "parse & roll_up" begin - parsed_blocks = parse_url_to_blocks(url) - @test length(parsed_blocks) > 0 - SEP = "" - docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP) - @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing -end +using Aqua -@testset "overall test" begin - chunks, sources = process_paths(url) - @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing +@testset "DocsScraper.jl" begin + @testset "Code quality (Aqua.jl)" begin + Aqua.test_all(DocsScraper; persistent_tasks = false) + end + include("crawl.jl") + include("parser.jl") + include("make_knowledge_packs.jl") end diff --git a/test/utils.jl b/test/utils.jl new file mode 100644 index 0000000..fbe338a --- /dev/null +++ b/test/utils.jl @@ -0,0 +1,10 @@ +using DocsScraper: parse_url_to_blocks, roll_up_chunks + +@testset "parse & roll_up" begin + parsed_blocks = parse_url_to_blocks(url) + @test length(parsed_blocks) > 0 + SEP = "" + docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP) + @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && + sources_[1] != nothing +end