From 93185094aca8333d13f242a5ccc5897e7ec1cec2 Mon Sep 17 00:00:00 2001
From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Thu, 15 Aug 2024 02:40:33 -0700
Subject: [PATCH] structured according to PkgTemplate (#4)

* structured according to PkgTemplate

* structured according to PkgTemplate, other changes

* dependency changes

* dependency changes

* code imptrovements
---
 .JuliaFormatter.toml               |   3 +
 .github/dependabot.yml             |   7 +
 .github/workflows/CI.yml           |  76 +++++++++
 .github/workflows/CompatHelper.yml |  16 ++
 .github/workflows/TagBot.yml       |  31 ++++
 .gitignore                         |   7 +-
 LICENSE                            |  21 +++
 Project.toml                       |  46 ++++--
 docs/Project.toml                  |  15 ++
 docs/make.jl                       |  24 +++
 docs/src/index.md                  |   8 +
 src/{RAGKit.jl => DocsScraper.jl}  |  19 ++-
 src/crawl.jl                       |  50 ++----
 src/extract_package_name.jl        | 162 +++++++++++++++++++
 src/extract_urls.jl                |  97 ++++++------
 src/make_embeddings.jl             | 163 -------------------
 src/make_knowledge_packs.jl        | 245 +++++++++++++++++++++++++++++
 src/parser.jl                      | 165 ++++++-------------
 src/preparation.jl                 |  71 +++++----
 src/user_preferences.jl            |   4 +
 src/utils.jl                       | 126 +++++++++++++--
 test/crawl.jl                      |   7 +
 test/make_knowledge_packs.jl       |   8 +
 test/parser.jl                     |  11 ++
 test/runtests.jl                   |  38 ++---
 test/utils.jl                      |  10 ++
 26 files changed, 977 insertions(+), 453 deletions(-)
 create mode 100644 .JuliaFormatter.toml
 create mode 100644 .github/dependabot.yml
 create mode 100644 .github/workflows/CI.yml
 create mode 100644 .github/workflows/CompatHelper.yml
 create mode 100644 .github/workflows/TagBot.yml
 create mode 100644 LICENSE
 create mode 100644 docs/Project.toml
 create mode 100644 docs/make.jl
 create mode 100644 docs/src/index.md
 rename src/{RAGKit.jl => DocsScraper.jl} (59%)
 create mode 100644 src/extract_package_name.jl
 delete mode 100644 src/make_embeddings.jl
 create mode 100644 src/make_knowledge_packs.jl
 create mode 100644 src/user_preferences.jl
 create mode 100644 test/crawl.jl
 create mode 100644 test/make_knowledge_packs.jl
 create mode 100644 test/parser.jl
 create mode 100644 test/utils.jl

diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 0000000..9601a61
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,3 @@
+# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
+style = "sciml"
+ignore = ["knowledge_packs"]
\ No newline at end of file
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..700707c
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
new file mode 100644
index 0000000..5cd2adb
--- /dev/null
+++ b/.github/workflows/CI.yml
@@ -0,0 +1,76 @@
+name: CI
+on:
+  push:
+    branches:
+      - main
+    tags: ["*"]
+  pull_request:
+  workflow_dispatch:
+concurrency:
+  # Skip intermediate builds: always.
+  # Cancel intermediate builds: only if it is a pull request build.
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+jobs:
+  test:
+    name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
+    permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      actions: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - "1.10"
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: ${{ matrix.version }}
+          arch: ${{ matrix.arch }}
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+      - uses: julia-actions/julia-processcoverage@v1
+      # - uses: codecov/codecov-action@v4
+      #   with:
+      #     files: lcov.info
+      #     token: ${{ secrets.CODECOV_TOKEN }}
+      #     fail_ci_if_error: false
+  docs:
+    name: Documentation
+    runs-on: ubuntu-latest
+    permissions:
+      actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
+      contents: write
+      statuses: write
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: "1"
+      - uses: julia-actions/cache@v2
+      - name: Configure doc environment
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Pkg
+          Pkg.develop(PackageSpec(path=pwd()))
+          Pkg.instantiate()
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-docdeploy@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+      - name: Run doctests
+        shell: julia --project=docs --color=yes {0}
+        run: |
+          using Documenter: DocMeta, doctest
+          using DocsScraper
+          DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+          doctest(DocsScraper)
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 0000000..d48734a
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,16 @@
+name: CompatHelper
+on:
+  schedule:
+    - cron: 0 0 1 * *
+  workflow_dispatch:
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 0000000..0cd3114
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,31 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        default: "3"
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.gitignore b/.gitignore
index 9c929a1..4a1c7f4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,9 @@
 # Ignore .env files
 .env
 knowledge_packs/
-Manifest.toml
\ No newline at end of file
+Manifest.toml
+/Manifest.toml
+/docs/Manifest.toml
+/docs/build/
+.vscode/**
+**/.DS_Store
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..183f1b7
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) Shreyas Agrawal @splendidbug and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Project.toml b/Project.toml
index 964d069..1fb77c2 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,22 +1,50 @@
-name = "RAGKit"
-uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17"
-authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"]
+name = "DocsScraper"
+uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+authors = ["Shreyas Agrawal @splendidbug  and contributors"]
 version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
-URIParser = "30578b45-9adc-5946-b283-645ec420af67"
+SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
 
 [compat]
-AbstractTrees = "0.4.5"
-Gumbo = "0.8.2"
-HTTP = "1.10.4"
-URIs = "1.5.1"
+AbstractTrees = "0.4"
+Aqua = "0.8"
+Dates = "1"
+EzXML = "1.2"
+Gumbo = "0.8"
+HDF5 = "0.17"
+HTTP = "1.10"
+Inflate = "0.1"
+LinearAlgebra = "1"
+PromptingTools = "0.48"
+SHA = "0.7"
+Serialization = "1"
+SparseArrays = "1"
+Tar = "1"
+Test = "1"
+URIs = "1.5"
+Unicode = "1"
+julia = "1.10"
+JSON = "0.21"
+
+[extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Aqua", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
new file mode 100644
index 0000000..15c39b1
--- /dev/null
+++ b/docs/Project.toml
@@ -0,0 +1,15 @@
+[deps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+DocsScraper = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
+EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
+LiveServer = "16fef848-5104-11e9-1b77-fb7a48bbb589"
+PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
+Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+URIParser = "30578b45-9adc-5946-b283-645ec420af67"
+URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
diff --git a/docs/make.jl b/docs/make.jl
new file mode 100644
index 0000000..47bd6f5
--- /dev/null
+++ b/docs/make.jl
@@ -0,0 +1,24 @@
+using DocsScraper
+using Documenter
+
+DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive = true)
+
+makedocs(;
+    modules = [DocsScraper],
+    authors = "Shreyas Agrawal @splendidbug  and contributors",
+    sitename = "DocsScraper.jl",
+    repo = "https://github.com/splendidbug/DocsScraper.jl/blob/{commit}{path}#{line}",
+    format = Documenter.HTML(;
+        repolink = "https://github.com/splendidbug/DocsScraper.jl",
+        canonical = "https://splendidbug.github.io/DocsScraper.jl",
+        edit_link = "main",
+        assets = String[]),
+    pages = [
+        "API Index" => "index.md"
+    ]
+)
+
+deploydocs(;
+    repo = "github.com/splendidbug/DocsScraper.jl",
+    devbranch = "main"
+)
diff --git a/docs/src/index.md b/docs/src/index.md
new file mode 100644
index 0000000..c30e1af
--- /dev/null
+++ b/docs/src/index.md
@@ -0,0 +1,8 @@
+# Reference
+
+```@index
+```
+
+```@autodocs
+Modules = [DocsScraper]
+```
\ No newline at end of file
diff --git a/src/RAGKit.jl b/src/DocsScraper.jl
similarity index 59%
rename from src/RAGKit.jl
rename to src/DocsScraper.jl
index b895363..7f114d9 100644
--- a/src/RAGKit.jl
+++ b/src/DocsScraper.jl
@@ -1,4 +1,4 @@
-module RAGKit
+module DocsScraper
 using HTTP, Gumbo, AbstractTrees, URIs
 using Gumbo: HTMLDocument, HTMLElement
 using EzXML
@@ -9,20 +9,23 @@ using LinearAlgebra, Unicode, SparseArrays
 using HDF5
 using Tar
 using Inflate
-
 using SHA
 using Serialization, URIs
-# using Regex
-
-# using Robots
+using Dates
+using JSON
 
 include("parser.jl")
 include("crawl.jl")
 include("extract_urls.jl")
 include("preparation.jl")
+include("extract_package_name.jl")
+export get_package_name
 
-include("make_embeddings.jl")
-export make_embeddings
+include("make_knowledge_packs.jl")
+export make_knowledge_packs
 
+include("user_preferences.jl")
+include("utils.jl")
+export remove_urls_from_index, urls_for_metadata
 
-end
\ No newline at end of file
+end
diff --git a/src/crawl.jl b/src/crawl.jl
index b147511..c972ef2 100644
--- a/src/crawl.jl
+++ b/src/crawl.jl
@@ -2,13 +2,10 @@
 """
     parse_robots_txt!(robots_txt::String)
 
-Parses the robots.txt string and returns rules along with the URLs on Sitemap
-
-# Arguments
-- `robots_txt`: robots.txt as a string
+Parse the robots.txt string and return rules and the URLs on Sitemap
 """
 function parse_robots_txt!(robots_txt::String)
-    rules = Dict{String,Dict{String,Vector{String}}}()
+    rules = Dict{String, Dict{String, Vector{String}}}()
     current_user_agent = ""
     sitemap_urls = Vector{AbstractString}()
 
@@ -17,7 +14,8 @@ function parse_robots_txt!(robots_txt::String)
         if startswith(line, "User-agent:")
             current_user_agent = strip(split(line, ":")[2])
             if !haskey(rules, current_user_agent)
-                rules[current_user_agent] = Dict("Disallow" => Vector{String}(), "Allow" => Vector{String}())
+                rules[current_user_agent] = Dict(
+                    "Disallow" => Vector{String}(), "Allow" => Vector{String}())
             end
         elseif startswith(line, "Disallow:")
             disallow_path = strip(split(line, ":")[2])
@@ -33,24 +31,20 @@ function parse_robots_txt!(robots_txt::String)
             url = strip(split(line, ":")[2])
             push!(sitemap_urls, url)
         end
-
     end
     return rules, sitemap_urls
 end
 
-
 """
-    check_robots_txt(user_agent::AbstractString,
-        url::AbstractString)
+    check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
-Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url
+Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls
 
 # Arguments
 - `user_agent`: user agent attempting to crawl the webpage
 - `url`: input URL string
 """
-function check_robots_txt(user_agent::AbstractString,
-    url::AbstractString)
+function check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
     ## TODO: Make a cache of rules for a quick lookup
     # if (haskey(restricted_urls, url))
@@ -101,27 +95,19 @@ end
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url
 """
 function get_base_url(url::AbstractString)
-
     parsed_url = URIs.URI(url)
     base_url = string(parsed_url.scheme, "://", parsed_url.host,
         parsed_url.port != nothing ? "" * string(parsed_url.port) : "", parsed_url.path)
     return base_url
 end
 
-
 """
     process_hostname(url::AbstractString)
 
-Returns the hostname of an input URL
-
-# Arguments
-- `url`: URL string
+Return the hostname of an input URL
 """
 function process_hostname(url::AbstractString)
     URI = URIs.URI(url)
@@ -129,17 +115,17 @@ function process_hostname(url::AbstractString)
     return hostname
 end
 
-
 """
     process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
 
-Adds the `url` to it's hostname in `hostname_dict`
+Add `url` to its hostname in `hostname_dict`
 
 # Arguments
 - `url`: URL string
 - `hostname_dict`: Dict with key being hostname and value being a vector of URLs
 """
-function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
+function process_hostname!(
+        url::AbstractString, hostname_dict::Dict{AbstractString, Vector{AbstractString}})
     hostname = process_hostname(url)
 
     # Add the URL to the dictionary under its hostname
@@ -150,20 +136,15 @@ function process_hostname!(url::AbstractString, hostname_dict::Dict{AbstractStri
     end
 end
 
-
 """
     crawl(input_urls::Vector{<:AbstractString})
 
-Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
-
-# Arguments
-- `input_urls`: A vector of input URLs
+Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
 """
 function crawl(input_urls::Vector{<:AbstractString})
-
     url_queue = Vector{AbstractString}(input_urls)
     visited_url_set = Set{AbstractString}()
-    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
+    hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
     sitemap_urls = Vector{AbstractString}()
 
     # TODO: Add parallel processing for URLs
@@ -187,6 +168,5 @@ function crawl(input_urls::Vector{<:AbstractString})
         end
     end
 
-    return hostname_url_dict
-
+    return hostname_url_dict, visited_url_set
 end
diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl
new file mode 100644
index 0000000..525cecf
--- /dev/null
+++ b/src/extract_package_name.jl
@@ -0,0 +1,162 @@
+"""
+    clean_url(url::String)
+
+Strip URL of any http:// ot https:// or www. prefixes 
+"""
+function clean_url(url::String)
+    # Remove http://, https://, www., or wwws.
+    cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "")
+    return cleaned_url
+end
+
+"""
+    base_url_segment(url::String)
+
+Return the base url and first path segment if all the other checks fail
+"""
+function base_url_segment(url::String)
+    # Clean the URL from unwanted prefixes
+    cleaned_url = clean_url(url)
+
+    # Parse the cleaned URL
+    uri = URI("https://" * cleaned_url)  # Add https:// to ensure correct parsing
+
+    # Extract the base URL (host)
+    base_url = replace(uri.host, r"^www\." => "")
+
+    # Extract the first path segment
+    path_segments = split(uri.path, "/"; keepempty = false)
+
+    if !isempty(path_segments)
+        first_segment = path_segments[1]
+        return "$base_url/$first_segment"
+    else
+        return base_url
+    end
+end
+
+"""
+    url_package_name(url::AbstractString)
+
+Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes
+"""
+function url_package_name(url::AbstractString)
+    if occursin(r"\.jl", url) || occursin(r"_jl", url)
+        package_name = match(r"[\/]([^\/]+(?:\.jl|_jl))", url)
+        return package_name.captures[1]
+    end
+    return ""
+end
+
+"""
+    get_base_url(url::AbstractString)
+
+Extract the base url
+"""
+function get_base_url(url::AbstractString)
+    parsed_url = URIs.URI(url)
+    base_url = string(parsed_url.scheme, "://", parsed_url.host,
+        parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
+    return base_url
+end
+
+"""
+    nav_bar(url::AbstractString)
+
+Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree
+"""
+function nav_bar(url::AbstractString)
+    base_url = get_base_url(url)
+    fetched_content = HTTP.get(base_url)
+    parsed = Gumbo.parsehtml(String(fetched_content.body))
+    content_candidates = [el
+                          for el in AbstractTrees.PreOrderDFS(parsed.root)
+                          if el isa HTMLElement]
+    content_by_class = filter(
+        el -> getattr(el, "class", nothing) in ["docs-package-name"], content_candidates)
+    if (!isempty(content_by_class))
+        parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+        heading_hierarchy = Dict{Symbol, Any}()
+        process_node!(only(content_by_class), heading_hierarchy, parsed_blocks)
+        package_name = parsed_blocks[2]["text"]
+        return package_name
+    end
+    return ""
+end
+
+"""
+    text_before_version(url::AbstractString)
+
+Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions 
+"""
+function text_before_version(url::AbstractString)
+    language_prefixes = [
+        "/en/", "/es/", "/fr/", "/de/", "/it/", "/pt/", "/ru/", "/zh/", "/ja/", "/ko/"]
+    contains_prefix = any(occursin(prefix, url) for prefix in language_prefixes)
+    if contains_prefix
+        pattern = r"/([^/]+)/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)(?:/|$)"
+    else
+        pattern = r"/([^/]+)/(?:stable|dev|latest|v\d+(\.\d+)*)"
+    end
+    package_name = match(pattern, url)
+    if package_name !== nothing
+        return package_name.captures[1]
+    end
+    return ""
+end
+
+"""
+    docs_in_url(url::AbstractString)
+
+If the base url is in the form docs.package_name.domain_extension, then return the middle word i.e., package_name 
+"""
+function docs_in_url(url::AbstractString)
+    cleaned_url = clean_url(url)
+
+    # Parse the cleaned URL
+    uri = URI("https://" * cleaned_url)  # Add https:// to ensure correct parsing
+
+    # Extract the base URL (host)
+    base_url = replace(uri.host, r"^www\." => "")
+    pattern = r"docs\.([^.]+)\.(org|com|ai|net|io|co|tech)"
+    m = match(pattern, base_url)
+    if m !== nothing
+        return m.captures[1]
+    end
+    return ""
+end
+
+"""
+    get_package_name(url::AbstractString)
+
+Return name of the package through the package URL  
+"""
+function get_package_name(url::AbstractString)
+
+    # try 1: look for package name in URL 
+    package_name = url_package_name(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 2: look for package name in nav bar
+    package_name = nav_bar(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 3: if the base url is in the form docs.package_name.domain_extension
+    package_name = docs_in_url(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # try 4: get text before "stable" or "dev" or any version in URL
+    package_name = text_before_version(url)
+    if (!isempty(package_name))
+        return package_name
+    end
+
+    # fallback: return base URL with first path segment
+    return base_url_segment(url)
+end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
index b9ea364..d750f34 100644
--- a/src/extract_urls.jl
+++ b/src/extract_urls.jl
@@ -1,31 +1,37 @@
-# Temporary until I find a package to simplify this
+"""
+    resolve_url(base_url::String, extracted_url::String)
 
-function resolve_url(base_url::String, relative_url::String)::String
-    base_uri = URI(base_url)
-    relative_uri = URI(relative_url)
+Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. 
+Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url
 
-    ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames
+# Arguments
+- base_url: URL of the page from which other URLs are being extracted
+- extracted_url: URL extracted from the base_url  
+"""
+function resolve_url(base_url::String, extracted_url::String)
+    base_uri = URI(base_url)
+    extracted_uri = URI(extracted_url)
     ## TODO: Look for version number either on the bottom left dropdown or identify on the url
 
-    if length(relative_url) > 4 && relative_url[1:4] == "http"
-        if base_uri.host == relative_uri.host
-            return relative_url
+    if length(extracted_url) > 4 && extracted_url[1:4] == "http"
+        if base_uri.host == extracted_uri.host
+            return extracted_url
         end
         return ""
     end
-    if !isempty(relative_url) && relative_url[1] == '#'
+    if !isempty(extracted_url) && extracted_url[1] == '#'
         return ""
     end
 
-    if !isempty(relative_uri.path) && relative_uri.path[1] == '/'
+    if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/'
         resolved_uri = URI(
-            scheme=base_uri.scheme,
-            userinfo=base_uri.userinfo,
-            host=base_uri.host,
-            port=base_uri.port,
-            path=relative_uri.path,
-            query=relative_uri.query,
-            fragment=relative_uri.fragment
+            scheme = base_uri.scheme,
+            userinfo = base_uri.userinfo,
+            host = base_uri.host,
+            port = base_uri.port,
+            path = extracted_uri.path,
+            query = extracted_uri.query,
+            fragment = extracted_uri.fragment
         )
         return string(resolved_uri)
     end
@@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String
     base_segments = split(base_uri.path, "/")
     base_segments = filter((i) -> i != "", base_segments)
 
-    relative_segments = split(relative_uri.path, "/")
-    relative_segments = filter((i) -> i != "", relative_segments)
+    extracted_segments = split(extracted_uri.path, "/")
+    extracted_segments = filter((i) -> i != "", extracted_segments)
 
-    # Process the relative segments
-    for segment in relative_segments
+    # Process the directory traversal paths
+    for segment in extracted_segments
         if segment == ".."
             if !isempty(base_segments)
                 pop!(base_segments)
@@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String
 
     # Create the resolved URI
     resolved_uri = URI(
-        scheme=base_uri.scheme,
-        userinfo=base_uri.userinfo,
-        host=base_uri.host,
-        port=base_uri.port,
-        path=resolved_path,
-        query=relative_uri.query,
-        fragment=relative_uri.fragment
+        scheme = base_uri.scheme,
+        userinfo = base_uri.userinfo,
+        host = base_uri.host,
+        port = base_uri.port,
+        path = resolved_path,
+        query = extracted_uri.query,
+        fragment = extracted_uri.fragment
     )
     return string(resolved_uri)
 end
 
-
 """
-    find_urls!(url::AbstractString, 
-        node::Gumbo.HTMLElement, 
-        url_queue::Vector{<:AbstractString}
+    find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}
 
-Function to recursively find <a> and extract the urls
+Function to recursively find <a> tags and extract the urls
 
 # Arguments
 - url: The initial input URL 
 - node: The HTML node of type Gumbo.HTMLElement
 - url_queue: Vector in which extracted URLs will be appended
 """
-function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
+function find_urls_html!(
+        url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
     if Gumbo.tag(node) == :a && haskey(node.attributes, "href")
         href = node.attributes["href"]
         if href !== nothing && !isempty(resolve_url(url, href))
@@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
         end
     end
 
+    # Go deep in the HTML tags and check if `node` is an <a> tag
     for child in node.children
         if isa(child, HTMLElement)
             find_urls_html!(url, child, url_queue)
@@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
     end
 end
 
+"""
+    find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
 
+Identify URL through regex pattern in xml files and push in `url_queue`
 
+# Arguments
+- url: url from which all other URLs will be extracted
+- url_queue: Vector in which extracted URLs will be appended
+"""
 function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
+    # If a string starts with "http" then it is considered as a URL regardless of it being valid. 
+    # Validity of URLs are checked during HTTP fetch
     try
         fetched_content = HTTP.get(url)
         xml_content = String(fetched_content.body)
@@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}
     end
 end
 
-
-
 """
     get_links!(url::AbstractString, 
         url_queue::Vector{<:AbstractString})
 
-Function to extract urls inside <a> tags
+Extract urls inside html or xml files 
 
 # Arguments
 - url: url from which all other URLs will be extracted
 - url_queue: Vector in which extracted URLs will be appended
 """
 function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
-
     @info "Scraping link: $url"
-    # println(url)
-    # try
     fetched_content = HTTP.get(url)
     parsed = Gumbo.parsehtml(String(fetched_content.body))
-    if (url[end-3:end] == ".xml")
+    if (url[(end - 3):end] == ".xml")
         find_urls_xml!(url_xml, url_queue)
     else
         find_urls_html!(url, parsed.root, url_queue)
     end
-    # print("-------------")
-    # catch e
-    #     println("Bad URL: $url")
-    # end
-end
\ No newline at end of file
+end
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
deleted file mode 100644
index ba079aa..0000000
--- a/src/make_embeddings.jl
+++ /dev/null
@@ -1,163 +0,0 @@
-## TODO: Make a function to Check for version number
-
-"""
-    report_artifact()
-
-prints artifact information
-"""
-function report_artifact(fn_output)
-    @info("ARTIFACT: $(basename(fn_output))")
-    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
-    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
-end
-
-
-
-
-"""
-    create_output_folders()
-
-Creates output folders
-"""
-function create_output_folders(knowledge_pack_path::String)
-    # Define the folder path    
-    folder_path = joinpath(knowledge_pack_path, "packs")
-    println("folder_path:", folder_path)
-    # Check if the folder exists
-    if !isdir(folder_path)
-        mkpath(folder_path)
-        @info "Folder created: $folder_path"
-    else
-        @info "Folder already exists: $folder_path"
-    end
-
-end
-
-"""
-    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}})
-
-Parses URLs from hostname_url_dict and saves the chunks
-
-# Arguments
-- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
-"""
-function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String)
-    output_chunks = Vector{SubString{String}}()
-    output_sources = Vector{String}()
-    SAVE_CHUNKS = true
-    CHUNK_SIZE = 512
-    for (hostname, urls) in hostname_url_dict
-        for url in urls
-            try
-                chunks, sources = process_paths(url)
-                append!(output_chunks, chunks)
-                append!(output_sources, sources)
-            catch
-                @error "error!! check url: $url"
-            end
-        end
-        if SAVE_CHUNKS
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks)
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources)
-        end
-
-    end
-
-
-end
-
-"""
-    generate_embeddings()
-
-Deserializes chunks and sources to generate embeddings 
-"""
-function generate_embeddings(knowledge_pack_path::String)
-    embedder = RT.BatchEmbedder()
-    entries = readdir(knowledge_pack_path)
-
-    # Initialize a dictionary to group files by hostname and chunk size
-    hostname_files = Dict{String,Dict{Int,Dict{String,String}}}()
-
-    # Regular expressions to match the file patterns
-    chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
-    sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
-    # Group files by hostname and chunk size
-    for file in entries
-        match_chunks = match(chunks_pattern, file)
-        match_sources = match(sources_pattern, file)
-
-        if match_chunks !== nothing
-            hostname = match_chunks.captures[1]
-            chunk_size = parse(Int, match_chunks.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file)
-        elseif match_sources !== nothing
-            hostname = match_sources.captures[1]
-            chunk_size = parse(Int, match_sources.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file)
-        end
-    end
-
-
-    # Process each pair of files
-    for (hostname, chunk_files) in hostname_files
-        for (chunk_size, files) in chunk_files
-            if haskey(files, "chunks") && haskey(files, "sources")
-                chunks_file = files["chunks"]
-                sources_file = files["sources"]
-                chunks = deserialize(chunks_file)
-                sources = deserialize(sources_file)
-                cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, api_key=ENV["OPENAI_API_KEY"])
-
-                # Float32
-                fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
-                fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
-                h5open(fn_temp, "w") do file
-                    file["chunks"] = chunks
-                    file["sources"] = sources
-                    file["embeddings"] = full_embeddings
-                    file["type"] = "ChunkIndex"
-                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
-                end
-                run(tar - cvzf$fn_output - C$(dirname(fn_temp))$(basename(fn_temp)))
-                report_artifact(fn_output)
-
-            else
-                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
-            end
-        end
-    end
-
-end
-
-
-
-"""
-    make_embeddings(input_urls::Vector{<:AbstractString})
-
-Entry point to crawl, parse and create embeddings
-
-# Arguments
-- input_urls: vector containing URL strings to parse
-"""
-function make_embeddings(input_urls::Vector{<:AbstractString})
-    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
-    hostname_url_dict = crawl(input_urls)
-    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
-    create_output_folders(knowledge_pack_path)
-    make_chunks(hostname_url_dict, knowledge_pack_path)
-    generate_embeddings(knowledge_pack_path)
-end
\ No newline at end of file
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
new file mode 100644
index 0000000..5d56ff8
--- /dev/null
+++ b/src/make_knowledge_packs.jl
@@ -0,0 +1,245 @@
+"""
+    report_artifact(fn_output)
+
+Print artifact information
+"""
+function report_artifact(fn_output)
+    @info("ARTIFACT: $(basename(fn_output))")
+    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
+    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
+end
+
+"""
+    create_output_folders(knowledge_pack_path::String)
+
+Create output folders on the knowledge_pack_path
+"""
+function create_output_folders(knowledge_pack_path::String)
+    # Define the folder path    
+    folder_path = joinpath(knowledge_pack_path, "packs")
+    # Check if the folder exists
+    if !isdir(folder_path)
+        mkpath(folder_path)
+    end
+end
+
+"""
+    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; 
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Parse URLs from hostname_url_dict and save the chunks
+
+# Arguments
+- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
+- knowledge_pack_path: Knowledge pack path
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+"""
+function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
+        knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+        min_chunk_size::Int = MIN_CHUNK_SIZE)
+    SAVE_CHUNKS = true
+    for (hostname, urls) in hostname_url_dict
+        output_chunks = Vector{SubString{String}}()
+        output_sources = Vector{String}()
+        for url in urls
+            try
+                chunks, sources = process_paths(
+                    url; max_chunk_size, min_chunk_size)
+                append!(output_chunks, chunks)
+                append!(output_sources, sources)
+            catch
+                @error "error!! check url: $url"
+            end
+        end
+        if SAVE_CHUNKS
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_chunks)
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_sources)
+        end
+    end
+end
+
+"""
+    l2_norm_columns(mat::AbstractMatrix)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(mat::AbstractMatrix)
+    norm_ = norm.(eachcol(mat))
+    return mat ./ norm_'
+end
+
+"""
+    l2_norm_columns(vect::AbstractVector)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(vect::AbstractVector)
+    norm_ = norm(vect)
+    return vect / norm_
+end
+
+"""
+    generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
+        embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)
+
+Deserialize chunks and sources to generate embeddings 
+
+# Arguments
+- model: Embedding model
+- embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
+"""
+function generate_embeddings(
+        knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+        model::AbstractString = MODEL,
+        embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString)
+    embedder = RT.BatchEmbedder()
+    entries = readdir(knowledge_pack_path)
+    # Initialize a dictionary to group files by hostname and chunk size
+    hostname_files = Dict{String, Dict{Int, Dict{String, String}}}()
+
+    # Regular expressions to match the file patterns of chunks and sources
+    chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
+    sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
+
+    # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
+    # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
+
+    # Group files by hostname and chunk size
+    for file in entries
+        match_chunks = match(chunks_pattern, file)
+        match_sources = match(sources_pattern, file)
+
+        if match_chunks !== nothing
+            hostname = match_chunks.captures[1]
+            max_chunk_size = parse(Int, match_chunks.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], max_chunk_size)
+                hostname_files[hostname][max_chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][max_chunk_size]["chunks"] = joinpath(
+                knowledge_pack_path, file)
+        elseif match_sources !== nothing
+            hostname = match_sources.captures[1]
+            max_chunk_size = parse(Int, match_sources.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], max_chunk_size)
+                hostname_files[hostname][max_chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][max_chunk_size]["sources"] = joinpath(
+                knowledge_pack_path, file)
+        end
+    end
+    # Process each pair of files
+    for (hostname, chunk_files) in hostname_files
+        for (max_chunk_size, files) in chunk_files
+            if haskey(files, "chunks") && haskey(files, "sources")
+                chunks_file = files["chunks"]
+                sources_file = files["sources"]
+                chunks = deserialize(chunks_file)
+                sources = deserialize(sources_file)
+                cost_tracker = Threads.Atomic{Float64}(0.0)
+                full_embeddings = RT.get_embeddings(
+                    embedder, chunks; model, verbose = false, cost_tracker)
+                @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+
+                trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
+                fn_output = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-$model-$trunc-Float32__v1.0.tar.gz")
+                fn_temp = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-$model-$trunc-Float32__v1.0.hdf5")
+
+                h5open(fn_temp, "w") do file
+                    file["chunks"] = chunks
+                    file["sources"] = sources
+                    file["embeddings"] = full_embeddings[1:embedding_size, :] |>
+                                         l2_norm_columns |> x -> map(>(0), x)
+                    file["type"] = "ChunkIndex"
+
+                    package_url_dict = Dict{String, Vector{String}}()
+                    package_url_dict = urls_for_metadata(sources)
+
+                    metadata = Dict(
+                        :embedded_dt => Dates.today(),
+                        :custom_metadata => custom_metadata, :max_chunk_size => max_chunk_size,
+                        :embedding_size => embedding_size, :model => model,
+                        :packages => package_url_dict)
+
+                    metadata_json = JSON.json(metadata)
+                    file["metadata"] = metadata_json
+                end
+
+                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+                run(command)
+                report_artifact(fn_output)
+
+            else
+                @warn "Missing pair for hostname: $hostname, max chunk size: $max_chunk_size"
+            end
+        end
+    end
+end
+
+"""
+    make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, 
+        custom_metadata::AbstractString)
+
+Entry point to crawl, parse and generate embeddings
+
+# Arguments
+- crawlable_urls: URLs that should be crawled to find more links
+- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+- model: Embedding model
+- embedding_size: Embedding dimensions
+- custom_metadata: Custom metadata like ecosystem name if required
+"""
+function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
+        single_urls::Vector{<:AbstractString} = String[],
+        max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE, custom_metadata::AbstractString = "")
+    if isempty(crawlable_urls) && isempty(single_urls)
+        error("At least one of `input_urls` or `single_pages` must be provided.")
+    end
+
+    hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
+
+    if !isempty(crawlable_urls)
+        hostname_url_dict, visited_url_set = crawl(crawlable_urls)
+    else
+        visited_url_set = Set{AbstractString}()
+    end
+    for url in single_urls
+        base_url = get_base_url(url)
+        if !in(base_url, visited_url_set)
+            push!(visited_url_set, base_url)
+            crawlable, sitemap_urls = check_robots_txt("*", base_url)
+            if crawlable
+                try
+                    process_hostname!(url, hostname_url_dict)
+                catch
+                    @error "Bad URL: $base_url"
+                end
+            end
+        end
+    end
+    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
+    create_output_folders(knowledge_pack_path)
+    make_chunks(
+        hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+    generate_embeddings(
+        knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata)
+end
diff --git a/src/parser.jl b/src/parser.jl
index d909280..2de7035 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -1,21 +1,3 @@
-"""
-Working:
-
-Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks 
-ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.  
-For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
-If the current node is a code block, return the text inside code block with backticks.
-If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. 
-if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
-it is assumed that everything inside the tag is part of a single text block with inline code. 
-But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. 
-To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration 
-that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. 
-We indicate this by a return flag is_text_inserted
-"""
-
-
-
 """
     insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, 
         parsed_blocks::Vector{Dict{String,Any}}, 
@@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector
 - text_to_insert: Text to be inserted
 - text_type: The text to be inserted could be heading or a code block or just text
 """
-function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    text_to_insert::AbstractString,
-    text_type::AbstractString)
-
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        text_to_insert::AbstractString,
+        text_type::AbstractString)
     if !isempty(strip(text_to_insert))
         push!(parsed_blocks,
             Dict(text_type => strip(text_to_insert),
@@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
     end
 end
 
-
-
 """
     process_headings!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl
 - parsed_blocks: Vector of Dicts to store parsed text and metadata
 """
 function process_headings!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}})
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}})
     tag_name = Gumbo.tag(node)
     # Clear headings of equal or lower level
     for k in collect(keys(heading_hierarchy))
-        if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+        if k != "header" &&
+           Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
             delete!(heading_hierarchy, k)
         end
     end
@@ -123,11 +102,10 @@ If the node is neither heading nor code
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_generic_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
 
@@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement,
         # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
         # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. 
 
-        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i,
+            :cite, :address, :em, :td, :a, :span, :header]
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        elseif tag_name in [:script]
+            continue
         else
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
         end
 
         # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
@@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
             print(prev_text_buffer, " " * received_text)
             text_to_insert = text_to_insert * " " * received_text
         end
-
     end
 
     # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, 
@@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     # if we're insert text in current node level, then we should insert the previous text if available, 
     # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
     if !isempty(strip(text_to_insert))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
         is_text_inserted = true
     end
 
@@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     return "", is_code_block, is_text_inserted
 end
 
-
 """
     process_docstring!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -224,11 +206,10 @@ Function to process node of class `docstring`
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_docstring!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
     is_code_block = false
@@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement,
     # Insert "header"
     if Gumbo.tag(children[1]) == :header
         heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
     end
 
-    received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+    received_text, is_code_block, is_text_inserted = process_node!(
+        children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 
     if !isempty(strip(received_text))
         insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
@@ -279,11 +262,10 @@ Function to process a node
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     tag_name = Gumbo.tag(node)
     if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
         return process_headings!(node, heading_hierarchy, parsed_blocks)
@@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement,
         return process_code(node)
 
     elseif tag_name == :article && getattr(node, "class", "") == "docstring"
-        return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+        return process_docstring!(
+            node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
     end
 
-    return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+    return process_generic_node!(
+        node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 end
 
-
 """
 multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
 """
@@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...)
     return strip(Gumbo.text(node)), is_code_block, is_text_inserted
 end
 
-
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url.
 """
 function get_base_url(url::AbstractString)
     parsed_url = URIs.URI(url)
@@ -329,7 +306,7 @@ end
 """
     get_html_content(root::Gumbo.HTMLElement)
 
-Returns the main content of the HTML. If not found, returns the whole HTML to parse
+Return the main content of the HTML. If not found, return the whole HTML to parse
 
 # Arguments
 - `root`: The HTML root from which content is extracted
@@ -338,73 +315,31 @@ function get_html_content(root::Gumbo.HTMLElement)
     target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
     target_classes = Set(["content", "franklin-content"])
 
-    content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+    content_candidates = [el
+                          for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
 
     # First try to find by ID
-    content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+    content_by_id = filter(
+        el -> getattr(el, "id", nothing) in target_ids, content_candidates)
     if !isempty(content_by_id)
         return only(content_by_id)
     end
 
     # Fallback to class if no ID matches
-    content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+    content_by_class = filter(
+        el -> getattr(el, "class", nothing) in target_classes, content_candidates)
     if !isempty(content_by_class)
         return only(content_by_class)
     end
 
     # Fallback to the root node if no class matches
     return root
-
 end
 
-
 """
     parse_url(url::AbstractString)
 
-Initiator and main function to parse HTML from url
-
-# Arguments
-- `url`: URL string to parse
-
-# Returns
-- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Usage
-parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
-
-# Example
-Let the HTML be:
-<!DOCTYPE html>
-    <html>
-    <body>
-
-    <h1>Heading 1</h1>
-        <h2>Heading 2</h2>
-            <p>para 1</p>
-            <h3>Heading 3</h3>
-                <code>this is my code block</code>
-            <h3>This is another h3 under Heading 2</h3>
-                <p>This is a paragraph with <code>inline code</code></p>
-
-        <h2>Heading 2_2</h2>
-            <p>para ewg</p>
-
-    </body>
-    </html>
-
-Output: 
-Any[
-    Dict{String, Any}("URL" => "URL")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
-    Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with  inline code")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
-]
+Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
 """
 function parse_url_to_blocks(url::AbstractString)
 
@@ -418,9 +353,9 @@ function parse_url_to_blocks(url::AbstractString)
         # Getting title of the document 
         # title = [el
         #          for el in AbstractTrees.PreOrderDFS(r_parsed.root)
-        #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
-        parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)])
-        heading_hierarchy = Dict{Symbol,Any}()
+        #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")    
+        parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+        heading_hierarchy = Dict{Symbol, Any}()
         process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
         return parsed_blocks
     catch
diff --git a/src/preparation.jl b/src/preparation.jl
index ab8d7b5..8736050 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,9 +1,7 @@
-# include("recursive_splitter.jl")
-include("utils.jl")
 """
     get_header_path(d::Dict)
 
-Concatenates the h1, h2, h3 keys from the metadata of a Dict
+Concatenate the h1, h2, h3 keys from the metadata of a Dict
 
 # Examples
 ```julia
@@ -12,17 +10,21 @@ get_header_path(d)
 # Output: "Axis/Attributes/yzoomkey"
 ```
 """
-function get_header_path(d::Dict)
-    metadata = get(d, "metadata", Dict{Any,Any}())
+function get_header_path(d::Dict{String, Any})
+    metadata = get(d, "metadata", Dict{Any, Any}())
     isempty(metadata) && return nothing
     keys_ = [:h1, :h2, :h3]
     vals = get.(Ref(metadata), keys_, "") |> x -> filter(!isempty, x) |> x -> join(x, "/")
     isempty(vals) ? nothing : vals
 end
 
+"""
+    roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
 
-"Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length"
-function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<SEP>")
+Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length
+"""
+function roll_up_chunks(parsed_blocks::Vector{Dict{String, Any}},
+        url::AbstractString; separator::String = "<SEP>")
     docs = String[]
     io = IOBuffer()
     last_header = nothing
@@ -35,7 +37,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
             str = String(take!(io))
             if !isempty(str)
                 push!(docs, str)
-                src = url * (isnothing(last_header) ? "" : "::$last_header")
+                src = url * (isnothing(last_header) ? "" : " - $last_header")
                 push!(sources, src)
             end
             last_header = header
@@ -48,28 +50,31 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
     str = String(take!(io))
     if !isempty(str)
         push!(docs, str)
-        src = url * (isnothing(last_header) ? "" : "::$last_header")
+        src = url * (isnothing(last_header) ? "" : " - $last_header")
         push!(sources, src)
     end
     return docs, sources
 end
 
-
 struct DocParserChunker <: RT.AbstractChunker end
-"""
-    RT.get_chunks(chunker::DocParserChunker,
-    html_files::Vector{<:AbstractString};
-    sources::AbstractVector{<:AbstractString}=html_files,
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
 
-Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.
 """
-function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-
-
+    RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+        verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
+
+Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, 
+and splits them by separators to get the desired length.
+
+# Arguments
+- chunker: DocParserChunker
+- url: URL of the webpage to extract chunks
+- verbose: Bool to print the log
+- separators: Chunk separators
+- max_chunk_size Maximum chunk size
+"""
+function RT.get_chunks(
+        chunker::DocParserChunker, url::AbstractString;
+        verbose::Bool = true, separators = ["\n\n", ". ", "\n", " "], max_chunk_size::Int = MAX_CHUNK_SIZE)
     SEP = "<SEP>"
     sources = AbstractVector{<:AbstractString}
     output_chunks = Vector{SubString{String}}()
@@ -79,12 +84,13 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
 
     parsed_blocks = parse_url_to_blocks(url)
     ## Roll up to the same header
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
 
     ## roll up chunks by SEP splitter, then remove it later
     for (doc, src) in zip(docs_, sources_)
         ## roll up chunks by SEP splitter, then remove it later
-        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|>
+        doc_chunks = PT.recursive_splitter(
+            doc, [SEP, separators...]; max_length = max_chunk_size) .|>
                      x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
         # skip if no chunks found
         isempty(doc_chunks) && continue
@@ -94,22 +100,25 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
     return output_chunks, output_sources
 end
 
+"""
+    process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
 
-
-"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them."
-function process_paths(url::AbstractString, max_length::Int=512)
-
+Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
+"""
+function process_paths(url::AbstractString;
+        max_chunk_size::Int = MAX_CHUNK_SIZE,
+        min_chunk_size::Int = MIN_CHUNK_SIZE)
     output_chunks = Vector{SubString{String}}()
     output_sources = Vector{String}()
 
-    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length)
+    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size)
 
     append!(output_chunks, chunks)
     append!(output_sources, sources)
 
-
     @info "Scraping done: $(length(output_chunks)) chunks"
-    postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true)
+    output_chunks, output_sources = postprocess_chunks(
+        output_chunks, output_sources; min_chunk_size, skip_code = true)
 
     return output_chunks, output_sources
 end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
new file mode 100644
index 0000000..00c1a2f
--- /dev/null
+++ b/src/user_preferences.jl
@@ -0,0 +1,4 @@
+global MIN_CHUNK_SIZE = 40
+global MAX_CHUNK_SIZE = 384
+global MODEL = "text-embedding-3-large"
+global EMBEDDING_SIZE = 3072
diff --git a/src/utils.jl b/src/utils.jl
index 4bf1e07..dfbc17c 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,9 @@
-"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)."
+"""
+    find_duplicates(chunks::AbstractVector{<:AbstractString})
+
+Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, 
+where `true` indicates a duplicate (second instance of the same text).
+"""
 function find_duplicates(chunks::AbstractVector{<:AbstractString})
     # hash the chunks for easier search
     hashed_chunks = bytes2hex.(sha256.(chunks))
@@ -20,36 +25,60 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString})
     return duplicates
 end
 
-"Removes chunks that are duplicated in the input list of chunks and their corresponding sources."
-function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+"""
+    remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+
+Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
+"""
+function remove_duplicates(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
     idxs = find_duplicates(chunks)
     return chunks[.!idxs], sources[.!idxs]
 end
 
-"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources."
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true)
+"""
+    remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
+"""
+function remove_short_chunks(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true)
+    chunk_lengths = length.(chunks)
     idx = if skip_code
-        ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text)
-        findall(x -> length(x) >= min_length || occursin("```", x), chunks)
+        ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
+        findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks)
     else
-        findall(x -> length(x) >= min_length, chunks)
+        findall(x -> length(x) >= min_chunk_size, chunks)
     end
+    chunk_lengths = length.(chunks[idx])
     return chunks[idx], sources[idx]
 end
 
-
-function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString}, websites::AbstractVector{<:AbstractString})
-    @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
+function replace_local_paths(
+        sources::AbstractVector{<:AbstractString}, paths::AbstractVector{<:AbstractString},
+        websites::AbstractVector{<:AbstractString})
+    @assert length(paths)==length(websites) "Length of `paths` must match length of `websites`"
     replacement_pairs = paths .=> websites
     output = map(x -> replace(x, replacement_pairs...), sources)
+    return output
 end
 
+"""
+    function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+        websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
 
-"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates."
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true,
-    paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
+"""
+function postprocess_chunks(
+        chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int = MIN_CHUNK_SIZE, skip_code::Bool = true,
+        paths::Union{Nothing, AbstractVector{<:AbstractString}} = nothing,
+        websites::Union{Nothing, AbstractVector{<:AbstractString}} = nothing)
     len_ = length(chunks)
-    chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code)
+    chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
     @info "Removed $(len_ - length(chunks)) short chunks"
 
     len_ = length(chunks)
@@ -63,6 +92,71 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A
     end
 
     return chunks, sources
+end
+
+"""
+    function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+
+Remove chunks and sources corresponding to URLs starting with `prefix_urls` 
+"""
+function remove_urls_from_index(
+        index_path::AbstractString, prefix_urls = Vector{<:AbstractString})
+    @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
+
+    h5open(index_path, "r+") do orig_file
+        # Load the sources dataset into a Julia array
+        sources = read(orig_file["sources"])
+        chunks = read(orig_file["chunks"])
+        embeddings = read(orig_file["embeddings"])
+
+        for url_to_remove in prefix_urls
+            indices_to_remove = findall(x -> startswith(x, url_to_remove), sources)
+            sources = deleteat!(sources, indices_to_remove)
+            chunks = deleteat!(chunks, indices_to_remove)
+            embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)]
+        end
+
+        write(file["sources"], sources)
+        write(file["chunks"], chunks)
+        write(file["embeddings"], embeddings)
+    end
+end
+
+"""
+    urls_for_metadata(sources::Vector{String})
+
+Return a Dict of package names with their associated URLs
+Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.
+"""
+function urls_for_metadata(sources::Vector{String})
+    urls = [split(source, " -")[1] for source in sources]
+    pattern = r"(/(?:stable|dev|latest|v\d+(?:\.\d+)*))"
+    cleaned_urls = [endswith(String(url), "/") ? String(url)[1:(end - 1)] : String(url)
+                    for url in urls]
+    unique_urls = unique(cleaned_urls)
+    package_names = Vector{String}()
+
+    for url in unique_urls
+        push!(package_names, get_package_name(String(url)))
+    end
 
+    cleaned_urls = [match(pattern, url) !== nothing ? first(split(url, pattern)) : url
+                    for url in unique_urls]
 
-end
\ No newline at end of file
+    zipped = zip(cleaned_urls, package_names) |> collect
+    unique_pairs = unique(zipped)
+    unique_urls = [pair[1] for pair in unique_pairs]
+    unique_package_names = [pair[2] for pair in unique_pairs]
+
+    package_url_dict = Dict{String, Vector{String}}()
+    for (url, package_name) in zip(unique_urls, unique_package_names)
+        if haskey(package_url_dict, package_name)
+            # If the package_name is already a key, append the url to the existing array
+            push!(package_url_dict[package_name], url)
+        else
+            # Otherwise, create a new entry with the package_name and the url
+            package_url_dict[package_name] = [url]
+        end
+    end
+    return package_url_dict
+end
diff --git a/test/crawl.jl b/test/crawl.jl
new file mode 100644
index 0000000..6b00ca4
--- /dev/null
+++ b/test/crawl.jl
@@ -0,0 +1,7 @@
+using DocsScraper: crawl
+
+@testset "crawl" begin
+    urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
+    hostname_url_dict = crawl(urls)
+    @test length(hostname_url_dict) > 0
+end
diff --git a/test/make_knowledge_packs.jl b/test/make_knowledge_packs.jl
new file mode 100644
index 0000000..5690725
--- /dev/null
+++ b/test/make_knowledge_packs.jl
@@ -0,0 +1,8 @@
+using DocsScraper: process_paths
+
+@testset "overall test" begin
+    url = "https://docs.julialang.org/en/v1/"
+    chunks, sources = process_paths(url)
+    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+          sources[1] != nothing
+end
diff --git a/test/parser.jl b/test/parser.jl
new file mode 100644
index 0000000..0faeb04
--- /dev/null
+++ b/test/parser.jl
@@ -0,0 +1,11 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+    url = "https://docs.julialang.org/en/v1/"
+    parsed_blocks = parse_url_to_blocks(url)
+    @test length(parsed_blocks) > 0
+    SEP = "<SEP>"
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index fdde81f..6e1e7e8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,33 +1,13 @@
-
+using DocsScraper
 using Test
-urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
-url = urls[1]
-queue = Vector{AbstractString}()
-
-@testset "check robots.txt" begin
-    result, sitemap_queue = check_robots_txt("*", url)
-    @test result == true
-end
-
-@testset "HTTP get" begin
-    @test HTTP.get(url) != nothing
-end
-
-@testset "get_urls!" begin
-    get_urls!(url, queue)
-    @test length(queue) > 1
-end
-
-@testset "parse & roll_up" begin
-    parsed_blocks = parse_url_to_blocks(url)
-    @test length(parsed_blocks) > 0
-    SEP = "<SEP>"
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
-    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing
-end
+using Aqua
 
-@testset "overall test" begin
-    chunks, sources = process_paths(url)
-    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing
+@testset "DocsScraper.jl" begin
+    @testset "Code quality (Aqua.jl)" begin
+        Aqua.test_all(DocsScraper; persistent_tasks = false)
+    end
 
+    include("crawl.jl")
+    include("parser.jl")
+    include("make_knowledge_packs.jl")
 end
diff --git a/test/utils.jl b/test/utils.jl
new file mode 100644
index 0000000..fbe338a
--- /dev/null
+++ b/test/utils.jl
@@ -0,0 +1,10 @@
+using DocsScraper: parse_url_to_blocks, roll_up_chunks
+
+@testset "parse & roll_up" begin
+    parsed_blocks = parse_url_to_blocks(url)
+    @test length(parsed_blocks) > 0
+    SEP = "<SEP>"
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
+end