diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
new file mode 100644
index 0000000..5657bd0
--- /dev/null
+++ b/.JuliaFormatter.toml
@@ -0,0 +1,2 @@
+# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
+style = "sciml"
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..700707c
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/" # Location of package manifests
+    schedule:
+      interval: "weekly"
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 371b418..874943f 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -3,7 +3,7 @@ on:
   push:
     branches:
       - main
-    tags: ['*']
+    tags: ["*"]
   pull_request:
   workflow_dispatch:
 concurrency:
@@ -23,9 +23,8 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          <<#VERSIONS>>
-          - '<<&.>>'
-          <</VERSIONS>>
+          - "1.10"
+          - "nightly"
         os:
           - ubuntu-latest
         arch:
@@ -52,13 +51,11 @@ jobs:
       actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
       contents: write
       statuses: write
-      pages: write
-      id-token: write
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
-          version: '1'
+          version: "1"
       - uses: julia-actions/cache@v2
       - name: Configure doc environment
         shell: julia --project=docs --color=yes {0}
@@ -75,7 +72,6 @@ jobs:
         shell: julia --project=docs --color=yes {0}
         run: |
           using Documenter: DocMeta, doctest
-          using <<&PKG>>
-          DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true)
-          doctest(<<&PKG>>)
-  <</HAS_DOCUMENTER>>
+          using DocsScraper
+          DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
+          doctest(DocsScraper)
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 0000000..d48734a
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,16 @@
+name: CompatHelper
+on:
+  schedule:
+    - cron: 0 0 1 * *
+  workflow_dispatch:
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 0000000..0cd3114
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,31 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        default: "3"
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.gitignore b/.gitignore
index 9c929a1..8e2d4ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 # Ignore .env files
 .env
 knowledge_packs/
-Manifest.toml
\ No newline at end of file
+Manifest.toml
+/Manifest.toml
+/docs/Manifest.toml
+/docs/build/
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..9238ca7
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+  "cSpell.words": [
+    "eachmatch",
+    "postprocess"
+  ]
+}
diff --git a/MIT b/LICENSE
similarity index 94%
rename from MIT
rename to LICENSE
index 775ba1d..d7bd022 100644
--- a/MIT
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) {{{YEAR}}} {{{AUTHORS}}}
+Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/Project.toml b/Project.toml
index 0c1f6a8..705a918 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,15 @@
-name = "RAGKit"
-uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17"
-authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"]
+name = "DocsScraper"
+uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
+authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"]
 version = "0.1.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
-PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
@@ -19,7 +17,19 @@ URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
 
 [compat]
 AbstractTrees = "0.4.5"
+EzXML = "1.2.0"
 Gumbo = "0.8.2"
+HDF5 = "0.17.2"
 HTTP = "1.10.4"
+Inflate = "0.1.5"
 PromptingTools = "0.36.0"
+URIParser = "0.4.1"
 URIs = "1.5.1"
+Tar = "1.10.0"
+
+[extras]
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Aqua", "Test"]
diff --git a/docs/Project.toml b/docs/Project.toml
index 6fea155..41b0b18 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -6,7 +6,6 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
-PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
 PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
 Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
 URIParser = "30578b45-9adc-5946-b283-645ec420af67"
diff --git a/docs/make.jl b/docs/make.jl
index 38d4452..a54f0f6 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,24 +1,23 @@
-using Documenter: Documenter, makedocs, deploydocs
-using PkgTemplates: PkgTemplates
+using DocsScraper
+using Documenter
+
+DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
 
 makedocs(;
-    modules=[PkgTemplates],
-    authors="Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>",
-    repo="https://github.com/splendidbug/RAGKit",
-    sitename="RAGKit.jl",
+    modules=[DocsScraper],
+    authors="Shreyas Agrawal @splendidbug and J S @svilupp",
+    sitename="DocsScraper.jl",
     # format=Documenter.HTML(;
-    #     repolink="https://github.com/splendidbug/RAGKit",
-    #     canonical="https://juliaci.github.io/PkgTemplates.jl",
+    #     canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl",
+    #     edit_link="master",
     #     assets=String[],
     # ),
     pages=[
         "Home" => "index.md",
-        "User Guide" => "user.md",
-        "Developer Guide" => "developer.md",
-        "Migrating To PkgTemplates 0.7+" => "migrating.md",
     ],
 )
 
 deploydocs(;
-    repo="https://github.com/splendidbug/RAGKit",
+    repo="github.com/Shreyas Agrawal/DocsScraper.jl",
+    devbranch="main",
 )
diff --git a/docs/src/index.md b/docs/src/index.md
index f53a411..a6f0129 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,4 +1,4 @@
-# RAGKit
+# DocsScraper
 
 ## Documentation
 
diff --git a/src/RAGKit.jl b/src/DocsScraper.jl
similarity index 56%
rename from src/RAGKit.jl
rename to src/DocsScraper.jl
index b895363..e78dde7 100644
--- a/src/RAGKit.jl
+++ b/src/DocsScraper.jl
@@ -1,7 +1,9 @@
-module RAGKit
+module DocsScraper
 using HTTP, Gumbo, AbstractTrees, URIs
 using Gumbo: HTMLDocument, HTMLElement
 using EzXML
+using Pkg
+Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl"))
 using PromptingTools
 const PT = PromptingTools
 const RT = PromptingTools.Experimental.RAGTools
@@ -12,17 +14,18 @@ using Inflate
 
 using SHA
 using Serialization, URIs
-# using Regex
-
-# using Robots
 
 include("parser.jl")
 include("crawl.jl")
 include("extract_urls.jl")
 include("preparation.jl")
 
-include("make_embeddings.jl")
-export make_embeddings
+include("make_knowledge_packs.jl")
+export make_knowledge_packs, just_generate
+
+include("user_preferences.jl")
+include("utils.jl")
+export remove_urls_from_index
 
 
 end
\ No newline at end of file
diff --git a/src/crawl.jl b/src/crawl.jl
index b147511..a8f93c9 100644
--- a/src/crawl.jl
+++ b/src/crawl.jl
@@ -2,10 +2,7 @@
 """
     parse_robots_txt!(robots_txt::String)
 
-Parses the robots.txt string and returns rules along with the URLs on Sitemap
-
-# Arguments
-- `robots_txt`: robots.txt as a string
+Parse the robots.txt string and return rules and the URLs on Sitemap
 """
 function parse_robots_txt!(robots_txt::String)
     rules = Dict{String,Dict{String,Vector{String}}}()
@@ -40,17 +37,15 @@ end
 
 
 """
-    check_robots_txt(user_agent::AbstractString,
-        url::AbstractString)
+    check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
-Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url
+Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls
 
 # Arguments
 - `user_agent`: user agent attempting to crawl the webpage
 - `url`: input URL string
 """
-function check_robots_txt(user_agent::AbstractString,
-    url::AbstractString)
+function check_robots_txt(user_agent::AbstractString, url::AbstractString)
 
     ## TODO: Make a cache of rules for a quick lookup
     # if (haskey(restricted_urls, url))
@@ -101,10 +96,7 @@ end
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url
 """
 function get_base_url(url::AbstractString)
 
@@ -118,10 +110,7 @@ end
 """
     process_hostname(url::AbstractString)
 
-Returns the hostname of an input URL
-
-# Arguments
-- `url`: URL string
+Return the hostname of an input URL
 """
 function process_hostname(url::AbstractString)
     URI = URIs.URI(url)
@@ -133,7 +122,7 @@ end
 """
     process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
 
-Adds the `url` to it's hostname in `hostname_dict`
+Add `url` to its hostname in `hostname_dict`
 
 # Arguments
 - `url`: URL string
@@ -154,10 +143,7 @@ end
 """
     crawl(input_urls::Vector{<:AbstractString})
 
-Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
-
-# Arguments
-- `input_urls`: A vector of input URLs
+Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
 """
 function crawl(input_urls::Vector{<:AbstractString})
 
@@ -187,6 +173,6 @@ function crawl(input_urls::Vector{<:AbstractString})
         end
     end
 
-    return hostname_url_dict
+    return hostname_url_dict, visited_url_set
 
 end
diff --git a/src/extract_urls.jl b/src/extract_urls.jl
index b9ea364..d5e8fcf 100644
--- a/src/extract_urls.jl
+++ b/src/extract_urls.jl
@@ -1,31 +1,37 @@
-# Temporary until I find a package to simplify this
+"""
+    resolve_url(base_url::String, extracted_url::String)
 
-function resolve_url(base_url::String, relative_url::String)::String
-    base_uri = URI(base_url)
-    relative_uri = URI(relative_url)
+Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. 
+Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url
 
-    ## TODO: Make a list of allowed URLs which would contain Julia docs hostnames
+# Arguments
+- base_url: URL of the page from which other URLs are being extracted
+- extracted_url: URL extracted from the base_url  
+"""
+function resolve_url(base_url::String, extracted_url::String)
+    base_uri = URI(base_url)
+    extracted_uri = URI(extracted_url)
     ## TODO: Look for version number either on the bottom left dropdown or identify on the url
 
-    if length(relative_url) > 4 && relative_url[1:4] == "http"
-        if base_uri.host == relative_uri.host
-            return relative_url
+    if length(extracted_url) > 4 && extracted_url[1:4] == "http"
+        if base_uri.host == extracted_uri.host
+            return extracted_url
         end
         return ""
     end
-    if !isempty(relative_url) && relative_url[1] == '#'
+    if !isempty(extracted_url) && extracted_url[1] == '#'
         return ""
     end
 
-    if !isempty(relative_uri.path) && relative_uri.path[1] == '/'
+    if !isempty(extracted_uri.path) && extracted_uri.path[1] == '/'
         resolved_uri = URI(
-            scheme=base_uri.scheme,
-            userinfo=base_uri.userinfo,
-            host=base_uri.host,
-            port=base_uri.port,
-            path=relative_uri.path,
-            query=relative_uri.query,
-            fragment=relative_uri.fragment
+            scheme = base_uri.scheme,
+            userinfo = base_uri.userinfo,
+            host = base_uri.host,
+            port = base_uri.port,
+            path = extracted_uri.path,
+            query = extracted_uri.query,
+            fragment = extracted_uri.fragment
         )
         return string(resolved_uri)
     end
@@ -34,11 +40,11 @@ function resolve_url(base_url::String, relative_url::String)::String
     base_segments = split(base_uri.path, "/")
     base_segments = filter((i) -> i != "", base_segments)
 
-    relative_segments = split(relative_uri.path, "/")
-    relative_segments = filter((i) -> i != "", relative_segments)
+    extracted_segments = split(extracted_uri.path, "/")
+    extracted_segments = filter((i) -> i != "", extracted_segments)
 
-    # Process the relative segments
-    for segment in relative_segments
+    # Process the directory traversal paths
+    for segment in extracted_segments
         if segment == ".."
             if !isempty(base_segments)
                 pop!(base_segments)
@@ -53,31 +59,29 @@ function resolve_url(base_url::String, relative_url::String)::String
 
     # Create the resolved URI
     resolved_uri = URI(
-        scheme=base_uri.scheme,
-        userinfo=base_uri.userinfo,
-        host=base_uri.host,
-        port=base_uri.port,
-        path=resolved_path,
-        query=relative_uri.query,
-        fragment=relative_uri.fragment
+        scheme = base_uri.scheme,
+        userinfo = base_uri.userinfo,
+        host = base_uri.host,
+        port = base_uri.port,
+        path = resolved_path,
+        query = extracted_uri.query,
+        fragment = extracted_uri.fragment
     )
     return string(resolved_uri)
 end
 
-
 """
-    find_urls!(url::AbstractString, 
-        node::Gumbo.HTMLElement, 
-        url_queue::Vector{<:AbstractString}
+    find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}
 
-Function to recursively find <a> and extract the urls
+Function to recursively find <a> tags and extract the urls
 
 # Arguments
 - url: The initial input URL 
 - node: The HTML node of type Gumbo.HTMLElement
 - url_queue: Vector in which extracted URLs will be appended
 """
-function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
+function find_urls_html!(
+        url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString})
     if Gumbo.tag(node) == :a && haskey(node.attributes, "href")
         href = node.attributes["href"]
         if href !== nothing && !isempty(resolve_url(url, href))
@@ -85,6 +89,7 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
         end
     end
 
+    # Go deep in the HTML tags and check if `node` is an <a> tag
     for child in node.children
         if isa(child, HTMLElement)
             find_urls_html!(url, child, url_queue)
@@ -92,9 +97,18 @@ function find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue
     end
 end
 
+"""
+    find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
 
+Identify URL through regex pattern in xml files and push in `url_queue`
 
+# Arguments
+- url: url from which all other URLs will be extracted
+- url_queue: Vector in which extracted URLs will be appended
+"""
 function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})
+    # If a string starts with "http" then it is considered as a URL regardless of it being valid. 
+    # Validity of URLs are checked during HTTP fetch
     try
         fetched_content = HTTP.get(url)
         xml_content = String(fetched_content.body)
@@ -108,32 +122,23 @@ function find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString}
     end
 end
 
-
-
 """
     get_links!(url::AbstractString, 
         url_queue::Vector{<:AbstractString})
 
-Function to extract urls inside <a> tags
+Extract urls inside html or xml files 
 
 # Arguments
 - url: url from which all other URLs will be extracted
 - url_queue: Vector in which extracted URLs will be appended
 """
 function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
-
     @info "Scraping link: $url"
-    # println(url)
-    # try
     fetched_content = HTTP.get(url)
     parsed = Gumbo.parsehtml(String(fetched_content.body))
-    if (url[end-3:end] == ".xml")
+    if (url[(end - 3):end] == ".xml")
         find_urls_xml!(url_xml, url_queue)
     else
         find_urls_html!(url, parsed.root, url_queue)
     end
-    # print("-------------")
-    # catch e
-    #     println("Bad URL: $url")
-    # end
 end
\ No newline at end of file
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
deleted file mode 100644
index f51c865..0000000
--- a/src/make_embeddings.jl
+++ /dev/null
@@ -1,173 +0,0 @@
-## TODO: Make a function to Check for version number
-
-"""
-    report_artifact()
-
-prints artifact information
-"""
-function report_artifact(fn_output)
-    @info("ARTIFACT: $(basename(fn_output))")
-    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
-    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
-end
-
-
-
-
-"""
-    create_output_folders()
-
-Creates output folders
-"""
-function create_output_folders(knowledge_pack_path::String)
-    # Define the folder path    
-    folder_path = joinpath(knowledge_pack_path, "packs")
-    println("folder_path:", folder_path)
-    # Check if the folder exists
-    if !isdir(folder_path)
-        mkpath(folder_path)
-        @info "Folder created: $folder_path"
-    else
-        @info "Folder already exists: $folder_path"
-    end
-
-end
-
-"""
-    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}})
-
-Parses URLs from hostname_url_dict and saves the chunks
-
-# Arguments
-- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
-"""
-function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String)
-    output_chunks = Vector{SubString{String}}()
-    output_sources = Vector{String}()
-    SAVE_CHUNKS = true
-    CHUNK_SIZE = 512
-    for (hostname, urls) in hostname_url_dict
-        for url in urls
-            try
-                chunks, sources = process_paths(url)
-                append!(output_chunks, chunks)
-                append!(output_sources, sources)
-            catch
-                @error "error!! check url: $url"
-            end
-        end
-        if SAVE_CHUNKS
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks)
-            serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources)
-        end
-
-    end
-
-
-end
-
-function l2_norm_columns(mat::AbstractMatrix)
-    norm_ = norm.(eachcol(mat))
-    return mat ./ norm_'
-end
-function l2_norm_columns(vect::AbstractVector)
-    norm_ = norm(vect)
-    return vect / norm_
-end
-
-
-"""
-    generate_embeddings()
-
-Deserializes chunks and sources to generate embeddings 
-"""
-function generate_embeddings(knowledge_pack_path::String)
-    embedder = RT.BatchEmbedder()
-    entries = readdir(knowledge_pack_path)
-
-    # Initialize a dictionary to group files by hostname and chunk size
-    hostname_files = Dict{String,Dict{Int,Dict{String,String}}}()
-
-    # Regular expressions to match the file patterns
-    chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
-    sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
-    # Group files by hostname and chunk size
-    for file in entries
-        match_chunks = match(chunks_pattern, file)
-        match_sources = match(sources_pattern, file)
-
-        if match_chunks !== nothing
-            hostname = match_chunks.captures[1]
-            chunk_size = parse(Int, match_chunks.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file)
-        elseif match_sources !== nothing
-            hostname = match_sources.captures[1]
-            chunk_size = parse(Int, match_sources.captures[2])
-            if !haskey(hostname_files, hostname)
-                hostname_files[hostname] = Dict{Int,Dict{String,String}}()
-            end
-            if !haskey(hostname_files[hostname], chunk_size)
-                hostname_files[hostname][chunk_size] = Dict{String,String}()
-            end
-            hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file)
-        end
-    end
-
-
-    # Process each pair of files
-    for (hostname, chunk_files) in hostname_files
-        for (chunk_size, files) in chunk_files
-            if haskey(files, "chunks") && haskey(files, "sources")
-                chunks_file = files["chunks"]
-                sources_file = files["sources"]
-                chunks = deserialize(chunks_file)
-                sources = deserialize(sources_file)
-                cost_tracker = Threads.Atomic{Float64}(0.0)
-                full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024)
-
-                fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
-                fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
-                h5open(fn_temp, "w") do file
-                    file["chunks"] = chunks
-                    file["sources"] = sources
-                    file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x)
-                    file["type"] = "ChunkIndex"
-                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
-                end
-                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
-                run(command)
-                report_artifact(fn_output)
-
-            else
-                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
-            end
-        end
-    end
-
-end
-
-
-
-"""
-    make_embeddings(input_urls::Vector{<:AbstractString})
-
-Entry point to crawl, parse and create embeddings
-
-# Arguments
-- input_urls: vector containing URL strings to parse
-"""
-function make_embeddings(input_urls::Vector{<:AbstractString})
-    hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
-    hostname_url_dict = crawl(input_urls)
-    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
-    create_output_folders(knowledge_pack_path)
-    make_chunks(hostname_url_dict, knowledge_pack_path)
-    generate_embeddings(knowledge_pack_path)
-end
\ No newline at end of file
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
new file mode 100644
index 0000000..291a9c7
--- /dev/null
+++ b/src/make_knowledge_packs.jl
@@ -0,0 +1,222 @@
+"""
+    report_artifact(fn_output)
+
+Print artifact information
+"""
+function report_artifact(fn_output)
+    @info("ARTIFACT: $(basename(fn_output))")
+    @info("sha256: ", bytes2hex(open(sha256, fn_output)))
+    @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
+end
+
+"""
+    create_output_folders(knowledge_pack_path::String)
+
+Create output folders on the knowledge_pack_path
+"""
+function create_output_folders(knowledge_pack_path::String)
+    # Define the folder path    
+    folder_path = joinpath(knowledge_pack_path, "packs")
+    # Check if the folder exists
+    if !isdir(folder_path)
+        mkpath(folder_path)
+    end
+end
+
+"""
+    make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
+        min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Parse URLs from hostname_url_dict and save the chunks
+
+# Arguments
+- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
+- knowledge_pack_path: Knowledge pack path
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+"""
+function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
+        knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+        min_chunk_size::Int = MIN_CHUNK_SIZE)
+    SAVE_CHUNKS = true
+    for (hostname, urls) in hostname_url_dict
+        output_chunks = Vector{SubString{String}}()
+        output_sources = Vector{String}()
+        for url in urls
+            try
+                chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+                append!(output_chunks, chunks)
+                append!(output_sources, sources)
+            catch
+                @error "error!! check url: $url"
+            end
+        end
+        if SAVE_CHUNKS
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_chunks)
+            serialize(
+                joinpath(knowledge_pack_path,
+                    "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+                output_sources)
+        end
+    end
+end
+
+"""
+    l2_norm_columns(mat::AbstractMatrix)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(mat::AbstractMatrix)
+    norm_ = norm.(eachcol(mat))
+    return mat ./ norm_'
+end
+
+"""
+    l2_norm_columns(vect::AbstractVector)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(vect::AbstractVector)
+    norm_ = norm(vect)
+    return vect / norm_
+end
+
+"""
+    generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Deserialize chunks and sources to generate embeddings 
+
+# Arguments
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
+        embedding_size::Int = EMBEDDING_SIZE)
+    embedder = RT.BatchEmbedder()
+    entries = readdir(knowledge_pack_path)
+    # Initialize a dictionary to group files by hostname and chunk size
+    hostname_files = Dict{String, Dict{Int, Dict{String, String}}}()
+
+    # Regular expressions to match the file patterns of chunks and sources
+    chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
+    sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
+
+    # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
+    # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
+
+    # Group files by hostname and chunk size
+    for file in entries
+        match_chunks = match(chunks_pattern, file)
+        match_sources = match(sources_pattern, file)
+
+        if match_chunks !== nothing
+            hostname = match_chunks.captures[1]
+            chunk_size = parse(Int, match_chunks.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], chunk_size)
+                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+                knowledge_pack_path, file)
+        elseif match_sources !== nothing
+            hostname = match_sources.captures[1]
+            chunk_size = parse(Int, match_sources.captures[2])
+            if !haskey(hostname_files, hostname)
+                hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+            end
+            if !haskey(hostname_files[hostname], chunk_size)
+                hostname_files[hostname][chunk_size] = Dict{String, String}()
+            end
+            hostname_files[hostname][chunk_size]["sources"] = joinpath(
+                knowledge_pack_path, file)
+        end
+    end
+    # Process each pair of files
+    for (hostname, chunk_files) in hostname_files
+        for (chunk_size, files) in chunk_files
+            if haskey(files, "chunks") && haskey(files, "sources")
+                chunks_file = files["chunks"]
+                sources_file = files["sources"]
+                chunks = deserialize(chunks_file)
+                sources = deserialize(sources_file)
+                cost_tracker = Threads.Atomic{Float64}(0.0)
+                full_embeddings = RT.get_embeddings(
+                    embedder, chunks; model, verbose = false, cost_tracker)
+                @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+                fn_output = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+                fn_temp = joinpath(knowledge_pack_path, "packs",
+                    "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+                h5open(fn_temp, "w") do file
+                    file["chunks"] = chunks
+                    file["sources"] = sources
+                    file["embeddings"] = full_embeddings[1:embedding_size, :] |>
+                                         l2_norm_columns |> x -> map(>(0), x)
+                    file["type"] = "ChunkIndex"
+                    # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+                end
+
+                command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+                run(command)
+                report_artifact(fn_output)
+
+            else
+                @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+            end
+        end
+    end
+end
+
+"""
+    make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+        max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Entry point to crawl, parse and generate embeddings
+
+# Arguments
+- crawlable_urls: URLs that should be crawled to find more links
+- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
+        single_urls::Vector{<:AbstractString} = String[],
+        max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
+        model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+    if isempty(crawlable_urls) && isempty(single_urls)
+        error("At least one of `input_urls` or `single_pages` must be provided.")
+    end
+
+    hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
+
+    if !isempty(crawlable_urls)
+        hostname_url_dict, visited_url_set = crawl(crawlable_urls)
+    else
+        visited_url_set = Set{AbstractString}()
+    end
+    for url in single_urls
+        base_url = get_base_url(url)
+        if !in(base_url, visited_url_set)
+            push!(visited_url_set, base_url)
+            crawlable, sitemap_urls = check_robots_txt("*", base_url)
+            if crawlable
+                try
+                    process_hostname!(url, hostname_url_dict)
+                catch
+                    @error "Bad URL: $base_url"
+                end
+            end
+        end
+    end
+    knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
+    create_output_folders(knowledge_pack_path)
+    make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+    generate_embeddings(knowledge_pack_path; model, embedding_size)
+end
diff --git a/src/parser.jl b/src/parser.jl
index d909280..def1a17 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -1,21 +1,3 @@
-"""
-Working:
-
-Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks 
-ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.  
-For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
-If the current node is a code block, return the text inside code block with backticks.
-If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. 
-if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
-it is assumed that everything inside the tag is part of a single text block with inline code. 
-But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. 
-To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration 
-that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. 
-We indicate this by a return flag is_text_inserted
-"""
-
-
-
 """
     insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, 
         parsed_blocks::Vector{Dict{String,Any}}, 
@@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector
 - text_to_insert: Text to be inserted
 - text_type: The text to be inserted could be heading or a code block or just text
 """
-function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    text_to_insert::AbstractString,
-    text_type::AbstractString)
-
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        text_to_insert::AbstractString,
+        text_type::AbstractString)
     if !isempty(strip(text_to_insert))
         push!(parsed_blocks,
             Dict(text_type => strip(text_to_insert),
@@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
     end
 end
 
-
-
 """
     process_headings!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl
 - parsed_blocks: Vector of Dicts to store parsed text and metadata
 """
 function process_headings!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}})
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}})
     tag_name = Gumbo.tag(node)
     # Clear headings of equal or lower level
     for k in collect(keys(heading_hierarchy))
-        if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+        if k != "header" &&
+           Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
             delete!(heading_hierarchy, k)
         end
     end
@@ -123,11 +102,10 @@ If the node is neither heading nor code
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_generic_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
 
@@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement,
         # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
         # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. 
 
-        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i,
+            :cite, :address, :em, :td, :a, :span, :header]
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        elseif tag_name in [:script]
+            continue
         else
-            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+            received_text, is_code_block, is_text_inserted = process_node!(
+                child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
         end
 
         # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
@@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
             print(prev_text_buffer, " " * received_text)
             text_to_insert = text_to_insert * " " * received_text
         end
-
     end
 
     # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, 
@@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     # if we're insert text in current node level, then we should insert the previous text if available, 
     # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
     if !isempty(strip(text_to_insert))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
         is_text_inserted = true
     end
 
@@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
     return "", is_code_block, is_text_inserted
 end
 
-
 """
     process_docstring!(node::Gumbo.HTMLElement,
         heading_hierarchy::Dict{Symbol,Any},
@@ -224,11 +206,10 @@ Function to process node of class `docstring`
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_docstring!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     seekstart(prev_text_buffer)
     prev_text = read(prev_text_buffer, String)
     is_code_block = false
@@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement,
     # Insert "header"
     if Gumbo.tag(children[1]) == :header
         heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
-        insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+        insert_parsed_data!(
+            heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
     end
 
-    received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+    received_text, is_code_block, is_text_inserted = process_node!(
+        children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 
     if !isempty(strip(received_text))
         insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
@@ -279,11 +262,10 @@ Function to process a node
 - prev_text_buffer: IO Buffer which contains previous text
 """
 function process_node!(node::Gumbo.HTMLElement,
-    heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}},
-    child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))
-
+        heading_hierarchy::Dict{Symbol, Any},
+        parsed_blocks::Vector{Dict{String, Any}},
+        child_new::Bool = true,
+        prev_text_buffer::IO = IOBuffer(write = true))
     tag_name = Gumbo.tag(node)
     if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
         return process_headings!(node, heading_hierarchy, parsed_blocks)
@@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement,
         return process_code(node)
 
     elseif tag_name == :article && getattr(node, "class", "") == "docstring"
-        return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+        return process_docstring!(
+            node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
     end
 
-    return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+    return process_generic_node!(
+        node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
 end
 
-
 """
 multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
 """
@@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...)
     return strip(Gumbo.text(node)), is_code_block, is_text_inserted
 end
 
-
 """
     get_base_url(url::AbstractString)
 
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url.
 """
 function get_base_url(url::AbstractString)
     parsed_url = URIs.URI(url)
@@ -329,7 +306,7 @@ end
 """
     get_html_content(root::Gumbo.HTMLElement)
 
-Returns the main content of the HTML. If not found, returns the whole HTML to parse
+Return the main content of the HTML. If not found, return the whole HTML to parse
 
 # Arguments
 - `root`: The HTML root from which content is extracted
@@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement)
     target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
     target_classes = Set(["content", "franklin-content"])
 
-    content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+    content_candidates = [el
+                          for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
 
     # First try to find by ID
-    content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+    content_by_id = filter(
+        el -> getattr(el, "id", nothing) in target_ids, content_candidates)
     if !isempty(content_by_id)
         return only(content_by_id)
     end
 
     # Fallback to class if no ID matches
-    content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+    content_by_class = filter(
+        el -> getattr(el, "class", nothing) in target_classes, content_candidates)
     if !isempty(content_by_class)
         return only(content_by_class)
     end
 
     # Fallback to the root node if no class matches
     return root
-
 end
 
-
 """
     parse_url(url::AbstractString)
 
-Initiator and main function to parse HTML from url
+Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
 
 # Arguments
 - `url`: URL string to parse
-
-# Returns
-- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Usage
-parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
-
-# Example
-Let the HTML be:
-<!DOCTYPE html>
-    <html>
-    <body>
-
-    <h1>Heading 1</h1>
-        <h2>Heading 2</h2>
-            <p>para 1</p>
-            <h3>Heading 3</h3>
-                <code>this is my code block</code>
-            <h3>This is another h3 under Heading 2</h3>
-                <p>This is a paragraph with <code>inline code</code></p>
-
-        <h2>Heading 2_2</h2>
-            <p>para ewg</p>
-
-    </body>
-    </html>
-
-Output: 
-Any[
-    Dict{String, Any}("URL" => "URL")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
-    Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with  inline code")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
-    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
-]
 """
 function parse_url_to_blocks(url::AbstractString)
 
@@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString)
         # title = [el
         #          for el in AbstractTrees.PreOrderDFS(r_parsed.root)
         #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
-        parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)])
-        heading_hierarchy = Dict{Symbol,Any}()
+        parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+        heading_hierarchy = Dict{Symbol, Any}()
         process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
         return parsed_blocks
     catch
diff --git a/src/preparation.jl b/src/preparation.jl
index ab8d7b5..9979155 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,9 +1,7 @@
-# include("recursive_splitter.jl")
-include("utils.jl")
 """
     get_header_path(d::Dict)
 
-Concatenates the h1, h2, h3 keys from the metadata of a Dict
+Concatenate the h1, h2, h3 keys from the metadata of a Dict
 
 # Examples
 ```julia
@@ -12,7 +10,7 @@ get_header_path(d)
 # Output: "Axis/Attributes/yzoomkey"
 ```
 """
-function get_header_path(d::Dict)
+function get_header_path(d::Dict{String,Any})
     metadata = get(d, "metadata", Dict{Any,Any}())
     isempty(metadata) && return nothing
     keys_ = [:h1, :h2, :h3]
@@ -21,8 +19,13 @@ function get_header_path(d::Dict)
 end
 
 
-"Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length"
-function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<SEP>")
+
+"""
+    roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
+
+Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length
+"""
+function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")
     docs = String[]
     io = IOBuffer()
     last_header = nothing
@@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
             str = String(take!(io))
             if !isempty(str)
                 push!(docs, str)
-                src = url * (isnothing(last_header) ? "" : "::$last_header")
+                src = url * (isnothing(last_header) ? "" : " - $last_header")
                 push!(sources, src)
             end
             last_header = header
@@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
     str = String(take!(io))
     if !isempty(str)
         push!(docs, str)
-        src = url * (isnothing(last_header) ? "" : "::$last_header")
+        src = url * (isnothing(last_header) ? "" : " - $last_header")
         push!(sources, src)
     end
     return docs, sources
@@ -56,19 +59,23 @@ end
 
 
 struct DocParserChunker <: RT.AbstractChunker end
-"""
-    RT.get_chunks(chunker::DocParserChunker,
-    html_files::Vector{<:AbstractString};
-    sources::AbstractVector{<:AbstractString}=html_files,
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
 
-Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.
+"""
+    RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+        verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
+
+Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, 
+and splits them by separators to get the desired length.
+
+# Arguments
+- chunker: DocParserChunker
+- url: URL of the webpage to extract chunks
+- verbose: Bool to print the log
+- separators: Chunk separators
+- max_chunk_size Maximum chunk size
 """
 function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
-    verbose::Bool=true,
-    separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-
+    verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
 
     SEP = "<SEP>"
     sources = AbstractVector{<:AbstractString}
@@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
     ## roll up chunks by SEP splitter, then remove it later
     for (doc, src) in zip(docs_, sources_)
         ## roll up chunks by SEP splitter, then remove it later
-        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|>
+        doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
                      x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
+        chunk_lengths = length.(doc_chunks)
         # skip if no chunks found
         isempty(doc_chunks) && continue
         append!(output_chunks, doc_chunks)
@@ -96,20 +104,24 @@ end
 
 
 
-"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them."
-function process_paths(url::AbstractString, max_length::Int=512)
+"""
+    process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
+"""
+function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
 
     output_chunks = Vector{SubString{String}}()
     output_sources = Vector{String}()
 
-    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length)
+    chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size)
 
     append!(output_chunks, chunks)
     append!(output_sources, sources)
 
 
     @info "Scraping done: $(length(output_chunks)) chunks"
-    postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true)
+    output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
 
     return output_chunks, output_sources
 end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
new file mode 100644
index 0000000..98794c6
--- /dev/null
+++ b/src/user_preferences.jl
@@ -0,0 +1,4 @@
+global MIN_CHUNK_SIZE = 40
+global MAX_CHUNK_SIZE = 256
+global MODEL = "text-embedding-3-large"
+global EMBEDDING_SIZE = 1024
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index 4bf1e07..e8dc014 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,9 @@
-"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)."
+"""
+    find_duplicates(chunks::AbstractVector{<:AbstractString})
+
+Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, 
+where `true` indicates a duplicate (second instance of the same text).
+"""
 function find_duplicates(chunks::AbstractVector{<:AbstractString})
     # hash the chunks for easier search
     hashed_chunks = bytes2hex.(sha256.(chunks))
@@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString})
     return duplicates
 end
 
-"Removes chunks that are duplicated in the input list of chunks and their corresponding sources."
+"""
+    remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+
+Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
+"""
 function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
     idxs = find_duplicates(chunks)
     return chunks[.!idxs], sources[.!idxs]
 end
 
-"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources."
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true)
+
+"""
+    remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
+"""
+function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+    chunk_lengths = length.(chunks)
     idx = if skip_code
-        ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text)
-        findall(x -> length(x) >= min_length || occursin("```", x), chunks)
+        ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
+        findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks)
     else
-        findall(x -> length(x) >= min_length, chunks)
+        findall(x -> length(x) >= min_chunk_size, chunks)
     end
+    chunk_lengths = length.(chunks[idx])
     return chunks[idx], sources[idx]
 end
 
@@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A
     @assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
     replacement_pairs = paths .=> websites
     output = map(x -> replace(x, replacement_pairs...), sources)
+    return output
 end
 
 
-"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates."
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true,
-    paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+
+"""
+    function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+        min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+        websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
+"""
+function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+    websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
     len_ = length(chunks)
-    chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code)
+    chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
     @info "Removed $(len_ - length(chunks)) short chunks"
 
     len_ = length(chunks)
@@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A
     end
 
     return chunks, sources
+end
+
+"""
+    function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+
+Remove chunks and sources corresponding to URLs starting with `prefix_urls` 
+"""
+function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+    @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
 
+    h5open(index_path, "r+") do orig_file
+        # Load the sources dataset into a Julia array
+        sources = read(orig_file["sources"])
+        chunks = read(orig_file["chunks"])
+        embeddings = read(orig_file["embeddings"])
 
+        for url_to_remove in prefix_urls
+            indices_to_remove = findall(x -> startswith(x, url_to_remove), sources)
+            sources = deleteat!(sources, indices_to_remove)
+            chunks = deleteat!(chunks, indices_to_remove)
+            embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)]
+        end
+
+        write(file["sources"], sources)
+        write(file["chunks"], chunks)
+        write(file["embeddings"], embeddings)
+    end
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 78a78b4..4b4a92c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays
 using HDF5
 using Tar
 using Inflate
-
 using SHA
 using Serialization, URIs
 
-include("..\\src\\crawl.jl")
-include("..\\src\\extract_urls.jl")
-include("..\\src\\parser.jl")
-include("..\\src\\preparation.jl")
+include(joinpath("..", "src", "crawl.jl"))
+include(joinpath("..", "src", "extract_urls.jl"))
+include(joinpath("..", "src", "parser.jl"))
+include(joinpath("..", "src", "preparation.jl"))
+include(joinpath("..", "src", "user_preferences.jl"))
+include(joinpath("..", "src", "utils.jl"))
+
 urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
 url = urls[1]
 queue = Vector{AbstractString}()
 
-@testset "check robots.txt" begin
+@testset "HTTP" begin
     @test HTTP.get(url) != nothing
-
     result, sitemap_queue = check_robots_txt("*", url)
     @test result == true
 end
@@ -38,12 +39,13 @@ end
     parsed_blocks = parse_url_to_blocks(url)
     @test length(parsed_blocks) > 0
     SEP = "<SEP>"
-    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
-    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing
+    docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+    @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+          sources_[1] != nothing
 end
 
 @testset "overall test" begin
     chunks, sources = process_paths(url)
-    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing
-
+    @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+          sources[1] != nothing
 end