From 6c1526013005bc23414dbd658bae984d63fef486 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sat, 24 Aug 2024 04:42:08 -0700
Subject: [PATCH 1/3] added knowledgepack creation scripts

---
 README.md                                     |  2 +-
 docs/make.jl                                  |  2 +-
 docs/src/index.md                             |  2 +-
 .../genie_knowledge_pack.jl                   | 18 +++++++
 .../juliaData_knowledge_pack.jl               |  5 +-
 .../juliaLang_knowledge_pack.jl               | 24 +++++++++
 .../makie_knowledge_pack.jl                   | 26 ++++++++++
 .../plots_knowledge_pack.jl                   | 29 +++++++++++
 .../sciml_knowledge_pack.jl                   | 51 +++++++++++++++++++
 .../tidier_knowledge_pack.jl                  | 21 ++++++++
 .../using_with_AIHelpMe.jl                    |  2 +-
 src/make_knowledge_packs.jl                   |  9 ++--
 12 files changed, 179 insertions(+), 12 deletions(-)
 create mode 100644 example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
 rename examples/scripts/generate_knowledge_pack.jl => example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl (84%)
 create mode 100644 example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
 create mode 100644 example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
 create mode 100644 example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
 create mode 100644 example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
 create mode 100644 example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
 rename {examples/scripts => example_scripts}/using_with_AIHelpMe.jl (98%)

diff --git a/README.md b/README.md
index 3688324..35ef955 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl")
 
 ## Building the Index
 ```julia
-crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
+crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"]
 
 index_path = make_knowledge_packs(crawlable_urls;
     index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path="knowledge_packs")
diff --git a/docs/make.jl b/docs/make.jl
index 7dbdbce..ee07dcc 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -5,7 +5,7 @@ DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive
 
 makedocs(;
     modules = [DocsScraper],
-    authors = "Shreyas Agrawal @splendidbug  and contributors",
+    authors = "Shreyas Agrawal @splendidbug and contributors",
     sitename = "DocsScraper.jl",
     repo = "https://github.com/JuliaGenAI/DocsScraper.jl/blob/{commit}{path}#{line}",
     format = Documenter.HTML(;
diff --git a/docs/src/index.md b/docs/src/index.md
index 800fcae..0864ef4 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -30,7 +30,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl")
 
 ## Building the Index
 ```julia
-crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
+crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"]
 
 index_path = make_knowledge_packs(crawlable_urls;
     index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path=joinpath(pwd(), "knowledge_packs"))
diff --git a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
new file mode 100644
index 0000000..671f9cc
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
@@ -0,0 +1,18 @@
+# The example below demonstrates the creation of Genie knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = ["https://learn.genieframework.com/"]
+
+index_path = make_knowledge_packs(crawlable_urls;
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "genie", custom_metadata = "Genie ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/examples/scripts/generate_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
similarity index 84%
rename from examples/scripts/generate_knowledge_pack.jl
rename to example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
index 65883f2..cea5906 100644
--- a/examples/scripts/generate_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
@@ -24,11 +24,10 @@ single_page_urls = ["https://docs.julialang.org/en/v1/manual/missing/",
     "https://arrow.apache.org/julia/stable/reference/"]
 
 index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
-    embedding_dimension = 1024, embedding_bool = true,
-    target_path = joinpath(pwd(), "knowledge_to_delete"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem")
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem")
 
 # The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
 
-# The above example creates the output directory (Link to the output directory). It contains the sub-directories "Scraped" and "Index". 
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
 # "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
 # containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
new file mode 100644
index 0000000..9fbb7d5
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
@@ -0,0 +1,24 @@
+# The example below demonstrates the creation of JuliaLang knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = [
+    "https://docs.julialang.org/en/v1/", "https://julialang.github.io/IJulia.jl/stable/",
+    "https://julialang.github.io/PackageCompiler.jl/stable/", "https://pkgdocs.julialang.org/dev/",
+    "https://julialang.github.io/JuliaSyntax.jl/dev/",
+    "https://julialang.github.io/AllocCheck.jl/dev/", "https://julialang.github.io/PrecompileTools.jl/stable/",
+    "https://julialang.github.io/StyledStrings.jl/dev/"]
+
+index_path = make_knowledge_packs(crawlable_urls;
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"),
+    index_name = "julialang", custom_metadata = "JuliaLang ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
new file mode 100644
index 0000000..eb45fbc
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
@@ -0,0 +1,26 @@
+# The example below demonstrates the creation of Makie knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = ["https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/",
+    "https://beautiful.makie.org/dev/",
+    "https://juliadatascience.io/DataVisualizationMakie",
+    "https://docs.makie.org/v0.21/explanations/backends/glmakie", "https://juliadatascience.io/glmakie",
+    "https://docs.makie.org/v0.21/explanations/backends/cairomakie", "https://juliadatascience.io/cairomakie", "http://juliaplots.org/WGLMakie.jl/stable/",
+    "http://juliaplots.org/WGLMakie.jl/dev/", "https://docs.makie.org/v0.21/explanations/backends/wglmakie",
+    "https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/abstractplotting_api.html", "http://juliaplots.org/StatsMakie.jl/latest/",
+    "https://docs.juliahub.com/StatsMakie/RRy0o/0.2.3/manual/tutorial/", "https://geo.makie.org/v0.7.3/", "https://geo.makie.org/dev/",
+    "https://libgeos.org/doxygen/geos__c_8h.html", "https://docs.makie.org/v0.21/"]
+
+index_path = make_knowledge_packs(crawlable_urls;
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "makie", custom_metadata = "Makie ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
new file mode 100644
index 0000000..3a908ba
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
@@ -0,0 +1,29 @@
+# The example below demonstrates the creation of plots knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = [
+    "https://docs.juliaplots.org/stable/", "https://docs.juliaplots.org/dev/",
+    "https://docs.juliaplots.org/latest/",
+    "https://docs.juliaplots.org/latest/generated/statsplots/", "https://docs.juliaplots.org/latest/ecosystem/",
+    "http://juliaplots.org/PlotlyJS.jl/stable/",
+    "http://juliaplots.org/PlotlyJS.jl/stable/manipulating_plots/", "https://docs.juliaplots.org/latest/gallery/gr/",
+    "https://docs.juliaplots.org/latest/gallery/unicodeplots/",
+    "https://docs.juliaplots.org/latest/gallery/pgfplotsx/",
+    "https://juliaplots.org/RecipesBase.jl/stable/",
+    "https://juliastats.org/StatsBase.jl/stable/", "https://juliastats.org/StatsBase.jl/stable/statmodels/",
+    "http://juliagraphs.org/GraphPlot.jl/",
+    "https://docs.juliahub.com/GraphPlot/bUwXr/0.6.0/"]
+
+index_path = make_knowledge_packs(crawlable_urls;
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "plots", custom_metadata = "Plots ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
new file mode 100644
index 0000000..ca6d472
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
@@ -0,0 +1,51 @@
+# The example below demonstrates the creation of SciML knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = ["https://sciml.ai/", "https://docs.sciml.ai/DiffEqDocs/stable/",
+    "https://docs.sciml.ai/DiffEqDocs/stable/types/sde_types/",
+    "https://docs.sciml.ai/ModelingToolkit/dev/", "https://docs.sciml.ai/DiffEqFlux/stable/",
+    "https://docs.sciml.ai/NeuralPDE/stable/", "https://docs.sciml.ai/NeuralPDE/stable/tutorials/pdesystem/",
+    "https://docs.sciml.ai/Optimization/stable/",
+    "https://docs.sciml.ai/SciMLSensitivity/stable/", "https://docs.sciml.ai/DataDrivenDiffEq/stable/", "https://turinglang.org/",
+    "https://turinglang.org/docs/tutorials/docs-00-getting-started/", "https://juliamath.github.io/MeasureTheory.jl/stable/",
+    "https://juliamath.github.io/MeasureTheory.jl/stable/", "https://docs.sciml.ai/DiffEqGPU/stable/",
+    "https://chevronetc.github.io/DistributedOperations.jl/dev/", "https://docs.sciml.ai/DiffEqBayes/stable/",
+    "https://turinglang.org/docs/tutorials/10-bayesian-differential-equations/index.html", "https://docs.sciml.ai/OrdinaryDiffEq/stable/",
+    "https://docs.sciml.ai/Overview/stable/", "https://docs.sciml.ai/DiffEqDocs/stable/solvers/sde_solve/",
+    "https://docs.sciml.ai/SciMLSensitivity/stable/examples/dde/delay_diffeq/", "https://docs.sciml.ai/DiffEqDocs/stable/tutorials/dde_example/",
+    "https://docs.sciml.ai/DiffEqDocs/stable/types/dae_types/", "https://docs.sciml.ai/DiffEqCallbacks/stable/",
+    "https://docs.sciml.ai/SciMLBase/stable/",
+    "https://docs.sciml.ai/DiffEqDocs/stable/features/callback_library/", "https://docs.sciml.ai/LinearSolve/stable/",
+    "https://docs.sciml.ai/ModelingToolkit/stable/",
+    "https://docs.sciml.ai/DataInterpolations/stable/", "https://docs.sciml.ai/DeepEquilibriumNetworks/stable/",
+    "https://docs.sciml.ai/DiffEqParamEstim/stable/",
+    "https://docs.sciml.ai/Integrals/stable/", "https://docs.sciml.ai/EasyModelAnalysis/stable/",
+    "https://docs.sciml.ai/GlobalSensitivity/stable/",
+    "https://docs.sciml.ai/ExponentialUtilities/stable/", "https://docs.sciml.ai/HighDimPDE/stable/",
+    "https://docs.sciml.ai/SciMLTutorialsOutput/stable/",
+    "https://docs.sciml.ai/Catalyst/stable/", "https://docs.sciml.ai/Surrogates/stable/",
+    "https://docs.sciml.ai/SciMLBenchmarksOutput/stable/",
+    "https://docs.sciml.ai/NeuralOperators/stable/", "https://docs.sciml.ai/NonlinearSolve/stable/",
+    "https://docs.sciml.ai/RecursiveArrayTools/stable/",
+    "https://docs.sciml.ai/ReservoirComputing/stable/", "https://docs.sciml.ai/MethodOfLines/stable/", "https://lux.csail.mit.edu/dev/"
+]
+
+# Crawler would not look for more URLs on these
+single_page_urls = [
+    "https://johnfoster.pge.utexas.edu/hpc-book/DifferentialEquations_jl.html",
+    "https://julialang.org/blog/2019/01/fluxdiffeq/", "https://juliapackages.com/p/galacticoptim",
+    "https://julianlsolvers.github.io/Optim.jl/stable/"]
+
+index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "sciml", custom_metadata = "SciML ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
new file mode 100644
index 0000000..f163225
--- /dev/null
+++ b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
@@ -0,0 +1,21 @@
+# The example below demonstrates the creation of Tidier knowledge pack
+
+using Pkg
+Pkg.activate(temp = true)
+Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
+using DocsScraper
+
+# The crawler will run on these URLs to look for more URLs with the same hostname
+crawlable_urls = ["https://tidierorg.github.io/Tidier.jl/dev/",
+    "https://tidierorg.github.io/TidierPlots.jl/latest/",
+    "https://tidierorg.github.io/TidierData.jl/latest/",
+    "https://tidierorg.github.io/TidierDB.jl/latest/"]
+
+index_path = make_knowledge_packs(crawlable_urls;
+    target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "tidier", custom_metadata = "Tidier ecosystem")
+
+# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
+
+# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
+# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
+# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/examples/scripts/using_with_AIHelpMe.jl b/example_scripts/using_with_AIHelpMe.jl
similarity index 98%
rename from examples/scripts/using_with_AIHelpMe.jl
rename to example_scripts/using_with_AIHelpMe.jl
index 4227ac3..05b3c21 100644
--- a/examples/scripts/using_with_AIHelpMe.jl
+++ b/example_scripts/using_with_AIHelpMe.jl
@@ -7,7 +7,7 @@ using AIHelpMe
 using AIHelpMe: pprint, last_result
 
 # Creating the index
-crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
+crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/"]
 index_path = make_knowledge_packs(crawlable_urls;
     index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true,
     target_path = "knowledge_packs")
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index ce05b81..68055a1 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -62,7 +62,7 @@ end
 removes all dashes ('-') from a given string
 """
 function process_text(text::AbstractString)
-    return replace(lowercase(text), "-" => "", "_" => "")
+    return replace(lowercase(text), "-" => "", "_" => "", " " => "")
 end
 
 """
@@ -84,9 +84,8 @@ function validate_args(crawlable_urls::Vector{<:AbstractString} = String[];
         error("At least one of `input_urls` or `single_pages` must be provided.")
     end
     if !ispath(target_path)
-        @error "Target path $target_path does not exist"
-        target_path = joinpath(@__DIR__, "..", "knowledge_packs")
-        @info "Index path is set to: $target_path"
+        @warn "Target path provided does not exist. Creating path $target_path"
+        mkpath(target_path)
     end
 
     index_name = process_text(index_name)
@@ -177,7 +176,7 @@ Return chunks, sources by reading the .jls files in `joinpath(target_path, "Scra
 """
 function load_chunks_sources(target_path::AbstractString)
     scraped_files_dir = joinpath(target_path, "Scraped_files")
-    entries = readdir(joinpath(target_path, scraped_files_dir))
+    entries = readdir(scraped_files_dir)
 
     # Regular expressions to match the file patterns of chunks and sources
     chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"

From 77eb3a8f0f495d1fa35d15ef5e776db563f4e21f Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sat, 24 Aug 2024 05:27:25 -0700
Subject: [PATCH 2/3] removed installations in examples

---
 README.md                                     | 21 +++++++++++++++++++
 docs/src/index.md                             | 21 +++++++++++++++++++
 .../genie_knowledge_pack.jl                   |  9 --------
 .../juliaData_knowledge_pack.jl               |  9 --------
 .../juliaLang_knowledge_pack.jl               |  9 --------
 .../makie_knowledge_pack.jl                   |  9 --------
 .../plots_knowledge_pack.jl                   |  9 --------
 .../sciml_knowledge_pack.jl                   |  9 --------
 .../tidier_knowledge_pack.jl                  |  9 --------
 src/extract_package_name.jl                   |  2 +-
 src/make_knowledge_packs.jl                   |  2 +-
 11 files changed, 44 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index 35ef955..e6a7651 100644
--- a/README.md
+++ b/README.md
@@ -100,3 +100,24 @@ using AIHelpMe: last_result
 # last_result() returns the last result from the RAG pipeline, ie, same as running aihelp(; return_all=true)
 print(last_result())
 ```
+## Output
+`make_knowledge_packs` creates the following files:
+
+```
+index_name\
+│
+├── Index\
+│   ├── index_name__artifact__info.txt
+│   ├── index_name__vDate__model_embedding_size-embedding_type__v1.0.hdf5
+│   └── index_name__vDate__model_embedding_size-embedding_type__v1.0.tar.gz 
+│
+├── Scraped_files\
+│   ├── scraped_hostname-chunks-max-chunk_size-min-min_chunk_size.jls
+│   ├── scraped_hostname-sources-max-chunk_size-min-min_chunk_size.jls
+│   └── . . .
+│
+└── index_name_URL_mapping.csv
+```
+- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
+- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
+- URL_mapping.csv contains the scraped URLs mapping them with the estimated package name.
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 0864ef4..4014c92 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -97,3 +97,24 @@ Tip: Use `pprint` for nicer outputs with sources and `last_result` for more deta
 using AIHelpMe: last_result
 print(last_result())
 ```
+## Output
+`make_knowledge_packs` creates the following files:
+
+```
+index_name\
+│
+├── Index\
+│   ├── index_name__artifact__info.txt
+│   ├── index_name__vDate__model_embedding_size-embedding_type__v1.0.hdf5
+│   └── index_name__vDate__model_embedding_size-embedding_type__v1.0.tar.gz  
+│
+├── Scraped_files\
+│   ├── scraped_hostname-chunks-max-chunk_size-min-min_chunk_size.jls
+│   ├── scraped_hostname-sources-max-chunk_size-min-min_chunk_size.jls
+│   └── . . .
+│
+└── index_name_URL_mapping.csv
+```
+- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
+- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
+- URL_mapping.csv contains the scraped URLs mapping them with the estimated package name.
\ No newline at end of file
diff --git a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
index 671f9cc..daea594 100644
--- a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of Genie knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -10,9 +7,3 @@ crawlable_urls = ["https://learn.genieframework.com/"]
 
 index_path = make_knowledge_packs(crawlable_urls;
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "genie", custom_metadata = "Genie ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
index cea5906..fa4d4b7 100644
--- a/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of JuliaData knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -25,9 +22,3 @@ single_page_urls = ["https://docs.julialang.org/en/v1/manual/missing/",
 
 index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
index 9fbb7d5..84273f7 100644
--- a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of JuliaLang knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -16,9 +13,3 @@ crawlable_urls = [
 index_path = make_knowledge_packs(crawlable_urls;
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"),
     index_name = "julialang", custom_metadata = "JuliaLang ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
index eb45fbc..b0bb87b 100644
--- a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of Makie knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -18,9 +15,3 @@ crawlable_urls = ["https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/",
 
 index_path = make_knowledge_packs(crawlable_urls;
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "makie", custom_metadata = "Makie ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
index 3a908ba..1abeeee 100644
--- a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of plots knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -21,9 +18,3 @@ crawlable_urls = [
 
 index_path = make_knowledge_packs(crawlable_urls;
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "plots", custom_metadata = "Plots ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
index ca6d472..a627b56 100644
--- a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of SciML knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -43,9 +40,3 @@ single_page_urls = [
 
 index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "sciml", custom_metadata = "SciML ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
index f163225..3e1f6d8 100644
--- a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
+++ b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
@@ -1,8 +1,5 @@
 # The example below demonstrates the creation of Tidier knowledge pack
 
-using Pkg
-Pkg.activate(temp = true)
-Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
 using DocsScraper
 
 # The crawler will run on these URLs to look for more URLs with the same hostname
@@ -13,9 +10,3 @@ crawlable_urls = ["https://tidierorg.github.io/Tidier.jl/dev/",
 
 index_path = make_knowledge_packs(crawlable_urls;
     target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "tidier", custom_metadata = "Tidier ecosystem")
-
-# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. 
-
-# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". 
-# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file 
-# containing the artifact info. The output directory also contains the URL mapping csv.
diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl
index 525cecf..f5eb8ca 100644
--- a/src/extract_package_name.jl
+++ b/src/extract_package_name.jl
@@ -4,7 +4,7 @@
 Strip URL of any http:// ot https:// or www. prefixes 
 """
 function clean_url(url::String)
-    # Remove http://, https://, www., or wwws.
+    # Remove http://, https://, www.
     cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "")
     return cleaned_url
 end
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
index 68055a1..0591cc5 100644
--- a/src/make_knowledge_packs.jl
+++ b/src/make_knowledge_packs.jl
@@ -70,7 +70,7 @@ end
         single_urls::Vector{<:AbstractString} = String[], target_path::AbstractString = "", index_name::AbstractString = "")
 
 Validate args. Return error if both `crawlable_urls` and `single_urls` are empty. 
-Create a target path if input path is invalid. Create a gensym index if the input index is inavlid. 
+Create a target path if input path is invalid. Create a gensym index if the input index is invalid. 
 
 # Arguments
 - crawlable_urls: URLs that should be crawled to find more links

From fe30323fa3418e572b5a116a78d26ed2d53de739 Mon Sep 17 00:00:00 2001
From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>
Date: Sat, 24 Aug 2024 05:29:18 -0700
Subject: [PATCH 3/3] doc updates

---
 README.md         | 4 ++--
 docs/src/index.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index e6a7651..4458440 100644
--- a/README.md
+++ b/README.md
@@ -118,6 +118,6 @@ index_name\
 │
 └── index_name_URL_mapping.csv
 ```
-- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
-- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
+- Index\: contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
+- Scraped_files\: contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
 - URL_mapping.csv contains the scraped URLs mapping them with the estimated package name.
\ No newline at end of file
diff --git a/docs/src/index.md b/docs/src/index.md
index 4014c92..6e9daba 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -115,6 +115,6 @@ index_name\
 │
 └── index_name_URL_mapping.csv
 ```
-- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
-- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
+- Index\: contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes. 
+- Scraped_files\: contains the scraped chunks and sources. These are separated by the hostnames of the URLs.
 - URL_mapping.csv contains the scraped URLs mapping them with the estimated package name.
\ No newline at end of file