Skip to content

Commit

Permalink
added knowledgepack creation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
splendidbug committed Aug 24, 2024
1 parent f70588a commit 6c15260
Show file tree
Hide file tree
Showing 12 changed files with 179 additions and 12 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl")

## Building the Index
```julia
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"]

index_path = make_knowledge_packs(crawlable_urls;
index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path="knowledge_packs")
Expand Down
2 changes: 1 addition & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive

makedocs(;
modules = [DocsScraper],
authors = "Shreyas Agrawal @splendidbug and contributors",
authors = "Shreyas Agrawal @splendidbug and contributors",
sitename = "DocsScraper.jl",
repo = "https://github.com/JuliaGenAI/DocsScraper.jl/blob/{commit}{path}#{line}",
format = Documenter.HTML(;
Expand Down
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl")

## Building the Index
```julia
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"]

index_path = make_knowledge_packs(crawlable_urls;
index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path=joinpath(pwd(), "knowledge_packs"))
Expand Down
18 changes: 18 additions & 0 deletions example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# The example below demonstrates the creation of Genie knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = ["https://learn.genieframework.com/"]

index_path = make_knowledge_packs(crawlable_urls;
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "genie", custom_metadata = "Genie ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,10 @@ single_page_urls = ["https://docs.julialang.org/en/v1/manual/missing/",
"https://arrow.apache.org/julia/stable/reference/"]

index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
embedding_dimension = 1024, embedding_bool = true,
target_path = joinpath(pwd(), "knowledge_to_delete"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem")
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates the output directory (Link to the output directory). It contains the sub-directories "Scraped" and "Index".
# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# The example below demonstrates the creation of JuliaLang knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = [
"https://docs.julialang.org/en/v1/", "https://julialang.github.io/IJulia.jl/stable/",
"https://julialang.github.io/PackageCompiler.jl/stable/", "https://pkgdocs.julialang.org/dev/",
"https://julialang.github.io/JuliaSyntax.jl/dev/",
"https://julialang.github.io/AllocCheck.jl/dev/", "https://julialang.github.io/PrecompileTools.jl/stable/",
"https://julialang.github.io/StyledStrings.jl/dev/"]

index_path = make_knowledge_packs(crawlable_urls;
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"),
index_name = "julialang", custom_metadata = "JuliaLang ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
26 changes: 26 additions & 0 deletions example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# The example below demonstrates the creation of Makie knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = ["https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/",
"https://beautiful.makie.org/dev/",
"https://juliadatascience.io/DataVisualizationMakie",
"https://docs.makie.org/v0.21/explanations/backends/glmakie", "https://juliadatascience.io/glmakie",
"https://docs.makie.org/v0.21/explanations/backends/cairomakie", "https://juliadatascience.io/cairomakie", "http://juliaplots.org/WGLMakie.jl/stable/",
"http://juliaplots.org/WGLMakie.jl/dev/", "https://docs.makie.org/v0.21/explanations/backends/wglmakie",
"https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/abstractplotting_api.html", "http://juliaplots.org/StatsMakie.jl/latest/",
"https://docs.juliahub.com/StatsMakie/RRy0o/0.2.3/manual/tutorial/", "https://geo.makie.org/v0.7.3/", "https://geo.makie.org/dev/",
"https://libgeos.org/doxygen/geos__c_8h.html", "https://docs.makie.org/v0.21/"]

index_path = make_knowledge_packs(crawlable_urls;
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "makie", custom_metadata = "Makie ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
29 changes: 29 additions & 0 deletions example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# The example below demonstrates the creation of plots knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = [
"https://docs.juliaplots.org/stable/", "https://docs.juliaplots.org/dev/",
"https://docs.juliaplots.org/latest/",
"https://docs.juliaplots.org/latest/generated/statsplots/", "https://docs.juliaplots.org/latest/ecosystem/",
"http://juliaplots.org/PlotlyJS.jl/stable/",
"http://juliaplots.org/PlotlyJS.jl/stable/manipulating_plots/", "https://docs.juliaplots.org/latest/gallery/gr/",
"https://docs.juliaplots.org/latest/gallery/unicodeplots/",
"https://docs.juliaplots.org/latest/gallery/pgfplotsx/",
"https://juliaplots.org/RecipesBase.jl/stable/",
"https://juliastats.org/StatsBase.jl/stable/", "https://juliastats.org/StatsBase.jl/stable/statmodels/",
"http://juliagraphs.org/GraphPlot.jl/",
"https://docs.juliahub.com/GraphPlot/bUwXr/0.6.0/"]

index_path = make_knowledge_packs(crawlable_urls;
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "plots", custom_metadata = "Plots ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
51 changes: 51 additions & 0 deletions example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# The example below demonstrates the creation of SciML knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = ["https://sciml.ai/", "https://docs.sciml.ai/DiffEqDocs/stable/",
"https://docs.sciml.ai/DiffEqDocs/stable/types/sde_types/",
"https://docs.sciml.ai/ModelingToolkit/dev/", "https://docs.sciml.ai/DiffEqFlux/stable/",
"https://docs.sciml.ai/NeuralPDE/stable/", "https://docs.sciml.ai/NeuralPDE/stable/tutorials/pdesystem/",
"https://docs.sciml.ai/Optimization/stable/",
"https://docs.sciml.ai/SciMLSensitivity/stable/", "https://docs.sciml.ai/DataDrivenDiffEq/stable/", "https://turinglang.org/",
"https://turinglang.org/docs/tutorials/docs-00-getting-started/", "https://juliamath.github.io/MeasureTheory.jl/stable/",
"https://juliamath.github.io/MeasureTheory.jl/stable/", "https://docs.sciml.ai/DiffEqGPU/stable/",
"https://chevronetc.github.io/DistributedOperations.jl/dev/", "https://docs.sciml.ai/DiffEqBayes/stable/",
"https://turinglang.org/docs/tutorials/10-bayesian-differential-equations/index.html", "https://docs.sciml.ai/OrdinaryDiffEq/stable/",
"https://docs.sciml.ai/Overview/stable/", "https://docs.sciml.ai/DiffEqDocs/stable/solvers/sde_solve/",
"https://docs.sciml.ai/SciMLSensitivity/stable/examples/dde/delay_diffeq/", "https://docs.sciml.ai/DiffEqDocs/stable/tutorials/dde_example/",
"https://docs.sciml.ai/DiffEqDocs/stable/types/dae_types/", "https://docs.sciml.ai/DiffEqCallbacks/stable/",
"https://docs.sciml.ai/SciMLBase/stable/",
"https://docs.sciml.ai/DiffEqDocs/stable/features/callback_library/", "https://docs.sciml.ai/LinearSolve/stable/",
"https://docs.sciml.ai/ModelingToolkit/stable/",
"https://docs.sciml.ai/DataInterpolations/stable/", "https://docs.sciml.ai/DeepEquilibriumNetworks/stable/",
"https://docs.sciml.ai/DiffEqParamEstim/stable/",
"https://docs.sciml.ai/Integrals/stable/", "https://docs.sciml.ai/EasyModelAnalysis/stable/",
"https://docs.sciml.ai/GlobalSensitivity/stable/",
"https://docs.sciml.ai/ExponentialUtilities/stable/", "https://docs.sciml.ai/HighDimPDE/stable/",
"https://docs.sciml.ai/SciMLTutorialsOutput/stable/",
"https://docs.sciml.ai/Catalyst/stable/", "https://docs.sciml.ai/Surrogates/stable/",
"https://docs.sciml.ai/SciMLBenchmarksOutput/stable/",
"https://docs.sciml.ai/NeuralOperators/stable/", "https://docs.sciml.ai/NonlinearSolve/stable/",
"https://docs.sciml.ai/RecursiveArrayTools/stable/",
"https://docs.sciml.ai/ReservoirComputing/stable/", "https://docs.sciml.ai/MethodOfLines/stable/", "https://lux.csail.mit.edu/dev/"
]

# Crawler would not look for more URLs on these
single_page_urls = [
"https://johnfoster.pge.utexas.edu/hpc-book/DifferentialEquations_jl.html",
"https://julialang.org/blog/2019/01/fluxdiffeq/", "https://juliapackages.com/p/galacticoptim",
"https://julianlsolvers.github.io/Optim.jl/stable/"]

index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls,
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "sciml", custom_metadata = "SciML ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
21 changes: 21 additions & 0 deletions example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# The example below demonstrates the creation of Tidier knowledge pack

using Pkg
Pkg.activate(temp = true)
Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl")
using DocsScraper

# The crawler will run on these URLs to look for more URLs with the same hostname
crawlable_urls = ["https://tidierorg.github.io/Tidier.jl/dev/",
"https://tidierorg.github.io/TidierPlots.jl/latest/",
"https://tidierorg.github.io/TidierData.jl/latest/",
"https://tidierorg.github.io/TidierDB.jl/latest/"]

index_path = make_knowledge_packs(crawlable_urls;
target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "tidier", custom_metadata = "Tidier ecosystem")

# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384.

# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index".
# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file
# containing the artifact info. The output directory also contains the URL mapping csv.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ using AIHelpMe
using AIHelpMe: pprint, last_result

# Creating the index
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"]
crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/"]
index_path = make_knowledge_packs(crawlable_urls;
index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true,
target_path = "knowledge_packs")
Expand Down
9 changes: 4 additions & 5 deletions src/make_knowledge_packs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ end
removes all dashes ('-') from a given string
"""
function process_text(text::AbstractString)
return replace(lowercase(text), "-" => "", "_" => "")
return replace(lowercase(text), "-" => "", "_" => "", " " => "")

Check warning on line 65 in src/make_knowledge_packs.jl

View check run for this annotation

Codecov / codecov/patch

src/make_knowledge_packs.jl#L65

Added line #L65 was not covered by tests
end

"""
Expand All @@ -84,9 +84,8 @@ function validate_args(crawlable_urls::Vector{<:AbstractString} = String[];
error("At least one of `input_urls` or `single_pages` must be provided.")
end
if !ispath(target_path)
@error "Target path $target_path does not exist"
target_path = joinpath(@__DIR__, "..", "knowledge_packs")
@info "Index path is set to: $target_path"
@warn "Target path provided does not exist. Creating path $target_path"
mkpath(target_path)

Check warning on line 88 in src/make_knowledge_packs.jl

View check run for this annotation

Codecov / codecov/patch

src/make_knowledge_packs.jl#L87-L88

Added lines #L87 - L88 were not covered by tests
end

index_name = process_text(index_name)
Expand Down Expand Up @@ -177,7 +176,7 @@ Return chunks, sources by reading the .jls files in `joinpath(target_path, "Scra
"""
function load_chunks_sources(target_path::AbstractString)
scraped_files_dir = joinpath(target_path, "Scraped_files")
entries = readdir(joinpath(target_path, scraped_files_dir))
entries = readdir(scraped_files_dir)

Check warning on line 179 in src/make_knowledge_packs.jl

View check run for this annotation

Codecov / codecov/patch

src/make_knowledge_packs.jl#L179

Added line #L179 was not covered by tests

# Regular expressions to match the file patterns of chunks and sources
chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
Expand Down

0 comments on commit 6c15260

Please sign in to comment.