From 6c1526013005bc23414dbd658bae984d63fef486 Mon Sep 17 00:00:00 2001 From: Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Sat, 24 Aug 2024 04:42:08 -0700 Subject: [PATCH] added knowledgepack creation scripts --- README.md | 2 +- docs/make.jl | 2 +- docs/src/index.md | 2 +- .../genie_knowledge_pack.jl | 18 +++++++ .../juliaData_knowledge_pack.jl | 5 +- .../juliaLang_knowledge_pack.jl | 24 +++++++++ .../makie_knowledge_pack.jl | 26 ++++++++++ .../plots_knowledge_pack.jl | 29 +++++++++++ .../sciml_knowledge_pack.jl | 51 +++++++++++++++++++ .../tidier_knowledge_pack.jl | 21 ++++++++ .../using_with_AIHelpMe.jl | 2 +- src/make_knowledge_packs.jl | 9 ++-- 12 files changed, 179 insertions(+), 12 deletions(-) create mode 100644 example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl rename examples/scripts/generate_knowledge_pack.jl => example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl (84%) create mode 100644 example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl create mode 100644 example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl create mode 100644 example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl create mode 100644 example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl create mode 100644 example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl rename {examples/scripts => example_scripts}/using_with_AIHelpMe.jl (98%) diff --git a/README.md b/README.md index 3688324..35ef955 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl") ## Building the Index ```julia -crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"] +crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"] index_path = make_knowledge_packs(crawlable_urls; index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path="knowledge_packs") diff --git a/docs/make.jl b/docs/make.jl index 7dbdbce..ee07dcc 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -5,7 +5,7 @@ DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive makedocs(; modules = [DocsScraper], - authors = "Shreyas Agrawal @splendidbug and contributors", + authors = "Shreyas Agrawal @splendidbug and contributors", sitename = "DocsScraper.jl", repo = "https://github.com/JuliaGenAI/DocsScraper.jl/blob/{commit}{path}#{line}", format = Documenter.HTML(; diff --git a/docs/src/index.md b/docs/src/index.md index 800fcae..0864ef4 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -30,7 +30,7 @@ Pkg.add(url="https://github.com/JuliaGenAI/DocsScraper.jl") ## Building the Index ```julia -crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"] +crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev"] index_path = make_knowledge_packs(crawlable_urls; index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path=joinpath(pwd(), "knowledge_packs")) diff --git a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl new file mode 100644 index 0000000..671f9cc --- /dev/null +++ b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl @@ -0,0 +1,18 @@ +# The example below demonstrates the creation of Genie knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = ["https://learn.genieframework.com/"] + +index_path = make_knowledge_packs(crawlable_urls; + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "genie", custom_metadata = "Genie ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/examples/scripts/generate_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl similarity index 84% rename from examples/scripts/generate_knowledge_pack.jl rename to example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl index 65883f2..cea5906 100644 --- a/examples/scripts/generate_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl @@ -24,11 +24,10 @@ single_page_urls = ["https://docs.julialang.org/en/v1/manual/missing/", "https://arrow.apache.org/julia/stable/reference/"] index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls, - embedding_dimension = 1024, embedding_bool = true, - target_path = joinpath(pwd(), "knowledge_to_delete"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem") + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem") # The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. -# The above example creates the output directory (Link to the output directory). It contains the sub-directories "Scraped" and "Index". +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". # "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file # containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl new file mode 100644 index 0000000..9fbb7d5 --- /dev/null +++ b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl @@ -0,0 +1,24 @@ +# The example below demonstrates the creation of JuliaLang knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = [ + "https://docs.julialang.org/en/v1/", "https://julialang.github.io/IJulia.jl/stable/", + "https://julialang.github.io/PackageCompiler.jl/stable/", "https://pkgdocs.julialang.org/dev/", + "https://julialang.github.io/JuliaSyntax.jl/dev/", + "https://julialang.github.io/AllocCheck.jl/dev/", "https://julialang.github.io/PrecompileTools.jl/stable/", + "https://julialang.github.io/StyledStrings.jl/dev/"] + +index_path = make_knowledge_packs(crawlable_urls; + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), + index_name = "julialang", custom_metadata = "JuliaLang ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl new file mode 100644 index 0000000..eb45fbc --- /dev/null +++ b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl @@ -0,0 +1,26 @@ +# The example below demonstrates the creation of Makie knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = ["https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/", + "https://beautiful.makie.org/dev/", + "https://juliadatascience.io/DataVisualizationMakie", + "https://docs.makie.org/v0.21/explanations/backends/glmakie", "https://juliadatascience.io/glmakie", + "https://docs.makie.org/v0.21/explanations/backends/cairomakie", "https://juliadatascience.io/cairomakie", "http://juliaplots.org/WGLMakie.jl/stable/", + "http://juliaplots.org/WGLMakie.jl/dev/", "https://docs.makie.org/v0.21/explanations/backends/wglmakie", + "https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/abstractplotting_api.html", "http://juliaplots.org/StatsMakie.jl/latest/", + "https://docs.juliahub.com/StatsMakie/RRy0o/0.2.3/manual/tutorial/", "https://geo.makie.org/v0.7.3/", "https://geo.makie.org/dev/", + "https://libgeos.org/doxygen/geos__c_8h.html", "https://docs.makie.org/v0.21/"] + +index_path = make_knowledge_packs(crawlable_urls; + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "makie", custom_metadata = "Makie ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl new file mode 100644 index 0000000..3a908ba --- /dev/null +++ b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl @@ -0,0 +1,29 @@ +# The example below demonstrates the creation of plots knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = [ + "https://docs.juliaplots.org/stable/", "https://docs.juliaplots.org/dev/", + "https://docs.juliaplots.org/latest/", + "https://docs.juliaplots.org/latest/generated/statsplots/", "https://docs.juliaplots.org/latest/ecosystem/", + "http://juliaplots.org/PlotlyJS.jl/stable/", + "http://juliaplots.org/PlotlyJS.jl/stable/manipulating_plots/", "https://docs.juliaplots.org/latest/gallery/gr/", + "https://docs.juliaplots.org/latest/gallery/unicodeplots/", + "https://docs.juliaplots.org/latest/gallery/pgfplotsx/", + "https://juliaplots.org/RecipesBase.jl/stable/", + "https://juliastats.org/StatsBase.jl/stable/", "https://juliastats.org/StatsBase.jl/stable/statmodels/", + "http://juliagraphs.org/GraphPlot.jl/", + "https://docs.juliahub.com/GraphPlot/bUwXr/0.6.0/"] + +index_path = make_knowledge_packs(crawlable_urls; + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "plots", custom_metadata = "Plots ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl new file mode 100644 index 0000000..ca6d472 --- /dev/null +++ b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl @@ -0,0 +1,51 @@ +# The example below demonstrates the creation of SciML knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = ["https://sciml.ai/", "https://docs.sciml.ai/DiffEqDocs/stable/", + "https://docs.sciml.ai/DiffEqDocs/stable/types/sde_types/", + "https://docs.sciml.ai/ModelingToolkit/dev/", "https://docs.sciml.ai/DiffEqFlux/stable/", + "https://docs.sciml.ai/NeuralPDE/stable/", "https://docs.sciml.ai/NeuralPDE/stable/tutorials/pdesystem/", + "https://docs.sciml.ai/Optimization/stable/", + "https://docs.sciml.ai/SciMLSensitivity/stable/", "https://docs.sciml.ai/DataDrivenDiffEq/stable/", "https://turinglang.org/", + "https://turinglang.org/docs/tutorials/docs-00-getting-started/", "https://juliamath.github.io/MeasureTheory.jl/stable/", + "https://juliamath.github.io/MeasureTheory.jl/stable/", "https://docs.sciml.ai/DiffEqGPU/stable/", + "https://chevronetc.github.io/DistributedOperations.jl/dev/", "https://docs.sciml.ai/DiffEqBayes/stable/", + "https://turinglang.org/docs/tutorials/10-bayesian-differential-equations/index.html", "https://docs.sciml.ai/OrdinaryDiffEq/stable/", + "https://docs.sciml.ai/Overview/stable/", "https://docs.sciml.ai/DiffEqDocs/stable/solvers/sde_solve/", + "https://docs.sciml.ai/SciMLSensitivity/stable/examples/dde/delay_diffeq/", "https://docs.sciml.ai/DiffEqDocs/stable/tutorials/dde_example/", + "https://docs.sciml.ai/DiffEqDocs/stable/types/dae_types/", "https://docs.sciml.ai/DiffEqCallbacks/stable/", + "https://docs.sciml.ai/SciMLBase/stable/", + "https://docs.sciml.ai/DiffEqDocs/stable/features/callback_library/", "https://docs.sciml.ai/LinearSolve/stable/", + "https://docs.sciml.ai/ModelingToolkit/stable/", + "https://docs.sciml.ai/DataInterpolations/stable/", "https://docs.sciml.ai/DeepEquilibriumNetworks/stable/", + "https://docs.sciml.ai/DiffEqParamEstim/stable/", + "https://docs.sciml.ai/Integrals/stable/", "https://docs.sciml.ai/EasyModelAnalysis/stable/", + "https://docs.sciml.ai/GlobalSensitivity/stable/", + "https://docs.sciml.ai/ExponentialUtilities/stable/", "https://docs.sciml.ai/HighDimPDE/stable/", + "https://docs.sciml.ai/SciMLTutorialsOutput/stable/", + "https://docs.sciml.ai/Catalyst/stable/", "https://docs.sciml.ai/Surrogates/stable/", + "https://docs.sciml.ai/SciMLBenchmarksOutput/stable/", + "https://docs.sciml.ai/NeuralOperators/stable/", "https://docs.sciml.ai/NonlinearSolve/stable/", + "https://docs.sciml.ai/RecursiveArrayTools/stable/", + "https://docs.sciml.ai/ReservoirComputing/stable/", "https://docs.sciml.ai/MethodOfLines/stable/", "https://lux.csail.mit.edu/dev/" +] + +# Crawler would not look for more URLs on these +single_page_urls = [ + "https://johnfoster.pge.utexas.edu/hpc-book/DifferentialEquations_jl.html", + "https://julialang.org/blog/2019/01/fluxdiffeq/", "https://juliapackages.com/p/galacticoptim", + "https://julianlsolvers.github.io/Optim.jl/stable/"] + +index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls, + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "sciml", custom_metadata = "SciML ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl new file mode 100644 index 0000000..f163225 --- /dev/null +++ b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl @@ -0,0 +1,21 @@ +# The example below demonstrates the creation of Tidier knowledge pack + +using Pkg +Pkg.activate(temp = true) +Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") +using DocsScraper + +# The crawler will run on these URLs to look for more URLs with the same hostname +crawlable_urls = ["https://tidierorg.github.io/Tidier.jl/dev/", + "https://tidierorg.github.io/TidierPlots.jl/latest/", + "https://tidierorg.github.io/TidierData.jl/latest/", + "https://tidierorg.github.io/TidierDB.jl/latest/"] + +index_path = make_knowledge_packs(crawlable_urls; + target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "tidier", custom_metadata = "Tidier ecosystem") + +# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. + +# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". +# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file +# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/examples/scripts/using_with_AIHelpMe.jl b/example_scripts/using_with_AIHelpMe.jl similarity index 98% rename from examples/scripts/using_with_AIHelpMe.jl rename to example_scripts/using_with_AIHelpMe.jl index 4227ac3..05b3c21 100644 --- a/examples/scripts/using_with_AIHelpMe.jl +++ b/example_scripts/using_with_AIHelpMe.jl @@ -7,7 +7,7 @@ using AIHelpMe using AIHelpMe: pprint, last_result # Creating the index -crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/home/"] +crawlable_urls = ["https://juliagenai.github.io/DocsScraper.jl/dev/"] index_path = make_knowledge_packs(crawlable_urls; index_name = "docsscraper", embedding_dimension = 1024, embedding_bool = true, target_path = "knowledge_packs") diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl index ce05b81..68055a1 100644 --- a/src/make_knowledge_packs.jl +++ b/src/make_knowledge_packs.jl @@ -62,7 +62,7 @@ end removes all dashes ('-') from a given string """ function process_text(text::AbstractString) - return replace(lowercase(text), "-" => "", "_" => "") + return replace(lowercase(text), "-" => "", "_" => "", " " => "") end """ @@ -84,9 +84,8 @@ function validate_args(crawlable_urls::Vector{<:AbstractString} = String[]; error("At least one of `input_urls` or `single_pages` must be provided.") end if !ispath(target_path) - @error "Target path $target_path does not exist" - target_path = joinpath(@__DIR__, "..", "knowledge_packs") - @info "Index path is set to: $target_path" + @warn "Target path provided does not exist. Creating path $target_path" + mkpath(target_path) end index_name = process_text(index_name) @@ -177,7 +176,7 @@ Return chunks, sources by reading the .jls files in `joinpath(target_path, "Scra """ function load_chunks_sources(target_path::AbstractString) scraped_files_dir = joinpath(target_path, "Scraped_files") - entries = readdir(joinpath(target_path, scraped_files_dir)) + entries = readdir(scraped_files_dir) # Regular expressions to match the file patterns of chunks and sources chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"