diff --git a/README.md b/README.md index 35ef955..e6a7651 100644 --- a/README.md +++ b/README.md @@ -100,3 +100,24 @@ using AIHelpMe: last_result # last_result() returns the last result from the RAG pipeline, ie, same as running aihelp(; return_all=true) print(last_result()) ``` +## Output +`make_knowledge_packs` creates the following files: + +``` +index_name\ +│ +├── Index\ +│ ├── index_name__artifact__info.txt +│ ├── index_name__vDate__model_embedding_size-embedding_type__v1.0.hdf5 +│ └── index_name__vDate__model_embedding_size-embedding_type__v1.0.tar.gz +│ +├── Scraped_files\ +│ ├── scraped_hostname-chunks-max-chunk_size-min-min_chunk_size.jls +│ ├── scraped_hostname-sources-max-chunk_size-min-min_chunk_size.jls +│ └── . . . +│ +└── index_name_URL_mapping.csv +``` +- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes.  +- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs. +- URL_mapping.csv contains the scraped URLs mapping them with the estimated package name. \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index 0864ef4..4014c92 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -97,3 +97,24 @@ Tip: Use `pprint` for nicer outputs with sources and `last_result` for more deta using AIHelpMe: last_result print(last_result()) ``` +## Output +`make_knowledge_packs` creates the following files: + +``` +index_name\ +│ +├── Index\ +│ ├── index_name__artifact__info.txt +│ ├── index_name__vDate__model_embedding_size-embedding_type__v1.0.hdf5 +│ └── index_name__vDate__model_embedding_size-embedding_type__v1.0.tar.gz +│ +├── Scraped_files\ +│ ├── scraped_hostname-chunks-max-chunk_size-min-min_chunk_size.jls +│ ├── scraped_hostname-sources-max-chunk_size-min-min_chunk_size.jls +│ └── . . . +│ +└── index_name_URL_mapping.csv +``` +- Index directory contains the .hdf5 and .tar.gz files along with the artifact__info.txt. Artifact info contains sha256 and git-tree-sha1 hashes.  +- Scraped_files directory contains the scraped chunks and sources. These are separated by the hostnames of the URLs. +- URL_mapping.csv contains the scraped URLs mapping them with the estimated package name. \ No newline at end of file diff --git a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl index 671f9cc..daea594 100644 --- a/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/genie_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of Genie knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -10,9 +7,3 @@ crawlable_urls = ["https://learn.genieframework.com/"] index_path = make_knowledge_packs(crawlable_urls; target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "genie", custom_metadata = "Genie ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl index cea5906..fa4d4b7 100644 --- a/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/juliaData_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of JuliaData knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -25,9 +22,3 @@ single_page_urls = ["https://docs.julialang.org/en/v1/manual/missing/", index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls, target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "juliadata", custom_metadata = "JuliaData ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl index 9fbb7d5..84273f7 100644 --- a/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/juliaLang_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of JuliaLang knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -16,9 +13,3 @@ crawlable_urls = [ index_path = make_knowledge_packs(crawlable_urls; target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "julialang", custom_metadata = "JuliaLang ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl index eb45fbc..b0bb87b 100644 --- a/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/makie_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of Makie knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -18,9 +15,3 @@ crawlable_urls = ["https://docs.juliahub.com/MakieGallery/Ql23q/0.2.17/", index_path = make_knowledge_packs(crawlable_urls; target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "makie", custom_metadata = "Makie ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl index 3a908ba..1abeeee 100644 --- a/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/plots_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of plots knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -21,9 +18,3 @@ crawlable_urls = [ index_path = make_knowledge_packs(crawlable_urls; target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "plots", custom_metadata = "Plots ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl index ca6d472..a627b56 100644 --- a/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/sciml_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of SciML knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -43,9 +40,3 @@ single_page_urls = [ index_path = make_knowledge_packs(crawlable_urls; single_urls = single_page_urls, target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "sciml", custom_metadata = "SciML ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl index f163225..3e1f6d8 100644 --- a/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl +++ b/example_scripts/creating_knowledge_packs/tidier_knowledge_pack.jl @@ -1,8 +1,5 @@ # The example below demonstrates the creation of Tidier knowledge pack -using Pkg -Pkg.activate(temp = true) -Pkg.add(url = "https://github.com/JuliaGenAI/DocsScraper.jl") using DocsScraper # The crawler will run on these URLs to look for more URLs with the same hostname @@ -13,9 +10,3 @@ crawlable_urls = ["https://tidierorg.github.io/Tidier.jl/dev/", index_path = make_knowledge_packs(crawlable_urls; target_path = joinpath("knowledge_packs", "dim=3072;chunk_size=384;Float32"), index_name = "tidier", custom_metadata = "Tidier ecosystem") - -# The index created here has 1024 embedding dimensions with boolean embeddings and max chunk size is 384. - -# The above example creates an output directory index_name which contains the sub-directories "Scraped" and "Index". -# "Scraped" contains .jls files of chunks and sources of the scraped URLs. Index contains the created index along with a .txt file -# containing the artifact info. The output directory also contains the URL mapping csv. diff --git a/src/extract_package_name.jl b/src/extract_package_name.jl index 525cecf..f5eb8ca 100644 --- a/src/extract_package_name.jl +++ b/src/extract_package_name.jl @@ -4,7 +4,7 @@ Strip URL of any http:// ot https:// or www. prefixes """ function clean_url(url::String) - # Remove http://, https://, www., or wwws. + # Remove http://, https://, www. cleaned_url = replace(url, r"^https?://(www\d?\.)?" => "") return cleaned_url end diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl index 68055a1..0591cc5 100644 --- a/src/make_knowledge_packs.jl +++ b/src/make_knowledge_packs.jl @@ -70,7 +70,7 @@ end single_urls::Vector{<:AbstractString} = String[], target_path::AbstractString = "", index_name::AbstractString = "") Validate args. Return error if both `crawlable_urls` and `single_urls` are empty. -Create a target path if input path is invalid. Create a gensym index if the input index is inavlid. +Create a target path if input path is invalid. Create a gensym index if the input index is invalid. # Arguments - crawlable_urls: URLs that should be crawled to find more links