From 51914a8831de4feab13ea1dda3f28592cee369b1 Mon Sep 17 00:00:00 2001 From: Shreyas Shirish Agrawal <48771895+splendidbug@users.noreply.github.com> Date: Sat, 17 Aug 2024 04:29:51 -0700 Subject: [PATCH] changed index name convention (#6) --- Project.toml | 2 -- docs/make.jl | 3 ++- src/DocsScraper.jl | 1 - src/make_knowledge_packs.jl | 32 ++++++++++++++++++++------------ test/crawl.jl | 34 +++++++++++++++++++++++++++++++++- 5 files changed, 55 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index ef5aaa9..b772828 100644 --- a/Project.toml +++ b/Project.toml @@ -14,7 +14,6 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" @@ -34,7 +33,6 @@ Inflate = "0.1" JSON = "0.21" LinearAlgebra = "1" PromptingTools = "0.49" -Random = "1" SHA = "0.7" Serialization = "1" SparseArrays = "1" diff --git a/docs/make.jl b/docs/make.jl index 47bd6f5..a53a41c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -20,5 +20,6 @@ makedocs(; deploydocs(; repo = "github.com/splendidbug/DocsScraper.jl", - devbranch = "main" + devbranch = "main", + branch = "gh-pages" ) diff --git a/src/DocsScraper.jl b/src/DocsScraper.jl index 0a65d57..7f114d9 100644 --- a/src/DocsScraper.jl +++ b/src/DocsScraper.jl @@ -13,7 +13,6 @@ using SHA using Serialization, URIs using Dates using JSON -using Random include("parser.jl") include("crawl.jl") diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl index a85c91f..cc5d8ff 100644 --- a/src/make_knowledge_packs.jl +++ b/src/make_knowledge_packs.jl @@ -82,6 +82,15 @@ function l2_norm_columns(vect::AbstractVector) return vect / norm_ end +""" + remove_dashes(text::AbstractString) + +removes all dashes ('-') from a given string +""" +function process_text(text::AbstractString) + return replace(lowercase(text), "-" => "", "_" => "") +end + """ generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString, @@ -160,34 +169,34 @@ function generate_embeddings( end end end + index_name = process_text(index_name) # Generate embeddings cost_tracker = Threads.Atomic{Float64}(0.0) full_embeddings = RT.get_embeddings( embedder, chunks; model, verbose = false, cost_tracker) + @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))" full_embeddings = full_embeddings[1:embedding_size, :] |> l2_norm_columns - if bool_embeddings full_embeddings = map(>(0), full_embeddings) end - if isempty(index_name) - rand_int = rand(1000:100000) - date = Dates.today() - index_name = "$(date)-$(rand_int)" - end - - @info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))" - trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0 emb_data_type = bool_embeddings ? "Bool" : "Float32" + date = Dates.today() + date_string = Dates.format(Dates.today(), "yyyymmdd") + if isempty(index_name) + index_name = "$(gensym("index"))" + end + + file_name = "$(index_name)__v$(date_string)__$(process_text(model))-$(embedding_size)-$(emb_data_type)__v1.0" fn_output = joinpath(knowledge_pack_path, "packs", - "$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz") + "$(file_name).tar.gz") fn_temp = joinpath(knowledge_pack_path, "packs", - "$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5") + "$(file_name).hdf5") h5open(fn_temp, "w") do file file["chunks"] = chunks @@ -269,5 +278,4 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[ generate_embeddings( knowledge_pack_path; max_chunk_size, model, embedding_size, custom_metadata, bool_embeddings, index_name) - end diff --git a/test/crawl.jl b/test/crawl.jl index 6b00ca4..efaceb4 100644 --- a/test/crawl.jl +++ b/test/crawl.jl @@ -1,4 +1,36 @@ -using DocsScraper: crawl +using DocsScraper: check_robots_txt, get_urls!, process_hostname!, get_base_url + +function crawl(input_urls::Vector{<:AbstractString}) + url_queue = Vector{AbstractString}(input_urls) + visited_url_set = Set{AbstractString}() + hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}() + sitemap_urls = Vector{AbstractString}() + + while !isempty(url_queue) + if (length(url_queue) > 2) + break + end + url = url_queue[1] + popfirst!(url_queue) + base_url = get_base_url(url) + + if !in(base_url, visited_url_set) + push!(visited_url_set, base_url) + crawlable, sitemap_urls = check_robots_txt("*", base_url) + append!(url_queue, sitemap_urls) + if crawlable + try + get_urls!(base_url, url_queue) + process_hostname!(url, hostname_url_dict) + catch + @error "Bad URL: $base_url" + end + end + end + end + + return hostname_url_dict, visited_url_set +end @testset "crawl" begin urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])