Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changed index name convention #6

Merged
merged 1 commit into from
Aug 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -34,7 +33,6 @@ Inflate = "0.1"
JSON = "0.21"
LinearAlgebra = "1"
PromptingTools = "0.49"
Random = "1"
SHA = "0.7"
Serialization = "1"
SparseArrays = "1"
Expand Down
3 changes: 2 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ makedocs(;

deploydocs(;
repo = "github.com/splendidbug/DocsScraper.jl",
devbranch = "main"
devbranch = "main",
branch = "gh-pages"
)
1 change: 0 additions & 1 deletion src/DocsScraper.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ using SHA
using Serialization, URIs
using Dates
using JSON
using Random

include("parser.jl")
include("crawl.jl")
Expand Down
32 changes: 20 additions & 12 deletions src/make_knowledge_packs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ function l2_norm_columns(vect::AbstractVector)
return vect / norm_
end

"""
remove_dashes(text::AbstractString)

removes all dashes ('-') from a given string
"""
function process_text(text::AbstractString)
return replace(lowercase(text), "-" => "", "_" => "")
end

"""
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL,
embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
Expand Down Expand Up @@ -160,34 +169,34 @@ function generate_embeddings(
end
end
end
index_name = process_text(index_name)

# Generate embeddings
cost_tracker = Threads.Atomic{Float64}(0.0)
full_embeddings = RT.get_embeddings(
embedder, chunks; model, verbose = false, cost_tracker)
@info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"

full_embeddings = full_embeddings[1:embedding_size, :] |>
l2_norm_columns

if bool_embeddings
full_embeddings = map(>(0), full_embeddings)
end

if isempty(index_name)
rand_int = rand(1000:100000)
date = Dates.today()
index_name = "$(date)-$(rand_int)"
end

@info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"

trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
emb_data_type = bool_embeddings ? "Bool" : "Float32"
date = Dates.today()
date_string = Dates.format(Dates.today(), "yyyymmdd")

if isempty(index_name)
index_name = "$(gensym("index"))"
end

file_name = "$(index_name)__v$(date_string)__$(process_text(model))-$(embedding_size)-$(emb_data_type)__v1.0"
fn_output = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
"$(file_name).tar.gz")
fn_temp = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")
"$(file_name).hdf5")

h5open(fn_temp, "w") do file
file["chunks"] = chunks
Expand Down Expand Up @@ -269,5 +278,4 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
generate_embeddings(
knowledge_pack_path; max_chunk_size, model, embedding_size,
custom_metadata, bool_embeddings, index_name)

end
34 changes: 33 additions & 1 deletion test/crawl.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
using DocsScraper: crawl
using DocsScraper: check_robots_txt, get_urls!, process_hostname!, get_base_url

function crawl(input_urls::Vector{<:AbstractString})
url_queue = Vector{AbstractString}(input_urls)
visited_url_set = Set{AbstractString}()
hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
sitemap_urls = Vector{AbstractString}()

while !isempty(url_queue)
if (length(url_queue) > 2)
break
end
url = url_queue[1]
popfirst!(url_queue)
base_url = get_base_url(url)

if !in(base_url, visited_url_set)
push!(visited_url_set, base_url)
crawlable, sitemap_urls = check_robots_txt("*", base_url)
append!(url_queue, sitemap_urls)
if crawlable
try
get_urls!(base_url, url_queue)
process_hostname!(url, hostname_url_dict)
catch
@error "Bad URL: $base_url"
end
end
end
end

return hostname_url_dict, visited_url_set
end

@testset "crawl" begin
urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
Expand Down
Loading