Skip to content

Commit

Permalink
changed index name convention (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
splendidbug authored Aug 17, 2024
1 parent f6e2dc6 commit 51914a8
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 17 deletions.
2 changes: 0 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Expand All @@ -34,7 +33,6 @@ Inflate = "0.1"
JSON = "0.21"
LinearAlgebra = "1"
PromptingTools = "0.49"
Random = "1"
SHA = "0.7"
Serialization = "1"
SparseArrays = "1"
Expand Down
3 changes: 2 additions & 1 deletion docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ makedocs(;

deploydocs(;
repo = "github.com/splendidbug/DocsScraper.jl",
devbranch = "main"
devbranch = "main",
branch = "gh-pages"
)
1 change: 0 additions & 1 deletion src/DocsScraper.jl
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ using SHA
using Serialization, URIs
using Dates
using JSON
using Random

include("parser.jl")
include("crawl.jl")
Expand Down
32 changes: 20 additions & 12 deletions src/make_knowledge_packs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ function l2_norm_columns(vect::AbstractVector)
return vect / norm_
end

"""
remove_dashes(text::AbstractString)
removes all dashes ('-') from a given string
"""
function process_text(text::AbstractString)
return replace(lowercase(text), "-" => "", "_" => "")
end

"""
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL,
embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
Expand Down Expand Up @@ -160,34 +169,34 @@ function generate_embeddings(
end
end
end
index_name = process_text(index_name)

# Generate embeddings
cost_tracker = Threads.Atomic{Float64}(0.0)
full_embeddings = RT.get_embeddings(
embedder, chunks; model, verbose = false, cost_tracker)
@info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"

full_embeddings = full_embeddings[1:embedding_size, :] |>
l2_norm_columns

if bool_embeddings
full_embeddings = map(>(0), full_embeddings)
end

if isempty(index_name)
rand_int = rand(1000:100000)
date = Dates.today()
index_name = "$(date)-$(rand_int)"
end

@info "Created embeddings for $index_name. Cost: \$$(round(cost_tracker[], digits=3))"

trunc = embedding_size < EMBEDDING_SIZE ? 1 : 0
emb_data_type = bool_embeddings ? "Bool" : "Float32"
date = Dates.today()
date_string = Dates.format(Dates.today(), "yyyymmdd")

if isempty(index_name)
index_name = "$(gensym("index"))"
end

file_name = "$(index_name)__v$(date_string)__$(process_text(model))-$(embedding_size)-$(emb_data_type)__v1.0"
fn_output = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.tar.gz")
"$(file_name).tar.gz")
fn_temp = joinpath(knowledge_pack_path, "packs",
"$index_name-$model-$trunc-$(emb_data_type)__v1.0.hdf5")
"$(file_name).hdf5")

h5open(fn_temp, "w") do file
file["chunks"] = chunks
Expand Down Expand Up @@ -269,5 +278,4 @@ function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[
generate_embeddings(
knowledge_pack_path; max_chunk_size, model, embedding_size,
custom_metadata, bool_embeddings, index_name)

end
34 changes: 33 additions & 1 deletion test/crawl.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,36 @@
using DocsScraper: crawl
using DocsScraper: check_robots_txt, get_urls!, process_hostname!, get_base_url

function crawl(input_urls::Vector{<:AbstractString})
url_queue = Vector{AbstractString}(input_urls)
visited_url_set = Set{AbstractString}()
hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
sitemap_urls = Vector{AbstractString}()

while !isempty(url_queue)
if (length(url_queue) > 2)
break
end
url = url_queue[1]
popfirst!(url_queue)
base_url = get_base_url(url)

if !in(base_url, visited_url_set)
push!(visited_url_set, base_url)
crawlable, sitemap_urls = check_robots_txt("*", base_url)
append!(url_queue, sitemap_urls)
if crawlable
try
get_urls!(base_url, url_queue)
process_hostname!(url, hostname_url_dict)
catch
@error "Bad URL: $base_url"
end
end
end
end

return hostname_url_dict, visited_url_set
end

@testset "crawl" begin
urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
Expand Down

0 comments on commit 51914a8

Please sign in to comment.