tags
+Extract urls inside html or xml files
# Arguments
- url: url from which all other URLs will be extracted
- url_queue: Vector in which extracted URLs will be appended
"""
function get_urls!(url::AbstractString, url_queue::Vector{<:AbstractString})
-
@info "Scraping link: $url"
- # println(url)
- # try
fetched_content = HTTP.get(url)
parsed = Gumbo.parsehtml(String(fetched_content.body))
- if (url[end-3:end] == ".xml")
+ if (url[(end - 3):end] == ".xml")
find_urls_xml!(url_xml, url_queue)
else
find_urls_html!(url, parsed.root, url_queue)
end
- # print("-------------")
- # catch e
- # println("Bad URL: $url")
- # end
end
\ No newline at end of file
diff --git a/src/make_embeddings.jl b/src/make_embeddings.jl
deleted file mode 100644
index f51c865..0000000
--- a/src/make_embeddings.jl
+++ /dev/null
@@ -1,173 +0,0 @@
-## TODO: Make a function to Check for version number
-
-"""
- report_artifact()
-
-prints artifact information
-"""
-function report_artifact(fn_output)
- @info("ARTIFACT: $(basename(fn_output))")
- @info("sha256: ", bytes2hex(open(sha256, fn_output)))
- @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
-end
-
-
-
-
-"""
- create_output_folders()
-
-Creates output folders
-"""
-function create_output_folders(knowledge_pack_path::String)
- # Define the folder path
- folder_path = joinpath(knowledge_pack_path, "packs")
- println("folder_path:", folder_path)
- # Check if the folder exists
- if !isdir(folder_path)
- mkpath(folder_path)
- @info "Folder created: $folder_path"
- else
- @info "Folder already exists: $folder_path"
- end
-
-end
-
-"""
- make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}})
-
-Parses URLs from hostname_url_dict and saves the chunks
-
-# Arguments
-- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
-"""
-function make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String)
- output_chunks = Vector{SubString{String}}()
- output_sources = Vector{String}()
- SAVE_CHUNKS = true
- CHUNK_SIZE = 512
- for (hostname, urls) in hostname_url_dict
- for url in urls
- try
- chunks, sources = process_paths(url)
- append!(output_chunks, chunks)
- append!(output_sources, sources)
- catch
- @error "error!! check url: $url"
- end
- end
- if SAVE_CHUNKS
- serialize(joinpath(knowledge_pack_path, "$(hostname)-chunks-$(CHUNK_SIZE).jls"), output_chunks)
- serialize(joinpath(knowledge_pack_path, "$(hostname)-sources-$(CHUNK_SIZE).jls"), output_sources)
- end
-
- end
-
-
-end
-
-function l2_norm_columns(mat::AbstractMatrix)
- norm_ = norm.(eachcol(mat))
- return mat ./ norm_'
-end
-function l2_norm_columns(vect::AbstractVector)
- norm_ = norm(vect)
- return vect / norm_
-end
-
-
-"""
- generate_embeddings()
-
-Deserializes chunks and sources to generate embeddings
-"""
-function generate_embeddings(knowledge_pack_path::String)
- embedder = RT.BatchEmbedder()
- entries = readdir(knowledge_pack_path)
-
- # Initialize a dictionary to group files by hostname and chunk size
- hostname_files = Dict{String,Dict{Int,Dict{String,String}}}()
-
- # Regular expressions to match the file patterns
- chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
- sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
-
- # Group files by hostname and chunk size
- for file in entries
- match_chunks = match(chunks_pattern, file)
- match_sources = match(sources_pattern, file)
-
- if match_chunks !== nothing
- hostname = match_chunks.captures[1]
- chunk_size = parse(Int, match_chunks.captures[2])
- if !haskey(hostname_files, hostname)
- hostname_files[hostname] = Dict{Int,Dict{String,String}}()
- end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String,String}()
- end
- hostname_files[hostname][chunk_size]["chunks"] = joinpath(knowledge_pack_path, file)
- elseif match_sources !== nothing
- hostname = match_sources.captures[1]
- chunk_size = parse(Int, match_sources.captures[2])
- if !haskey(hostname_files, hostname)
- hostname_files[hostname] = Dict{Int,Dict{String,String}}()
- end
- if !haskey(hostname_files[hostname], chunk_size)
- hostname_files[hostname][chunk_size] = Dict{String,String}()
- end
- hostname_files[hostname][chunk_size]["sources"] = joinpath(knowledge_pack_path, file)
- end
- end
-
-
- # Process each pair of files
- for (hostname, chunk_files) in hostname_files
- for (chunk_size, files) in chunk_files
- if haskey(files, "chunks") && haskey(files, "sources")
- chunks_file = files["chunks"]
- sources_file = files["sources"]
- chunks = deserialize(chunks_file)
- sources = deserialize(sources_file)
- cost_tracker = Threads.Atomic{Float64}(0.0)
- full_embeddings = RT.get_embeddings(embedder, chunks; model="text-embedding-3-large", verbose=false, cost_tracker, dimensions=1024)
-
- fn_output = joinpath(knowledge_pack_path, "packs", "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
- fn_temp = joinpath(knowledge_pack_path, "packs", "pack.hdf5")
- h5open(fn_temp, "w") do file
- file["chunks"] = chunks
- file["sources"] = sources
- file["embeddings"] = full_embeddings[1:1024, :] |> l2_norm_columns |> x -> map(>(0), x)
- file["type"] = "ChunkIndex"
- # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
- end
- command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
- run(command)
- report_artifact(fn_output)
-
- else
- @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
- end
- end
- end
-
-end
-
-
-
-"""
- make_embeddings(input_urls::Vector{<:AbstractString})
-
-Entry point to crawl, parse and create embeddings
-
-# Arguments
-- input_urls: vector containing URL strings to parse
-"""
-function make_embeddings(input_urls::Vector{<:AbstractString})
- hostname_url_dict = Dict{AbstractString,Vector{AbstractString}}()
- hostname_url_dict = crawl(input_urls)
- knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
- create_output_folders(knowledge_pack_path)
- make_chunks(hostname_url_dict, knowledge_pack_path)
- generate_embeddings(knowledge_pack_path)
-end
\ No newline at end of file
diff --git a/src/make_knowledge_packs.jl b/src/make_knowledge_packs.jl
new file mode 100644
index 0000000..291a9c7
--- /dev/null
+++ b/src/make_knowledge_packs.jl
@@ -0,0 +1,222 @@
+"""
+ report_artifact(fn_output)
+
+Print artifact information
+"""
+function report_artifact(fn_output)
+ @info("ARTIFACT: $(basename(fn_output))")
+ @info("sha256: ", bytes2hex(open(sha256, fn_output)))
+ @info("git-tree-sha1: ", Tar.tree_hash(IOBuffer(inflate_gzip(fn_output))))
+end
+
+"""
+ create_output_folders(knowledge_pack_path::String)
+
+Create output folders on the knowledge_pack_path
+"""
+function create_output_folders(knowledge_pack_path::String)
+ # Define the folder path
+ folder_path = joinpath(knowledge_pack_path, "packs")
+ # Check if the folder exists
+ if !isdir(folder_path)
+ mkpath(folder_path)
+ end
+end
+
+"""
+ make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; max_chunk_size::Int=MAX_CHUNK_SIZE,
+ min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Parse URLs from hostname_url_dict and save the chunks
+
+# Arguments
+- hostname_url_dict: Dict with key being hostname and value being a vector of URLs
+- knowledge_pack_path: Knowledge pack path
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+"""
+function make_chunks(hostname_url_dict::Dict{AbstractString, Vector{AbstractString}},
+ knowledge_pack_path::String; max_chunk_size::Int = MAX_CHUNK_SIZE,
+ min_chunk_size::Int = MIN_CHUNK_SIZE)
+ SAVE_CHUNKS = true
+ for (hostname, urls) in hostname_url_dict
+ output_chunks = Vector{SubString{String}}()
+ output_sources = Vector{String}()
+ for url in urls
+ try
+ chunks, sources = process_paths(url; max_chunk_size, min_chunk_size)
+ append!(output_chunks, chunks)
+ append!(output_sources, sources)
+ catch
+ @error "error!! check url: $url"
+ end
+ end
+ if SAVE_CHUNKS
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-chunks-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_chunks)
+ serialize(
+ joinpath(knowledge_pack_path,
+ "$(hostname)-sources-max-$(max_chunk_size)-min-$(min_chunk_size).jls"),
+ output_sources)
+ end
+ end
+end
+
+"""
+ l2_norm_columns(mat::AbstractMatrix)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(mat::AbstractMatrix)
+ norm_ = norm.(eachcol(mat))
+ return mat ./ norm_'
+end
+
+"""
+ l2_norm_columns(vect::AbstractVector)
+
+Normalize the columns of the input embeddings
+"""
+function l2_norm_columns(vect::AbstractVector)
+ norm_ = norm(vect)
+ return vect / norm_
+end
+
+"""
+ generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Deserialize chunks and sources to generate embeddings
+
+# Arguments
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function generate_embeddings(knowledge_pack_path::String; model::AbstractString = MODEL,
+ embedding_size::Int = EMBEDDING_SIZE)
+ embedder = RT.BatchEmbedder()
+ entries = readdir(knowledge_pack_path)
+ # Initialize a dictionary to group files by hostname and chunk size
+ hostname_files = Dict{String, Dict{Int, Dict{String, String}}}()
+
+ # Regular expressions to match the file patterns of chunks and sources
+ chunks_pattern = r"^(.*)-chunks-max-(\d+)-min-(\d+)\.jls$"
+ sources_pattern = r"^(.*)-sources-max-(\d+)-min-(\d+)\.jls$"
+
+ # chunks_pattern = r"^(.*)-chunks-(\d+)\.jls$"
+ # sources_pattern = r"^(.*)-sources-(\d+)\.jls$"
+
+ # Group files by hostname and chunk size
+ for file in entries
+ match_chunks = match(chunks_pattern, file)
+ match_sources = match(sources_pattern, file)
+
+ if match_chunks !== nothing
+ hostname = match_chunks.captures[1]
+ chunk_size = parse(Int, match_chunks.captures[2])
+ if !haskey(hostname_files, hostname)
+ hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+ end
+ if !haskey(hostname_files[hostname], chunk_size)
+ hostname_files[hostname][chunk_size] = Dict{String, String}()
+ end
+ hostname_files[hostname][chunk_size]["chunks"] = joinpath(
+ knowledge_pack_path, file)
+ elseif match_sources !== nothing
+ hostname = match_sources.captures[1]
+ chunk_size = parse(Int, match_sources.captures[2])
+ if !haskey(hostname_files, hostname)
+ hostname_files[hostname] = Dict{Int, Dict{String, String}}()
+ end
+ if !haskey(hostname_files[hostname], chunk_size)
+ hostname_files[hostname][chunk_size] = Dict{String, String}()
+ end
+ hostname_files[hostname][chunk_size]["sources"] = joinpath(
+ knowledge_pack_path, file)
+ end
+ end
+ # Process each pair of files
+ for (hostname, chunk_files) in hostname_files
+ for (chunk_size, files) in chunk_files
+ if haskey(files, "chunks") && haskey(files, "sources")
+ chunks_file = files["chunks"]
+ sources_file = files["sources"]
+ chunks = deserialize(chunks_file)
+ sources = deserialize(sources_file)
+ cost_tracker = Threads.Atomic{Float64}(0.0)
+ full_embeddings = RT.get_embeddings(
+ embedder, chunks; model, verbose = false, cost_tracker)
+ @info "Created embeddings for $hostname. Cost: \$$(round(cost_tracker[], digits=3))"
+ fn_output = joinpath(knowledge_pack_path, "packs",
+ "$hostname-textembedding3large-0-Float32__v1.0.tar.gz")
+ fn_temp = joinpath(knowledge_pack_path, "packs",
+ "$hostname-textembedding3large-0-Float32__v1.0.hdf5")
+ h5open(fn_temp, "w") do file
+ file["chunks"] = chunks
+ file["sources"] = sources
+ file["embeddings"] = full_embeddings[1:embedding_size, :] |>
+ l2_norm_columns |> x -> map(>(0), x)
+ file["type"] = "ChunkIndex"
+ # file["metadata"] = "$hostname ecosystem docstrings, chunk size $chunk_size, downloaded on 20240330, contains: Makie.jl, AlgebraOfGraphics.jl, GeoMakie.jl, GraphMakie.jl, MakieThemes.jl, TopoPlots.jl, Tyler.jl"
+ end
+
+ command = `tar -cvzf $fn_output -C $(dirname(fn_temp)) $(basename(fn_temp))`
+ run(command)
+ report_artifact(fn_output)
+
+ else
+ @warn "Missing pair for hostname: $hostname, chunk size: $chunk_size"
+ end
+ end
+ end
+end
+
+"""
+ make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+ max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE)
+
+Entry point to crawl, parse and generate embeddings
+
+# Arguments
+- crawlable_urls: URLs that should be crawled to find more links
+- single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
+- max_chunk_size: Maximum chunk size
+- min_chunk_size: Minimum chunk size
+- model: Embedding model
+- embedding_size: Embedding dimensions
+"""
+function make_knowledge_packs(crawlable_urls::Vector{<:AbstractString} = String[];
+ single_urls::Vector{<:AbstractString} = String[],
+ max_chunk_size::Int = MAX_CHUNK_SIZE, min_chunk_size::Int = MIN_CHUNK_SIZE,
+ model::AbstractString = MODEL, embedding_size::Int = EMBEDDING_SIZE)
+ if isempty(crawlable_urls) && isempty(single_urls)
+ error("At least one of `input_urls` or `single_pages` must be provided.")
+ end
+
+ hostname_url_dict = Dict{AbstractString, Vector{AbstractString}}()
+
+ if !isempty(crawlable_urls)
+ hostname_url_dict, visited_url_set = crawl(crawlable_urls)
+ else
+ visited_url_set = Set{AbstractString}()
+ end
+ for url in single_urls
+ base_url = get_base_url(url)
+ if !in(base_url, visited_url_set)
+ push!(visited_url_set, base_url)
+ crawlable, sitemap_urls = check_robots_txt("*", base_url)
+ if crawlable
+ try
+ process_hostname!(url, hostname_url_dict)
+ catch
+ @error "Bad URL: $base_url"
+ end
+ end
+ end
+ end
+ knowledge_pack_path = joinpath(@__DIR__, "..", "knowledge_packs")
+ create_output_folders(knowledge_pack_path)
+ make_chunks(hostname_url_dict, knowledge_pack_path; max_chunk_size, min_chunk_size)
+ generate_embeddings(knowledge_pack_path; model, embedding_size)
+end
diff --git a/src/parser.jl b/src/parser.jl
index d909280..def1a17 100644
--- a/src/parser.jl
+++ b/src/parser.jl
@@ -1,21 +1,3 @@
-"""
-Working:
-
-Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks
-ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.
-For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
-If the current node is a code block, return the text inside code block with backticks.
-If the node is neither heading nor code, then we'll need to go deeper in the hierarchy.
-if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
-it is assumed that everything inside the tag is part of a single text block with inline code.
-But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false.
-To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration
-that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again.
-We indicate this by a return flag is_text_inserted
-"""
-
-
-
"""
insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
parsed_blocks::Vector{Dict{String,Any}},
@@ -30,11 +12,10 @@ Insert the text into parsed_blocks Vector
- text_to_insert: Text to be inserted
- text_type: The text to be inserted could be heading or a code block or just text
"""
-function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- text_to_insert::AbstractString,
- text_type::AbstractString)
-
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ text_to_insert::AbstractString,
+ text_type::AbstractString)
if !isempty(strip(text_to_insert))
push!(parsed_blocks,
Dict(text_type => strip(text_to_insert),
@@ -42,8 +23,6 @@ function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
end
end
-
-
"""
process_headings!(node::Gumbo.HTMLElement,
heading_hierarchy::Dict{Symbol,Any},
@@ -57,13 +36,13 @@ Process headings. If the current node is heading, directly insert into parsed_bl
- parsed_blocks: Vector of Dicts to store parsed text and metadata
"""
function process_headings!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}})
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}})
tag_name = Gumbo.tag(node)
# Clear headings of equal or lower level
for k in collect(keys(heading_hierarchy))
- if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+ if k != "header" &&
+ Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
delete!(heading_hierarchy, k)
end
end
@@ -123,11 +102,10 @@ If the node is neither heading nor code
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_generic_node!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
seekstart(prev_text_buffer)
prev_text = read(prev_text_buffer, String)
@@ -142,10 +120,15 @@ function process_generic_node!(node::Gumbo.HTMLElement,
# if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
# there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted.
- if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
- received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+ if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i,
+ :cite, :address, :em, :td, :a, :span, :header]
+ received_text, is_code_block, is_text_inserted = process_node!(
+ child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+ elseif tag_name in [:script]
+ continue
else
- received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+ received_text, is_code_block, is_text_inserted = process_node!(
+ child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
# changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
@@ -180,7 +163,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
print(prev_text_buffer, " " * received_text)
text_to_insert = text_to_insert * " " * received_text
end
-
end
# if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence,
@@ -195,7 +177,8 @@ function process_generic_node!(node::Gumbo.HTMLElement,
# if we're insert text in current node level, then we should insert the previous text if available,
# otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
if !isempty(strip(text_to_insert))
- insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+ insert_parsed_data!(
+ heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
is_text_inserted = true
end
@@ -205,7 +188,6 @@ function process_generic_node!(node::Gumbo.HTMLElement,
return "", is_code_block, is_text_inserted
end
-
"""
process_docstring!(node::Gumbo.HTMLElement,
heading_hierarchy::Dict{Symbol,Any},
@@ -224,11 +206,10 @@ Function to process node of class `docstring`
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_docstring!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
seekstart(prev_text_buffer)
prev_text = read(prev_text_buffer, String)
is_code_block = false
@@ -248,10 +229,12 @@ function process_docstring!(node::Gumbo.HTMLElement,
# Insert "header"
if Gumbo.tag(children[1]) == :header
heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
- insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+ insert_parsed_data!(
+ heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
end
- received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+ received_text, is_code_block, is_text_inserted = process_node!(
+ children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
if !isempty(strip(received_text))
insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
@@ -279,11 +262,10 @@ Function to process a node
- prev_text_buffer: IO Buffer which contains previous text
"""
function process_node!(node::Gumbo.HTMLElement,
- heading_hierarchy::Dict{Symbol,Any},
- parsed_blocks::Vector{Dict{String,Any}},
- child_new::Bool=true,
- prev_text_buffer::IO=IOBuffer(write=true))
-
+ heading_hierarchy::Dict{Symbol, Any},
+ parsed_blocks::Vector{Dict{String, Any}},
+ child_new::Bool = true,
+ prev_text_buffer::IO = IOBuffer(write = true))
tag_name = Gumbo.tag(node)
if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
return process_headings!(node, heading_hierarchy, parsed_blocks)
@@ -292,15 +274,14 @@ function process_node!(node::Gumbo.HTMLElement,
return process_code(node)
elseif tag_name == :article && getattr(node, "class", "") == "docstring"
- return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+ return process_docstring!(
+ node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
- return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
-
+ return process_generic_node!(
+ node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
end
-
"""
multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
"""
@@ -310,14 +291,10 @@ function process_node!(node::Gumbo.HTMLText, args...)
return strip(Gumbo.text(node)), is_code_block, is_text_inserted
end
-
"""
get_base_url(url::AbstractString)
-Extracts the base url.
-
-# Arguments
-- `url`: The url string of which, the base url needs to be extracted
+Extract the base url.
"""
function get_base_url(url::AbstractString)
parsed_url = URIs.URI(url)
@@ -329,7 +306,7 @@ end
"""
get_html_content(root::Gumbo.HTMLElement)
-Returns the main content of the HTML. If not found, returns the whole HTML to parse
+Return the main content of the HTML. If not found, return the whole HTML to parse
# Arguments
- `root`: The HTML root from which content is extracted
@@ -338,73 +315,34 @@ function get_html_content(root::Gumbo.HTMLElement)
target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
target_classes = Set(["content", "franklin-content"])
- content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+ content_candidates = [el
+ for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
# First try to find by ID
- content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+ content_by_id = filter(
+ el -> getattr(el, "id", nothing) in target_ids, content_candidates)
if !isempty(content_by_id)
return only(content_by_id)
end
# Fallback to class if no ID matches
- content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+ content_by_class = filter(
+ el -> getattr(el, "class", nothing) in target_classes, content_candidates)
if !isempty(content_by_class)
return only(content_by_class)
end
# Fallback to the root node if no class matches
return root
-
end
-
"""
parse_url(url::AbstractString)
-Initiator and main function to parse HTML from url
+Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
# Arguments
- `url`: URL string to parse
-
-# Returns
-- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
-
-# Usage
-parsed_blocks = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
-
-# Example
-Let the HTML be:
-
-
-
-
- Heading 1
- Heading 2
- para 1
- Heading 3
- this is my code block
- This is another h3 under Heading 2
- This is a paragraph with inline code
-
- Heading 2_2
- para ewg
-
-
-
-
-Output:
-Any[
- Dict{String, Any}("URL" => "URL")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
- Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
- Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
-]
"""
function parse_url_to_blocks(url::AbstractString)
@@ -419,8 +357,8 @@ function parse_url_to_blocks(url::AbstractString)
# title = [el
# for el in AbstractTrees.PreOrderDFS(r_parsed.root)
# if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
- parsed_blocks = Vector{Dict{String,Any}}([Dict("Source" => base_url)])
- heading_hierarchy = Dict{Symbol,Any}()
+ parsed_blocks = Vector{Dict{String, Any}}([Dict("Source" => base_url)])
+ heading_hierarchy = Dict{Symbol, Any}()
process_node!(get_html_content(parsed.root), heading_hierarchy, parsed_blocks)
return parsed_blocks
catch
diff --git a/src/preparation.jl b/src/preparation.jl
index ab8d7b5..9979155 100644
--- a/src/preparation.jl
+++ b/src/preparation.jl
@@ -1,9 +1,7 @@
-# include("recursive_splitter.jl")
-include("utils.jl")
"""
get_header_path(d::Dict)
-Concatenates the h1, h2, h3 keys from the metadata of a Dict
+Concatenate the h1, h2, h3 keys from the metadata of a Dict
# Examples
```julia
@@ -12,7 +10,7 @@ get_header_path(d)
# Output: "Axis/Attributes/yzoomkey"
```
"""
-function get_header_path(d::Dict)
+function get_header_path(d::Dict{String,Any})
metadata = get(d, "metadata", Dict{Any,Any}())
isempty(metadata) && return nothing
keys_ = [:h1, :h2, :h3]
@@ -21,8 +19,13 @@ function get_header_path(d::Dict)
end
-"Roll-up chunks (that have the same header!), so we can split them later by to get the desired length"
-function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="")
+
+"""
+ roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
+
+Roll-up chunks (that have the same header!), so we can split them later by to get the desired length
+"""
+function roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="")
docs = String[]
io = IOBuffer()
last_header = nothing
@@ -35,7 +38,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
str = String(take!(io))
if !isempty(str)
push!(docs, str)
- src = url * (isnothing(last_header) ? "" : "::$last_header")
+ src = url * (isnothing(last_header) ? "" : " - $last_header")
push!(sources, src)
end
last_header = header
@@ -48,7 +51,7 @@ function roll_up_chunks(parsed_blocks, url::AbstractString; separator::String="<
str = String(take!(io))
if !isempty(str)
push!(docs, str)
- src = url * (isnothing(last_header) ? "" : "::$last_header")
+ src = url * (isnothing(last_header) ? "" : " - $last_header")
push!(sources, src)
end
return docs, sources
@@ -56,19 +59,23 @@ end
struct DocParserChunker <: RT.AbstractChunker end
-"""
- RT.get_chunks(chunker::DocParserChunker,
- html_files::Vector{<:AbstractString};
- sources::AbstractVector{<:AbstractString}=html_files,
- verbose::Bool=true,
- separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-Extracts chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.
+"""
+ RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+ verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
+
+Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers,
+and splits them by separators to get the desired length.
+
+# Arguments
+- chunker: DocParserChunker
+- url: URL of the webpage to extract chunks
+- verbose: Bool to print the log
+- separators: Chunk separators
+- max_chunk_size Maximum chunk size
"""
function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
- verbose::Bool=true,
- separators=["\n\n", ". ", "\n", " "], max_length::Int=256)
-
+ verbose::Bool=true, separators=["\n\n", ". ", "\n", " "], max_chunk_size::Int=MAX_CHUNK_SIZE)
SEP = ""
sources = AbstractVector{<:AbstractString}
@@ -84,8 +91,9 @@ function RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
## roll up chunks by SEP splitter, then remove it later
for (doc, src) in zip(docs_, sources_)
## roll up chunks by SEP splitter, then remove it later
- doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length) .|>
+ doc_chunks = PT.recursive_splitter(doc, [SEP, separators...]; max_length=max_chunk_size) .|>
x -> replace(x, SEP => " ") .|> strip |> x -> filter(!isempty, x)
+ chunk_lengths = length.(doc_chunks)
# skip if no chunks found
isempty(doc_chunks) && continue
append!(output_chunks, doc_chunks)
@@ -96,20 +104,24 @@ end
-"Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them."
-function process_paths(url::AbstractString, max_length::Int=512)
+"""
+ process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
+
+Process folders provided in `paths`. In each, take all HTML files, scrape them, chunk them and postprocess them.
+"""
+function process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)
output_chunks = Vector{SubString{String}}()
output_sources = Vector{String}()
- chunks, sources = RT.get_chunks(DocParserChunker(), url; max_length)
+ chunks, sources = RT.get_chunks(DocParserChunker(), url; max_chunk_size)
append!(output_chunks, chunks)
append!(output_sources, sources)
@info "Scraping done: $(length(output_chunks)) chunks"
- postprocess_chunks(output_chunks, output_sources; min_length=40, skip_code=true)
+ output_chunks, output_sources = postprocess_chunks(output_chunks, output_sources; min_chunk_size, skip_code=true)
return output_chunks, output_sources
end
diff --git a/src/user_preferences.jl b/src/user_preferences.jl
new file mode 100644
index 0000000..98794c6
--- /dev/null
+++ b/src/user_preferences.jl
@@ -0,0 +1,4 @@
+global MIN_CHUNK_SIZE = 40
+global MAX_CHUNK_SIZE = 256
+global MODEL = "text-embedding-3-large"
+global EMBEDDING_SIZE = 1024
\ No newline at end of file
diff --git a/src/utils.jl b/src/utils.jl
index 4bf1e07..e8dc014 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,4 +1,9 @@
-"Finds duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where `true` indicates a duplicate (second instance of the same text)."
+"""
+ find_duplicates(chunks::AbstractVector{<:AbstractString})
+
+Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list,
+where `true` indicates a duplicate (second instance of the same text).
+"""
function find_duplicates(chunks::AbstractVector{<:AbstractString})
# hash the chunks for easier search
hashed_chunks = bytes2hex.(sha256.(chunks))
@@ -20,20 +25,34 @@ function find_duplicates(chunks::AbstractVector{<:AbstractString})
return duplicates
end
-"Removes chunks that are duplicated in the input list of chunks and their corresponding sources."
+"""
+ remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
+
+Remove chunks that are duplicated in the input list of chunks and their corresponding sources.
+"""
function remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})
idxs = find_duplicates(chunks)
return chunks[.!idxs], sources[.!idxs]
end
-"Removes chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources."
-function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true)
+
+"""
+ remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+Remove chunks that are shorter than a specified length (`min_length`) from the input list of chunks and their corresponding sources.
+"""
+function remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)
+
+ chunk_lengths = length.(chunks)
idx = if skip_code
- ## Keep short chunks if they contain code (might be combined with some preceding/suceeeding text)
- findall(x -> length(x) >= min_length || occursin("```", x), chunks)
+ ## Keep short chunks if they contain code (might be combined with some preceding/succeeding text)
+ findall(x -> length(x) >= min_chunk_size || occursin("```", x), chunks)
else
- findall(x -> length(x) >= min_length, chunks)
+ findall(x -> length(x) >= min_chunk_size, chunks)
end
+ chunk_lengths = length.(chunks[idx])
return chunks[idx], sources[idx]
end
@@ -42,14 +61,24 @@ function replace_local_paths(sources::AbstractVector{<:AbstractString}, paths::A
@assert length(paths) == length(websites) "Length of `paths` must match length of `websites`"
replacement_pairs = paths .=> websites
output = map(x -> replace(x, replacement_pairs...), sources)
+ return output
end
-"Post-processes the input list of chunks and their corresponding sources by removing short chunks and duplicates."
-function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString}; min_length::Int=40, skip_code::Bool=true,
- paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing, websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+
+"""
+ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+ websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
+
+Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.
+"""
+function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+ min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
+ websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)
len_ = length(chunks)
- chunks, sources = remove_short_chunks(chunks, sources; min_length, skip_code)
+ chunks, sources = remove_short_chunks(chunks, sources; min_chunk_size, skip_code)
@info "Removed $(len_ - length(chunks)) short chunks"
len_ = length(chunks)
@@ -63,6 +92,31 @@ function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::A
end
return chunks, sources
+end
+
+"""
+ function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+
+Remove chunks and sources corresponding to URLs starting with `prefix_urls`
+"""
+function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})
+ @assert endswith(file_path, ".hdf5") "Provided file path must end with `.hdf5` (see HDF5.jl)."
+ h5open(index_path, "r+") do orig_file
+ # Load the sources dataset into a Julia array
+ sources = read(orig_file["sources"])
+ chunks = read(orig_file["chunks"])
+ embeddings = read(orig_file["embeddings"])
+ for url_to_remove in prefix_urls
+ indices_to_remove = findall(x -> startswith(x, url_to_remove), sources)
+ sources = deleteat!(sources, indices_to_remove)
+ chunks = deleteat!(chunks, indices_to_remove)
+ embeddings = embeddings[:, setdiff(1:size(embeddings, 2), indices_to_remove)]
+ end
+
+ write(file["sources"], sources)
+ write(file["chunks"], chunks)
+ write(file["embeddings"], embeddings)
+ end
end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 78a78b4..4b4a92c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,21 +10,22 @@ using LinearAlgebra, Unicode, SparseArrays
using HDF5
using Tar
using Inflate
-
using SHA
using Serialization, URIs
-include("..\\src\\crawl.jl")
-include("..\\src\\extract_urls.jl")
-include("..\\src\\parser.jl")
-include("..\\src\\preparation.jl")
+include(joinpath("..", "src", "crawl.jl"))
+include(joinpath("..", "src", "extract_urls.jl"))
+include(joinpath("..", "src", "parser.jl"))
+include(joinpath("..", "src", "preparation.jl"))
+include(joinpath("..", "src", "user_preferences.jl"))
+include(joinpath("..", "src", "utils.jl"))
+
urls = Vector{AbstractString}(["https://docs.julialang.org/en/v1/"])
url = urls[1]
queue = Vector{AbstractString}()
-@testset "check robots.txt" begin
+@testset "HTTP" begin
@test HTTP.get(url) != nothing
-
result, sitemap_queue = check_robots_txt("*", url)
@test result == true
end
@@ -38,12 +39,13 @@ end
parsed_blocks = parse_url_to_blocks(url)
@test length(parsed_blocks) > 0
SEP = ""
- docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator=SEP)
- @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing && sources_[1] != nothing
+ docs_, sources_ = roll_up_chunks(parsed_blocks, url; separator = SEP)
+ @test length(docs_) > 0 && length(sources_) > 0 && docs_[1] != nothing &&
+ sources_[1] != nothing
end
@testset "overall test" begin
chunks, sources = process_paths(url)
- @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing && sources[1] != nothing
-
+ @test length(chunks) > 0 && length(sources) > 0 && chunks[1] != nothing &&
+ sources[1] != nothing
end