diff --git a/dev/.documenter-siteinfo.json b/dev/.documenter-siteinfo.json index 9dffc0c..c69e5f9 100644 --- a/dev/.documenter-siteinfo.json +++ b/dev/.documenter-siteinfo.json @@ -1 +1 @@ -{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-08-15T09:43:16","documenter_version":"1.5.0"}} \ No newline at end of file +{"documenter":{"julia_version":"1.10.4","generation_timestamp":"2024-08-16T03:23:52","documenter_version":"1.5.0"}} \ No newline at end of file diff --git a/dev/index.html b/dev/index.html index 7303503..904c2d4 100644 --- a/dev/index.html +++ b/dev/index.html @@ -1,30 +1,31 @@ -API Index · DocsScraper.jl

Reference

DocsScraper.check_robots_txtMethod
check_robots_txt(user_agent::AbstractString, url::AbstractString)

Check robots.txt of a URL and return a boolean representing if user_agent is allowed to crawl the input url, along with sitemap urls

Arguments

  • user_agent: user agent attempting to crawl the webpage
  • url: input URL string
source
DocsScraper.crawlMethod
crawl(input_urls::Vector{<:AbstractString})

Crawl on the input URLs and return a hostname_url_dict which is a dictionary with key being hostnames and the values being the URLs

source
DocsScraper.docs_in_urlMethod
docs_in_url(url::AbstractString)

If the base url is in the form docs.packagename.domainextension, then return the middle word i.e., package_name

source
DocsScraper.find_duplicatesMethod
find_duplicates(chunks::AbstractVector{<:AbstractString})

Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where true indicates a duplicate (second instance of the same text).

source
DocsScraper.find_urls_html!Method
find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}

Function to recursively find <a> tags and extract the urls

Arguments

  • url: The initial input URL
  • node: The HTML node of type Gumbo.HTMLElement
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.find_urls_xml!Method
find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})

Identify URL through regex pattern in xml files and push in url_queue

Arguments

  • url: url from which all other URLs will be extracted
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.generate_embeddingsMethod
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
-    embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)

Deserialize chunks and sources to generate embeddings

Arguments

  • model: Embedding model
  • embedding_size: Embedding dimensions
  • custom_metadata: Custom metadata like ecosystem name if required
source
DocsScraper.get_header_pathMethod
get_header_path(d::Dict)

Concatenate the h1, h2, h3 keys from the metadata of a Dict

Examples

d = Dict("metadata" => Dict{Symbol,Any}(:h1 => "Axis", :h2 => "Attributes", :h3 => "yzoomkey"), "heading" => "yzoomkey")
+API Index · DocsScraper.jl

Reference

DocsScraper.check_robots_txtMethod
check_robots_txt(user_agent::AbstractString, url::AbstractString)

Check robots.txt of a URL and return a boolean representing if user_agent is allowed to crawl the input url, along with sitemap urls

Arguments

  • user_agent: user agent attempting to crawl the webpage
  • url: input URL string
source
DocsScraper.crawlMethod
crawl(input_urls::Vector{<:AbstractString})

Crawl on the input URLs and return a hostname_url_dict which is a dictionary with key being hostnames and the values being the URLs

source
DocsScraper.docs_in_urlMethod
docs_in_url(url::AbstractString)

If the base url is in the form docs.packagename.domainextension, then return the middle word i.e., package_name

source
DocsScraper.find_duplicatesMethod
find_duplicates(chunks::AbstractVector{<:AbstractString})

Find duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where true indicates a duplicate (second instance of the same text).

source
DocsScraper.find_urls_html!Method
find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}

Function to recursively find <a> tags and extract the urls

Arguments

  • url: The initial input URL
  • node: The HTML node of type Gumbo.HTMLElement
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.find_urls_xml!Method
find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})

Identify URL through regex pattern in xml files and push in url_queue

Arguments

  • url: url from which all other URLs will be extracted
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.generate_embeddingsMethod
generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, 
+    embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,
+    bool_embeddings::Bool = true, index_name::AbstractString = "")

Deserialize chunks and sources to generate embeddings Note: We highly recommend to pass index_name. This will be the name of the generated index. Default: date-randomInt

Arguments

  • model: Embedding model
  • embedding_size: Embedding dimensions
  • custom_metadata: Custom metadata like ecosystem name if required
  • bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
  • index_name: Name if the index. Default: date-randomInt
source
DocsScraper.get_header_pathMethod
get_header_path(d::Dict)

Concatenate the h1, h2, h3 keys from the metadata of a Dict

Examples

d = Dict("metadata" => Dict{Symbol,Any}(:h1 => "Axis", :h2 => "Attributes", :h3 => "yzoomkey"), "heading" => "yzoomkey")
 get_header_path(d)
-# Output: "Axis/Attributes/yzoomkey"
source
DocsScraper.get_html_contentMethod
get_html_content(root::Gumbo.HTMLElement)

Return the main content of the HTML. If not found, return the whole HTML to parse

Arguments

  • root: The HTML root from which content is extracted
source
DocsScraper.get_urls!Method
get_links!(url::AbstractString, 
-    url_queue::Vector{<:AbstractString})

Extract urls inside html or xml files

Arguments

  • url: url from which all other URLs will be extracted
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.get_html_contentMethod
get_html_content(root::Gumbo.HTMLElement)

Return the main content of the HTML. If not found, return the whole HTML to parse

Arguments

  • root: The HTML root from which content is extracted
source
DocsScraper.get_urls!Method
get_links!(url::AbstractString, 
+    url_queue::Vector{<:AbstractString})

Extract urls inside html or xml files

Arguments

  • url: url from which all other URLs will be extracted
  • url_queue: Vector in which extracted URLs will be appended
source
DocsScraper.insert_parsed_data!Method
insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, 
     parsed_blocks::Vector{Dict{String,Any}}, 
     text_to_insert::AbstractString, 
-    text_type::AbstractString)

Insert the text into parsed_blocks Vector

Arguments

  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • texttoinsert: Text to be inserted
  • text_type: The text to be inserted could be heading or a code block or just text
source
DocsScraper.make_chunksMethod
make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; 
-    max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)

Parse URLs from hostnameurldict and save the chunks

Arguments

  • hostnameurldict: Dict with key being hostname and value being a vector of URLs
  • knowledgepackpath: Knowledge pack path
  • maxchunksize: Maximum chunk size
  • minchunksize: Minimum chunk size
source
DocsScraper.make_knowledge_packsFunction
make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
+    text_type::AbstractString)

Insert the text into parsed_blocks Vector

Arguments

  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • texttoinsert: Text to be inserted
  • text_type: The text to be inserted could be heading or a code block or just text
source
DocsScraper.make_chunksMethod
make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; 
+    max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)

Parse URLs from hostnameurldict and save the chunks

Arguments

  • hostnameurldict: Dict with key being hostname and value being a vector of URLs
  • knowledgepackpath: Knowledge pack path
  • maxchunksize: Maximum chunk size
  • minchunksize: Minimum chunk size
source
DocsScraper.make_knowledge_packsFunction
make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],
     max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, 
-    custom_metadata::AbstractString)

Entry point to crawl, parse and generate embeddings

Arguments

  • crawlable_urls: URLs that should be crawled to find more links
  • single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
  • maxchunksize: Maximum chunk size
  • minchunksize: Minimum chunk size
  • model: Embedding model
  • embedding_size: Embedding dimensions
  • custom_metadata: Custom metadata like ecosystem name if required
source
DocsScraper.nav_barMethod
nav_bar(url::AbstractString)

Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree

source
DocsScraper.parse_url_to_blocksMethod
parse_url(url::AbstractString)

Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata

source
DocsScraper.postprocess_chunksMethod
function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = "")

Entry point to crawl, parse and generate embeddings. Note: We highly recommend to pass index_name. This will be the name of the generated index. Default: date-randomInt

Arguments

  • crawlable_urls: URLs that should be crawled to find more links
  • single_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs
  • maxchunksize: Maximum chunk size
  • minchunksize: Minimum chunk size
  • model: Embedding model
  • embedding_size: Embedding dimensions
  • custom_metadata: Custom metadata like ecosystem name if required
  • bool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise
  • index_name: Name if the index. Default: date-randomInt
source
DocsScraper.nav_barMethod
nav_bar(url::AbstractString)

Julia doc websites tend to have the package name under ".docs-package-name" class in the HTML tree

source
DocsScraper.parse_url_to_blocksMethod
parse_url(url::AbstractString)

Initiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata

source
DocsScraper.postprocess_chunksMethod
function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
     min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,
-    websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)

Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.

source
DocsScraper.process_codeMethod
process_code(node::Gumbo.HTMLElement)

Process code snippets. If the current node is a code block, return the text inside code block with backticks.

Arguments

  • node: The root HTML node
source
DocsScraper.process_docstring!Function
process_docstring!(node::Gumbo.HTMLElement,
+    websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)

Post-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.

source
DocsScraper.process_codeMethod
process_code(node::Gumbo.HTMLElement)

Process code snippets. If the current node is a code block, return the text inside code block with backticks.

Arguments

  • node: The root HTML node
source
DocsScraper.process_docstring!Function
process_docstring!(node::Gumbo.HTMLElement,
     heading_hierarchy::Dict{Symbol,Any},
     parsed_blocks::Vector{Dict{String,Any}},
     child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))

Function to process node of class docstring

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_generic_node!Function
process_generic_node!(node::Gumbo.HTMLElement,
+    prev_text_buffer::IO=IOBuffer(write=true))

Function to process node of class docstring

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_generic_node!Function
process_generic_node!(node::Gumbo.HTMLElement,
     heading_hierarchy::Dict{Symbol,Any},
     parsed_blocks::Vector{Dict{String,Any}},
     child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))

If the node is neither heading nor code

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_headings!Method
process_headings!(node::Gumbo.HTMLElement,
+    prev_text_buffer::IO=IOBuffer(write=true))

If the node is neither heading nor code

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_headings!Method
process_headings!(node::Gumbo.HTMLElement,
     heading_hierarchy::Dict{Symbol,Any},
-    parsed_blocks::Vector{Dict{String,Any}})

Process headings. If the current node is heading, directly insert into parsed_blocks.

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
source
DocsScraper.process_hostname!Method
process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})

Add url to its hostname in hostname_dict

Arguments

  • url: URL string
  • hostname_dict: Dict with key being hostname and value being a vector of URLs
source
DocsScraper.process_node!Function
process_node!(node::Gumbo.HTMLElement,
+    parsed_blocks::Vector{Dict{String,Any}})

Process headings. If the current node is heading, directly insert into parsed_blocks.

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
source
DocsScraper.process_hostname!Method
process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})

Add url to its hostname in hostname_dict

Arguments

  • url: URL string
  • hostname_dict: Dict with key being hostname and value being a vector of URLs
source
DocsScraper.process_node!Function
process_node!(node::Gumbo.HTMLElement,
     heading_hierarchy::Dict{Symbol,Any},
     parsed_blocks::Vector{Dict{String,Any}},
     child_new::Bool=true,
-    prev_text_buffer::IO=IOBuffer(write=true))

Function to process a node

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_pathsMethod
process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)

Process folders provided in paths. In each, take all HTML files, scrape them, chunk them and postprocess them.

source
DocsScraper.remove_duplicatesMethod
remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})

Remove chunks that are duplicated in the input list of chunks and their corresponding sources.

source
DocsScraper.remove_short_chunksMethod
remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
-    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)

Remove chunks that are shorter than a specified length (min_length) from the input list of chunks and their corresponding sources.

source
DocsScraper.remove_urls_from_indexFunction
function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})

Remove chunks and sources corresponding to URLs starting with prefix_urls

source
DocsScraper.resolve_urlMethod
resolve_url(base_url::String, extracted_url::String)

Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url

Arguments

  • base_url: URL of the page from which other URLs are being extracted
  • extractedurl: URL extracted from the baseurl
source
DocsScraper.roll_up_chunksMethod
roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")

Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length

source
DocsScraper.text_before_versionMethod
text_before_version(url::AbstractString)

Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions

source
DocsScraper.url_package_nameMethod
url_package_name(url::AbstractString)

Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes

source
DocsScraper.urls_for_metadataMethod
urls_for_metadata(sources::Vector{String})

Return a Dict of package names with their associated URLs Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.

source
PromptingTools.Experimental.RAGTools.get_chunksMethod
RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
-    verbose::Bool=true, separators=["

", ". ", " ", " "], maxchunksize::Int=MAXCHUNKSIZE)

Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.

Arguments

  • chunker: DocParserChunker
  • url: URL of the webpage to extract chunks
  • verbose: Bool to print the log
  • separators: Chunk separators
  • maxchunksize Maximum chunk size
source
+ prev_text_buffer::IO=IOBuffer(write=true))

Function to process a node

Arguments

  • node: The root HTML node
  • heading_hierarchy: Dict used to store metadata
  • parsed_blocks: Vector of Dicts to store parsed text and metadata
  • childnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks
  • prevtextbuffer: IO Buffer which contains previous text
source
DocsScraper.process_pathsMethod
process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)

Process folders provided in paths. In each, take all HTML files, scrape them, chunk them and postprocess them.

source
DocsScraper.remove_duplicatesMethod
remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})

Remove chunks that are duplicated in the input list of chunks and their corresponding sources.

source
DocsScraper.remove_short_chunksMethod
remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};
+    min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)

Remove chunks that are shorter than a specified length (min_length) from the input list of chunks and their corresponding sources.

source
DocsScraper.remove_urls_from_indexFunction
function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})

Remove chunks and sources corresponding to URLs starting with prefix_urls

source
DocsScraper.resolve_urlMethod
resolve_url(base_url::String, extracted_url::String)

Check the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url

Arguments

  • base_url: URL of the page from which other URLs are being extracted
  • extractedurl: URL extracted from the baseurl
source
DocsScraper.roll_up_chunksMethod
roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String="<SEP>")

Roll-up chunks (that have the same header!), so we can split them later by <SEP> to get the desired length

source
DocsScraper.text_before_versionMethod
text_before_version(url::AbstractString)

Return text before "stable" or "dev" or any version in URL. It is generally observed that doc websites have package names before their versions

source
DocsScraper.url_package_nameMethod
url_package_name(url::AbstractString)

Return the text if the URL itself contains the package name with ".jl" or "_jl" suffixes

source
DocsScraper.urls_for_metadataMethod
urls_for_metadata(sources::Vector{String})

Return a Dict of package names with their associated URLs Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.

source
PromptingTools.Experimental.RAGTools.get_chunksMethod
RT.get_chunks(chunker::DocParserChunker, url::AbstractString;
+    verbose::Bool=true, separators=["

", ". ", " ", " "], maxchunksize::Int=MAXCHUNKSIZE)

Extract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.

Arguments

  • chunker: DocParserChunker
  • url: URL of the webpage to extract chunks
  • verbose: Bool to print the log
  • separators: Chunk separators
  • maxchunksize Maximum chunk size
source
diff --git a/dev/search_index.js b/dev/search_index.js index 08df8c9..6264c21 100644 --- a/dev/search_index.js +++ b/dev/search_index.js @@ -1,3 +1,3 @@ var documenterSearchIndex = {"docs": -[{"location":"#Reference","page":"API Index","title":"Reference","text":"","category":"section"},{"location":"","page":"API Index","title":"API Index","text":"","category":"page"},{"location":"","page":"API Index","title":"API Index","text":"Modules = [DocsScraper]","category":"page"},{"location":"#DocsScraper.base_url_segment-Tuple{String}","page":"API Index","title":"DocsScraper.base_url_segment","text":"base_url_segment(url::String)\n\nReturn the base url and first path segment if all the other checks fail\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.check_robots_txt-Tuple{AbstractString, AbstractString}","page":"API Index","title":"DocsScraper.check_robots_txt","text":"check_robots_txt(user_agent::AbstractString, url::AbstractString)\n\nCheck robots.txt of a URL and return a boolean representing if user_agent is allowed to crawl the input url, along with sitemap urls\n\nArguments\n\nuser_agent: user agent attempting to crawl the webpage\nurl: input URL string\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.clean_url-Tuple{String}","page":"API Index","title":"DocsScraper.clean_url","text":"clean_url(url::String)\n\nStrip URL of any http:// ot https:// or www. prefixes \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.crawl-Tuple{Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.crawl","text":"crawl(input_urls::Vector{<:AbstractString})\n\nCrawl on the input URLs and return a hostname_url_dict which is a dictionary with key being hostnames and the values being the URLs\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.create_output_folders-Tuple{String}","page":"API Index","title":"DocsScraper.create_output_folders","text":"create_output_folders(knowledge_pack_path::String)\n\nCreate output folders on the knowledgepackpath\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.docs_in_url-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.docs_in_url","text":"docs_in_url(url::AbstractString)\n\nIf the base url is in the form docs.packagename.domainextension, then return the middle word i.e., package_name \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_duplicates-Tuple{AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_duplicates","text":"find_duplicates(chunks::AbstractVector{<:AbstractString})\n\nFind duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where true indicates a duplicate (second instance of the same text).\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_urls_html!-Tuple{AbstractString, Gumbo.HTMLElement, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_urls_html!","text":"find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}\n\nFunction to recursively find tags and extract the urls\n\nArguments\n\nurl: The initial input URL \nnode: The HTML node of type Gumbo.HTMLElement\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_urls_xml!-Tuple{AbstractString, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_urls_xml!","text":"find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})\n\nIdentify URL through regex pattern in xml files and push in url_queue\n\nArguments\n\nurl: url from which all other URLs will be extracted\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.generate_embeddings-Tuple{String}","page":"API Index","title":"DocsScraper.generate_embeddings","text":"generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, \n embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString)\n\nDeserialize chunks and sources to generate embeddings \n\nArguments\n\nmodel: Embedding model\nembedding_size: Embedding dimensions\ncustom_metadata: Custom metadata like ecosystem name if required\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_base_url-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.get_base_url","text":"get_base_url(url::AbstractString)\n\nExtract the base url\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_header_path-Tuple{Dict{String, Any}}","page":"API Index","title":"DocsScraper.get_header_path","text":"get_header_path(d::Dict)\n\nConcatenate the h1, h2, h3 keys from the metadata of a Dict\n\nExamples\n\nd = Dict(\"metadata\" => Dict{Symbol,Any}(:h1 => \"Axis\", :h2 => \"Attributes\", :h3 => \"yzoomkey\"), \"heading\" => \"yzoomkey\")\nget_header_path(d)\n# Output: \"Axis/Attributes/yzoomkey\"\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_html_content-Tuple{Gumbo.HTMLElement}","page":"API Index","title":"DocsScraper.get_html_content","text":"get_html_content(root::Gumbo.HTMLElement)\n\nReturn the main content of the HTML. If not found, return the whole HTML to parse\n\nArguments\n\nroot: The HTML root from which content is extracted\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_package_name-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.get_package_name","text":"get_package_name(url::AbstractString)\n\nReturn name of the package through the package URL \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_urls!-Tuple{AbstractString, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.get_urls!","text":"get_links!(url::AbstractString, \n url_queue::Vector{<:AbstractString})\n\nExtract urls inside html or xml files \n\nArguments\n\nurl: url from which all other URLs will be extracted\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.insert_parsed_data!-Tuple{Dict{Symbol, Any}, Vector{Dict{String, Any}}, AbstractString, AbstractString}","page":"API Index","title":"DocsScraper.insert_parsed_data!","text":"insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, \n parsed_blocks::Vector{Dict{String,Any}}, \n text_to_insert::AbstractString, \n text_type::AbstractString)\n\nInsert the text into parsed_blocks Vector\n\nArguments\n\nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\ntexttoinsert: Text to be inserted\ntext_type: The text to be inserted could be heading or a code block or just text\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.l2_norm_columns-Tuple{AbstractMatrix}","page":"API Index","title":"DocsScraper.l2_norm_columns","text":"l2_norm_columns(mat::AbstractMatrix)\n\nNormalize the columns of the input embeddings\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.l2_norm_columns-Tuple{AbstractVector}","page":"API Index","title":"DocsScraper.l2_norm_columns","text":"l2_norm_columns(vect::AbstractVector)\n\nNormalize the columns of the input embeddings\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.make_chunks-Tuple{Dict{AbstractString, Vector{AbstractString}}, String}","page":"API Index","title":"DocsScraper.make_chunks","text":"make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; \n max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)\n\nParse URLs from hostnameurldict and save the chunks\n\nArguments\n\nhostnameurldict: Dict with key being hostname and value being a vector of URLs\nknowledgepackpath: Knowledge pack path\nmaxchunksize: Maximum chunk size\nminchunksize: Minimum chunk size\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.make_knowledge_packs","page":"API Index","title":"DocsScraper.make_knowledge_packs","text":"make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],\n max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, \n custom_metadata::AbstractString)\n\nEntry point to crawl, parse and generate embeddings\n\nArguments\n\ncrawlable_urls: URLs that should be crawled to find more links\nsingle_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs\nmaxchunksize: Maximum chunk size\nminchunksize: Minimum chunk size\nmodel: Embedding model\nembedding_size: Embedding dimensions\ncustom_metadata: Custom metadata like ecosystem name if required\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.nav_bar-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.nav_bar","text":"nav_bar(url::AbstractString)\n\nJulia doc websites tend to have the package name under \".docs-package-name\" class in the HTML tree\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.parse_robots_txt!-Tuple{String}","page":"API Index","title":"DocsScraper.parse_robots_txt!","text":"parse_robots_txt!(robots_txt::String)\n\nParse the robots.txt string and return rules and the URLs on Sitemap\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.parse_url_to_blocks-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.parse_url_to_blocks","text":"parse_url(url::AbstractString)\n\nInitiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.postprocess_chunks-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.postprocess_chunks","text":"function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};\n min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,\n websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)\n\nPost-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_code-Tuple{Gumbo.HTMLElement}","page":"API Index","title":"DocsScraper.process_code","text":"process_code(node::Gumbo.HTMLElement)\n\nProcess code snippets. If the current node is a code block, return the text inside code block with backticks.\n\nArguments\n\nnode: The root HTML node\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_docstring!","page":"API Index","title":"DocsScraper.process_docstring!","text":"process_docstring!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nFunction to process node of class docstring\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_generic_node!","page":"API Index","title":"DocsScraper.process_generic_node!","text":"process_generic_node!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nIf the node is neither heading nor code\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_headings!-Tuple{Gumbo.HTMLElement, Dict{Symbol, Any}, Vector{Dict{String, Any}}}","page":"API Index","title":"DocsScraper.process_headings!","text":"process_headings!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}})\n\nProcess headings. If the current node is heading, directly insert into parsed_blocks. \n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_hostname!-Tuple{AbstractString, Dict{AbstractString, Vector{AbstractString}}}","page":"API Index","title":"DocsScraper.process_hostname!","text":"process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})\n\nAdd url to its hostname in hostname_dict\n\nArguments\n\nurl: URL string\nhostname_dict: Dict with key being hostname and value being a vector of URLs\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_hostname-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.process_hostname","text":"process_hostname(url::AbstractString)\n\nReturn the hostname of an input URL\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_node!","page":"API Index","title":"DocsScraper.process_node!","text":"process_node!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nFunction to process a node\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_node!-Tuple{Gumbo.HTMLText, Vararg{Any}}","page":"API Index","title":"DocsScraper.process_node!","text":"multiple dispatch for process_node!() when node is of type Gumbo.HTMLText\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_paths-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.process_paths","text":"process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)\n\nProcess folders provided in paths. In each, take all HTML files, scrape them, chunk them and postprocess them.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_duplicates-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.remove_duplicates","text":"remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})\n\nRemove chunks that are duplicated in the input list of chunks and their corresponding sources.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_short_chunks-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.remove_short_chunks","text":"remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};\n min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)\n\nRemove chunks that are shorter than a specified length (min_length) from the input list of chunks and their corresponding sources.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_urls_from_index","page":"API Index","title":"DocsScraper.remove_urls_from_index","text":"function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})\n\nRemove chunks and sources corresponding to URLs starting with prefix_urls \n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.report_artifact-Tuple{Any}","page":"API Index","title":"DocsScraper.report_artifact","text":"report_artifact(fn_output)\n\nPrint artifact information\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.resolve_url-Tuple{String, String}","page":"API Index","title":"DocsScraper.resolve_url","text":"resolve_url(base_url::String, extracted_url::String)\n\nCheck the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url\n\nArguments\n\nbase_url: URL of the page from which other URLs are being extracted\nextractedurl: URL extracted from the baseurl \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.roll_up_chunks-Tuple{Vector{Dict{String, Any}}, AbstractString}","page":"API Index","title":"DocsScraper.roll_up_chunks","text":"roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String=\"\")\n\nRoll-up chunks (that have the same header!), so we can split them later by to get the desired length\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.text_before_version-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.text_before_version","text":"text_before_version(url::AbstractString)\n\nReturn text before \"stable\" or \"dev\" or any version in URL. It is generally observed that doc websites have package names before their versions \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.url_package_name-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.url_package_name","text":"url_package_name(url::AbstractString)\n\nReturn the text if the URL itself contains the package name with \".jl\" or \"_jl\" suffixes\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.urls_for_metadata-Tuple{Vector{String}}","page":"API Index","title":"DocsScraper.urls_for_metadata","text":"urls_for_metadata(sources::Vector{String})\n\nReturn a Dict of package names with their associated URLs Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.\n\n\n\n\n\n","category":"method"},{"location":"#PromptingTools.Experimental.RAGTools.get_chunks-Tuple{DocsScraper.DocParserChunker, AbstractString}","page":"API Index","title":"PromptingTools.Experimental.RAGTools.get_chunks","text":"RT.get_chunks(chunker::DocParserChunker, url::AbstractString;\n verbose::Bool=true, separators=[\"\n\n\", \". \", \" \", \" \"], maxchunksize::Int=MAXCHUNKSIZE)\n\nExtract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.\n\nArguments\n\nchunker: DocParserChunker\nurl: URL of the webpage to extract chunks\nverbose: Bool to print the log\nseparators: Chunk separators\nmaxchunksize Maximum chunk size\n\n\n\n\n\n","category":"method"}] +[{"location":"#Reference","page":"API Index","title":"Reference","text":"","category":"section"},{"location":"","page":"API Index","title":"API Index","text":"","category":"page"},{"location":"","page":"API Index","title":"API Index","text":"Modules = [DocsScraper]","category":"page"},{"location":"#DocsScraper.base_url_segment-Tuple{String}","page":"API Index","title":"DocsScraper.base_url_segment","text":"base_url_segment(url::String)\n\nReturn the base url and first path segment if all the other checks fail\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.check_robots_txt-Tuple{AbstractString, AbstractString}","page":"API Index","title":"DocsScraper.check_robots_txt","text":"check_robots_txt(user_agent::AbstractString, url::AbstractString)\n\nCheck robots.txt of a URL and return a boolean representing if user_agent is allowed to crawl the input url, along with sitemap urls\n\nArguments\n\nuser_agent: user agent attempting to crawl the webpage\nurl: input URL string\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.clean_url-Tuple{String}","page":"API Index","title":"DocsScraper.clean_url","text":"clean_url(url::String)\n\nStrip URL of any http:// ot https:// or www. prefixes \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.crawl-Tuple{Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.crawl","text":"crawl(input_urls::Vector{<:AbstractString})\n\nCrawl on the input URLs and return a hostname_url_dict which is a dictionary with key being hostnames and the values being the URLs\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.create_output_folders-Tuple{String}","page":"API Index","title":"DocsScraper.create_output_folders","text":"create_output_folders(knowledge_pack_path::String)\n\nCreate output folders on the knowledgepackpath\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.docs_in_url-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.docs_in_url","text":"docs_in_url(url::AbstractString)\n\nIf the base url is in the form docs.packagename.domainextension, then return the middle word i.e., package_name \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_duplicates-Tuple{AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_duplicates","text":"find_duplicates(chunks::AbstractVector{<:AbstractString})\n\nFind duplicates in a list of chunks using SHA-256 hash. Returns a bit vector of the same length as the input list, where true indicates a duplicate (second instance of the same text).\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_urls_html!-Tuple{AbstractString, Gumbo.HTMLElement, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_urls_html!","text":"find_urls_html!(url::AbstractString, node::Gumbo.HTMLElement, url_queue::Vector{<:AbstractString}\n\nFunction to recursively find tags and extract the urls\n\nArguments\n\nurl: The initial input URL \nnode: The HTML node of type Gumbo.HTMLElement\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.find_urls_xml!-Tuple{AbstractString, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.find_urls_xml!","text":"find_urls_xml!(url::AbstractString, url_queue::Vector{<:AbstractString})\n\nIdentify URL through regex pattern in xml files and push in url_queue\n\nArguments\n\nurl: url from which all other URLs will be extracted\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.generate_embeddings-Tuple{String}","page":"API Index","title":"DocsScraper.generate_embeddings","text":"generate_embeddings(knowledge_pack_path::String; model::AbstractString=MODEL, \n embedding_size::Int=EMBEDDING_SIZE, custom_metadata::AbstractString,\n bool_embeddings::Bool = true, index_name::AbstractString = \"\")\n\nDeserialize chunks and sources to generate embeddings Note: We highly recommend to pass index_name. This will be the name of the generated index. Default: date-randomInt\n\nArguments\n\nmodel: Embedding model\nembedding_size: Embedding dimensions\ncustom_metadata: Custom metadata like ecosystem name if required\nbool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise\nindex_name: Name if the index. Default: date-randomInt\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_base_url-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.get_base_url","text":"get_base_url(url::AbstractString)\n\nExtract the base url\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_header_path-Tuple{Dict{String, Any}}","page":"API Index","title":"DocsScraper.get_header_path","text":"get_header_path(d::Dict)\n\nConcatenate the h1, h2, h3 keys from the metadata of a Dict\n\nExamples\n\nd = Dict(\"metadata\" => Dict{Symbol,Any}(:h1 => \"Axis\", :h2 => \"Attributes\", :h3 => \"yzoomkey\"), \"heading\" => \"yzoomkey\")\nget_header_path(d)\n# Output: \"Axis/Attributes/yzoomkey\"\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_html_content-Tuple{Gumbo.HTMLElement}","page":"API Index","title":"DocsScraper.get_html_content","text":"get_html_content(root::Gumbo.HTMLElement)\n\nReturn the main content of the HTML. If not found, return the whole HTML to parse\n\nArguments\n\nroot: The HTML root from which content is extracted\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_package_name-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.get_package_name","text":"get_package_name(url::AbstractString)\n\nReturn name of the package through the package URL \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.get_urls!-Tuple{AbstractString, Vector{<:AbstractString}}","page":"API Index","title":"DocsScraper.get_urls!","text":"get_links!(url::AbstractString, \n url_queue::Vector{<:AbstractString})\n\nExtract urls inside html or xml files \n\nArguments\n\nurl: url from which all other URLs will be extracted\nurl_queue: Vector in which extracted URLs will be appended\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.insert_parsed_data!-Tuple{Dict{Symbol, Any}, Vector{Dict{String, Any}}, AbstractString, AbstractString}","page":"API Index","title":"DocsScraper.insert_parsed_data!","text":"insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, \n parsed_blocks::Vector{Dict{String,Any}}, \n text_to_insert::AbstractString, \n text_type::AbstractString)\n\nInsert the text into parsed_blocks Vector\n\nArguments\n\nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\ntexttoinsert: Text to be inserted\ntext_type: The text to be inserted could be heading or a code block or just text\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.l2_norm_columns-Tuple{AbstractMatrix}","page":"API Index","title":"DocsScraper.l2_norm_columns","text":"l2_norm_columns(mat::AbstractMatrix)\n\nNormalize the columns of the input embeddings\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.l2_norm_columns-Tuple{AbstractVector}","page":"API Index","title":"DocsScraper.l2_norm_columns","text":"l2_norm_columns(vect::AbstractVector)\n\nNormalize the columns of the input embeddings\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.make_chunks-Tuple{Dict{AbstractString, Vector{AbstractString}}, String}","page":"API Index","title":"DocsScraper.make_chunks","text":"make_chunks(hostname_url_dict::Dict{AbstractString,Vector{AbstractString}}, knowledge_pack_path::String; \n max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)\n\nParse URLs from hostnameurldict and save the chunks\n\nArguments\n\nhostnameurldict: Dict with key being hostname and value being a vector of URLs\nknowledgepackpath: Knowledge pack path\nmaxchunksize: Maximum chunk size\nminchunksize: Minimum chunk size\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.make_knowledge_packs","page":"API Index","title":"DocsScraper.make_knowledge_packs","text":"make_knowledge_packs(crawlable_urls::Vector{<:AbstractString}=String[]; single_urls::Vector{<:AbstractString}=String[],\n max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE, model::AbstractString=MODEL, embedding_size::Int=EMBEDDING_SIZE, \n custom_metadata::AbstractString, bool_embeddings::Bool = true, index_name::AbstractString = \"\")\n\nEntry point to crawl, parse and generate embeddings. Note: We highly recommend to pass index_name. This will be the name of the generated index. Default: date-randomInt\n\nArguments\n\ncrawlable_urls: URLs that should be crawled to find more links\nsingle_urls: Single page URLs that should just be scraped and parsed. The crawler won't look for more URLs\nmaxchunksize: Maximum chunk size\nminchunksize: Minimum chunk size\nmodel: Embedding model\nembedding_size: Embedding dimensions\ncustom_metadata: Custom metadata like ecosystem name if required\nbool_embeddings: If true, embeddings generated will be boolean, Float32 otherwise\nindex_name: Name if the index. Default: date-randomInt\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.nav_bar-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.nav_bar","text":"nav_bar(url::AbstractString)\n\nJulia doc websites tend to have the package name under \".docs-package-name\" class in the HTML tree\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.parse_robots_txt!-Tuple{String}","page":"API Index","title":"DocsScraper.parse_robots_txt!","text":"parse_robots_txt!(robots_txt::String)\n\nParse the robots.txt string and return rules and the URLs on Sitemap\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.parse_url_to_blocks-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.parse_url_to_blocks","text":"parse_url(url::AbstractString)\n\nInitiator and main function to parse HTML from url. Return a Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.postprocess_chunks-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.postprocess_chunks","text":"function postprocess_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};\n min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true, paths::Union{Nothing,AbstractVector{<:AbstractString}}=nothing,\n websites::Union{Nothing,AbstractVector{<:AbstractString}}=nothing)\n\nPost-process the input list of chunks and their corresponding sources by removing short chunks and duplicates.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_code-Tuple{Gumbo.HTMLElement}","page":"API Index","title":"DocsScraper.process_code","text":"process_code(node::Gumbo.HTMLElement)\n\nProcess code snippets. If the current node is a code block, return the text inside code block with backticks.\n\nArguments\n\nnode: The root HTML node\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_docstring!","page":"API Index","title":"DocsScraper.process_docstring!","text":"process_docstring!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nFunction to process node of class docstring\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_generic_node!","page":"API Index","title":"DocsScraper.process_generic_node!","text":"process_generic_node!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nIf the node is neither heading nor code\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_headings!-Tuple{Gumbo.HTMLElement, Dict{Symbol, Any}, Vector{Dict{String, Any}}}","page":"API Index","title":"DocsScraper.process_headings!","text":"process_headings!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}})\n\nProcess headings. If the current node is heading, directly insert into parsed_blocks. \n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_hostname!-Tuple{AbstractString, Dict{AbstractString, Vector{AbstractString}}}","page":"API Index","title":"DocsScraper.process_hostname!","text":"process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})\n\nAdd url to its hostname in hostname_dict\n\nArguments\n\nurl: URL string\nhostname_dict: Dict with key being hostname and value being a vector of URLs\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_hostname-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.process_hostname","text":"process_hostname(url::AbstractString)\n\nReturn the hostname of an input URL\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_node!","page":"API Index","title":"DocsScraper.process_node!","text":"process_node!(node::Gumbo.HTMLElement,\n heading_hierarchy::Dict{Symbol,Any},\n parsed_blocks::Vector{Dict{String,Any}},\n child_new::Bool=true,\n prev_text_buffer::IO=IOBuffer(write=true))\n\nFunction to process a node\n\nArguments\n\nnode: The root HTML node \nheading_hierarchy: Dict used to store metadata\nparsed_blocks: Vector of Dicts to store parsed text and metadata\nchildnew: Bool to specify if the current block (child) is part of previous block or not. If it's not, then a new insertion needs to be created in parsedblocks\nprevtextbuffer: IO Buffer which contains previous text\n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.process_node!-Tuple{Gumbo.HTMLText, Vararg{Any}}","page":"API Index","title":"DocsScraper.process_node!","text":"multiple dispatch for process_node!() when node is of type Gumbo.HTMLText\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.process_paths-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.process_paths","text":"process_paths(url::AbstractString; max_chunk_size::Int=MAX_CHUNK_SIZE, min_chunk_size::Int=MIN_CHUNK_SIZE)\n\nProcess folders provided in paths. In each, take all HTML files, scrape them, chunk them and postprocess them.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_duplicates-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.remove_duplicates","text":"remove_duplicates(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString})\n\nRemove chunks that are duplicated in the input list of chunks and their corresponding sources.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_short_chunks-Tuple{AbstractVector{<:AbstractString}, AbstractVector{<:AbstractString}}","page":"API Index","title":"DocsScraper.remove_short_chunks","text":"remove_short_chunks(chunks::AbstractVector{<:AbstractString}, sources::AbstractVector{<:AbstractString};\n min_chunk_size::Int=MIN_CHUNK_SIZE, skip_code::Bool=true)\n\nRemove chunks that are shorter than a specified length (min_length) from the input list of chunks and their corresponding sources.\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.remove_urls_from_index","page":"API Index","title":"DocsScraper.remove_urls_from_index","text":"function remove_urls_from_index(index_path::AbstractString, prefix_urls=Vector{<:AbstractString})\n\nRemove chunks and sources corresponding to URLs starting with prefix_urls \n\n\n\n\n\n","category":"function"},{"location":"#DocsScraper.report_artifact-Tuple{Any}","page":"API Index","title":"DocsScraper.report_artifact","text":"report_artifact(fn_output)\n\nPrint artifact information\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.resolve_url-Tuple{String, String}","page":"API Index","title":"DocsScraper.resolve_url","text":"resolve_url(base_url::String, extracted_url::String)\n\nCheck the extracted URL with the original URL. Return empty String if the extracted URL belongs to a different domain. Return complete URL if there's a directory traversal paths or the extracted URL belongs to the same domain as the base_url\n\nArguments\n\nbase_url: URL of the page from which other URLs are being extracted\nextractedurl: URL extracted from the baseurl \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.roll_up_chunks-Tuple{Vector{Dict{String, Any}}, AbstractString}","page":"API Index","title":"DocsScraper.roll_up_chunks","text":"roll_up_chunks(parsed_blocks::Vector{Dict{String,Any}}, url::AbstractString; separator::String=\"\")\n\nRoll-up chunks (that have the same header!), so we can split them later by to get the desired length\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.text_before_version-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.text_before_version","text":"text_before_version(url::AbstractString)\n\nReturn text before \"stable\" or \"dev\" or any version in URL. It is generally observed that doc websites have package names before their versions \n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.url_package_name-Tuple{AbstractString}","page":"API Index","title":"DocsScraper.url_package_name","text":"url_package_name(url::AbstractString)\n\nReturn the text if the URL itself contains the package name with \".jl\" or \"_jl\" suffixes\n\n\n\n\n\n","category":"method"},{"location":"#DocsScraper.urls_for_metadata-Tuple{Vector{String}}","page":"API Index","title":"DocsScraper.urls_for_metadata","text":"urls_for_metadata(sources::Vector{String})\n\nReturn a Dict of package names with their associated URLs Note: Due to their large number, URLs are stripped down to the package name; Package subpaths are not included in metadata.\n\n\n\n\n\n","category":"method"},{"location":"#PromptingTools.Experimental.RAGTools.get_chunks-Tuple{DocsScraper.DocParserChunker, AbstractString}","page":"API Index","title":"PromptingTools.Experimental.RAGTools.get_chunks","text":"RT.get_chunks(chunker::DocParserChunker, url::AbstractString;\n verbose::Bool=true, separators=[\"\n\n\", \". \", \" \", \" \"], maxchunksize::Int=MAXCHUNKSIZE)\n\nExtract chunks from HTML files, by parsing the content in the HTML, rolling up chunks by headers, and splits them by separators to get the desired length.\n\nArguments\n\nchunker: DocParserChunker\nurl: URL of the webpage to extract chunks\nverbose: Bool to print the log\nseparators: Chunk separators\nmaxchunksize Maximum chunk size\n\n\n\n\n\n","category":"method"}] }