diff --git a/Manifest.toml b/Manifest.toml new file mode 100644 index 0000000..e3b8557 --- /dev/null +++ b/Manifest.toml @@ -0,0 +1,242 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.10.2" +manifest_format = "2.0" +project_hash = "349e311d5cd42e1e3153d8d68dd0a7ecc199edb7" + +[[deps.AbstractTrees]] +git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177" +uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +version = "0.4.5" + +[[deps.ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.BitFlags]] +git-tree-sha1 = "2dc09997850d68179b69dafb58ae806167a32b1b" +uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35" +version = "0.1.8" + +[[deps.CodecZlib]] +deps = ["TranscodingStreams", "Zlib_jll"] +git-tree-sha1 = "59939d8a997469ee05c4b4944560a820f9ba0d73" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.7.4" + +[[deps.ConcurrentUtilities]] +deps = ["Serialization", "Sockets"] +git-tree-sha1 = "6cbbd4d241d7e6579ab354737f4dd95ca43946e1" +uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb" +version = "2.4.1" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[deps.ExceptionUnwrapping]] +deps = ["Test"] +git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a" +uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4" +version = "0.1.10" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.Gumbo]] +deps = ["AbstractTrees", "Gumbo_jll", "Libdl"] +git-tree-sha1 = "a1a138dfbf9df5bace489c7a9d5196d6afdfa140" +uuid = "708ec375-b3d6-5a57-a7ce-8257bf98657a" +version = "0.8.2" + +[[deps.Gumbo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "29070dee9df18d9565276d68a596854b1764aa38" +uuid = "528830af-5a63-567c-a44a-034ed33b8444" +version = "0.10.2+0" + +[[deps.HTTP]] +deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"] +git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "1.10.8" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.JLLWrappers]] +deps = ["Artifacts", "Preferences"] +git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.5.0" + +[[deps.LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.4" + +[[deps.LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "8.4.0+0" + +[[deps.LibGit2]] +deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.LibGit2_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"] +uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5" +version = "1.6.4+0" + +[[deps.LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.11.0+1" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.LoggingExtras]] +deps = ["Dates", "Logging"] +git-tree-sha1 = "c1dd6d7978c12545b4179fb6153b9250c96b0075" +uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36" +version = "1.0.3" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"] +git-tree-sha1 = "c067a280ddc25f196b5e7df3877c6b226d390aaf" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.1.9" + +[[deps.MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+1" + +[[deps.MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2023.1.10" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.OpenSSL]] +deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"] +git-tree-sha1 = "38cb508d080d21dc1128f7fb04f20387ed4c0af4" +uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c" +version = "1.4.3" + +[[deps.OpenSSL_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl"] +git-tree-sha1 = "3da7367955dcc5c54c1ba4d402ccdc09a1a3e046" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "3.0.13+1" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.10.0" + +[[deps.Preferences]] +deps = ["TOML"] +git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.4.3" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.SimpleBufferStream]] +git-tree-sha1 = "874e8867b33a00e784c8a7e4b60afe9e037b74e1" +uuid = "777ac1f9-54b0-4bf8-805c-2214025038e7" +version = "1.1.0" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" + +[[deps.Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" + +[[deps.Test]] +deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[deps.TranscodingStreams]] +git-tree-sha1 = "5d54d076465da49d6746c647022f3b3674e64156" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.10.8" +weakdeps = ["Random", "Test"] + + [deps.TranscodingStreams.extensions] + TestExt = ["Test", "Random"] + +[[deps.URIs]] +git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b" +uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +version = "1.5.1" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+1" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.52.0+1" + +[[deps.p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+2" diff --git a/Project.toml b/Project.toml new file mode 100644 index 0000000..17303a7 --- /dev/null +++ b/Project.toml @@ -0,0 +1,16 @@ +name = "RAGKit" +uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17" +authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"] +version = "0.1.0" + +[deps] +AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" +Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a" +HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" +URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" + +[compat] +AbstractTrees = "0.4.5" +Gumbo = "0.8.2" +HTTP = "1.10.4" +URIs = "1.5.1" diff --git a/README.md b/README.md new file mode 100644 index 0000000..1ea2386 --- /dev/null +++ b/README.md @@ -0,0 +1,21 @@ +# DocsScraper: "A document scraping and parsing tool used to create a custom RAG database for AIHelpMe.jl" + +This tool is used to collect and parse Julia's extensive documentation. + +## Requirements + +```julia +using Pkg; +Pkg.instantiate() +``` + +## Usage +1. **Basic Usage**: +```julia + parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/") +``` + +## How it works +```parse_url(url::String)``` extracts the base URL and recursively parses the URL so that all the inner lying text and code is returned in the form of a Vector of Dict along with each text/code's metadata. + +Please note that this is merely a pre-release and more work needs to be done \ No newline at end of file diff --git a/src/RAGKit.jl b/src/RAGKit.jl new file mode 100644 index 0000000..36a1254 --- /dev/null +++ b/src/RAGKit.jl @@ -0,0 +1 @@ +include("parser.jl") diff --git a/src/parser.jl b/src/parser.jl new file mode 100644 index 0000000..d1fbb49 --- /dev/null +++ b/src/parser.jl @@ -0,0 +1,429 @@ +""" +Working: + +Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks +ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion. +For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks. +If the current node is a code block, return the text inside code block with backticks. +If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. +if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td] +it is assumed that everything inside the tag is part of a single text block with inline code. +But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. +To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration +that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. +We indicate this by a return flag is_text_inserted +""" + + + +""" + insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) + +Insert the text into parsed_blocks Vector + +# Arguments +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- text_to_insert: Text to be inserted +- text_type: The text to be inserted could be heading or a code block or just text +""" +function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + text_to_insert::AbstractString, + text_type::AbstractString) + + if !isempty(strip(text_to_insert)) + push!(parsed_blocks, + Dict(text_type => strip(text_to_insert), + "metadata" => copy(heading_hierarchy))) + end +end + + + +""" + process_headings!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}) + +Process headings. If the current node is heading, directly insert into parsed_blocks. + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +""" +function process_headings!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}) + + tag_name = Gumbo.tag(node) + # Clear headings of equal or lower level + for k in collect(keys(heading_hierarchy)) + if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name))) + delete!(heading_hierarchy, k) + end + end + heading_hierarchy[tag_name] = strip(Gumbo.text(node)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(node), "heading") + + is_code_block = false + is_text_inserted = false + return "", is_code_block, is_text_inserted +end + +""" + process_code(node::Gumbo.HTMLElement) + +Process code snippets. If the current node is a code block, return the text inside code block with backticks. + +# Arguments +- node: The root HTML node +""" +function process_code(node::Gumbo.HTMLElement) + is_code_block = false + + # Start a new code block + if Gumbo.tag(node.parent) == :pre + class_name = getattr(node, "class", "") + if occursin("language", class_name) + match_result = match(r"language-(\S+)", class_name) + language = match_result !== nothing ? match_result.captures[1] : "julia" + code_content = "```$language " * strip(Gumbo.text(node)) * "```" + else + code_content = "```julia " * strip(Gumbo.text(node)) * "```" + end + is_code_block = true + else + code_content = "`" * strip(Gumbo.text(node)) * "`" + end + is_text_inserted = false + return code_content, is_code_block, is_text_inserted +end + +""" + process_generic_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +If the node is neither heading nor code + + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_generic_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + seekstart(prev_text_buffer) + prev_text = read(prev_text_buffer, String) + + tag_name = Gumbo.tag(node) + text_to_insert = "" + # Recursively process the child node for text content + children = collect(AbstractTrees.children(node)) + num_children = length(children) + is_code_block = false + is_text_inserted = false + for (index, child) in enumerate(children) + # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless, + # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. + + if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header] + received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer) + else + received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + end + + # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call) + if is_text_inserted + text_to_insert = "" + prev_text = "" + take!(prev_text_buffer) + end + + # if is_code_block is true, means the received_text is a code block, hence needs to be put as a separate entry in parsed_blocks + if !isempty(strip(received_text)) && is_code_block == true + to_insert = String(take!(prev_text_buffer)) + if (!isempty(strip(to_insert))) + insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text") + text_to_insert = "" + prev_text = "" + is_text_inserted = true + end + insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "code") + is_code_block = false + received_text = "" + end + + # if the code block is last part of the loop then due to is_code_block::bool, whole text_to_insert becomes code + # (code is returned with backticks. If code is not inline and is supposed to be a separate block, + # then this case is handled earlier where size of code>50 ) + if index == num_children + is_code_block = false + end + + if !isempty(strip(received_text)) + print(prev_text_buffer, " " * received_text) + text_to_insert = text_to_insert * " " * received_text + end + + end + + # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, + # reset the buffer return the text. + if (child_new == false) + take!(prev_text_buffer) + print(prev_text_buffer, prev_text) + return text_to_insert, is_code_block, is_text_inserted + end + + # insert text_to_insert to parsed_blocks + # if we're insert text in current node level, then we should insert the previous text if available, + # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird + if !isempty(strip(text_to_insert)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text") + is_text_inserted = true + end + + # following is so that in current recursive call, we appended prev_text_buffer, which we need to remove + take!(prev_text_buffer) + print(prev_text_buffer, prev_text) + return "", is_code_block, is_text_inserted +end + + +""" + process_docstring!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +Function to process node of class `docstring` + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_docstring!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + seekstart(prev_text_buffer) + prev_text = read(prev_text_buffer, String) + is_code_block = false + is_text_inserted = false + + # Recursively process the child node for text content + children = collect(AbstractTrees.children(node)) + + # Insert previously collected text + to_insert = String(take!(prev_text_buffer)) + if (!isempty(strip(to_insert))) + insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text") + prev_text = "" + is_text_inserted = true + end + + # Insert "header" + if Gumbo.tag(children[1]) == :header + heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1])) + insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header") + end + + received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + + if !isempty(strip(received_text)) + insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text") + end + delete!(heading_hierarchy, :docstring_header) + + return "", is_code_block, is_text_inserted +end + +""" + process_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + +Function to process a node + +# Arguments +- node: The root HTML node +- heading_hierarchy: Dict used to store metadata +- parsed_blocks: Vector of Dicts to store parsed text and metadata +- child_new: Bool to specify if the current block (child) is part of previous block or not. + If it's not, then a new insertion needs to be created in parsed_blocks +- prev_text_buffer: IO Buffer which contains previous text +""" +function process_node!(node::Gumbo.HTMLElement, + heading_hierarchy::Dict{Symbol,Any}, + parsed_blocks::Vector{Dict{String,Any}}, + child_new::Bool=true, + prev_text_buffer::IO=IOBuffer(write=true)) + + tag_name = Gumbo.tag(node) + if startswith(string(tag_name), "h") && isdigit(last(string(tag_name))) + return process_headings!(node, heading_hierarchy, parsed_blocks) + + elseif tag_name == :code + return process_code(node) + + elseif tag_name == :article && getattr(node, "class", "") == "docstring" + return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + + end + + return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer) + +end + + +""" +multiple dispatch for process_node!() when node is of type Gumbo.HTMLText +""" +function process_node!(node::Gumbo.HTMLText, args...) + is_code_block = false + is_text_inserted = false + return strip(Gumbo.text(node)), is_code_block, is_text_inserted +end + + +""" + get_base_url(url::AbstractString) + +Extracts the base url. + +# Arguments +- `url`: The url string of which, the base url needs to be extracted +""" +function get_base_url(url::AbstractString) + parsed_url = URIs.URI(url) + base_url = string(parsed_url.scheme, "://", parsed_url.host, + parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path) + return base_url +end + +""" + get_html_content(root::Gumbo.HTMLElement) + +Returns the main content of the HTML. If not found, returns the whole HTML to parse + +# Arguments +- `root`: The HTML root from which content is extracted +""" +function get_html_content(root::Gumbo.HTMLElement) + target_ids = Set(["VPContent", "main_content_wrap", "pages-content"]) + target_classes = Set(["content", "franklin-content"]) + + content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement] + + # First try to find by ID + content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates) + if !isempty(content_by_id) + return only(content_by_id) + end + + # Fallback to class if no ID matches + content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates) + if !isempty(content_by_class) + return only(content_by_class) + end + + # Fallback to the root node if no class matches + return root + +end + + +""" + parse_url(urls::Vector{<:AbstractString}) + +Initiator and main function to parse HTML from url + +# Arguments +- `urls`: vector containing URL strings to parse + +# Returns +- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata + +# Usage +parsed_blocks = parse_url(["https://docs.julialang.org/en/v1/base/multi-threading/"]) + +# Example +Let the HTML be: + + +
+ +para 1
+this is my code block
+ This is a paragraph with inline code
para ewg
+ + + + +Output: +Any[ + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3") + Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2")) + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with inline code") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2") + Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg") +] +""" +function parse_url_to_blocks(urls::Vector{<:AbstractString}) + + ## TODO: Check if you need parallel processing for multiple urls + + parsed_blocks = Vector{Dict{String,Any}}() + heading_hierarchy = Dict{Symbol,Any}() + + for url in urls + @info "Parsing URL: $url" + base_url = get_base_url(url) + r = HTTP.get(base_url) + r_parsed = parsehtml(String(r.body)) + # Getting title of the document + # title = [el + # for el in AbstractTrees.PreOrderDFS(r_parsed.root) + # if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ") + + + process_node!(get_html_content(r_parsed.root), heading_hierarchy, parsed_blocks) + end + return parsed_blocks +end diff --git a/test/runtests.jl b/test/runtests.jl new file mode 100644 index 0000000..aee7e9c --- /dev/null +++ b/test/runtests.jl @@ -0,0 +1,7 @@ +using Test + +include("..\\src\\RAGKit.jl") + +@testset "RAGKit Tests" begin + # Your test cases go here +end