diff --git a/Manifest.toml b/Manifest.toml
new file mode 100644
index 0000000..e3b8557
--- /dev/null
+++ b/Manifest.toml
@@ -0,0 +1,242 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.10.2"
+manifest_format = "2.0"
+project_hash = "349e311d5cd42e1e3153d8d68dd0a7ecc199edb7"
+
+[[deps.AbstractTrees]]
+git-tree-sha1 = "2d9c9a55f9c93e8887ad391fbae72f8ef55e1177"
+uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+version = "0.4.5"
+
+[[deps.ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+version = "1.1.1"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[deps.BitFlags]]
+git-tree-sha1 = "2dc09997850d68179b69dafb58ae806167a32b1b"
+uuid = "d1d4a3ce-64b1-5f1a-9ba4-7e7e69966f35"
+version = "0.1.8"
+
+[[deps.CodecZlib]]
+deps = ["TranscodingStreams", "Zlib_jll"]
+git-tree-sha1 = "59939d8a997469ee05c4b4944560a820f9ba0d73"
+uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
+version = "0.7.4"
+
+[[deps.ConcurrentUtilities]]
+deps = ["Serialization", "Sockets"]
+git-tree-sha1 = "6cbbd4d241d7e6579ab354737f4dd95ca43946e1"
+uuid = "f0e56b4a-5159-44fe-b623-3e5288b988bb"
+version = "2.4.1"
+
+[[deps.Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[deps.Downloads]]
+deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+version = "1.6.0"
+
+[[deps.ExceptionUnwrapping]]
+deps = ["Test"]
+git-tree-sha1 = "dcb08a0d93ec0b1cdc4af184b26b591e9695423a"
+uuid = "460bff9d-24e4-43bc-9d9f-a8973cb893f4"
+version = "0.1.10"
+
+[[deps.FileWatching]]
+uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+
+[[deps.Gumbo]]
+deps = ["AbstractTrees", "Gumbo_jll", "Libdl"]
+git-tree-sha1 = "a1a138dfbf9df5bace489c7a9d5196d6afdfa140"
+uuid = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+version = "0.8.2"
+
+[[deps.Gumbo_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
+git-tree-sha1 = "29070dee9df18d9565276d68a596854b1764aa38"
+uuid = "528830af-5a63-567c-a44a-034ed33b8444"
+version = "0.10.2+0"
+
+[[deps.HTTP]]
+deps = ["Base64", "CodecZlib", "ConcurrentUtilities", "Dates", "ExceptionUnwrapping", "Logging", "LoggingExtras", "MbedTLS", "NetworkOptions", "OpenSSL", "Random", "SimpleBufferStream", "Sockets", "URIs", "UUIDs"]
+git-tree-sha1 = "d1d712be3164d61d1fb98e7ce9bcbc6cc06b45ed"
+uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+version = "1.10.8"
+
+[[deps.InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[deps.JLLWrappers]]
+deps = ["Artifacts", "Preferences"]
+git-tree-sha1 = "7e5d6779a1e09a36db2a7b6cff50942a0a7d0fca"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.5.0"
+
+[[deps.LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.6.4"
+
+[[deps.LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "8.4.0+0"
+
+[[deps.LibGit2]]
+deps = ["Base64", "LibGit2_jll", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[deps.LibGit2_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll"]
+uuid = "e37daf67-58a4-590a-8e99-b0245dd2ffc5"
+version = "1.6.4+0"
+
+[[deps.LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.11.0+1"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[deps.LoggingExtras]]
+deps = ["Dates", "Logging"]
+git-tree-sha1 = "c1dd6d7978c12545b4179fb6153b9250c96b0075"
+uuid = "e6f89c97-d47a-5376-807f-9c37f3926c36"
+version = "1.0.3"
+
+[[deps.Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[deps.MbedTLS]]
+deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "NetworkOptions", "Random", "Sockets"]
+git-tree-sha1 = "c067a280ddc25f196b5e7df3877c6b226d390aaf"
+uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
+version = "1.1.9"
+
+[[deps.MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.28.2+1"
+
+[[deps.MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+version = "2023.1.10"
+
+[[deps.NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
+
+[[deps.OpenSSL]]
+deps = ["BitFlags", "Dates", "MozillaCACerts_jll", "OpenSSL_jll", "Sockets"]
+git-tree-sha1 = "38cb508d080d21dc1128f7fb04f20387ed4c0af4"
+uuid = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
+version = "1.4.3"
+
+[[deps.OpenSSL_jll]]
+deps = ["Artifacts", "JLLWrappers", "Libdl"]
+git-tree-sha1 = "3da7367955dcc5c54c1ba4d402ccdc09a1a3e046"
+uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95"
+version = "3.0.13+1"
+
+[[deps.Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+version = "1.10.0"
+
+[[deps.Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "9306f6085165d270f7e3db02af26a400d580f5c6"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.4.3"
+
+[[deps.Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[deps.REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[deps.Random]]
+deps = ["SHA"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.SimpleBufferStream]]
+git-tree-sha1 = "874e8867b33a00e784c8a7e4b60afe9e037b74e1"
+uuid = "777ac1f9-54b0-4bf8-805c-2214025038e7"
+version = "1.1.0"
+
+[[deps.Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[deps.TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.3"
+
+[[deps.Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+version = "1.10.0"
+
+[[deps.Test]]
+deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
+uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[[deps.TranscodingStreams]]
+git-tree-sha1 = "5d54d076465da49d6746c647022f3b3674e64156"
+uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
+version = "0.10.8"
+weakdeps = ["Random", "Test"]
+
+    [deps.TranscodingStreams.extensions]
+    TestExt = ["Test", "Random"]
+
+[[deps.URIs]]
+git-tree-sha1 = "67db6cc7b3821e19ebe75791a9dd19c9b1188f2b"
+uuid = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+version = "1.5.1"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[deps.Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.13+1"
+
+[[deps.nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.52.0+1"
+
+[[deps.p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "17.4.0+2"
diff --git a/Project.toml b/Project.toml
new file mode 100644
index 0000000..17303a7
--- /dev/null
+++ b/Project.toml
@@ -0,0 +1,16 @@
+name = "RAGKit"
+uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17"
+authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"]
+version = "0.1.0"
+
+[deps]
+AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
+Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
+HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
+URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
+
+[compat]
+AbstractTrees = "0.4.5"
+Gumbo = "0.8.2"
+HTTP = "1.10.4"
+URIs = "1.5.1"
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..1ea2386
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+# DocsScraper: "A document scraping and parsing tool used to create a custom RAG database for AIHelpMe.jl"
+
+This tool is used to collect and parse Julia's extensive documentation.
+
+## Requirements
+
+```julia
+using Pkg; 
+Pkg.instantiate()
+```
+
+## Usage
+1. **Basic Usage**:
+```julia
+   parsed_text = parse_url("https://docs.julialang.org/en/v1/base/multi-threading/")
+```
+
+## How it works
+```parse_url(url::String)``` extracts the base URL and recursively parses the URL so that all the inner lying text and code is returned in the form of a Vector of Dict along with each text/code's metadata.
+
+Please note that this is merely a pre-release and more work needs to be done
\ No newline at end of file
diff --git a/src/RAGKit.jl b/src/RAGKit.jl
new file mode 100644
index 0000000..36a1254
--- /dev/null
+++ b/src/RAGKit.jl
@@ -0,0 +1 @@
+include("parser.jl")
diff --git a/src/parser.jl b/src/parser.jl
new file mode 100644
index 0000000..d1fbb49
--- /dev/null
+++ b/src/parser.jl
@@ -0,0 +1,429 @@
+"""
+Working:
+
+Since HTML structure is complex, we need to figure out when do we insert the extracted text in parsed_blocks 
+ie., should we add the text of child hierarchy and then insert or should we insert now and let the child hierarchy make another insertion.  
+For this we employ multiple checks. If the current node is heading, directly insert into parsed_blocks.
+If the current node is a code block, return the text inside code block with backticks.
+If the node is neither heading nor code, then we'll need to go deeper in the hierarchy. 
+if the current node's tag is from the list [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td]
+it is assumed that everything inside the tag is part of a single text block with inline code. 
+But when we go deeper and if there is a code block with size > 50 chars, then our assumption was false. 
+To correct this, we first insert the previously extracted text, next we insert the current code and additionally indicate the parent recursion iteration 
+that the current iteration has inserted the previously parsed text, so there is no need for parent iteration to insert the text block again. 
+We indicate this by a return flag is_text_inserted
+"""
+
+
+
+"""
+    insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any}, 
+        parsed_blocks::Vector{Dict{String,Any}}, 
+        text_to_insert::AbstractString, 
+        text_type::AbstractString)
+    
+Insert the text into parsed_blocks Vector
+
+# Arguments
+- heading_hierarchy: Dict used to store metadata
+- parsed_blocks: Vector of Dicts to store parsed text and metadata
+- text_to_insert: Text to be inserted
+- text_type: The text to be inserted could be heading or a code block or just text
+"""
+function insert_parsed_data!(heading_hierarchy::Dict{Symbol,Any},
+    parsed_blocks::Vector{Dict{String,Any}},
+    text_to_insert::AbstractString,
+    text_type::AbstractString)
+
+    if !isempty(strip(text_to_insert))
+        push!(parsed_blocks,
+            Dict(text_type => strip(text_to_insert),
+                "metadata" => copy(heading_hierarchy)))
+    end
+end
+
+
+
+"""
+    process_headings!(node::Gumbo.HTMLElement,
+        heading_hierarchy::Dict{Symbol,Any},
+        parsed_blocks::Vector{Dict{String,Any}})
+
+Process headings. If the current node is heading, directly insert into parsed_blocks. 
+
+# Arguments
+- node: The root HTML node 
+- heading_hierarchy: Dict used to store metadata
+- parsed_blocks: Vector of Dicts to store parsed text and metadata
+"""
+function process_headings!(node::Gumbo.HTMLElement,
+    heading_hierarchy::Dict{Symbol,Any},
+    parsed_blocks::Vector{Dict{String,Any}})
+
+    tag_name = Gumbo.tag(node)
+    # Clear headings of equal or lower level
+    for k in collect(keys(heading_hierarchy))
+        if k != "header" && Base.parse(Int, last(string(k))) >= Base.parse(Int, last(string(tag_name)))
+            delete!(heading_hierarchy, k)
+        end
+    end
+    heading_hierarchy[tag_name] = strip(Gumbo.text(node))
+    insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(node), "heading")
+
+    is_code_block = false
+    is_text_inserted = false
+    return "", is_code_block, is_text_inserted
+end
+
+"""
+    process_code(node::Gumbo.HTMLElement)
+
+Process code snippets. If the current node is a code block, return the text inside code block with backticks.
+
+# Arguments
+- node: The root HTML node
+"""
+function process_code(node::Gumbo.HTMLElement)
+    is_code_block = false
+
+    # Start a new code block
+    if Gumbo.tag(node.parent) == :pre
+        class_name = getattr(node, "class", "")
+        if occursin("language", class_name)
+            match_result = match(r"language-(\S+)", class_name)
+            language = match_result !== nothing ? match_result.captures[1] : "julia"
+            code_content = "```$language " * strip(Gumbo.text(node)) * "```"
+        else
+            code_content = "```julia " * strip(Gumbo.text(node)) * "```"
+        end
+        is_code_block = true
+    else
+        code_content = "`" * strip(Gumbo.text(node)) * "`"
+    end
+    is_text_inserted = false
+    return code_content, is_code_block, is_text_inserted
+end
+
+"""
+    process_generic_node!(node::Gumbo.HTMLElement,
+        heading_hierarchy::Dict{Symbol,Any},
+        parsed_blocks::Vector{Dict{String,Any}},
+        child_new::Bool=true,
+        prev_text_buffer::IO=IOBuffer(write=true))
+
+If the node is neither heading nor code
+        
+
+# Arguments
+- node: The root HTML node 
+- heading_hierarchy: Dict used to store metadata
+- parsed_blocks: Vector of Dicts to store parsed text and metadata
+- child_new: Bool to specify if the current block (child) is part of previous block or not. 
+                If it's not, then a new insertion needs to be created in parsed_blocks
+- prev_text_buffer: IO Buffer which contains previous text
+"""
+function process_generic_node!(node::Gumbo.HTMLElement,
+    heading_hierarchy::Dict{Symbol,Any},
+    parsed_blocks::Vector{Dict{String,Any}},
+    child_new::Bool=true,
+    prev_text_buffer::IO=IOBuffer(write=true))
+
+    seekstart(prev_text_buffer)
+    prev_text = read(prev_text_buffer, String)
+
+    tag_name = Gumbo.tag(node)
+    text_to_insert = ""
+    # Recursively process the child node for text content
+    children = collect(AbstractTrees.children(node))
+    num_children = length(children)
+    is_code_block = false
+    is_text_inserted = false
+    for (index, child) in enumerate(children)
+        # if the current tag belongs in the list, it is assumed that all the text/code should be part of a single paragraph/block, unless,
+        # there occurs a code block with >50 chars, then, previously parsed text is inserted first, then the code block is inserted. 
+
+        if tag_name in [:p, :li, :dt, :dd, :pre, :b, :strong, :i, :cite, :address, :em, :td, :a, :span, :header]
+            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, false, prev_text_buffer)
+        else
+            received_text, is_code_block, is_text_inserted = process_node!(child, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+        end
+
+        # changing text_to_insert to "" to avoid inserting text_to_insert again (as it was inserted by the child recursion call)
+        if is_text_inserted
+            text_to_insert = ""
+            prev_text = ""
+            take!(prev_text_buffer)
+        end
+
+        # if is_code_block is true, means the received_text is a code block, hence needs to be put as a separate entry in parsed_blocks
+        if !isempty(strip(received_text)) && is_code_block == true
+            to_insert = String(take!(prev_text_buffer))
+            if (!isempty(strip(to_insert)))
+                insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text")
+                text_to_insert = ""
+                prev_text = ""
+                is_text_inserted = true
+            end
+            insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "code")
+            is_code_block = false
+            received_text = ""
+        end
+
+        # if the code block is last part of the loop then due to is_code_block::bool, whole text_to_insert becomes code 
+        # (code is returned with backticks. If code is not inline and is supposed to be a separate block, 
+        # then this case is handled earlier where size of code>50 )
+        if index == num_children
+            is_code_block = false
+        end
+
+        if !isempty(strip(received_text))
+            print(prev_text_buffer, " " * received_text)
+            text_to_insert = text_to_insert * " " * received_text
+        end
+
+    end
+
+    # if child_new is false, this means new child (new entry in parsed_blocks) should not be created, hence, 
+    # reset the buffer return the text.  
+    if (child_new == false)
+        take!(prev_text_buffer)
+        print(prev_text_buffer, prev_text)
+        return text_to_insert, is_code_block, is_text_inserted
+    end
+
+    # insert text_to_insert to parsed_blocks
+    # if we're insert text in current node level, then we should insert the previous text if available, 
+    # otherwise it'll be inserted when the control goes back to the parent call and hence, order of the insertion will be weird
+    if !isempty(strip(text_to_insert))
+        insert_parsed_data!(heading_hierarchy, parsed_blocks, String(take!(prev_text_buffer)), "text")
+        is_text_inserted = true
+    end
+
+    # following is so that in current recursive call, we appended prev_text_buffer, which we need to remove
+    take!(prev_text_buffer)
+    print(prev_text_buffer, prev_text)
+    return "", is_code_block, is_text_inserted
+end
+
+
+"""
+    process_docstring!(node::Gumbo.HTMLElement,
+        heading_hierarchy::Dict{Symbol,Any},
+        parsed_blocks::Vector{Dict{String,Any}},
+        child_new::Bool=true,
+        prev_text_buffer::IO=IOBuffer(write=true))
+
+Function to process node of class `docstring`
+
+# Arguments
+- node: The root HTML node 
+- heading_hierarchy: Dict used to store metadata
+- parsed_blocks: Vector of Dicts to store parsed text and metadata
+- child_new: Bool to specify if the current block (child) is part of previous block or not. 
+                If it's not, then a new insertion needs to be created in parsed_blocks
+- prev_text_buffer: IO Buffer which contains previous text
+"""
+function process_docstring!(node::Gumbo.HTMLElement,
+    heading_hierarchy::Dict{Symbol,Any},
+    parsed_blocks::Vector{Dict{String,Any}},
+    child_new::Bool=true,
+    prev_text_buffer::IO=IOBuffer(write=true))
+
+    seekstart(prev_text_buffer)
+    prev_text = read(prev_text_buffer, String)
+    is_code_block = false
+    is_text_inserted = false
+
+    # Recursively process the child node for text content
+    children = collect(AbstractTrees.children(node))
+
+    # Insert previously collected text
+    to_insert = String(take!(prev_text_buffer))
+    if (!isempty(strip(to_insert)))
+        insert_parsed_data!(heading_hierarchy, parsed_blocks, to_insert, "text")
+        prev_text = ""
+        is_text_inserted = true
+    end
+
+    # Insert "header"
+    if Gumbo.tag(children[1]) == :header
+        heading_hierarchy[:docstring_header] = strip(Gumbo.text(children[1]))
+        insert_parsed_data!(heading_hierarchy, parsed_blocks, Gumbo.text(children[1]), "docstring_header")
+    end
+
+    received_text, is_code_block, is_text_inserted = process_node!(children[2], heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+
+    if !isempty(strip(received_text))
+        insert_parsed_data!(heading_hierarchy, parsed_blocks, received_text, "text")
+    end
+    delete!(heading_hierarchy, :docstring_header)
+
+    return "", is_code_block, is_text_inserted
+end
+
+"""
+    process_node!(node::Gumbo.HTMLElement,
+        heading_hierarchy::Dict{Symbol,Any},
+        parsed_blocks::Vector{Dict{String,Any}},
+        child_new::Bool=true,
+        prev_text_buffer::IO=IOBuffer(write=true))
+
+Function to process a node
+
+# Arguments
+- node: The root HTML node 
+- heading_hierarchy: Dict used to store metadata
+- parsed_blocks: Vector of Dicts to store parsed text and metadata
+- child_new: Bool to specify if the current block (child) is part of previous block or not. 
+                If it's not, then a new insertion needs to be created in parsed_blocks
+- prev_text_buffer: IO Buffer which contains previous text
+"""
+function process_node!(node::Gumbo.HTMLElement,
+    heading_hierarchy::Dict{Symbol,Any},
+    parsed_blocks::Vector{Dict{String,Any}},
+    child_new::Bool=true,
+    prev_text_buffer::IO=IOBuffer(write=true))
+
+    tag_name = Gumbo.tag(node)
+    if startswith(string(tag_name), "h") && isdigit(last(string(tag_name)))
+        return process_headings!(node, heading_hierarchy, parsed_blocks)
+
+    elseif tag_name == :code
+        return process_code(node)
+
+    elseif tag_name == :article && getattr(node, "class", "") == "docstring"
+        return process_docstring!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+
+    end
+
+    return process_generic_node!(node, heading_hierarchy, parsed_blocks, child_new, prev_text_buffer)
+
+end
+
+
+"""
+multiple dispatch for process_node!() when node is of type Gumbo.HTMLText
+"""
+function process_node!(node::Gumbo.HTMLText, args...)
+    is_code_block = false
+    is_text_inserted = false
+    return strip(Gumbo.text(node)), is_code_block, is_text_inserted
+end
+
+
+"""
+    get_base_url(url::AbstractString)
+
+Extracts the base url.
+
+# Arguments
+- `url`: The url string of which, the base url needs to be extracted
+"""
+function get_base_url(url::AbstractString)
+    parsed_url = URIs.URI(url)
+    base_url = string(parsed_url.scheme, "://", parsed_url.host,
+        parsed_url.port != nothing ? ":" * string(parsed_url.port) : "", parsed_url.path)
+    return base_url
+end
+
+"""
+    get_html_content(root::Gumbo.HTMLElement)
+
+Returns the main content of the HTML. If not found, returns the whole HTML to parse
+
+# Arguments
+- `root`: The HTML root from which content is extracted
+"""
+function get_html_content(root::Gumbo.HTMLElement)
+    target_ids = Set(["VPContent", "main_content_wrap", "pages-content"])
+    target_classes = Set(["content", "franklin-content"])
+
+    content_candidates = [el for el in AbstractTrees.PreOrderDFS(root) if el isa HTMLElement]
+
+    # First try to find by ID
+    content_by_id = filter(el -> getattr(el, "id", nothing) in target_ids, content_candidates)
+    if !isempty(content_by_id)
+        return only(content_by_id)
+    end
+
+    # Fallback to class if no ID matches
+    content_by_class = filter(el -> getattr(el, "class", nothing) in target_classes, content_candidates)
+    if !isempty(content_by_class)
+        return only(content_by_class)
+    end
+
+    # Fallback to the root node if no class matches
+    return root
+
+end
+
+
+"""
+    parse_url(urls::Vector{<:AbstractString})
+
+Initiator and main function to parse HTML from url
+
+# Arguments
+- `urls`: vector containing URL strings to parse
+
+# Returns
+- A Vector of Dict containing Heading/Text/Code along with a Dict of respective metadata
+
+# Usage
+parsed_blocks = parse_url(["https://docs.julialang.org/en/v1/base/multi-threading/"])
+
+# Example
+Let the HTML be:
+<!DOCTYPE html>
+    <html>
+    <body>
+
+    <h1>Heading 1</h1>
+        <h2>Heading 2</h2>
+            <p>para 1</p>
+            <h3>Heading 3</h3>
+                <code>this is my code block</code>
+            <h3>This is another h3 under Heading 2</h3>
+                <p>This is a paragraph with <code>inline code</code></p>
+
+        <h2>Heading 2_2</h2>
+            <p>para ewg</p>
+
+    </body>
+    </html>
+
+Output: 
+Any[
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1"), "heading" => "Heading 1")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "heading" => "Heading 2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2"), "text" => "para 1")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"), "heading" => "Heading 3")
+    Dict{String, Any}("code" => "```julia this is my code block```", "metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "Heading 3", "h2" => "Heading 2"))
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "heading" => "This is another h3 under Heading 2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h3" => "This is another h3 under Heading 2", "h2" => "Heading 2"), "text" => "This is a paragraph with  inline code")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "heading" => "Heading 2_2")
+    Dict{String, Any}("metadata" => Dict{Any, Any}("h1" => "Heading 1", "h2" => "Heading 2_2"), "text" => "para ewg")
+]
+"""
+function parse_url_to_blocks(urls::Vector{<:AbstractString})
+
+    ## TODO: Check if you need parallel processing for multiple urls
+
+    parsed_blocks = Vector{Dict{String,Any}}()
+    heading_hierarchy = Dict{Symbol,Any}()
+
+    for url in urls
+        @info "Parsing URL: $url"
+        base_url = get_base_url(url)
+        r = HTTP.get(base_url)
+        r_parsed = parsehtml(String(r.body))
+        # Getting title of the document 
+        # title = [el
+        #          for el in AbstractTrees.PreOrderDFS(r_parsed.root)
+        #          if el isa HTMLElement && tag(el) == :title] .|> text |> Base.Fix2(join, " / ")
+
+
+        process_node!(get_html_content(r_parsed.root), heading_hierarchy, parsed_blocks)
+    end
+    return parsed_blocks
+end
diff --git a/test/runtests.jl b/test/runtests.jl
new file mode 100644
index 0000000..aee7e9c
--- /dev/null
+++ b/test/runtests.jl
@@ -0,0 +1,7 @@
+using Test
+
+include("..\\src\\RAGKit.jl")
+
+@testset "RAGKit Tests" begin
+    # Your test cases go here
+end