Skip to content

Commit

Permalink
structured according to PkgTemplate, other changes
Browse files Browse the repository at this point in the history
  • Loading branch information
splendidbug committed Aug 11, 2024
1 parent ed41636 commit 6dc7aa3
Show file tree
Hide file tree
Showing 22 changed files with 557 additions and 435 deletions.
2 changes: 2 additions & 0 deletions .JuliaFormatter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# See https://domluna.github.io/JuliaFormatter.jl/stable/ for a list of options
style = "sciml"
7 changes: 7 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "github-actions"
directory: "/" # Location of package manifests
schedule:
interval: "weekly"
18 changes: 7 additions & 11 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ on:
push:
branches:
- main
tags: ['*']
tags: ["*"]
pull_request:
workflow_dispatch:
concurrency:
Expand All @@ -23,9 +23,8 @@ jobs:
fail-fast: false
matrix:
version:
<<#VERSIONS>>
- '<<&.>>'
<</VERSIONS>>
- "1.10"
- "nightly"
os:
- ubuntu-latest
arch:
Expand All @@ -52,13 +51,11 @@ jobs:
actions: write # needed to allow julia-actions/cache to proactively delete old caches that it has created
contents: write
statuses: write
pages: write
id-token: write
steps:
- uses: actions/checkout@v4
- uses: julia-actions/setup-julia@v2
with:
version: '1'
version: "1"
- uses: julia-actions/cache@v2
- name: Configure doc environment
shell: julia --project=docs --color=yes {0}
Expand All @@ -75,7 +72,6 @@ jobs:
shell: julia --project=docs --color=yes {0}
run: |
using Documenter: DocMeta, doctest
using <<&PKG>>
DocMeta.setdocmeta!(<<&PKG>>, :DocTestSetup, :(using <<&PKG>>); recursive=true)
doctest(<<&PKG>>)
<</HAS_DOCUMENTER>>
using DocsScraper
DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)
doctest(DocsScraper)
16 changes: 16 additions & 0 deletions .github/workflows/CompatHelper.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
name: CompatHelper
on:
schedule:
- cron: 0 0 1 * *
workflow_dispatch:
jobs:
CompatHelper:
runs-on: ubuntu-latest
steps:
- name: Pkg.add("CompatHelper")
run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
- name: CompatHelper.main()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
run: julia -e 'using CompatHelper; CompatHelper.main()'
31 changes: 31 additions & 0 deletions .github/workflows/TagBot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: TagBot
on:
issue_comment:
types:
- created
workflow_dispatch:
inputs:
lookback:
default: "3"
permissions:
actions: read
checks: read
contents: write
deployments: read
issues: read
discussions: read
packages: read
pages: read
pull-requests: read
repository-projects: read
security-events: read
statuses: read
jobs:
TagBot:
if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
runs-on: ubuntu-latest
steps:
- uses: JuliaRegistries/TagBot@v1
with:
token: ${{ secrets.GITHUB_TOKEN }}
ssh: ${{ secrets.DOCUMENTER_KEY }}
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
# Ignore .env files
.env
knowledge_packs/
Manifest.toml
Manifest.toml
/Manifest.toml
/docs/Manifest.toml
/docs/build/
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cSpell.words": [
"eachmatch",
"postprocess"
]
}
2 changes: 1 addition & 1 deletion MIT → LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) {{{YEAR}}} {{{AUTHORS}}}
Copyright (c) Shreyas Agrawal @splendidbug and J S @svilupp

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
20 changes: 15 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,25 +1,35 @@
name = "RAGKit"
uuid = "74e640d8-05f4-4b4f-8742-56fc934b3f17"
authors = ["Shreyas Agrawal <48771895+splendidbug@users.noreply.github.com>"]
name = "DocsScraper"
uuid = "bd71d052-5e08-40cc-a492-eb4e8da4b649"
authors = ["Shreyas Agrawal @splendidbug and J S @svilupp"]
version = "0.1.0"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
DotEnv = "4dc1fcf4-5e3b-5448-94ab-0c38ec0385c1"
EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"

[compat]
AbstractTrees = "0.4.5"
EzXML = "1.2.0"
Gumbo = "0.8.2"
HDF5 = "0.17.2"
HTTP = "1.10.4"
Inflate = "0.1.5"
PromptingTools = "0.36.0"
URIParser = "0.4.1"
URIs = "1.5.1"
Tar = "1.10.0"

[extras]
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Aqua", "Test"]
1 change: 0 additions & 1 deletion docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ Gumbo = "708ec375-b3d6-5a57-a7ce-8257bf98657a"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Inflate = "d25df0c9-e2be-5dd7-82c8-3ad0b3e990b9"
PkgTemplates = "14b8a8f1-9102-5b29-a752-f990bacb7fe1"
PromptingTools = "670122d1-24a8-4d70-bfce-740807c42192"
Tar = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
URIParser = "30578b45-9adc-5946-b283-645ec420af67"
Expand Down
23 changes: 11 additions & 12 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
using Documenter: Documenter, makedocs, deploydocs
using PkgTemplates: PkgTemplates
using DocsScraper
using Documenter

DocMeta.setdocmeta!(DocsScraper, :DocTestSetup, :(using DocsScraper); recursive=true)

makedocs(;
modules=[PkgTemplates],
authors="Shreyas Agrawal <[email protected]>",
repo="https://github.com/splendidbug/RAGKit",
sitename="RAGKit.jl",
modules=[DocsScraper],
authors="Shreyas Agrawal @splendidbug and J S @svilupp",
sitename="DocsScraper.jl",
# format=Documenter.HTML(;
# repolink="https://github.com/splendidbug/RAGKit",
# canonical="https://juliaci.github.io/PkgTemplates.jl",
# canonical="https://Shreyas Agrawal.github.io/DocsScraper.jl",
# edit_link="master",
# assets=String[],
# ),
pages=[
"Home" => "index.md",
"User Guide" => "user.md",
"Developer Guide" => "developer.md",
"Migrating To PkgTemplates 0.7+" => "migrating.md",
],
)

deploydocs(;
repo="https://github.com/splendidbug/RAGKit",
repo="github.com/Shreyas Agrawal/DocsScraper.jl",
devbranch="main",
)
2 changes: 1 addition & 1 deletion docs/src/index.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# RAGKit
# DocsScraper

## Documentation

15 changes: 9 additions & 6 deletions src/RAGKit.jl → src/DocsScraper.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
module RAGKit
module DocsScraper
using HTTP, Gumbo, AbstractTrees, URIs
using Gumbo: HTMLDocument, HTMLElement
using EzXML
using Pkg
Pkg.develop(PackageSpec(path="C:\\Users\\shrey\\Desktop\\stuff\\assignments\\grad\\projects\\Julia\\PromptingTools.jl"))
using PromptingTools
const PT = PromptingTools
const RT = PromptingTools.Experimental.RAGTools
Expand All @@ -12,17 +14,18 @@ using Inflate

using SHA
using Serialization, URIs
# using Regex

# using Robots

include("parser.jl")
include("crawl.jl")
include("extract_urls.jl")
include("preparation.jl")

include("make_embeddings.jl")
export make_embeddings
include("make_knowledge_packs.jl")
export make_knowledge_packs, just_generate

include("user_preferences.jl")
include("utils.jl")
export remove_urls_from_index


end
32 changes: 9 additions & 23 deletions src/crawl.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
"""
parse_robots_txt!(robots_txt::String)
Parses the robots.txt string and returns rules along with the URLs on Sitemap
# Arguments
- `robots_txt`: robots.txt as a string
Parse the robots.txt string and return rules and the URLs on Sitemap
"""
function parse_robots_txt!(robots_txt::String)
rules = Dict{String,Dict{String,Vector{String}}}()
Expand Down Expand Up @@ -40,17 +37,15 @@ end


"""
check_robots_txt(user_agent::AbstractString,
url::AbstractString)
check_robots_txt(user_agent::AbstractString, url::AbstractString)
Checks the robots.txt of a URL and returns a boolean representing if `user_agent` is allowed to crawl the input url
Check robots.txt of a URL and return a boolean representing if `user_agent` is allowed to crawl the input url, along with sitemap urls
# Arguments
- `user_agent`: user agent attempting to crawl the webpage
- `url`: input URL string
"""
function check_robots_txt(user_agent::AbstractString,
url::AbstractString)
function check_robots_txt(user_agent::AbstractString, url::AbstractString)

## TODO: Make a cache of rules for a quick lookup
# if (haskey(restricted_urls, url))
Expand Down Expand Up @@ -101,10 +96,7 @@ end
"""
get_base_url(url::AbstractString)
Extracts the base url.
# Arguments
- `url`: The url string of which, the base url needs to be extracted
Extract the base url
"""
function get_base_url(url::AbstractString)

Expand All @@ -118,10 +110,7 @@ end
"""
process_hostname(url::AbstractString)
Returns the hostname of an input URL
# Arguments
- `url`: URL string
Return the hostname of an input URL
"""
function process_hostname(url::AbstractString)
URI = URIs.URI(url)
Expand All @@ -133,7 +122,7 @@ end
"""
process_hostname(url::AbstractString, hostname_dict::Dict{AbstractString,Vector{AbstractString}})
Adds the `url` to it's hostname in `hostname_dict`
Add `url` to its hostname in `hostname_dict`
# Arguments
- `url`: URL string
Expand All @@ -154,10 +143,7 @@ end
"""
crawl(input_urls::Vector{<:AbstractString})
Crawls on the input URLs and returns a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
# Arguments
- `input_urls`: A vector of input URLs
Crawl on the input URLs and return a `hostname_url_dict` which is a dictionary with key being hostnames and the values being the URLs
"""
function crawl(input_urls::Vector{<:AbstractString})

Expand Down Expand Up @@ -187,6 +173,6 @@ function crawl(input_urls::Vector{<:AbstractString})
end
end

return hostname_url_dict
return hostname_url_dict, visited_url_set

end
Loading

0 comments on commit 6dc7aa3

Please sign in to comment.