Skip to content

Commit

Permalink
Add support for a web crawler (superagent-ai#419)
Browse files Browse the repository at this point in the history
* Add support for a web crawler

* Small tweaks
  • Loading branch information
homanp authored Sep 25, 2023
1 parent 459cf1d commit 7cccd1f
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 9 deletions.
15 changes: 12 additions & 3 deletions libs/superagent/app/datasource/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup as Soup
from langchain.docstore.document import Document
from langchain.document_loaders import (
GitLoader,
PyPDFLoader,
RecursiveUrlLoader,
TextLoader,
UnstructuredMarkdownLoader,
UnstructuredWordDocumentLoader,
Expand Down Expand Up @@ -120,9 +122,16 @@ def load_github(self):
return loader.load_and_split()

def load_webpage(self):
RemoteDepthReader = download_loader("RemoteDepthReader")
loader = RemoteDepthReader(depth=0)
return loader.load_langchain_documents(url=self.datasource.url)
loader = RecursiveUrlLoader(
url=self.datasource.url,
max_depth=2,
extractor=lambda x: Soup(x, "html.parser").text,
)
chunks = loader.load_and_split()
for chunk in chunks:
if "language" in chunk.metadata:
del chunk.metadata["language"]
return chunks

def load_notion(self):
metadata = json.loads(self.datasource.metadata)
Expand Down
15 changes: 14 additions & 1 deletion libs/superagent/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions libs/superagent/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ langsmith = "^0.0.39"
requests = "^2.31.0"
wolframalpha = "^5.0.0"
langchain = "^0.0.300"
bs4 = "^0.0.1"


[build-system]
Expand Down
10 changes: 5 additions & 5 deletions libs/ui/app/datasources/data-table.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,10 @@ export function DataTable<TData, TValue>({
size="sm"
onClick={() => {
setSelectedType("webpage")
form.setValue("type", "URL")
form.setValue("type", "WEBPAGE")
}}
>
Add URLs
Add webpage
</Button>
</Alert>
<Alert className="flex items-center justify-between">
Expand Down Expand Up @@ -355,10 +355,10 @@ export function DataTable<TData, TValue>({
name="url"
render={({ field }) => (
<FormItem>
<FormLabel>URLs</FormLabel>
<FormLabel>URL</FormLabel>
<FormControl>
<Textarea
placeholder="Comma separeted list of URLs."
<Input
placeholder="E.g https://www.superagent.sh"
{...field}
/>
</FormControl>
Expand Down

0 comments on commit 7cccd1f

Please sign in to comment.