Skip to content

Commit

Permalink
Pass TestSearch benchmark consistently (Add browse_website TOKENS_TO_…
Browse files Browse the repository at this point in the history
…TRIGGER_SUMMARY) (Significant-Gravitas#5092)

* Added SUMMARIZATION_TRIGGER_LENGTH
browse_website won't summarize content that's shorter
than SUMMARIZATION_TRIGGER_LENGTH.
It defaults to 250 characters, which is approximately 50 tokens.

* Refactor BrowserOptions

* Use tokens instead of length
to trigger summarization

* Bugfix

* fix: Always return links even if not summarizing
feat: Increase the number of links returned from 5 to 20

---------

Co-authored-by: lc0rp <[email protected]>
Co-authored-by: James Collins <[email protected]>
  • Loading branch information
3 people authored Aug 1, 2023
1 parent a593c32 commit 3a2d08f
Showing 1 changed file with 23 additions and 15 deletions.
38 changes: 23 additions & 15 deletions autogpt/commands/web_selenium.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@

from __future__ import annotations

from autogpt.llm.utils.token_counter import count_string_tokens

COMMAND_CATEGORY = "web_browse"
COMMAND_CATEGORY_TITLE = "Web Browsing"

import logging
from pathlib import Path
from sys import platform
from typing import Optional, Type
from typing import Optional

from bs4 import BeautifulSoup
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeDriverService
from selenium.webdriver.chrome.webdriver import WebDriver as ChromeDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.options import ArgOptions as BrowserOptions
from selenium.webdriver.edge.options import Options as EdgeOptions
from selenium.webdriver.edge.service import Service as EdgeDriverService
from selenium.webdriver.edge.webdriver import WebDriver as EdgeDriver
Expand All @@ -38,9 +41,9 @@
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks
from autogpt.url_utils.validators import validate_url

BrowserOptions = ChromeOptions | EdgeOptions | FirefoxOptions | SafariOptions

FILE_DIR = Path(__file__).parent.parent
TOKENS_TO_TRIGGER_SUMMARY = 50
LINKS_TO_RETURN = 20


@command(
Expand All @@ -64,25 +67,30 @@ def browse_website(url: str, question: str, agent: Agent) -> str:
question (str): The question asked by the user
Returns:
Tuple[str, WebDriver]: The answer and links to the user and the webdriver
str: The answer and links to the user and the webdriver
"""
driver = None
try:
driver, text = scrape_text_with_selenium(url, agent)
add_header(driver)
if TOKENS_TO_TRIGGER_SUMMARY < count_string_tokens(text, agent.llm.name):
text = summarize_memorize_webpage(url, text, question, agent, driver)

links = scrape_links_with_selenium(driver, url)

# Limit links to LINKS_TO_RETURN
if len(links) > LINKS_TO_RETURN:
links = links[:LINKS_TO_RETURN]

return f"Answer gathered from website: {text}\n\nLinks: {links}"
except WebDriverException as e:
# These errors are often quite long and include lots of context.
# Just grab the first line.
msg = e.msg.split("\n")[0]
return f"Error: {msg}"

add_header(driver)
summary = summarize_memorize_webpage(url, text, question, agent, driver)
links = scrape_links_with_selenium(driver, url)

# Limit links to 5
if len(links) > 5:
links = links[:5]
close_browser(driver)
return f"Answer gathered from website: {summary}\n\nLinks: {links}"
finally:
if driver:
close_browser(driver)


def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
Expand All @@ -96,7 +104,7 @@ def scrape_text_with_selenium(url: str, agent: Agent) -> tuple[WebDriver, str]:
"""
logging.getLogger("selenium").setLevel(logging.CRITICAL)

options_available: dict[str, Type[BrowserOptions]] = {
options_available: dict[str, BrowserOptions] = {
"chrome": ChromeOptions,
"edge": EdgeOptions,
"firefox": FirefoxOptions,
Expand Down

0 comments on commit 3a2d08f

Please sign in to comment.