From 467330de6e9627eff3d14066f7ad9b2ae4fe8b59 Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Thu, 28 Nov 2024 18:44:33 +0000 Subject: [PATCH 1/8] update fetch server to use readability JS if node is installed --- src/fetch/README.md | 2 ++ src/fetch/pyproject.toml | 2 +- src/fetch/src/mcp_server_fetch/server.py | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/fetch/README.md b/src/fetch/README.md index ffdd01b0..f31a2435 100644 --- a/src/fetch/README.md +++ b/src/fetch/README.md @@ -16,6 +16,8 @@ Presently the server only supports fetching HTML content. ## Installation +Optionally: Install node.js, this will cause the fetch serve to use a different HTML simplifier that is more robust. + ### Using uv (recommended) When using [`uv`](https://docs.astral.sh/uv/) no specific installation is needed. We will diff --git a/src/fetch/pyproject.toml b/src/fetch/pyproject.toml index 25eac8d8..d9015e69 100644 --- a/src/fetch/pyproject.toml +++ b/src/fetch/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mcp-server-fetch" -version = "0.1.2" +version = "0.1.3" description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs" readme = "README.md" requires-python = ">=3.10" diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 04ecad3c..6cec81e9 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -24,11 +24,13 @@ def extract_content(html: str) -> str: - ret = readabilipy.simple_json.simple_json_from_html_string(html) + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=True + ) if not ret["plain_content"]: return "Page failed to be simplified from HTML" content = markdownify.markdownify( - ret["plain_content"], + ret["content"], heading_style=markdownify.ATX, ) return content From 37622d3872ef7fcf7909cdf20952a4fc70ac0515 Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 11:51:41 +0000 Subject: [PATCH 2/8] add handling of non-html pages --- src/fetch/src/mcp_server_fetch/server.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 6cec81e9..8caf0da0 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -23,11 +23,11 @@ DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" -def extract_content(html: str) -> str: +def extract_content_from_html(html: str) -> str: ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["plain_content"]: + if not ret["content"]: return "Page failed to be simplified from HTML" content = markdownify.markdownify( ret["content"], @@ -105,13 +105,18 @@ async def fetch_url(url: str, user_agent: str) -> str: f"Failed to fetch {url} - status code {response.status_code}", ) - page_html = response.text + page_raw = response.text - return extract_content(page_html) + content_type = response.headers.get("content-type", "") + if " Date: Fri, 29 Nov 2024 12:23:18 +0000 Subject: [PATCH 3/8] improve error message to model on fetch failure --- src/fetch/src/mcp_server_fetch/server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 8caf0da0..9cc8c256 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -95,10 +95,10 @@ async def fetch_url(url: str, user_agent: str) -> str: async with AsyncClient() as client: try: response = await client.get( - url, follow_redirects=True, headers={"User-Agent": user_agent} + url, follow_redirects=True, headers={"User-Agent": user_agent}, timeout=30, ) - except HTTPError: - raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}") + except HTTPError as e: + raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}: {e!r}") if response.status_code >= 400: raise McpError( INTERNAL_ERROR, From e8dcd29427cef216c1866f04a08d49f6a461030a Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 13:04:16 +0000 Subject: [PATCH 4/8] add pagination of fetches so models can avoid reading a full page if it's got the information it needs --- src/fetch/src/mcp_server_fetch/server.py | 30 +++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 9cc8c256..c1e1c88e 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -17,7 +17,7 @@ INTERNAL_ERROR, ) from protego import Protego -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, ValidationError DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)" DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" @@ -89,7 +89,10 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str): ) -async def fetch_url(url: str, user_agent: str) -> str: +async def fetch_url(url: str, user_agent: str) -> (str, str): + """ + Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. + """ from httpx import AsyncClient, HTTPError async with AsyncClient() as client: @@ -109,13 +112,14 @@ async def fetch_url(url: str, user_agent: str) -> str: content_type = response.headers.get("content-type", "") if " list[Prompt]: @server.call_tool() async def call_tool(name, arguments: dict) -> list[TextContent]: - url = arguments.get("url") + try: + args = Fetch(**arguments) + except ValueError as e: + raise McpError(INVALID_PARAMS, str(e)) + + url = args.url if not url: raise McpError(INVALID_PARAMS, "URL is required") if not ignore_robots_txt: await check_may_autonomously_fetch_url(url, user_agent_autonomous) - content = await fetch_url(url, user_agent_autonomous) - return [TextContent(type="text", text=f"Contents of {url}:\n{content}")] + content, prefix = await fetch_url(url, user_agent_autonomous) + if len(content) > args.max_length: + content = content[args.start_index : args.start_index + args.max_length] + content += f"\n\nContent truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content." + return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")] @server.get_prompt() async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: @@ -172,7 +184,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: url = arguments["url"] try: - content = await fetch_url(url, user_agent_manual) + content, prefix = await fetch_url(url, user_agent_manual) # TODO: after SDK bug is addressed, don't catch the exception except McpError as e: return GetPromptResult( @@ -188,7 +200,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult: description=f"Contents of {url}", messages=[ PromptMessage( - role="user", content=TextContent(type="text", text=content) + role="user", content=TextContent(type="text", text=prefix + content) ) ], ) From b6710dae1a7694544c4d76aa8f71e653d947fddd Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 13:16:09 +0000 Subject: [PATCH 5/8] add argument to fetch raw html --- src/fetch/src/mcp_server_fetch/server.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index c1e1c88e..2a404ed3 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -89,7 +89,7 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str): ) -async def fetch_url(url: str, user_agent: str) -> (str, str): +async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, str): """ Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. """ @@ -111,7 +111,9 @@ async def fetch_url(url: str, user_agent: str) -> (str, str): page_raw = response.text content_type = response.headers.get("content-type", "") - if " list[Tool]: return [ Tool( name="fetch", - description="""Fetches a URL from the internet and extracts its contents as markdown. + description="""Fetches a URL from the internet and optionally extracts its contents as markdown. Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.""", inputSchema=Fetch.model_json_schema(), @@ -170,7 +173,7 @@ async def call_tool(name, arguments: dict) -> list[TextContent]: if not ignore_robots_txt: await check_may_autonomously_fetch_url(url, user_agent_autonomous) - content, prefix = await fetch_url(url, user_agent_autonomous) + content, prefix = await fetch_url(url, user_agent_autonomous, force_raw=args.raw) if len(content) > args.max_length: content = content[args.start_index : args.start_index + args.max_length] content += f"\n\nContent truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content." From 5552af104ca0df9b05570351691fb614a024544f Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 13:37:35 +0000 Subject: [PATCH 6/8] format with black --- src/fetch/src/mcp_server_fetch/server.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 2a404ed3..9588445d 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -98,7 +98,10 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, async with AsyncClient() as client: try: response = await client.get( - url, follow_redirects=True, headers={"User-Agent": user_agent}, timeout=30, + url, + follow_redirects=True, + headers={"User-Agent": user_agent}, + timeout=30, ) except HTTPError as e: raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}: {e!r}") @@ -111,19 +114,30 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, page_raw = response.text content_type = response.headers.get("content-type", "") - is_page_html = " list[TextContent]: if not ignore_robots_txt: await check_may_autonomously_fetch_url(url, user_agent_autonomous) - content, prefix = await fetch_url(url, user_agent_autonomous, force_raw=args.raw) + content, prefix = await fetch_url( + url, user_agent_autonomous, force_raw=args.raw + ) if len(content) > args.max_length: content = content[args.start_index : args.start_index + args.max_length] content += f"\n\nContent truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content." From c820086b35e57f1254315d04443f2e04bc4431fd Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 14:45:57 +0000 Subject: [PATCH 7/8] update README to reflect new capabilities --- src/fetch/README.md | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/fetch/README.md b/src/fetch/README.md index f31a2435..d7ecb6d4 100644 --- a/src/fetch/README.md +++ b/src/fetch/README.md @@ -2,21 +2,26 @@ A Model Context Protocol server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption. -Presently the server only supports fetching HTML content. +The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need. ### Available Tools - `fetch` - Fetches a URL from the internet and extracts its contents as markdown. + - `url` (string, required): URL to fetch + - `max_length` (integer, optional): Maximum number of characters to return (default: 5000) + - `start_index` (integer, optional): Start content from this character index (default: 0) + - `raw` (boolean, optional): Get raw content without markdown conversion (default: false) ### Prompts - **fetch** - Fetch a URL and extract its contents as markdown - - Argument: `url` (string, required): URL to fetch + - Arguments: + - `url` (string, required): URL to fetch ## Installation -Optionally: Install node.js, this will cause the fetch serve to use a different HTML simplifier that is more robust. +Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust. ### Using uv (recommended) From ea42a21078d9964b023d66b17c024cbdc6207c85 Mon Sep 17 00:00:00 2001 From: Jack Adamson Date: Fri, 29 Nov 2024 14:54:06 +0000 Subject: [PATCH 8/8] add doc strings for readabilty and constrain types --- src/fetch/src/mcp_server_fetch/server.py | 48 +++++++++++++++++++----- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index 9588445d..a3c0f95b 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Optional, Tuple from urllib.parse import urlparse, urlunparse import markdownify @@ -17,13 +17,21 @@ INTERNAL_ERROR, ) from protego import Protego -from pydantic import BaseModel, Field, ValidationError +from pydantic import BaseModel, Field, AnyUrl, conint DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)" DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)" def extract_content_from_html(html: str) -> str: + """Extract and convert HTML content to Markdown format. + + Args: + html: Raw HTML content to process + + Returns: + Simplified markdown version of the content + """ ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) @@ -36,9 +44,17 @@ def extract_content_from_html(html: str) -> str: return content -def get_robots_txt_url(url: str) -> str: +def get_robots_txt_url(url: AnyUrl | str) -> str: + """Get the robots.txt URL for a given website URL. + + Args: + url: Website URL to get robots.txt for + + Returns: + URL of the robots.txt file + """ # Parse the URL into components - parsed = urlparse(url) + parsed = urlparse(str(url)) # Reconstruct the base URL with just scheme, netloc, and /robots.txt path robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", "")) @@ -46,7 +62,7 @@ def get_robots_txt_url(url: str) -> str: return robots_url -async def check_may_autonomously_fetch_url(url: str, user_agent: str): +async def check_may_autonomously_fetch_url(url: AnyUrl | str, user_agent: str) -> None: """ Check if the URL can be fetched by the user agent according to the robots.txt file. Raises a McpError if not. @@ -89,7 +105,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str): ) -async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, str): +async def fetch_url( + url: AnyUrl | str, user_agent: str, force_raw: bool = False +) -> Tuple[str, str]: """ Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information. """ @@ -98,7 +116,7 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, async with AsyncClient() as client: try: response = await client.get( - url, + str(url), follow_redirects=True, headers={"User-Agent": user_agent}, timeout=30, @@ -128,9 +146,13 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str, class Fetch(BaseModel): - url: str = Field(..., description="URL to fetch") - max_length: int = Field(5000, description="Maximum number of characters to return.") - start_index: int = Field( + """Parameters for fetching a URL.""" + + url: AnyUrl = Field(..., description="URL to fetch") + max_length: conint(gt=0, lt=1000000) = Field( + 5000, description="Maximum number of characters to return." + ) + start_index: conint(ge=0) = Field( 0, description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.", ) @@ -143,6 +165,12 @@ class Fetch(BaseModel): async def serve( custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False ) -> None: + """Run the fetch MCP server. + + Args: + custom_user_agent: Optional custom User-Agent string to use for requests + ignore_robots_txt: Whether to ignore robots.txt restrictions + """ server = Server("mcp-fetch") user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL