feature(web-scrape): add robots.txt compliance

- Add respect_robots_txt parameter (default: True)
- Implement _get_robots_parser() with caching
- Implement _is_allowed_by_robots() check
- Return clear error when blocked by robots.txt
Fixes #23
This commit is contained in:
Samkit Shah
2026-01-21 23:07:39 -06:00
parent ffff7d0758
commit 5e4d2331d5
@@ -3,15 +3,91 @@ Web Scrape Tool - Extract content from web pages.
Uses httpx for requests and BeautifulSoup for HTML parsing.
Returns clean text content from web pages.
Respect robots.txt by default for ethical scraping.
"""
from __future__ import annotations
from typing import Any, List
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
import httpx
from bs4 import BeautifulSoup
from fastmcp import FastMCP
# Cache for robots.txt parsers (domain -> parser)
_robots_cache: dict[str, RobotFileParser | None] = {}
# User-Agent for the scraper - identifies as a bot for transparency
USER_AGENT = "AdenBot/1.0 (https://adenhq.com; web scraping tool)"
# Browser-like User-Agent for actual page requests
BROWSER_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
def _get_robots_parser(base_url: str, timeout: float = 10.0) -> RobotFileParser | None:
"""
Fetch and parse robots.txt for a domain.
Args:
base_url: Base URL of the domain (e.g., 'https://example.com')
timeout: Timeout for fetching robots.txt
Returns:
RobotFileParser if robots.txt exists and was parsed, None otherwise
"""
if base_url in _robots_cache:
return _robots_cache[base_url]
robots_url = f"{base_url}/robots.txt"
parser = RobotFileParser()
try:
response = httpx.get(
robots_url,
headers={"User-Agent": USER_AGENT},
follow_redirects=True,
timeout=timeout,
)
if response.status_code == 200:
parser.parse(response.text.splitlines())
_robots_cache[base_url] = parser
return parser
else:
# No robots.txt or error (4xx/5xx) - allow all by convention
_robots_cache[base_url] = None
return None
except (httpx.TimeoutException, httpx.RequestError):
# Can't fetch robots.txt - allow but don't cache (might be temporary)
return None
def _is_allowed_by_robots(url: str) -> tuple[bool, str]:
"""
Check if URL is allowed by robots.txt.
Args:
url: Full URL to check
Returns:
Tuple of (allowed: bool, reason: str)
"""
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
path = parsed.path or "/"
parser = _get_robots_parser(base_url)
if parser is None:
# No robots.txt found or couldn't fetch - all paths allowed
return True, "No robots.txt found or not accessible"
# Check both our bot user-agent and wildcard
if parser.can_fetch(USER_AGENT, path) and parser.can_fetch("*", path):
return True, "Allowed by robots.txt"
else:
return False, f"Blocked by robots.txt for path: {path}"
def register_tools(mcp: FastMCP) -> None:
"""Register web scrape tools with the MCP server."""
@@ -22,6 +98,7 @@ def register_tools(mcp: FastMCP) -> None:
selector: str | None = None,
include_links: bool = False,
max_length: int = 50000,
respect_robots_txt: bool = True,
) -> dict:
"""
Scrape and extract text content from a webpage.
@@ -34,6 +111,7 @@ def register_tools(mcp: FastMCP) -> None:
selector: CSS selector to target specific content (e.g., 'article', '.main-content')
include_links: Include extracted links in the response
max_length: Maximum length of extracted text (1000-500000)
respect_robots_txt: Whether to respect robots.txt rules (default: True)
Returns:
Dict with scraped content (url, title, description, content, length) or error dict
@@ -43,6 +121,16 @@ def register_tools(mcp: FastMCP) -> None:
if not url.startswith(("http://", "https://")):
url = "https://" + url
# Check robots.txt if enabled
if respect_robots_txt:
allowed, reason = _is_allowed_by_robots(url)
if not allowed:
return {
"error": f"Scraping blocked: {reason}",
"blocked_by_robots_txt": True,
"url": url,
}
# Validate max_length
if max_length < 1000:
max_length = 1000
@@ -53,7 +141,7 @@ def register_tools(mcp: FastMCP) -> None:
response = httpx.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"User-Agent": BROWSER_USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
},
@@ -112,6 +200,7 @@ def register_tools(mcp: FastMCP) -> None:
"description": description,
"content": text,
"length": len(text),
"robots_txt_respected": respect_robots_txt,
}
# Extract links if requested