feature(web-scrape): add robots.txt compliance

- Add respect_robots_txt parameter (default: True) - Implement _get_robots_parser() with caching - Implement _is_allowed_by_robots() check - Return clear error when blocked by robots.txt Fixes #23
2026-01-21 23:07:39 -06:00
parent ffff7d0758
commit 5e4d2331d5
1 changed files with 90 additions and 1 deletions
@@ -3,15 +3,91 @@ Web Scrape Tool - Extract content from web pages.

 Uses httpx for requests and BeautifulSoup for HTML parsing.
 Returns clean text content from web pages.
+Respect robots.txt by default for ethical scraping.
 """
 from __future__ import annotations

 from typing import Any, List
+from urllib.parse import urlparse
+from urllib.robotparser import RobotFileParser

 import httpx
 from bs4 import BeautifulSoup
 from fastmcp import FastMCP

+# Cache for robots.txt parsers (domain -> parser)
+_robots_cache: dict[str, RobotFileParser | None] = {}
+
+# User-Agent for the scraper - identifies as a bot for transparency
+USER_AGENT = "AdenBot/1.0 (https://adenhq.com; web scraping tool)"
+
+# Browser-like User-Agent for actual page requests
+BROWSER_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+
+
+def _get_robots_parser(base_url: str, timeout: float = 10.0) -> RobotFileParser | None:
+    """
+    Fetch and parse robots.txt for a domain.
+    
+    Args:
+        base_url: Base URL of the domain (e.g., 'https://example.com')
+        timeout: Timeout for fetching robots.txt
+        
+    Returns:
+        RobotFileParser if robots.txt exists and was parsed, None otherwise
+    """
+    if base_url in _robots_cache:
+        return _robots_cache[base_url]
+    
+    robots_url = f"{base_url}/robots.txt"
+    parser = RobotFileParser()
+    
+    try:
+        response = httpx.get(
+            robots_url,
+            headers={"User-Agent": USER_AGENT},
+            follow_redirects=True,
+            timeout=timeout,
+        )
+        if response.status_code == 200:
+            parser.parse(response.text.splitlines())
+            _robots_cache[base_url] = parser
+            return parser
+        else:
+            # No robots.txt or error (4xx/5xx) - allow all by convention
+            _robots_cache[base_url] = None
+            return None
+    except (httpx.TimeoutException, httpx.RequestError):
+        # Can't fetch robots.txt - allow but don't cache (might be temporary)
+        return None
+
+
+def _is_allowed_by_robots(url: str) -> tuple[bool, str]:
+    """
+    Check if URL is allowed by robots.txt.
+    
+    Args:
+        url: Full URL to check
+        
+    Returns:
+        Tuple of (allowed: bool, reason: str)
+    """
+    parsed = urlparse(url)
+    base_url = f"{parsed.scheme}://{parsed.netloc}"
+    path = parsed.path or "/"
+    
+    parser = _get_robots_parser(base_url)
+    
+    if parser is None:
+        # No robots.txt found or couldn't fetch - all paths allowed
+        return True, "No robots.txt found or not accessible"
+    
+    # Check both our bot user-agent and wildcard
+    if parser.can_fetch(USER_AGENT, path) and parser.can_fetch("*", path):
+        return True, "Allowed by robots.txt"
+    else:
+        return False, f"Blocked by robots.txt for path: {path}"
+

 def register_tools(mcp: FastMCP) -> None:
    """Register web scrape tools with the MCP server."""
@@ -22,6 +98,7 @@ def register_tools(mcp: FastMCP) -> None:
        selector: str | None = None,
        include_links: bool = False,
        max_length: int = 50000,
+        respect_robots_txt: bool = True,
    ) -> dict:
        """
        Scrape and extract text content from a webpage.
@@ -34,6 +111,7 @@ def register_tools(mcp: FastMCP) -> None:
            selector: CSS selector to target specific content (e.g., 'article', '.main-content')
            include_links: Include extracted links in the response
            max_length: Maximum length of extracted text (1000-500000)
+            respect_robots_txt: Whether to respect robots.txt rules (default: True)

        Returns:
            Dict with scraped content (url, title, description, content, length) or error dict
@@ -43,6 +121,16 @@ def register_tools(mcp: FastMCP) -> None:
            if not url.startswith(("http://", "https://")):
                url = "https://" + url

+            # Check robots.txt if enabled
+            if respect_robots_txt:
+                allowed, reason = _is_allowed_by_robots(url)
+                if not allowed:
+                    return {
+                        "error": f"Scraping blocked: {reason}",
+                        "blocked_by_robots_txt": True,
+                        "url": url,
+                    }
+
            # Validate max_length
            if max_length < 1000:
                max_length = 1000
@@ -53,7 +141,7 @@ def register_tools(mcp: FastMCP) -> None:
            response = httpx.get(
                url,
                headers={
-                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    "User-Agent": BROWSER_USER_AGENT,
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.5",
                },
@@ -112,6 +200,7 @@ def register_tools(mcp: FastMCP) -> None:
                "description": description,
                "content": text,
                "length": len(text),
+                "robots_txt_respected": respect_robots_txt,
            }

            # Extract links if requested