From 182d9ca6f9fdbadcc06a8f1854f655b094a80d77 Mon Sep 17 00:00:00 2001 From: Youssef Mohammed Abdelal Mohammed <85359600+YoussefMohamedym3@users.noreply.github.com> Date: Mon, 23 Feb 2026 14:57:04 +0200 Subject: [PATCH] feat(tools): add arXiv search and download tools (#5222) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(arxiv): implement search_papers and initial download_paper tools * feat(arxiv): improve PDF download handling with temp files and validation (WIP) Switch to NamedTemporaryFile for safer temp file handling Force export.arxiv.org domain for PDF downloads Add custom User-Agent header Validate Content-Type to ensure PDF response Improve error handling and cleanup logic Add timeout to requests Work in progress – download_paper still under refinement. * feat(arxiv): replace NamedTemporaryFile with module-level TemporaryDirectory Switch from NamedTemporaryFile(delete=False) to a shared _TEMP_DIR for the lifetime of the server process. Scopes file lifetime to the session, guarantees cleanup via atexit, and removes the need for manual file handle management. Expand README with full args/returns/error reference and implementation notes explaining the temp storage design decision. * test(arxiv): add comprehensive tests for search_papers and download_paper fix(arxiv): return structured error instead of raising on invalid PDF content type - Add full test coverage for search_papers (validation, success, id_list, errors) - Add full test coverage for download_paper (success, network errors, invalid content, cleanup) - Mock arxiv client and requests to isolate behavior - Ensure partial files are cleaned up on failure - Align download_paper behavior with tool contract (no exceptions, structured responses) * style(tools): apply ruff formatting to arxiv tool and update lockfile --- tools/README.md | 1 + tools/pyproject.toml | 2 + tools/src/aden_tools/tools/__init__.py | 2 + .../src/aden_tools/tools/arxiv_tool/README.md | 171 +++++++++++++ .../aden_tools/tools/arxiv_tool/__init__.py | 5 + .../aden_tools/tools/arxiv_tool/arxiv_tool.py | 225 +++++++++++++++++ tools/tests/tools/test_arxiv_tool.py | 234 ++++++++++++++++++ uv.lock | 35 +++ 8 files changed, 675 insertions(+) create mode 100644 tools/src/aden_tools/tools/arxiv_tool/README.md create mode 100644 tools/src/aden_tools/tools/arxiv_tool/__init__.py create mode 100644 tools/src/aden_tools/tools/arxiv_tool/arxiv_tool.py create mode 100644 tools/tests/tools/test_arxiv_tool.py diff --git a/tools/README.md b/tools/README.md index d9543663..86261e1b 100644 --- a/tools/README.md +++ b/tools/README.md @@ -106,6 +106,7 @@ python mcp_server.py | `patents_search`, `patents_get_details` | Search patents and retrieve patent details via SerpAPI | | `exa_search`, `exa_answer`, `exa_find_similar`, `exa_get_contents` | Semantic search and content retrieval via Exa AI | | `news_search`, `news_headlines`, `news_by_company`, `news_sentiment` | Search news articles and analyse sentiment | +| `search_papers`, `download_paper` | Search arXiv for scientific papers and download PDFs | ### Communication diff --git a/tools/pyproject.toml b/tools/pyproject.toml index 63cb822e..c1117516 100644 --- a/tools/pyproject.toml +++ b/tools/pyproject.toml @@ -33,6 +33,8 @@ dependencies = [ "resend>=2.0.0", "framework", "stripe>=14.3.0", + "arxiv>=2.1.0", + "requests>=2.31.0", ] [project.optional-dependencies] diff --git a/tools/src/aden_tools/tools/__init__.py b/tools/src/aden_tools/tools/__init__.py index 91d6781f..c5b53fa4 100644 --- a/tools/src/aden_tools/tools/__init__.py +++ b/tools/src/aden_tools/tools/__init__.py @@ -23,6 +23,7 @@ if TYPE_CHECKING: # Import register_tools from each tool module from .account_info_tool import register_tools as register_account_info from .apollo_tool import register_tools as register_apollo +from .arxiv_tool import register_tools as register_arxiv from .bigquery_tool import register_tools as register_bigquery from .calcom_tool import register_tools as register_calcom from .calendar_tool import register_tools as register_calendar @@ -96,6 +97,7 @@ def register_all_tools( register_pdf_read(mcp) register_time(mcp) register_runtime_logs(mcp) + register_arxiv(mcp) # Tools that need credentials (pass credentials if provided) # web_search supports multiple providers (Google, Brave) with auto-detection diff --git a/tools/src/aden_tools/tools/arxiv_tool/README.md b/tools/src/aden_tools/tools/arxiv_tool/README.md new file mode 100644 index 00000000..f013a061 --- /dev/null +++ b/tools/src/aden_tools/tools/arxiv_tool/README.md @@ -0,0 +1,171 @@ +# arXiv Tool + +Search and download scientific papers from arXiv. + +## Description + +Provides two tools for interacting with the arXiv preprint repository: + +- **`search_papers`** — Search for papers by keyword, author, title, or category with flexible sorting +- **`download_paper`** — Download a paper as a PDF to a temporary local file by arXiv ID + +## Arguments + +### `search_papers` + +| Argument | Type | Required | Default | Description | +| ------------- | --------- | -------- | -------------- | ---------------------------------------------------------------------- | +| `query` | str | Yes* | `""` | Search query. Supports field prefixes and boolean operators (see below) | +| `id_list` | list[str] | Yes* | `None` | Specific arXiv IDs to retrieve (e.g. `["1706.03762"]`) | +| `max_results` | int | No | `10` | Maximum number of results to return (capped at 100) | +| `sort_by` | str | No | `"relevance"` | Sort criterion: `"relevance"`, `"lastUpdatedDate"`, `"submittedDate"` | +| `sort_order` | str | No | `"descending"` | Sort direction: `"descending"` or `"ascending"` | + +\* At least one of `query` or `id_list` must be provided. + +**Query syntax:** + +- Field prefixes: `ti:` (title), `au:` (author), `abs:` (abstract), `cat:` (category) +- Boolean operators: `AND`, `OR`, `ANDNOT` (must be uppercase) +- Examples: `"ti:transformer AND au:vaswani"`, `"abs:multi-agent systems"` + +### `download_paper` + +| Argument | Type | Required | Default | Description | +| ---------- | ---- | -------- | ------- | ------------------------------------------------------------------------ | +| `paper_id` | str | Yes | - | arXiv paper ID, with or without version (e.g. `"2207.13219"`, `"2207.13219v4"`) | + +## Environment Variables + +No API credentials required. arXiv is a publicly accessible repository. + +## Example Usage + +```python +# Keyword search +result = search_papers(query="multi-agent reinforcement learning") + +# Search by title and author +result = search_papers(query="ti:attention AND au:vaswani", max_results=5) + +# Search by category, sorted by submission date +result = search_papers( + query="cat:cs.LG", + sort_by="submittedDate", + sort_order="descending", + max_results=20, +) + +# Retrieve specific papers by ID +result = search_papers(id_list=["1706.03762", "2005.14165"]) + +# Download a paper as a PDF +result = download_paper(paper_id="1706.03762") +# result["file_path"] → "/tmp/arxiv_papers_/Attention_Is_All_You_Need_1706_03762_.pdf" +# Files are stored in a shared managed directory for the lifetime of the server process. +# No cleanup needed — the directory is automatically deleted on process exit. +``` + +## Return Values + +### `search_papers` — success + +Results are truncated to one entry for brevity; `"total"` reflects the actual count returned. + +```json +{ + "success": true, + "query": "multi-agent reinforcement learning", + "id_list": [], + "results": [ + { + "id": "2203.08975v2", + "title": "A Survey of Multi-Agent Deep Reinforcement Learning with Communication", + "summary": "Communication is an effective mechanism for coordinating the behaviors of multiple agents...", + "published": "2022-03-16", + "authors": [ + "Changxi Zhu", + "Mehdi Dastani", + "Shihan Wang" + ], + "pdf_url": "https://arxiv.org/pdf/2203.08975v2", + "categories": [ + "cs.MA", + "cs.LG" + ] + } + ], + "total": 10 +} +``` + +When using `id_list`, `"query"` is returned as an empty string and `"id_list"` echoes the requested IDs: + +```json +{ + "success": true, + "query": "", + "id_list": [ + "1706.03762", + "2005.14165" + ], + "results": ["..."], + "total": 2 +} +``` + +### `download_paper` — success + +```json +{ + "success": true, + "file_path": "/tmp/arxiv_papers_/Attention_Is_All_You_Need_1706_03762_.pdf", + "paper_id": "1706.03762" +} +``` + +## Error Handling + +All errors return `{"success": false, "error": "..."}`. + +### `search_papers` + +| Error message | Cause | +|---|---| +| `Invalid Request: You must provide either a 'query' or an 'id_list'.` | Both `query` and `id_list` are empty | +| `arXiv specific error: ` | `arxiv.ArxivError` raised by the library | +| `Network unreachable.` | `ConnectionError` — no internet connectivity | +| `arXiv search failed: ` | Any other unexpected exception | + +```json +{ + "success": false, + "error": "Invalid Request: You must provide either a 'query' or an 'id_list'." +} +``` + +### `download_paper` + +| Error message | Cause | +|---|---| +| `No paper found with ID: ` | The arXiv ID does not exist | +| `PDF URL not available for this paper.` | Paper metadata has no PDF link | +| `Failed during download or write: ` | `requests` network error, OS write failure, or arXiv returned an unexpected content type (e.g. HTML error page instead of PDF) | +| `arXiv library error: ` | `arxiv.ArxivError` raised during metadata lookup | +| `Network error: ` | `ConnectionError` during metadata lookup | +| `Unexpected error: ` | Any other unexpected exception (partial file is cleaned up before returning) | + +```json +{ + "success": false, + "error": "No paper found with ID: 0000.00000" +} +``` +## Implementation Notes + +**PDF download** uses `requests.get` against `export.arxiv.org` (the designated programmatic subdomain) instead of the deprecated `Result.download_pdf()` helper. The 3-second rate limit only applies to the metadata API — the PDF download itself is a plain HTTPS file transfer and has no such restriction. + +**Temporary storage** — PDFs are written to a module-level `TemporaryDirectory`, cleaned up automatically on process exit via `atexit`. This is intentional: the PDF is a transient bridge between `download_paper` and `pdf_read_tool` — not a deliverable. Using `data_dir` (the framework's session workspace) would pollute `list_data_files` with unreadable binary blobs and accumulate files with no cleanup. `_TEMP_DIR` scopes the file to exactly as long as it's needed. + +**Known limitation:** +- **Resumable sessions** — if the process restarts mid-session, `_TEMP_DIR` is wiped and any checkpointed file path becomes invalid. This is unlikely to matter in practice since `pdf_read_tool` should be called immediately after `download_paper` in the same node. diff --git a/tools/src/aden_tools/tools/arxiv_tool/__init__.py b/tools/src/aden_tools/tools/arxiv_tool/__init__.py new file mode 100644 index 00000000..c60b155c --- /dev/null +++ b/tools/src/aden_tools/tools/arxiv_tool/__init__.py @@ -0,0 +1,5 @@ +"""ArXiv tool package.""" + +from .arxiv_tool import register_tools + +__all__ = ["register_tools"] diff --git a/tools/src/aden_tools/tools/arxiv_tool/arxiv_tool.py b/tools/src/aden_tools/tools/arxiv_tool/arxiv_tool.py new file mode 100644 index 00000000..cecb6eba --- /dev/null +++ b/tools/src/aden_tools/tools/arxiv_tool/arxiv_tool.py @@ -0,0 +1,225 @@ +""" +arXiv Tool - Search and download scientific papers. +""" + +import atexit +import os +import re +import tempfile +from typing import Literal +from urllib.parse import urlparse + +import arxiv +import requests +from fastmcp import FastMCP + +_SHARED_ARXIV_CLIENT = arxiv.Client(page_size=100, delay_seconds=3, num_retries=3) + +_TEMP_DIR = tempfile.TemporaryDirectory(prefix="arxiv_papers_") +atexit.register(_TEMP_DIR.cleanup) + + +def register_tools(mcp: FastMCP) -> None: + """Register arXiv tools with the MCP server.""" + + @mcp.tool() + def search_papers( + query: str = "", + id_list: list[str] | None = None, + max_results: int = 10, + sort_by: Literal["relevance", "lastUpdatedDate", "submittedDate"] = "relevance", + sort_order: Literal["descending", "ascending"] = "descending", + ) -> dict: + """ + Searches arXiv for scientific papers using keywords or specific IDs. + + CRITICAL: You MUST provide either a `query` OR an `id_list`. + + Args: + query (str): The search query (e.g., "multi-agent systems"). + Default is empty. + + QUERY SYNTAX & PREFIXES: + - Use prefixes: 'ti:' (Title), 'au:' (Author), + 'abs:' (Abstract), 'cat:' (Category). + - Boolean: AND, OR, ANDNOT (Must be capitalized). + - Example: "ti:transformer AND au:vaswani" + + id_list (list[str] | None): Specific arXiv IDs (e.g., ["1706.03762"]). + Use this to retrieve specific known papers. + + max_results (int): Max results to return (default 10). + + sort_by (Literal): The sorting criterion. + Options: "relevance", "lastUpdatedDate", "submittedDate". + Default: "relevance". + + sort_order (Literal): The order of sorting. + Options: "descending", "ascending". + Default: "descending". + + Returns: + dict: { "success": bool, "data": list[dict], "count": int } + """ + + # VALIDATION: Ensure the Agent didn't send an empty request + if not query and not id_list: + return { + "success": False, + "error": "Invalid Request: You must provide either a 'query' or an 'id_list'.", + } + + # Prevent the agent from accidentally requesting too much data + max_results = min(max_results, 100) + + # INTERNAL MAPS: Bridge String (Agent) -> Enum Object (Library) + sort_criteria_map = { + "relevance": arxiv.SortCriterion.Relevance, + "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate, + "submittedDate": arxiv.SortCriterion.SubmittedDate, + } + sort_order_map = { + "descending": arxiv.SortOrder.Descending, + "ascending": arxiv.SortOrder.Ascending, + } + + try: + search = arxiv.Search( + query=query, + id_list=id_list or [], + max_results=max_results, + sort_by=sort_criteria_map.get(sort_by, arxiv.SortCriterion.Relevance), + sort_order=sort_order_map.get(sort_order, arxiv.SortOrder.Descending), + ) + + result_object = _SHARED_ARXIV_CLIENT.results(search) + results = [] + + # EXECUTION & SERIALIZATION + for r in result_object: + results.append( + { + "id": r.get_short_id(), + "title": r.title, + "summary": r.summary.replace("\n", " "), + "published": str(r.published.date()), + "authors": [a.name for a in r.authors], + "pdf_url": r.pdf_url, + "categories": r.categories, + } + ) + return { + "success": True, + "query": query, + "id_list": id_list or [], + "results": results, + "total": len(results), + } + except arxiv.ArxivError as e: + return {"success": False, "error": f"arXiv specific error: {e}"} + + except ConnectionError: + return {"success": False, "error": "Network unreachable."} + except Exception as e: + return {"success": False, "error": f"arXiv search failed: {str(e)}"} + + @mcp.tool() + def download_paper(paper_id: str) -> dict: + """ + Downloads a paper from arXiv by its ID and saves it to a managed temporary directory + for the lifetime of the server process. + + Args: + paper_id (str): The arXiv identifier (e.g., "2207.13219v4"). + + Returns: + dict: { "success": bool, "file_path": str, "paper_id": str } + The file is valid until the server process exits. No cleanup needed. + """ + local_path = None + try: + # Find the PDF Link + search = arxiv.Search(id_list=[paper_id]) + results_generator = _SHARED_ARXIV_CLIENT.results(search) + paper = next(results_generator, None) + + if not paper: + return { + "success": False, + "error": f"No paper found with ID: {paper_id}", + } + + pdf_url = paper.pdf_url + + if not pdf_url: + return { + "success": False, + "error": "PDF URL not available for this paper.", + } + + parsed_url = urlparse(pdf_url) + pdf_url = parsed_url._replace(netloc="export.arxiv.org").geturl() + + # Clean the title to make it a valid filename + clean_title = re.sub(r"[^\w\s-]", "", paper.title).strip().replace(" ", "_") + clean_id = re.sub(r"[^\w\s-]", "_", paper_id) + prefix = f"{clean_title[:50]}_{clean_id}_" + + filename = f"{prefix}.pdf" + local_path = os.path.join(_TEMP_DIR.name, filename) + + try: + # Start the Stream + # stream=True prevents loading the entire file into memory + headers = {"User-Agent": "Hive-Agent/1.0 (https://github.com/adenhq/hive)"} + + # No rate limiting needed for PDF download. + # The 3-second rule only applies to the metadata API (export.arxiv.org/api/query), + # as explicitly stated in the arXiv API User Manual. + # This is a plain HTTPS file download (export.arxiv.org/pdf/...), not an API call. + # The deprecated arxiv.py helper `Result.download_pdf()` confirms this — + # it was just a bare urlretrieve() call, + # with zero rate limiting or client involvement, + # because Result objects are pure data and hold no reference back to the Client. + response = requests.get(pdf_url, stream=True, timeout=60, headers=headers) + response.raise_for_status() + + content_type = response.headers.get("Content-Type", "") + if "pdf" not in content_type.lower(): + return { + "success": False, + "error": ( + f"Failed during download or write: Expected PDF content but got " + f"'{content_type}'. arXiv may have returned an error page." + ), + } + + with open(local_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + + except (requests.RequestException, OSError) as e: + if os.path.exists(local_path): + os.remove(local_path) + local_path = None # prevent double-deletion in the outer except + + return { + "success": False, + "error": f"Failed during download or write: {str(e)}", + } + + return { + "success": True, + "file_path": local_path, + "paper_id": paper_id, + } + + except arxiv.ArxivError as e: + return {"success": False, "error": f"arXiv library error: {str(e)}"} + except ConnectionError as e: + return {"success": False, "error": f"Network error: {str(e)}"} + except Exception as e: + if local_path and os.path.exists(local_path): + os.remove(local_path) + return {"success": False, "error": f"Unexpected error: {str(e)}"} diff --git a/tools/tests/tools/test_arxiv_tool.py b/tools/tests/tools/test_arxiv_tool.py new file mode 100644 index 00000000..febba157 --- /dev/null +++ b/tools/tests/tools/test_arxiv_tool.py @@ -0,0 +1,234 @@ +""" +Tests for the arXiv search and download tool. + +Covers: +- search_papers: success, id_list lookup, validation, sorting, error handling +- download_paper: success, missing paper, no PDF URL, network error, + bad content type, file cleanup on error +- Tool registration +""" + +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import arxiv +from fastmcp import FastMCP + +from aden_tools.tools.arxiv_tool.arxiv_tool import register_tools + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_mcp() -> FastMCP: + mcp = FastMCP("test-arxiv") + register_tools(mcp) + return mcp + + +def _get_tool(mcp: FastMCP, name: str): + """Return the raw callable for a registered tool by name.""" + return mcp._tool_manager._tools[name].fn + + +def _make_arxiv_result( + short_id="1706.03762", + title="Attention Is All You Need", + summary="We propose a new simple network architecture...", + published="2017-06-12", + authors=("Vaswani",), + pdf_url="https://arxiv.org/pdf/1706.03762", + categories=("cs.CL",), +) -> MagicMock: + """Build a minimal mock arxiv.Result.""" + result = MagicMock() + result.get_short_id.return_value = short_id + result.title = title + result.summary = summary + result.published.date.return_value = published + result.authors = [MagicMock(name=a) for a in authors] + result.pdf_url = pdf_url + result.categories = list(categories) + return result + + +# --------------------------------------------------------------------------- +# Tool registration +# --------------------------------------------------------------------------- + + +class TestToolRegistration: + def test_all_tools_registered(self): + mcp = _make_mcp() + registered = set(mcp._tool_manager._tools.keys()) + assert "search_papers" in registered + assert "download_paper" in registered + + +# --------------------------------------------------------------------------- +# search_papers +# --------------------------------------------------------------------------- + + +class TestSearchPapers: + def setup_method(self): + self.mcp = _make_mcp() + self.search_papers = _get_tool(self.mcp, "search_papers") + + def test_validation_error_missing_params(self): + result = self.search_papers(query="", id_list=None) + assert result["success"] is False + assert "query" in result["error"] or "id_list" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_search_success(self, mock_client): + mock_client.results.return_value = iter([_make_arxiv_result()]) + + result = self.search_papers(query="attention transformer") + + assert result["success"] is True + assert result["total"] == 1 + paper = result["results"][0] + assert paper["id"] == "1706.03762" + assert paper["title"] == "Attention Is All You Need" + assert paper["pdf_url"] == "https://arxiv.org/pdf/1706.03762" + assert "cs.CL" in paper["categories"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_search_success_with_results(self, mock_client): + mock_client.results.return_value = iter( + [_make_arxiv_result(short_id=f"000{i}.0000{i}") for i in range(3)] + ) + result = self.search_papers(query="multi-agent systems", max_results=3) + assert result["success"] is True + assert result["total"] == 3 + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_search_by_id_list(self, mock_client): + mock_client.results.return_value = iter([_make_arxiv_result()]) + + result = self.search_papers(id_list=["1706.03762"]) + + assert result["success"] is True + assert result["id_list"] == ["1706.03762"] + assert result["query"] == "" + + def test_max_results_clamped(self): + """max_results above 100 should be silently capped — confirm no crash.""" + with patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") as mock_client: + mock_client.results.return_value = iter([]) + result = self.search_papers(query="test", max_results=9999) + assert result["success"] is True + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_arxiv_error_handling(self, mock_client): + mock_client.results.side_effect = arxiv.ArxivError( + message="arXiv is down", url="", retry=False + ) + result = self.search_papers(query="test") + assert result["success"] is False + assert "arXiv" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_network_error_handling(self, mock_client): + mock_client.results.side_effect = ConnectionError("unreachable") + result = self.search_papers(query="test") + assert result["success"] is False + assert "unreachable" in result["error"].lower() or "network" in result["error"].lower() + + +# --------------------------------------------------------------------------- +# download_paper +# --------------------------------------------------------------------------- + + +class TestDownloadPaper: + def setup_method(self): + self.mcp = _make_mcp() + self.download_paper = _get_tool(self.mcp, "download_paper") + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool.requests.get") + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_download_success(self, mock_client, mock_get, tmp_path): + mock_client.results.return_value = iter([_make_arxiv_result()]) + + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.headers = {"Content-Type": "application/pdf"} + mock_response.iter_content.return_value = [b"%PDF-1.4 fake content"] + mock_get.return_value = mock_response + + with patch("aden_tools.tools.arxiv_tool.arxiv_tool._TEMP_DIR") as mock_tmp: + mock_tmp.name = str(tmp_path) + result = self.download_paper(paper_id="1706.03762") + + assert result["success"] is True + assert result["paper_id"] == "1706.03762" + assert result["file_path"].endswith(".pdf") + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_no_paper_found(self, mock_client): + mock_client.results.return_value = iter([]) + result = self.download_paper(paper_id="0000.00000") + assert result["success"] is False + assert "No paper found" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_no_pdf_url(self, mock_client): + paper = _make_arxiv_result(pdf_url=None) + mock_client.results.return_value = iter([paper]) + result = self.download_paper(paper_id="1706.03762") + assert result["success"] is False + assert "PDF URL not available" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool.requests.get") + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_download_network_error(self, mock_client, mock_get): + import requests + + mock_client.results.return_value = iter([_make_arxiv_result()]) + mock_get.side_effect = requests.RequestException("connection refused") + + result = self.download_paper(paper_id="1706.03762") + + assert result["success"] is False + assert "Failed during download" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool.requests.get") + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_download_invalid_content_type(self, mock_client, mock_get): + mock_client.results.return_value = iter([_make_arxiv_result()]) + + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.headers = {"Content-Type": "text/html"} + mock_get.return_value = mock_response + + result = self.download_paper(paper_id="1706.03762") + + assert result["success"] is False + assert "Failed during download" in result["error"] + + @patch("aden_tools.tools.arxiv_tool.arxiv_tool.requests.get") + @patch("aden_tools.tools.arxiv_tool.arxiv_tool._SHARED_ARXIV_CLIENT") + def test_file_cleanup_on_error(self, mock_client, mock_get, tmp_path): + """Partial file must be deleted when the download fails mid-write.""" + import requests + + mock_client.results.return_value = iter([_make_arxiv_result()]) + + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_response.headers = {"Content-Type": "application/pdf"} + mock_response.iter_content.side_effect = requests.RequestException("dropped") + mock_get.return_value = mock_response + + with patch("aden_tools.tools.arxiv_tool.arxiv_tool._TEMP_DIR") as mock_tmp: + mock_tmp.name = str(tmp_path) + result = self.download_paper(paper_id="1706.03762") + + assert result["success"] is False + # No leftover partial files + assert list(tmp_path.iterdir()) == [] diff --git a/uv.lock b/uv.lock index 99a60ea6..d84881e4 100644 --- a/uv.lock +++ b/uv.lock @@ -184,6 +184,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/0e/27be9fdef66e72d64c0cdc3cc2823101b80585f8119b5c112c2e8f5f7dab/anyio-4.12.1-py3-none-any.whl", hash = "sha256:d405828884fc140aa80a3c667b8beed277f1dfedec42ba031bd6ac3db606ab6c", size = 113592, upload-time = "2026-01-06T11:45:19.497Z" }, ] +[[package]] +name = "arxiv" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "feedparser" }, + { name = "requests" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8d/aa/dc1c6c633f63fce090e7c067af8c528a5e61218a61c266ff615d46cbde0a/arxiv-2.4.0.tar.gz", hash = "sha256:cabe5470d031aa3f22d2744a7600391c62c3489653f0c62bec9019e62bb0554b", size = 74546, upload-time = "2026-01-05T02:43:16.823Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/11/63/9e71153b2d48c98f8079c90d7211bc65515cc1ad18c3328c3c0472e68f44/arxiv-2.4.0-py3-none-any.whl", hash = "sha256:c02ccb09a777aaadd75d3bc1d2627894ef9c987c651d0dacd864b9f69fb0569f", size = 12065, upload-time = "2026-01-05T02:43:12.542Z" }, +] + [[package]] name = "async-timeout" version = "5.0.1" @@ -755,6 +768,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1f/f6/da4db31001e854025ffd26bc9ba0740a9cbba2c3259695f7c5834908b336/fastuuid-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:df61342889d0f5e7a32f7284e55ef95103f2110fee433c2ae7c2c0956d76ac8a", size = 156457, upload-time = "2025-10-19T22:33:44.579Z" }, ] +[[package]] +name = "feedparser" +version = "6.0.12" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sgmllib3k" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/dc/79/db7edb5e77d6dfbc54d7d9df72828be4318275b2e580549ff45a962f6461/feedparser-6.0.12.tar.gz", hash = "sha256:64f76ce90ae3e8ef5d1ede0f8d3b50ce26bcce71dd8ae5e82b1cd2d4a5f94228", size = 286579, upload-time = "2025-09-10T13:33:59.486Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/eb/c96d64137e29ae17d83ad2552470bafe3a7a915e85434d9942077d7fd011/feedparser-6.0.12-py3-none-any.whl", hash = "sha256:6bbff10f5a52662c00a2e3f86a38928c37c48f77b3c511aedcd51de933549324", size = 81480, upload-time = "2025-09-10T13:33:58.022Z" }, +] + [[package]] name = "filelock" version = "3.20.3" @@ -3198,6 +3223,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/46/f5af3402b579fd5e11573ce652019a67074317e18c1935cc0b4ba9b35552/secretstorage-3.5.0-py3-none-any.whl", hash = "sha256:0ce65888c0725fcb2c5bc0fdb8e5438eece02c523557ea40ce0703c266248137", size = 15554, upload-time = "2025-11-23T19:02:51.545Z" }, ] +[[package]] +name = "sgmllib3k" +version = "1.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/bd/3704a8c3e0942d711c1299ebf7b9091930adae6675d7c8f476a7ce48653c/sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9", size = 5750, upload-time = "2010-08-24T14:33:52.445Z" } + [[package]] name = "shellingham" version = "1.5.4" @@ -3384,6 +3415,7 @@ name = "tools" version = "0.1.0" source = { editable = "tools" } dependencies = [ + { name = "arxiv" }, { name = "beautifulsoup4" }, { name = "diff-match-patch" }, { name = "dnspython" }, @@ -3398,6 +3430,7 @@ dependencies = [ { name = "pydantic" }, { name = "pypdf" }, { name = "python-dotenv" }, + { name = "requests" }, { name = "resend" }, { name = "stripe" }, ] @@ -3441,6 +3474,7 @@ dev = [ [package.metadata] requires-dist = [ + { name = "arxiv", specifier = ">=2.1.0" }, { name = "beautifulsoup4", specifier = ">=4.12.0" }, { name = "diff-match-patch", specifier = ">=20230430" }, { name = "dnspython", specifier = ">=2.4.0" }, @@ -3467,6 +3501,7 @@ requires-dist = [ { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.21.0" }, { name = "python-dotenv", specifier = ">=1.0.0" }, + { name = "requests", specifier = ">=2.31.0" }, { name = "resend", specifier = ">=2.0.0" }, { name = "restrictedpython", marker = "extra == 'all'", specifier = ">=7.0" }, { name = "restrictedpython", marker = "extra == 'sandbox'", specifier = ">=7.0" },