feat(vision): add GCP Vision API integration (#4231)

* feat(vision): add GCP Vision API integration

* refactor(vision): move GCP Vision credentials to dedicated folder

* fix: clean up credentials imports and updated gitignore

* followed ruff alphabetic order for credentials
This commit is contained in:
T.Trinath Reddy
2026-02-13 11:30:15 +05:30
committed by GitHub
parent 988922304f
commit 0deeb87c63
8 changed files with 1302 additions and 1 deletions
+3 -1
View File
@@ -74,4 +74,6 @@ exports/*
docs/github-issues/*
core/tests/*dumps/*
screenshots/*
screenshots/*
@@ -54,6 +54,7 @@ from .apollo import APOLLO_CREDENTIALS
from .base import CredentialError, CredentialSpec
from .browser import get_aden_auth_url, get_aden_setup_url, open_browser
from .email import EMAIL_CREDENTIALS
from .gcp_vision import GCP_VISION_CREDENTIALS
from .github import GITHUB_CREDENTIALS
from .health_check import HealthCheckResult, check_credential_health
from .hubspot import HUBSPOT_CREDENTIALS
@@ -74,6 +75,7 @@ CREDENTIAL_SPECS = {
**LLM_CREDENTIALS,
**SEARCH_CREDENTIALS,
**EMAIL_CREDENTIALS,
**GCP_VISION_CREDENTIALS,
**APOLLO_CREDENTIALS,
**GITHUB_CREDENTIALS,
**HUBSPOT_CREDENTIALS,
@@ -106,6 +108,7 @@ __all__ = [
"LLM_CREDENTIALS",
"SEARCH_CREDENTIALS",
"EMAIL_CREDENTIALS",
"GCP_VISION_CREDENTIALS",
"GITHUB_CREDENTIALS",
"HUBSPOT_CREDENTIALS",
"SLACK_CREDENTIALS",
@@ -0,0 +1,46 @@
"""
GCP Vision tool credentials.
Contains credentials for Google Cloud Vision API integration.
"""
from .base import CredentialSpec
GCP_VISION_CREDENTIALS = {
"google_vision": CredentialSpec(
env_var="GOOGLE_CLOUD_VISION_API_KEY",
tools=[
"vision_detect_labels",
"vision_detect_text",
"vision_detect_faces",
"vision_localize_objects",
"vision_detect_logos",
"vision_detect_landmarks",
"vision_image_properties",
"vision_web_detection",
"vision_safe_search",
],
required=True,
startup_required=False,
help_url="https://console.cloud.google.com/apis/credentials",
description="Google Cloud Vision API key for image analysis",
# Auth method support
aden_supported=False,
aden_provider_name="",
direct_api_key_supported=True,
api_key_instructions="""To get a Google Cloud Vision API key:
1. Go to Google Cloud Console (console.cloud.google.com)
2. Create a new project or select existing
3. Go to APIs & Services > Library
4. Search for "Cloud Vision API" and enable it
5. Go to APIs & Services > Credentials
6. Click "Create Credentials" > "API Key"
7. Copy the API key""",
# Health check configuration
health_check_endpoint="",
health_check_method="GET",
# Credential store mapping
credential_id="google_vision",
credential_key="api_key",
),
}
+12
View File
@@ -46,6 +46,7 @@ from .pdf_read_tool import register_tools as register_pdf_read
from .runtime_logs_tool import register_tools as register_runtime_logs
from .serpapi_tool import register_tools as register_serpapi
from .slack_tool import register_tools as register_slack
from .vision_tool import register_tools as register_vision
from .web_scrape_tool import register_tools as register_web_scrape
from .web_search_tool import register_tools as register_web_search
@@ -81,6 +82,7 @@ def register_all_tools(
register_apollo(mcp, credentials=credentials)
register_serpapi(mcp, credentials=credentials)
register_slack(mcp, credentials=credentials)
register_vision(mcp, credentials=credentials)
# Register file system toolkits
register_view_file(mcp)
@@ -219,6 +221,16 @@ def register_all_tools(
"slack_kick_user_from_channel",
"slack_delete_file",
"slack_get_team_stats",
# Vision tools
"vision_detect_labels",
"vision_detect_text",
"vision_detect_faces",
"vision_localize_objects",
"vision_detect_logos",
"vision_detect_landmarks",
"vision_image_properties",
"vision_web_detection",
"vision_safe_search",
]
@@ -0,0 +1,149 @@
# Google Cloud Vision Tool
Image analysis tool using Google Cloud Vision API.
## Features
| Tool | Description |
|------|-------------|
| `vision_detect_labels` | Identify objects, scenes, activities |
| `vision_detect_text` | Extract text from images (OCR) |
| `vision_detect_faces` | Detect faces and emotions |
| `vision_localize_objects` | Detect objects with bounding boxes |
| `vision_detect_logos` | Identify brand logos |
| `vision_detect_landmarks` | Identify famous places |
| `vision_image_properties` | Get dominant colors and crop hints |
| `vision_web_detection` | Find similar images online |
| `vision_safe_search` | Detect inappropriate content |
## Setup
### 1. Get API Key
1. Go to [Google Cloud Console](https://console.cloud.google.com)
2. Create a new project or select existing
3. Go to **APIs & Services > Library**
4. Search for "Cloud Vision API" and enable it
5. Go to **APIs & Services > Credentials**
6. Click **Create Credentials > API Key**
7. Copy the API key
### 2. Set Environment Variable
```bash
export GOOGLE_CLOUD_VISION_API_KEY=your_api_key
```
## Usage
### Label Detection
```python
result = vision_detect_labels(
image_source="https://example.com/photo.jpg",
max_labels=5
)
# {"labels": [{"description": "Dog", "score": 0.97}, ...]}
```
### Text Detection (OCR)
```python
result = vision_detect_text(image_source="/path/to/receipt.jpg")
# {"text": "Store: Amazon\nTotal: $49.99", "blocks": [...]}
```
### Face Detection
```python
result = vision_detect_faces(image_source="https://example.com/group.jpg")
# {"faces": [{"joy": "VERY_LIKELY", "anger": "VERY_UNLIKELY", ...}]}
```
### Object Localization
```python
result = vision_localize_objects(image_source="/path/to/image.jpg")
# {"objects": [{"name": "Cat", "score": 0.92, "bounds": [...]}]}
```
### Logo Detection
```python
result = vision_detect_logos(image_source="https://example.com/product.jpg")
# {"logos": [{"description": "Nike", "score": 0.95}]}
```
### Landmark Detection
```python
result = vision_detect_landmarks(image_source="/path/to/travel.jpg")
# {"landmarks": [{"description": "Eiffel Tower", "location": {"latitude": 48.85, "longitude": 2.29}}]}
```
### Image Properties
```python
result = vision_image_properties(image_source="https://example.com/art.jpg")
# {"colors": [{"red": 255, "green": 128, "blue": 0, "score": 0.5}], "crop_hints": [...]}
```
### Web Detection
```python
result = vision_web_detection(image_source="/path/to/image.jpg")
# {"web_entities": [...], "similar_images": [...], "pages_with_image": [...]}
```
### Safe Search
```python
result = vision_safe_search(image_source="https://example.com/upload.jpg")
# {"adult": "VERY_UNLIKELY", "violence": "VERY_UNLIKELY", "racy": "POSSIBLE", ...}
```
## Input Types
| Type | Example |
|------|---------|
| URL | `https://example.com/image.jpg` |
| Local file | `/path/to/image.jpg` |
**Supported formats:** JPEG, PNG, GIF, BMP, WEBP, ICO
**Max file size:** 10MB
## Error Handling
```python
# File not found
{"error": "File not found: /path/to/missing.jpg"}
# File too large
{"error": "File exceeds 10MB limit (12.5MB)"}
# Missing credentials
{"error": "GOOGLE_CLOUD_VISION_API_KEY not configured", "help": "..."}
# API errors
{"error": "Invalid API key"}
{"error": "Rate limit exceeded. Try again later."}
```
## Pricing
- **First 1000 images/month:** Free
- **After:** ~$1.50 per 1000 images
See [Cloud Vision Pricing](https://cloud.google.com/vision/pricing) for details.
## Likelihood Values
Face detection and safe search return likelihood values:
| Value | Meaning |
|-------|---------|
| `VERY_UNLIKELY` | Very unlikely |
| `UNLIKELY` | Unlikely |
| `POSSIBLE` | Possible |
| `LIKELY` | Likely |
| `VERY_LIKELY` | Very likely |
@@ -0,0 +1,5 @@
"""Google Cloud Vision tool for image analysis."""
from .vision_tool import register_tools
__all__ = ["register_tools"]
@@ -0,0 +1,536 @@
"""
Google Cloud Vision Tool - Image analysis using Google Cloud Vision API.
Supports:
- Label detection (objects, scenes, activities)
- Text detection (OCR)
- Face detection (emotions)
- Object localization (bounding boxes)
- Logo detection
- Landmark detection
- Image properties (colors, crop hints)
- Web detection (similar images)
- Safe search (content moderation)
API Reference: https://cloud.google.com/vision/docs
"""
from __future__ import annotations
import base64
import os
from pathlib import Path
from typing import TYPE_CHECKING, Any
import httpx
from fastmcp import FastMCP
if TYPE_CHECKING:
from aden_tools.credentials import CredentialStoreAdapter
VISION_API_URL = "https://vision.googleapis.com/v1/images:annotate"
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB
class _VisionClient:
"""Internal client for Google Cloud Vision API."""
def __init__(self, api_key: str):
self._api_key = api_key
def _load_image(self, image_source: str) -> dict[str, Any] | dict[str, str]:
"""
Load image from URL or local file.
Returns:
Image dict for API request, or error dict if failed.
"""
# Check if URL
if image_source.startswith(("http://", "https://")):
return {"source": {"imageUri": image_source}}
# Local file
file_path = Path(image_source)
if not file_path.exists():
return {"error": f"File not found: {image_source}"}
if not file_path.is_file():
return {"error": f"Not a file: {image_source}"}
# Check file size
file_size = file_path.stat().st_size
if file_size > MAX_FILE_SIZE:
size_mb = file_size / (1024 * 1024)
return {"error": f"File exceeds 10MB limit ({size_mb:.1f}MB)"}
# Read and encode
try:
content = file_path.read_bytes()
encoded = base64.b64encode(content).decode("utf-8")
return {"content": encoded}
except Exception as e:
return {"error": f"Failed to read file: {str(e)}"}
def _call_api(
self, image_data: dict[str, Any], features: list[dict[str, Any]]
) -> dict[str, Any]:
"""Make request to Vision API."""
try:
response = httpx.post(
VISION_API_URL,
params={"key": self._api_key},
json={"requests": [{"image": image_data, "features": features}]},
timeout=30.0,
)
return self._handle_response(response)
except httpx.TimeoutException:
return {"error": "Request timed out"}
except httpx.RequestError as e:
return {"error": f"Network error: {str(e)}"}
def _handle_response(self, response: httpx.Response) -> dict[str, Any]:
"""Handle API response and errors."""
if response.status_code == 400:
return {"error": "Invalid request. Check image format and size."}
if response.status_code == 401:
return {"error": "Invalid API key"}
if response.status_code == 403:
return {"error": "API key not authorized. Enable Vision API in Google Cloud Console."}
if response.status_code == 429:
return {"error": "Rate limit exceeded. Try again later."}
if response.status_code != 200:
return {"error": f"Vision API error (HTTP {response.status_code})"}
data = response.json()
responses = data.get("responses", [])
if not responses:
return {"error": "Empty response from API"}
result = responses[0]
if "error" in result:
return {"error": result["error"].get("message", "Unknown API error")}
return result
def detect_labels(self, image_source: str, max_results: int = 10) -> dict[str, Any]:
"""Detect labels in image."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(
image_data, [{"type": "LABEL_DETECTION", "maxResults": max_results}]
)
if "error" in result:
return result
labels = [
{"description": label["description"], "score": round(label["score"], 3)}
for label in result.get("labelAnnotations", [])
]
return {"labels": labels}
def detect_text(self, image_source: str) -> dict[str, Any]:
"""Detect text in image (OCR)."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(image_data, [{"type": "TEXT_DETECTION"}])
if "error" in result:
return result
annotations = result.get("textAnnotations", [])
if not annotations:
return {"text": "", "blocks": []}
# First annotation is full text
full_text = annotations[0].get("description", "")
blocks = [
{
"text": ann.get("description", ""),
"bounds": ann.get("boundingPoly", {}).get("vertices", []),
}
for ann in annotations[1:]
]
return {"text": full_text, "blocks": blocks}
def detect_faces(self, image_source: str, max_results: int = 10) -> dict[str, Any]:
"""Detect faces and emotions in image."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(image_data, [{"type": "FACE_DETECTION", "maxResults": max_results}])
if "error" in result:
return result
faces = []
for face in result.get("faceAnnotations", []):
faces.append(
{
"joy": face.get("joyLikelihood", "UNKNOWN"),
"sorrow": face.get("sorrowLikelihood", "UNKNOWN"),
"anger": face.get("angerLikelihood", "UNKNOWN"),
"surprise": face.get("surpriseLikelihood", "UNKNOWN"),
"confidence": round(face.get("detectionConfidence", 0), 3),
"bounds": face.get("boundingPoly", {}).get("vertices", []),
}
)
return {"faces": faces}
def localize_objects(self, image_source: str, max_results: int = 10) -> dict[str, Any]:
"""Detect objects with bounding boxes."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(
image_data, [{"type": "OBJECT_LOCALIZATION", "maxResults": max_results}]
)
if "error" in result:
return result
objects = [
{
"name": obj.get("name", ""),
"score": round(obj.get("score", 0), 3),
"bounds": obj.get("boundingPoly", {}).get("normalizedVertices", []),
}
for obj in result.get("localizedObjectAnnotations", [])
]
return {"objects": objects}
def detect_logos(self, image_source: str, max_results: int = 5) -> dict[str, Any]:
"""Detect logos in image."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(image_data, [{"type": "LOGO_DETECTION", "maxResults": max_results}])
if "error" in result:
return result
logos = [
{
"description": logo.get("description", ""),
"score": round(logo.get("score", 0), 3),
}
for logo in result.get("logoAnnotations", [])
]
return {"logos": logos}
def detect_landmarks(self, image_source: str, max_results: int = 5) -> dict[str, Any]:
"""Detect landmarks in image."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(
image_data, [{"type": "LANDMARK_DETECTION", "maxResults": max_results}]
)
if "error" in result:
return result
landmarks = []
for lm in result.get("landmarkAnnotations", []):
location = {}
locations = lm.get("locations", [])
if locations:
lat_lng = locations[0].get("latLng", {})
location = {
"latitude": lat_lng.get("latitude"),
"longitude": lat_lng.get("longitude"),
}
landmarks.append(
{
"description": lm.get("description", ""),
"score": round(lm.get("score", 0), 3),
"location": location,
}
)
return {"landmarks": landmarks}
def get_image_properties(self, image_source: str) -> dict[str, Any]:
"""Get image properties (colors, crop hints)."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(
image_data,
[{"type": "IMAGE_PROPERTIES"}, {"type": "CROP_HINTS"}],
)
if "error" in result:
return result
# Extract colors
colors = []
color_info = result.get("imagePropertiesAnnotation", {})
dominant_colors = color_info.get("dominantColors", {}).get("colors", [])
for color in dominant_colors[:5]:
rgb = color.get("color", {})
colors.append(
{
"red": int(rgb.get("red", 0)),
"green": int(rgb.get("green", 0)),
"blue": int(rgb.get("blue", 0)),
"score": round(color.get("score", 0), 3),
"pixel_fraction": round(color.get("pixelFraction", 0), 3),
}
)
# Extract crop hints
crop_hints = []
hints_annotation = result.get("cropHintsAnnotation", {})
for hint in hints_annotation.get("cropHints", []):
crop_hints.append(
{
"bounds": hint.get("boundingPoly", {}).get("vertices", []),
"confidence": round(hint.get("confidence", 0), 3),
}
)
return {"colors": colors, "crop_hints": crop_hints}
def web_detection(self, image_source: str) -> dict[str, Any]:
"""Find similar images and web references."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(image_data, [{"type": "WEB_DETECTION"}])
if "error" in result:
return result
web = result.get("webDetection", {})
web_entities = [
{
"description": entity.get("description", ""),
"score": round(entity.get("score", 0), 3),
}
for entity in web.get("webEntities", [])[:10]
]
similar_images = [img.get("url", "") for img in web.get("visuallySimilarImages", [])[:5]]
pages_with_image = [
{"url": page.get("url", ""), "title": page.get("pageTitle", "")}
for page in web.get("pagesWithMatchingImages", [])[:5]
]
return {
"web_entities": web_entities,
"similar_images": similar_images,
"pages_with_image": pages_with_image,
}
def safe_search(self, image_source: str) -> dict[str, Any]:
"""Detect inappropriate content."""
image_data = self._load_image(image_source)
if "error" in image_data:
return image_data
result = self._call_api(image_data, [{"type": "SAFE_SEARCH_DETECTION"}])
if "error" in result:
return result
safe = result.get("safeSearchAnnotation", {})
return {
"adult": safe.get("adult", "UNKNOWN"),
"spoof": safe.get("spoof", "UNKNOWN"),
"medical": safe.get("medical", "UNKNOWN"),
"violence": safe.get("violence", "UNKNOWN"),
"racy": safe.get("racy", "UNKNOWN"),
}
def register_tools(
mcp: FastMCP,
credentials: CredentialStoreAdapter | None = None,
) -> None:
"""Register Google Cloud Vision tools with the MCP server."""
def _get_api_key() -> str | None:
"""Get API key from credentials or environment."""
if credentials is not None:
return credentials.get("google_vision")
return os.getenv("GOOGLE_CLOUD_VISION_API_KEY")
def _get_client() -> _VisionClient | dict[str, str]:
"""Get Vision client, or return error dict if no credentials."""
api_key = _get_api_key()
if not api_key:
return {
"error": "GOOGLE_CLOUD_VISION_API_KEY not configured",
"help": "Get an API key at https://console.cloud.google.com/apis/credentials",
}
return _VisionClient(api_key)
@mcp.tool()
def vision_detect_labels(
image_source: str,
max_labels: int = 10,
) -> dict:
"""
Detect labels (objects, scenes, activities) in an image.
Args:
image_source: URL or local file path to the image
max_labels: Maximum number of labels to return (1-100, default 10)
Returns:
Dict with labels and confidence scores, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.detect_labels(image_source, min(max(1, max_labels), 100))
@mcp.tool()
def vision_detect_text(image_source: str) -> dict:
"""
Extract text from an image (OCR).
Args:
image_source: URL or local file path to the image
Returns:
Dict with extracted text and text blocks with positions, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.detect_text(image_source)
@mcp.tool()
def vision_detect_faces(
image_source: str,
max_faces: int = 10,
) -> dict:
"""
Detect faces and emotions in an image.
Args:
image_source: URL or local file path to the image
max_faces: Maximum number of faces to detect (1-100, default 10)
Returns:
Dict with faces including emotions (joy, sorrow, anger, surprise), or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.detect_faces(image_source, min(max(1, max_faces), 100))
@mcp.tool()
def vision_localize_objects(
image_source: str,
max_objects: int = 10,
) -> dict:
"""
Detect objects with bounding box coordinates in an image.
Args:
image_source: URL or local file path to the image
max_objects: Maximum number of objects to detect (1-100, default 10)
Returns:
Dict with objects including names, scores, and normalized bounding boxes, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.localize_objects(image_source, min(max(1, max_objects), 100))
@mcp.tool()
def vision_detect_logos(
image_source: str,
max_logos: int = 5,
) -> dict:
"""
Detect brand logos in an image.
Args:
image_source: URL or local file path to the image
max_logos: Maximum number of logos to detect (1-20, default 5)
Returns:
Dict with detected logos and confidence scores, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.detect_logos(image_source, min(max(1, max_logos), 20))
@mcp.tool()
def vision_detect_landmarks(
image_source: str,
max_landmarks: int = 5,
) -> dict:
"""
Detect famous landmarks in an image.
Args:
image_source: URL or local file path to the image
max_landmarks: Maximum number of landmarks to detect (1-20, default 5)
Returns:
Dict with landmarks including names, scores, and GPS coordinates, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.detect_landmarks(image_source, min(max(1, max_landmarks), 20))
@mcp.tool()
def vision_image_properties(image_source: str) -> dict:
"""
Get image properties including dominant colors and crop hints.
Args:
image_source: URL or local file path to the image
Returns:
Dict with dominant colors (RGB, score) and crop hints, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.get_image_properties(image_source)
@mcp.tool()
def vision_web_detection(image_source: str) -> dict:
"""
Find similar images and web references for an image.
Args:
image_source: URL or local file path to the image
Returns:
Dict with web entities, similar images, and pages containing the image
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.web_detection(image_source)
@mcp.tool()
def vision_safe_search(image_source: str) -> dict:
"""
Detect inappropriate content in an image.
Checks for: adult, spoof, medical, violence, racy content.
Each category returns a likelihood: VERY_UNLIKELY, UNLIKELY, POSSIBLE, LIKELY, VERY_LIKELY.
Args:
image_source: URL or local file path to the image
Returns:
Dict with likelihood ratings for each category, or error dict
"""
client = _get_client()
if isinstance(client, dict):
return client
return client.safe_search(image_source)
+548
View File
@@ -0,0 +1,548 @@
"""Tests for Google Cloud Vision tool."""
import base64
import os
from pathlib import Path
from unittest.mock import patch
import httpx
import pytest
from fastmcp import FastMCP
from aden_tools.tools.vision_tool import register_tools
@pytest.fixture
def mcp() -> FastMCP:
"""Create a fresh FastMCP instance for testing."""
return FastMCP("test-server")
@pytest.fixture
def sample_image(tmp_path: Path) -> Path:
"""Create a small test image file."""
# Create a minimal valid PNG (1x1 pixel)
png_data = base64.b64decode(
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
)
image_file = tmp_path / "test.png"
image_file.write_bytes(png_data)
return image_file
@pytest.fixture
def large_file(tmp_path: Path) -> Path:
"""Create a file larger than 10MB."""
large_file = tmp_path / "large.png"
large_file.write_bytes(b"x" * (11 * 1024 * 1024)) # 11MB
return large_file
# --- Credential Tests ---
def test_missing_credentials(mcp: FastMCP):
"""Test error when API key not configured."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {}, clear=True):
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "GOOGLE_CLOUD_VISION_API_KEY" in result["error"]
assert "help" in result
def test_credentials_from_env(mcp: FastMCP):
"""Test that credentials are retrieved from environment."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
mock_response = {"responses": [{"labelAnnotations": []}]}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/image.jpg")
assert "labels" in result
# --- Image Loading Tests ---
def test_file_not_found(mcp: FastMCP):
"""Test error when local file doesn't exist."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
result = tool_fn(image_source="/nonexistent/path/image.jpg")
assert "error" in result
assert "File not found" in result["error"]
def test_file_too_large(mcp: FastMCP, large_file: Path):
"""Test error when file exceeds 10MB limit."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
result = tool_fn(image_source=str(large_file))
assert "error" in result
assert "10MB" in result["error"]
def test_directory_not_file(mcp: FastMCP, tmp_path: Path):
"""Test error when path is a directory, not a file."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
result = tool_fn(image_source=str(tmp_path))
assert "error" in result
assert "Not a file" in result["error"]
# --- API Response Tests ---
def test_detect_labels_success(mcp: FastMCP):
"""Test successful label detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
mock_response = {
"responses": [
{
"labelAnnotations": [
{"description": "Dog", "score": 0.97},
{"description": "Animal", "score": 0.95},
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/dog.jpg", max_labels=5)
assert "labels" in result
assert len(result["labels"]) == 2
assert result["labels"][0]["description"] == "Dog"
assert result["labels"][0]["score"] == 0.97
def test_detect_text_success(mcp: FastMCP):
"""Test successful text detection (OCR)."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_text"].fn
mock_response = {
"responses": [
{
"textAnnotations": [
{"description": "Hello World\nLine 2"},
{"description": "Hello", "boundingPoly": {"vertices": [{"x": 0, "y": 0}]}},
{"description": "World", "boundingPoly": {"vertices": [{"x": 50, "y": 0}]}},
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/text.jpg")
assert "text" in result
assert result["text"] == "Hello World\nLine 2"
assert "blocks" in result
assert len(result["blocks"]) == 2
def test_detect_faces_success(mcp: FastMCP):
"""Test successful face detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_faces"].fn
mock_response = {
"responses": [
{
"faceAnnotations": [
{
"joyLikelihood": "VERY_LIKELY",
"sorrowLikelihood": "VERY_UNLIKELY",
"angerLikelihood": "VERY_UNLIKELY",
"surpriseLikelihood": "UNLIKELY",
"detectionConfidence": 0.98,
"boundingPoly": {"vertices": [{"x": 10, "y": 10}]},
}
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/face.jpg")
assert "faces" in result
assert len(result["faces"]) == 1
assert result["faces"][0]["joy"] == "VERY_LIKELY"
assert result["faces"][0]["confidence"] == 0.98
def test_localize_objects_success(mcp: FastMCP):
"""Test successful object localization."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_localize_objects"].fn
mock_response = {
"responses": [
{
"localizedObjectAnnotations": [
{
"name": "Cat",
"score": 0.92,
"boundingPoly": {
"normalizedVertices": [
{"x": 0.1, "y": 0.2},
{"x": 0.9, "y": 0.8},
]
},
}
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/cat.jpg")
assert "objects" in result
assert len(result["objects"]) == 1
assert result["objects"][0]["name"] == "Cat"
def test_detect_logos_success(mcp: FastMCP):
"""Test successful logo detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_logos"].fn
mock_response = {
"responses": [
{
"logoAnnotations": [
{"description": "Apple", "score": 0.95},
{"description": "Nike", "score": 0.88},
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/logos.jpg")
assert "logos" in result
assert len(result["logos"]) == 2
assert result["logos"][0]["description"] == "Apple"
def test_detect_landmarks_success(mcp: FastMCP):
"""Test successful landmark detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_landmarks"].fn
mock_response = {
"responses": [
{
"landmarkAnnotations": [
{
"description": "Eiffel Tower",
"score": 0.96,
"locations": [{"latLng": {"latitude": 48.8584, "longitude": 2.2945}}],
}
]
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/paris.jpg")
assert "landmarks" in result
assert len(result["landmarks"]) == 1
assert result["landmarks"][0]["description"] == "Eiffel Tower"
assert result["landmarks"][0]["location"]["latitude"] == 48.8584
def test_image_properties_success(mcp: FastMCP):
"""Test successful image properties extraction."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_image_properties"].fn
mock_response = {
"responses": [
{
"imagePropertiesAnnotation": {
"dominantColors": {
"colors": [
{
"color": {"red": 255, "green": 0, "blue": 0},
"score": 0.5,
"pixelFraction": 0.3,
}
]
}
},
"cropHintsAnnotation": {
"cropHints": [{"boundingPoly": {"vertices": []}, "confidence": 0.8}]
},
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/colorful.jpg")
assert "colors" in result
assert len(result["colors"]) == 1
assert result["colors"][0]["red"] == 255
assert "crop_hints" in result
def test_web_detection_success(mcp: FastMCP):
"""Test successful web detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_web_detection"].fn
mock_response = {
"responses": [
{
"webDetection": {
"webEntities": [{"description": "Sunset", "score": 0.9}],
"visuallySimilarImages": [{"url": "https://similar.com/1.jpg"}],
"pagesWithMatchingImages": [
{"url": "https://page.com", "pageTitle": "Sunset Photos"}
],
}
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/sunset.jpg")
assert "web_entities" in result
assert "similar_images" in result
assert "pages_with_image" in result
assert result["web_entities"][0]["description"] == "Sunset"
def test_safe_search_success(mcp: FastMCP):
"""Test successful safe search detection."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_safe_search"].fn
mock_response = {
"responses": [
{
"safeSearchAnnotation": {
"adult": "VERY_UNLIKELY",
"spoof": "UNLIKELY",
"medical": "VERY_UNLIKELY",
"violence": "VERY_UNLIKELY",
"racy": "POSSIBLE",
}
}
]
}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/photo.jpg")
assert result["adult"] == "VERY_UNLIKELY"
assert result["violence"] == "VERY_UNLIKELY"
assert result["racy"] == "POSSIBLE"
# --- Local File Tests ---
def test_local_file_success(mcp: FastMCP, sample_image: Path):
"""Test successful processing of local file."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
mock_response = {"responses": [{"labelAnnotations": [{"description": "Image", "score": 0.9}]}]}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source=str(sample_image))
assert "labels" in result
# Verify base64 content was sent
call_args = mock_post.call_args
request_json = call_args.kwargs["json"]
assert "content" in request_json["requests"][0]["image"]
# --- Error Handling Tests ---
def test_api_error_401(mcp: FastMCP):
"""Test handling of invalid API key error."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(401)
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "Invalid API key" in result["error"]
def test_api_error_403(mcp: FastMCP):
"""Test handling of unauthorized API key error."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(403)
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "not authorized" in result["error"]
def test_api_error_429(mcp: FastMCP):
"""Test handling of rate limit error."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(429)
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "Rate limit" in result["error"]
def test_timeout_error(mcp: FastMCP):
"""Test handling of request timeout."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.side_effect = httpx.TimeoutException("Timeout")
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "timed out" in result["error"]
def test_network_error(mcp: FastMCP):
"""Test handling of network error."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.side_effect = httpx.RequestError("Network error")
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "Network error" in result["error"]
def test_empty_response(mcp: FastMCP):
"""Test handling of empty API response."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json={"responses": []})
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "Empty response" in result["error"]
def test_api_error_in_response(mcp: FastMCP):
"""Test handling of error in API response body."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
mock_response = {"responses": [{"error": {"message": "Image too small"}}]}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/image.jpg")
assert "error" in result
assert "Image too small" in result["error"]
# --- Parameter Validation Tests ---
def test_max_labels_clamped(mcp: FastMCP):
"""Test that max_labels is clamped to valid range."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_labels"].fn
mock_response = {"responses": [{"labelAnnotations": []}]}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
# Test with value > 100
tool_fn(image_source="https://example.com/image.jpg", max_labels=200)
# Verify maxResults was clamped to 100
call_args = mock_post.call_args
features = call_args.kwargs["json"]["requests"][0]["features"]
assert features[0]["maxResults"] == 100
def test_detect_text_no_text_found(mcp: FastMCP):
"""Test text detection when no text is found."""
register_tools(mcp, credentials=None)
tool_fn = mcp._tool_manager._tools["vision_detect_text"].fn
mock_response = {"responses": [{"textAnnotations": []}]}
with patch.dict(os.environ, {"GOOGLE_CLOUD_VISION_API_KEY": "test-api-key"}):
with patch("httpx.post") as mock_post:
mock_post.return_value = httpx.Response(200, json=mock_response)
result = tool_fn(image_source="https://example.com/image.jpg")
assert result["text"] == ""
assert result["blocks"] == []