feat(tools): add youtube transcript integration via youtube-transcript-api
This commit is contained in:
@@ -28,6 +28,7 @@ dependencies = [
|
||||
"fastmcp>=2.0.0",
|
||||
"diff-match-patch>=20230430",
|
||||
"python-dotenv>=1.0.0",
|
||||
"youtube-transcript-api>=0.6.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -38,6 +38,7 @@ from .file_system_toolkits.write_to_file import register_tools as register_write
|
||||
from .pdf_read_tool import register_tools as register_pdf_read
|
||||
from .web_scrape_tool import register_tools as register_web_scrape
|
||||
from .web_search_tool import register_tools as register_web_search
|
||||
from .youtube_transcript_tool.tool import register as register_youtube
|
||||
|
||||
|
||||
def register_all_tools(
|
||||
@@ -59,6 +60,7 @@ def register_all_tools(
|
||||
register_example(mcp)
|
||||
register_web_scrape(mcp)
|
||||
register_pdf_read(mcp)
|
||||
register_youtube(mcp)
|
||||
|
||||
# Tools that need credentials (pass credentials if provided)
|
||||
# web_search supports multiple providers (Google, Brave) with auto-detection
|
||||
@@ -80,6 +82,7 @@ def register_all_tools(
|
||||
"web_search",
|
||||
"web_scrape",
|
||||
"pdf_read",
|
||||
"get_youtube_transcript",
|
||||
"view_file",
|
||||
"write_to_file",
|
||||
"list_dir",
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
from fastmcp import FastMCP
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
|
||||
import re
|
||||
|
||||
|
||||
def get_youtube_transcript(video_url: str) -> str:
|
||||
"""
|
||||
Fetches the transcript for a YouTube video.
|
||||
|
||||
Args:
|
||||
video_url: The URL of the YouTube video
|
||||
|
||||
Returns:
|
||||
The transcript text as a single string, or an error message if unavailable
|
||||
"""
|
||||
try:
|
||||
# Extract video ID from URL
|
||||
video_id = extract_video_id(video_url)
|
||||
if not video_id:
|
||||
return "Error: Invalid YouTube URL"
|
||||
|
||||
# Get transcript
|
||||
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
||||
|
||||
# Join transcript entries into a single string
|
||||
transcript_text = " ".join([entry['text'] for entry in transcript_list])
|
||||
|
||||
return transcript_text
|
||||
|
||||
except TranscriptsDisabled:
|
||||
return "Error: Transcripts are disabled for this video"
|
||||
except NoTranscriptFound:
|
||||
return "Error: No transcript available"
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
|
||||
def extract_video_id(url: str) -> str:
|
||||
"""
|
||||
Extracts the video ID from various YouTube URL formats.
|
||||
|
||||
Supports:
|
||||
- https://www.youtube.com/watch?v=VIDEO_ID
|
||||
- https://youtu.be/VIDEO_ID
|
||||
- https://www.youtube.com/embed/VIDEO_ID
|
||||
- https://www.youtube.com/v/VIDEO_ID
|
||||
|
||||
Args:
|
||||
url: YouTube video URL
|
||||
|
||||
Returns:
|
||||
The video ID, or empty string if not found
|
||||
"""
|
||||
patterns = [
|
||||
r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([a-zA-Z0-9_-]{11})',
|
||||
r'youtube\.com\/watch\?.*v=([a-zA-Z0-9_-]{11})'
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def register(mcp: FastMCP):
|
||||
"""Register the YouTube transcript tool with FastMCP."""
|
||||
mcp.tool()(get_youtube_transcript)
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from aden_tools.tools.youtube_transcript_tool.tool import get_youtube_transcript
|
||||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
|
||||
|
||||
|
||||
def test_get_transcript_success():
|
||||
"""Test successful transcript retrieval."""
|
||||
# Mock data
|
||||
mock_transcript = [
|
||||
{'text': 'Hello', 'start': 0},
|
||||
{'text': 'World', 'start': 1}
|
||||
]
|
||||
|
||||
# Mock YouTubeTranscriptApi.get_transcript using patch.object
|
||||
with patch.object(YouTubeTranscriptApi, 'get_transcript', return_value=mock_transcript, create=True) as mock_get:
|
||||
# Call the function with a dummy URL
|
||||
result = get_youtube_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
|
||||
|
||||
# Assert the result equals "Hello World"
|
||||
assert result == "Hello World"
|
||||
|
||||
# Verify the API was called with the correct video ID
|
||||
mock_get.assert_called_once_with('dQw4w9WgXcQ')
|
||||
|
||||
|
||||
def test_get_transcript_transcripts_disabled():
|
||||
"""Test handling of TranscriptsDisabled exception."""
|
||||
# Mock YouTubeTranscriptApi.get_transcript to raise TranscriptsDisabled
|
||||
with patch.object(YouTubeTranscriptApi, 'get_transcript', side_effect=TranscriptsDisabled('video_id'), create=True) as mock_get:
|
||||
# Call the function
|
||||
result = get_youtube_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
|
||||
|
||||
# Assert the function returns an error string (does NOT raise the exception)
|
||||
assert result.startswith('Error:')
|
||||
assert 'disabled' in result.lower()
|
||||
|
||||
|
||||
def test_get_transcript_no_transcript_found():
|
||||
"""Test handling of NoTranscriptFound exception."""
|
||||
# Mock YouTubeTranscriptApi.get_transcript to raise NoTranscriptFound
|
||||
with patch.object(YouTubeTranscriptApi, 'get_transcript', side_effect=NoTranscriptFound('video_id', [], 'message'), create=True) as mock_get:
|
||||
# Call the function
|
||||
result = get_youtube_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
|
||||
|
||||
# Assert the function returns an error string
|
||||
assert result.startswith('Error:')
|
||||
assert 'No transcript available' in result
|
||||
|
||||
|
||||
|
||||
def test_invalid_url():
|
||||
"""Test handling of invalid YouTube URLs."""
|
||||
# Call with an invalid URL
|
||||
result = get_youtube_transcript('https://not-youtube.com/video')
|
||||
|
||||
# Assert error is returned
|
||||
assert result.startswith('Error:')
|
||||
assert 'Invalid' in result or 'invalid' in result
|
||||
|
||||
|
||||
def test_generic_exception():
|
||||
"""Test handling of generic exceptions."""
|
||||
# Mock to raise a generic exception
|
||||
with patch.object(YouTubeTranscriptApi, 'get_transcript', side_effect=Exception('Some unexpected error'), create=True) as mock_get:
|
||||
# Call the function
|
||||
result = get_youtube_transcript('https://www.youtube.com/watch?v=dQw4w9WgXcQ')
|
||||
|
||||
# Assert the function returns an error string
|
||||
assert result.startswith('Error:')
|
||||
Reference in New Issue
Block a user