import requests import pandas as pd from io import BytesIO from markitdown import MarkItDown from langchain_core.tools import tool @tool def extract_transcript_from_youtube(url: str) -> str: """ Extracts the transcript from a YouTube video given its URL. Args: url (str): The YouTube video URL. Returns: transcript (str): The transcript of the video, or an error message if extraction fails. """ transcript_str = "### Transcript" md = MarkItDown(enable_plugins=True) try: result = md.convert(url) except Exception as e: return f"Failed to extract transcript from YouTube video: {str(e)}" parts = result.text_content.split(transcript_str) if len(parts) < 2: return result.text_content transcript = (transcript_str + "\n" + parts[1]).strip() return transcript @tool def extract_data_from_excel(url: str) -> str: """ Downloads and extracts data from an Excel file at the given URL. Args: url (str): The URL of the Excel file. Returns: str: A string representation of the data in the first sheet of the Excel file. """ try: response = requests.get(url) response.raise_for_status() excel_file = BytesIO(response.content) df = pd.read_excel(excel_file) # Optional: Remove unnamed columns often created by Excel df = df.loc[:, ~df.columns.str.contains('^Unnamed')] # Convert all numeric columns to float for col in df.select_dtypes(include=["number"]).columns: df[col] = df[col].astype(float) return df.to_string(index=False) except Exception as e: return f"Failed to process Excel file from URL: {str(e)}" @tool def extract_transcript_from_audio(url: str) -> str: """ Extracts the transcript from an audio file given its URL. Supported formats: mp3, wav. Args: url (str): The URL of the audio file. Returns: str: The transcript of the audio file, or an error message if extraction fails. """ md = MarkItDown(enable_plugins=True) try: result = md.convert(url) except Exception as e: return f"Failed to extract transcript from audio: {str(e)}" return result.text_content