Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 20, 2025

Commit

1fdec9b

1 Parent(s): 1cb8f4e

Enhance audio tools

Browse files

Files changed (1) hide show

audio_tools.py +25 -7

audio_tools.py CHANGED Viewed

@@ -11,18 +11,36 @@ class TranscribeAudioTool(Tool):
     name = "transcribe_audio"
     description = "Transcribe an audio file"
     inputs = {
-        "audio": {"type": "string", "description": "The audio file in base64 format"}
     }
     output_type = "string"
     def setup(self):
         self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
-    def forward(self, audio: str) -> str:
-        audio_data = base64.b64decode(audio)
-        audio_segment = AudioSegment.from_file(BytesIO(audio_data))
-        result = self.model.automatic_speech_recognition(audio_segment)
-        return result["text"]
 transcribe_audio_tool = TranscribeAudioTool()
@@ -31,7 +49,7 @@ def audio_to_base64(file_path: str) -> str:
     """
     Convert an audio file to base64 format
     Args:
-        file_path: Path to the audio file
     Returns:
         The audio file in base64 format
     """

     name = "transcribe_audio"
     description = "Transcribe an audio file"
     inputs = {
+        "audio": {"type": "any", "description": "The audio file in base64 format or as an AudioSegment object"}
     }
     output_type = "string"
     def setup(self):
         self.model = InferenceClient(model="openai/whisper-large-v3", provider="hf-inference", token=os.getenv("HUGGINGFACE_API_KEY"))
+    def forward(self, audio: any) -> str:
+        try:
+            # Handle AudioSegment object
+            if isinstance(audio, AudioSegment):
+                # Convert AudioSegment to base64
+                buffer = BytesIO()
+                audio.export(buffer, format="wav")
+                audio_data = buffer.getvalue()
+            # Handle base64 string
+            elif isinstance(audio, str):
+                audio_data = base64.b64decode(audio)
+            else:
+                raise ValueError(f"Unsupported audio type: {type(audio)}. Expected base64 string or AudioSegment object.")
+            # Create audio segment from the data
+            audio_segment = AudioSegment.from_file(BytesIO(audio_data))
+            # Transcribe using the model
+            result = self.model.automatic_speech_recognition(audio_segment)
+            return result["text"]
+        except Exception as e:
+            raise RuntimeError(f"Error in transcription: {str(e)}")
 transcribe_audio_tool = TranscribeAudioTool()
     """
     Convert an audio file to base64 format
     Args:
+        file_path: Path to the audio file (should be in mp3 format)
     Returns:
         The audio file in base64 format
     """