Issamohammed commited on
Commit
f64cacf
·
verified ·
1 Parent(s): 37f7d1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -14
app.py CHANGED
@@ -2,13 +2,13 @@ import os
2
  import torch
3
  import gradio as gr
4
  import logging
 
5
  from pydub import AudioSegment
6
  from pydub.exceptions import CouldntDecodeError
7
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
8
  from pathlib import Path
9
  from tempfile import NamedTemporaryFile
10
  from datetime import timedelta
11
- import time
12
 
13
  # Setup logging
14
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -21,6 +21,16 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
22
  SUPPORTED_FORMATS = {".wav", ".mp3", ".m4a"}
23
 
 
 
 
 
 
 
 
 
 
 
24
  # Initialize model and pipeline
25
  def initialize_pipeline():
26
  try:
@@ -46,38 +56,43 @@ def initialize_pipeline():
46
  # Convert audio if needed
47
  def convert_to_wav(audio_path: str) -> str:
48
  try:
 
 
49
  ext = str(Path(audio_path).suffix).lower()
50
  if ext not in SUPPORTED_FORMATS:
51
  raise ValueError(f"Unsupported audio format: {ext}. Supported formats: {', '.join(SUPPORTED_FORMATS)}")
52
  if ext != ".wav":
 
53
  audio = AudioSegment.from_file(audio_path)
54
  wav_path = str(Path(audio_path).with_suffix(".converted.wav"))
55
  audio.export(wav_path, format="wav")
 
56
  return wav_path
57
  return audio_path
58
  except CouldntDecodeError:
59
- logger.error(f"Failed to decode audio file: {audio_path}")
60
- raise ValueError("Audio file is corrupted or in an unsupported format.")
61
  except OSError as e:
62
  logger.error(f"OS error during audio conversion: {str(e)}")
63
- raise ValueError("Failed to process audio file due to a system error.")
64
  except Exception as e:
65
- logger.error(f"Unexpected error during audio conversion: {str(e)}")
66
- raise ValueError("An unexpected error occurred while converting the audio.")
67
 
68
  # Split audio into chunks
69
  def split_audio(audio_path: str) -> list:
70
  try:
71
  audio = AudioSegment.from_file(audio_path)
72
  if len(audio) == 0:
73
- raise ValueError("Audio file is empty or invalid.")
 
74
  return [audio[i:i + CHUNK_DURATION_MS] for i in range(0, len(audio), CHUNK_DURATION_MS)]
75
  except CouldntDecodeError:
76
  logger.error(f"Failed to decode audio for splitting: {audio_path}")
77
- raise ValueError("Audio file is corrupted or in an unsupported format.")
78
  except Exception as e:
79
  logger.error(f"Failed to split audio: {str(e)}")
80
- raise ValueError(f"Failed to process audio: {str(e)}")
81
 
82
  # Helper to compute chunk start time
83
  def get_chunk_time(index: int, chunk_duration_ms: int) -> str:
@@ -89,7 +104,7 @@ def transcribe(audio_path: str, include_timestamps: bool = False, progress=gr.Pr
89
  try:
90
  if not audio_path or not os.path.exists(audio_path):
91
  logger.warning("Invalid or missing audio file path.")
92
- return "Please upload a valid audio file.", None
93
 
94
  # Convert to WAV if needed
95
  wav_path = convert_to_wav(audio_path)
@@ -110,7 +125,7 @@ def transcribe(audio_path: str, include_timestamps: bool = False, progress=gr.Pr
110
  result = PIPELINE(temp_file.name,
111
  generate_kwargs={"task": "transcribe", "language": "sv"})
112
  text = result["text"].strip()
113
- if text: # Only append non-empty transcriptions
114
  transcript.append(text)
115
  if include_timestamps:
116
  timestamp = get_chunk_time(i, CHUNK_DURATION_MS)
@@ -168,7 +183,7 @@ def transcribe(audio_path: str, include_timestamps: bool = False, progress=gr.Pr
168
  return str(e), None
169
  except Exception as e:
170
  logger.error(f"Unexpected error during transcription: {str(e)}")
171
- return f"An unexpected error occurred: {str(e)}. Please try again or contact support.", None
172
 
173
  # Initialize pipeline globally
174
  try:
@@ -181,11 +196,11 @@ except RuntimeError as e:
181
  def create_interface():
182
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
183
  gr.Markdown("# Swedish Whisper Transcriber")
184
- gr.Markdown("Upload audio (.wav, .mp3, .m4a) for real-time Swedish speech transcription.")
185
 
186
  with gr.Row():
187
  with gr.Column():
188
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
189
  timestamp_toggle = gr.Checkbox(label="Include Timestamps in Download", value=False)
190
  transcribe_btn = gr.Button("Transcribe")
191
 
@@ -203,6 +218,9 @@ def create_interface():
203
 
204
  if __name__ == "__main__":
205
  try:
 
 
 
206
  create_interface().launch()
207
  except Exception as e:
208
  logger.critical(f"Failed to launch Gradio interface: {str(e)}")
 
2
  import torch
3
  import gradio as gr
4
  import logging
5
+ import subprocess
6
  from pydub import AudioSegment
7
  from pydub.exceptions import CouldntDecodeError
8
  from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
9
  from pathlib import Path
10
  from tempfile import NamedTemporaryFile
11
  from datetime import timedelta
 
12
 
13
  # Setup logging
14
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
21
  TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
22
  SUPPORTED_FORMATS = {".wav", ".mp3", ".m4a"}
23
 
24
+ # Check for ffmpeg availability
25
+ def check_ffmpeg():
26
+ try:
27
+ subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
28
+ logger.info("ffmpeg is installed and accessible.")
29
+ return True
30
+ except (subprocess.CalledProcessError, FileNotFoundError):
31
+ logger.error("ffmpeg is not installed or not found in PATH.")
32
+ return False
33
+
34
  # Initialize model and pipeline
35
  def initialize_pipeline():
36
  try:
 
56
  # Convert audio if needed
57
  def convert_to_wav(audio_path: str) -> str:
58
  try:
59
+ if not check_ffmpeg():
60
+ raise RuntimeError("ffmpeg is required to process .m4a files. Please install ffmpeg and ensure it's in your PATH.")
61
  ext = str(Path(audio_path).suffix).lower()
62
  if ext not in SUPPORTED_FORMATS:
63
  raise ValueError(f"Unsupported audio format: {ext}. Supported formats: {', '.join(SUPPORTED_FORMATS)}")
64
  if ext != ".wav":
65
+ logger.info(f"Converting {ext} file to WAV: {audio_path}")
66
  audio = AudioSegment.from_file(audio_path)
67
  wav_path = str(Path(audio_path).with_suffix(".converted.wav"))
68
  audio.export(wav_path, format="wav")
69
+ logger.info(f"Conversion successful: {wav_path}")
70
  return wav_path
71
  return audio_path
72
  except CouldntDecodeError:
73
+ logger.error(f"Failed to decode .m4a file: {audio_path}")
74
+ raise ValueError("The .m4a file is corrupted or not supported. Ensure it's a valid iPhone recording and ffmpeg is installed.")
75
  except OSError as e:
76
  logger.error(f"OS error during audio conversion: {str(e)}")
77
+ raise ValueError("Failed to process the .m4a file due to a system error. Check file permissions or disk space.")
78
  except Exception as e:
79
+ logger.error(f"Unexpected error during .m4a conversion: {str(e)}")
80
+ raise ValueError(f"An unexpected error occurred while converting the .m4a file: {str(e)}")
81
 
82
  # Split audio into chunks
83
  def split_audio(audio_path: str) -> list:
84
  try:
85
  audio = AudioSegment.from_file(audio_path)
86
  if len(audio) == 0:
87
+ raise ValueError("The .m4a file is empty or invalid.")
88
+ logger.info(f"Splitting audio into {CHUNK_DURATION_MS/1000}-second chunks: {audio_path}")
89
  return [audio[i:i + CHUNK_DURATION_MS] for i in range(0, len(audio), CHUNK_DURATION_MS)]
90
  except CouldntDecodeError:
91
  logger.error(f"Failed to decode audio for splitting: {audio_path}")
92
+ raise ValueError("The .m4a file is corrupted or not supported. Ensure it's a valid iPhone recording.")
93
  except Exception as e:
94
  logger.error(f"Failed to split audio: {str(e)}")
95
+ raise ValueError(f"Failed to process the .m4a file: {str(e)}")
96
 
97
  # Helper to compute chunk start time
98
  def get_chunk_time(index: int, chunk_duration_ms: int) -> str:
 
104
  try:
105
  if not audio_path or not os.path.exists(audio_path):
106
  logger.warning("Invalid or missing audio file path.")
107
+ return "Please upload a valid .m4a file.", None
108
 
109
  # Convert to WAV if needed
110
  wav_path = convert_to_wav(audio_path)
 
125
  result = PIPELINE(temp_file.name,
126
  generate_kwargs={"task": "transcribe", "language": "sv"})
127
  text = result["text"].strip()
128
+ if text:
129
  transcript.append(text)
130
  if include_timestamps:
131
  timestamp = get_chunk_time(i, CHUNK_DURATION_MS)
 
183
  return str(e), None
184
  except Exception as e:
185
  logger.error(f"Unexpected error during transcription: {str(e)}")
186
+ return f"An unexpected error occurred while processing the .m4a file: {str(e)}. Please ensure the file is a valid iPhone recording and try again.", None
187
 
188
  # Initialize pipeline globally
189
  try:
 
196
  def create_interface():
197
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
198
  gr.Markdown("# Swedish Whisper Transcriber")
199
+ gr.Markdown("Upload an .m4a file from your iPhone for real-time Swedish speech transcription.")
200
 
201
  with gr.Row():
202
  with gr.Column():
203
+ audio_input = gr.Audio(type="filepath", label="Upload .m4a Audio")
204
  timestamp_toggle = gr.Checkbox(label="Include Timestamps in Download", value=False)
205
  transcribe_btn = gr.Button("Transcribe")
206
 
 
218
 
219
  if __name__ == "__main__":
220
  try:
221
+ if not check_ffmpeg():
222
+ print("Error: ffmpeg is required to process .m4a files. Please install ffmpeg and ensure it's in your PATH.")
223
+ exit(1)
224
  create_interface().launch()
225
  except Exception as e:
226
  logger.critical(f"Failed to launch Gradio interface: {str(e)}")