File size: 17,250 Bytes
00fa6cd
 
 
 
e0a3394
00fa6cd
f83e1a4
f71888e
e0a3394
f71888e
e0a3394
 
 
 
 
 
f71888e
e0a3394
 
 
 
 
71f837d
831de14
71f837d
 
 
 
 
e0a3394
71f837d
 
 
 
 
 
 
 
 
 
 
c99432d
c602576
 
 
f83e1a4
 
c602576
b8db3af
e60314b
c602576
e60314b
5c1dd7e
e60314b
c602576
7bcfa4d
c602576
 
 
 
7bcfa4d
 
 
 
 
 
 
 
 
16a2be1
 
7bcfa4d
 
 
16a2be1
c602576
16a2be1
c602576
 
16a2be1
c602576
e60314b
 
 
 
00fa6cd
 
 
 
 
 
 
 
 
16a2be1
 
 
 
 
 
 
 
 
 
c99432d
 
 
d285286
 
 
 
 
 
 
 
 
 
 
c99432d
 
 
d285286
 
 
 
 
 
c99432d
e0a3394
 
 
d285286
 
 
014a2de
 
 
 
 
e0a3394
014a2de
 
 
d285286
 
 
014a2de
d285286
 
014a2de
 
 
 
 
d285286
 
e0a3394
014a2de
e0a3394
d285286
 
 
 
 
 
 
014a2de
 
 
 
 
 
 
 
 
 
e0a3394
 
5c1dd7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7c5634
 
 
 
 
 
 
 
 
 
5c1dd7e
00fa6cd
 
5c1dd7e
00fa6cd
e0a3394
 
 
 
 
7bcfa4d
 
f51135e
7bcfa4d
e0a3394
7bcfa4d
 
e0a3394
00fa6cd
7bcfa4d
e0a3394
 
00fa6cd
e0a3394
00fa6cd
5c1dd7e
16a2be1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c99432d
014a2de
e0a3394
 
5c1dd7e
 
 
014a2de
5c1dd7e
 
014a2de
e0a3394
 
014a2de
 
 
 
 
e0a3394
 
 
 
2065f5c
e0a3394
 
 
00fa6cd
c99432d
e0a3394
 
e8353e2
f83e1a4
 
 
b7c5634
f83e1a4
 
 
 
 
 
 
 
 
 
e8353e2
4cd9b29
 
f83e1a4
 
 
 
 
 
 
 
e8353e2
f83e1a4
 
e8353e2
f83e1a4
 
 
 
4cd9b29
 
 
 
f83e1a4
 
 
 
 
4cd9b29
f83e1a4
 
 
 
 
 
4cd9b29
f83e1a4
 
 
 
4cd9b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c99432d
4cd9b29
c99432d
71f837d
c99432d
 
 
 
 
 
 
00fa6cd
cf68d9c
 
 
2a547a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf68d9c
2a547a7
 
 
cf68d9c
 
 
00fa6cd
cf68d9c
 
 
e0a3394
f9fdf42
e0a3394
 
 
 
 
 
 
 
5c1dd7e
e0a3394
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
import os
import streamlit as st
import tempfile
import base64
import time
from datetime import datetime
from hf_transcriber import HFTranscriber
from huggingface_hub import login
from dotenv import load_dotenv, find_dotenv

# Set page config first
st.set_page_config(
    page_title="🎡 Audio to Sheet Music Transcriber",
    page_icon="🎡",
    layout="wide"
)

# Load environment variables
env_path = find_dotenv()
if env_path:
    load_dotenv(env_path)

# Hugging Face authentication
HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN') or os.environ.get('HF_TOKEN') or (st.secrets.get('HUGGINGFACE_TOKEN') if st.secrets.has_key('HUGGINGFACE_TOKEN') else None) or (st.secrets.get('HF_TOKEN') if st.secrets.has_key('HF_TOKEN') else None)

# Check if we have a valid token
if HUGGINGFACE_TOKEN and HUGGINGFACE_TOKEN.startswith('hf_'):
    try:
        login(token=HUGGINGFACE_TOKEN, add_to_git_credential=False)
        st.sidebar.success("βœ… Authenticated with Hugging Face")
    except Exception as e:
        st.sidebar.warning(f"⚠️ Using public models (rate limited): {str(e)}")
        HUGGINGFACE_TOKEN = None  # Reset token if invalid
else:
    st.sidebar.warning("""
    ⚠️ No Hugging Face token found. Using public models (rate limited).
    
    To use your own token:
    1. Get your token from [Hugging Face Settings](https://huggingface.co/settings/tokens)
    2. Add it to your Space's secrets as `HUGGINGFACE_TOKEN`
    """)

# Configuration dictionary to store app settings
app_config = {
    'RECORDING_ENABLED': False,
    'AUDIO_DEVICES': [],
    'DEFAULT_MODEL': 'openai/whisper-small'  # Default model for transcription
}

def init_recording():
    """Initialize recording capability and return status."""
    try:
        # Try to import recording-related modules
        from recorder import AudioRecorder, list_audio_devices
        
        # Update config with recording components
        app_config['AudioRecorder'] = AudioRecorder
        app_config['list_audio_devices'] = list_audio_devices
        
        # Try to list audio devices to verify everything works
        try:
            devices = list_audio_devices()
            app_config['AUDIO_DEVICES'] = devices
            
            if not devices or not any(d.get('max_input_channels', 0) > 0 for d in devices):
                app_config['RECORDING_ENABLED'] = False
            else:
                app_config['RECORDING_ENABLED'] = True
                
            return True
                
        except Exception as e:
            app_config['RECORDING_ENABLED'] = False
            app_config['AUDIO_DEVICES'] = []
            return False
        
    except ImportError:
        app_config['RECORDING_ENABLED'] = False
        return False
    except Exception:
        app_config['RECORDING_ENABLED'] = False
        return False

# Initialize recording capability
init_recording()

def get_binary_file_downloader_html(bin_file, file_label='File'):
    """Generate a link to download the given file."""
    with open(bin_file, 'rb') as f:
        data = f.read()
    bin_str = base64.b64encode(data).decode()
    href = f'<a href="data:application/octet-stream;base64,{bin_str}" download="{os.path.basename(bin_file)}">Download {file_label}</a>'
    return href

def render_file_uploader():
    """Render the file uploader component."""
    st.info("ℹ️ Please upload an audio file for transcription (WAV, MP3, or OGG format)")
    return st.file_uploader(
        "Choose an audio file",
        type=["wav", "mp3", "ogg"],
        help="Select an audio file to transcribe (max 30MB)",
        key="file_uploader"
    )

def save_uploaded_file(uploaded_file):
    """Save uploaded file to a temporary file and return the path."""
    try:
        #Validate file type
        allowed_types = ["wav", "mp3", "ogg"]
        file_ext = os.path.splitext(uploaded_file.name)[1].lower()
        if file_ext not in allowed_types:
            raise ValueError(f"Unsupported file type: {file_ext}. Allowed: {', '.join(allowed_types)}")
        
        #Create temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp_file:
            #Read files in chunks to handle large files
            for chunk in uploaded_file.chunks(4096):
                tmp_file.write(chunk)
            return tmp_file.name
    except Exception as e:
        st.error(f"Error saving file: {str(e)}")
        if 'tmp_file' in locals() and os.path.exists(tmp_file.name):
            try:
                os.unlink(tmp_file.name)
            except:
                pass
        raise

def transcribe_audio(file_path, model_name):
    """Transcribe audio using the specified model."""
    try:
        #debug info
        st.sidebar.info("ℹ️ Starting transcription...")

        # Debug: Show authentication status
        hf_token = os.getenv('HUGGINGFACE_TOKEN') or os.getenv('HF_TOKEN')
        st.sidebar.info(f"πŸ”‘ Using model: {model_name}")
        st.sidebar.info(f"πŸ”‘ Token present: {'Yes' if hf_token else 'No'}")
        
        transcriber = HFTranscriber(model_name=model_name)
        
        # Read the audio file
        try:
            audio_data, sample_rate = librosa.load(file_path, sr=16000, mono=True)
            st.sidebar.info(f"πŸ”Š Loaded audio: {len(audio_data)/sample_rate:.2f} seconds @ {sample_rate}Hz")

        except Exception as e:
            raise Exception(f"❌ Error loading audio file: {str(e)}")
                    
        # Transcribe
        with st.spinner("πŸ”„ Transcribing audio..."):
            result = transcriber.transcribe_audio(audio_data, sample_rate)
            
        if not result or 'text' not in result:
            raise ValueError("❌ No transcription results returned. The model might not be accessible.")
                    
        return result
        
    except Exception as e:
        st.sidebar.error("❌ Transcription failed: {str(e)}")
        st.sidebar.error("This might be due to:")
        st.sidebar.error("1. Invalid or missing Hugging Face token")
        st.sidebar.error("2. Insufficient permissions for the model")
        st.sidebar.error("3. Network connectivity issues")
        st.sidebar.error("4. Model not found or not accessible")
        raise
        
        # Add debug info
        st.sidebar.error("πŸ” Debug Info:")
        st.sidebar.json({
            "model": model_name,
            "token_present": bool(hf_token),
            "token_prefix": hf_token[:8] + '...' if hf_token else None,
            "error": str(e)
        })
        
        return None

def record_audio():
    """Handle audio recording functionality."""
    st.header("🎀 Record Audio")
    
    if not app_config['RECORDING_ENABLED']:
        st.warning("Audio recording is not available on this device.")
        return
    
    AudioRecorder = app_config['AudioRecorder']
    
    if 'recorder' not in st.session_state:
        st.session_state.recorder = AudioRecorder()
    
    col1, col2 = st.columns(2)
    
    with col1:
        if st.button("🎀 Start Recording"):
            st.session_state.recorder.start()
            st.session_state.recording = True
            st.experimental_rerun()
    
    with col2:
        if st.button("⏹️ Stop Recording") and st.session_state.get('recording', False):
            audio_data = st.session_state.recorder.stop()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_file = os.path.join("outputs", f"recording_{timestamp}.wav")
            os.makedirs("outputs", exist_ok=True)
            audio_data.export(output_file, format="wav")
            st.session_state.recorded_file = output_file
            st.session_state.recording = False
            st.experimental_rerun()
    
    if st.session_state.get('recording', False):
        st.warning("Recording in progress... Click 'Stop Recording' when finished.")
    
    if 'recorded_file' in st.session_state and os.path.exists(st.session_state.recorded_file):
        st.audio(st.session_state.recorded_file)
        return st.session_state.recorded_file
    
    return None
# Add this before your main() function
def test_hf_connection():
    from transformers import pipeline
    try:
        pipe = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
        result = pipe("This is a test")
        st.sidebar.success("βœ… Connection to Hugging Face successful!")
        st.sidebar.json(result[0])
    except Exception as e:
        st.sidebar.error(f"❌ Connection failed: {str(e)}")

def main():
    st.title("🎡 Audio to Sheet Music Transcriber")
    st.markdown("### Record or upload audio for transcription")
    
    # Model selection in sidebar
    with st.sidebar:
        st.header("πŸ”§ Settings")
        
        # Model selection
        model_options = {
            "Whisper Small (Recommended)": "openai/whisper-small",
            "Whisper Tiny": "openai/whisper-tiny",
            "Whisper Base": "openai/whisper-base",
            "Wav2Vec2 Base": "facebook/wav2vec2-base-960h"
        }
        
        selected_model = st.selectbox(
            "Select Model",
            options=list(model_options.keys()),
            index=0,
            help="Choose the transcription model. Whisper models generally provide better accuracy."
        )
        model_name = model_options[selected_model]
    
    # Main content area - Tabs for different input methods
    if app_config['RECORDING_ENABLED']:
        tab1, tab2 = st.tabs(["🎀 Record Audio", "πŸ“ Upload File"])
        recorded_file = None
        uploaded_file = None
        
        with tab1:
            recorded_file = record_audio()
        
        with tab2:
            uploaded_file = render_file_uploader()
    else:
        # If recording is not available, just show the file uploader
        st.info("🎀 Audio recording is not available in this environment. Please upload an audio file instead.")
        uploaded_file = render_file_uploader()
        recorded_file = None
    
    if uploaded_file is not None or recorded_file:
        with st.spinner("Processing audio..."):
            try:
                # Get the file path (either recorded or uploaded)
                if recorded_file:
                    temp_file_path = recorded_file
                    file_ext = os.path.splitext(temp_file_path)[1][1:]
                else:
                    temp_file_path = save_uploaded_file(uploaded_file)
                    file_ext = os.path.splitext(uploaded_file.name)[1][1:]
                
                # Display the audio player
                st.audio(temp_file_path, format=f'audio/{file_ext}')
                
                # Show file info
                file_size = os.path.getsize(temp_file_path) / (1024 * 1024)  # in MB
                st.info(f"πŸ“‚ Processing: {os.path.basename(temp_file_path)} ({file_size:.2f} MB)")
                
            except Exception as e:
                st.error(f"Error processing uploaded file: {str(e)}")
                if 'temp_file_path' in locals() and os.path.exists(temp_file_path):
                    try:
                        os.remove(temp_file_path)
                    except:
                        pass
    
    # Transcription Section
    if uploaded_file is not None:
        audio_file = temp_file_path
        
        # Add model selection
        model_options = {
            "Whisper Small": "openai/whisper-small",
            "Whisper Tiny": "openai/whisper-tiny",
            "Whisper Base": "openai/whisper-base",
            "Wav2Vec2 Base": "facebook/wav2vec2-base-960h",
            "SpeechT5": "microsoft/speecht5_asr"
        }
        selected_model = st.selectbox(
            "Select Transcription Model",
            options=list(model_options.keys()),
            index=0
        )
        
        if st.button("🎡 Transcribe Audio"):
            try:
                with st.spinner("Transcribing audio..."):
                    # Initialize the transcriber with the selected model
                    model_name = model_options[selected_model]
                    transcriber = HFTranscriber(model_name=model_name)
                    
                    # Create output directory
                    os.makedirs("outputs", exist_ok=True)
                    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                    
                    try:
                        # Transcribe the audio
                        result = transcriber.transcribe_audio(audio_file)
                        
                        # Save transcription to file
                        txt_output = os.path.join("outputs", f"transcription_{timestamp}.txt")
                        with open(txt_output, 'w', encoding='utf-8') as f:
                            f.write(result['transcription'])
                        
                        # Show success message
                        st.success("🎡 Transcription completed successfully!")
                        
                        # Display the transcription
                        st.markdown("### Transcription Result")
                        st.text_area("Transcription", result['transcription'], height=200)
                        
                        # Show download link
                        st.markdown("### Download Results")
                        st.download_button(
                            label="Download Transcription",
                            data=result['transcription'],
                            file_name=f"transcription_{timestamp}.txt",
                            mime="text/plain"
                        )
                        
                        # If we have timing information, show it
                        if 'word_timestamps' in result and result['word_timestamps']:
                            st.markdown("### Word-level Timestamps")
                            st.json(result['word_timestamps'])
                            st.markdown(f"**MIDI File:** {get_binary_file_downloader_html(midi_output, 'Download MIDI')}", 
                                      unsafe_allow_html=True)
                        
                        # Display MusicXML file
                        if os.path.exists(musicxml_output):
                            st.markdown(f"**MusicXML File:** {get_binary_file_downloader_html(musicxml_output, 'Download MusicXML')}", 
                                      unsafe_allow_html=True)
                        
                        # Display a preview of the transcription if possible
                        try:
                            from IPython.display import display, Audio
                            audio = Audio(audio_file)
                            st.audio(audio_file, format='audio/wav')
                        except Exception as e:
                            st.warning(f"Could not display audio preview: {str(e)}")
                            
                    except Exception as e:
                        st.error(f"❌ Error during transcription: {str(e)}")
                        st.exception(e)  # Show full traceback for debugging
            
            except Exception as e:
                st.error(f"An error occurred during transcription setup: {str(e)}")
                st.exception(e)  # Show full traceback for debugging
    
    # Clean up temporary files
    if os.path.exists("temp_uploads"):
        for file in os.listdir("temp_uploads"):
            try:
                os.remove(os.path.join("temp_uploads", file))
            except:
                pass

def clean_up_recordings(keep_last=5):
    """Clean up old recording files, keeping only the most recent ones."""
    try:
        # Use absolute path for the recordings directory
        recordings_dir = os.path.abspath("recordings")
        
        # Ensure the recordings directory exists
        os.makedirs(recordings_dir, exist_ok=True)
        
        # Get all wav files and sort by modification time
        recordings = sorted(
            glob.glob(os.path.join(recordings_dir, "*.wav")), 
            key=os.path.getmtime, 
            reverse=True
        )
        
        # Remove old recordings, keeping the specified number
        for old_recording in recordings[keep_last:]:
            try:
                if os.path.exists(old_recording):
                    os.remove(old_recording)
                    print(f"Removed old recording: {old_recording}")
            except Exception as e:
                print(f"Error removing {old_recording}: {e}")
    except Exception as e:
        print(f"Error in clean_up_recordings: {e}")

if __name__ == "__main__":
    # Create necessary directories
    os.makedirs("outputs", exist_ok=True)
    
    # Run the app
    main()
    
    # Add footer
    st.markdown("---")
    st.markdown("### About")
    st.markdown("""
    This app uses Hugging Face's Transformers library for speech-to-text transcription.
    Models are loaded on-demand and require an internet connection.
    
    **Note:** This version supports both file uploads and live recording (if your device supports it).
    """)