File size: 7,518 Bytes
2a7d044
c5c0d60
 
 
 
007d471
c5c0d60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
007d471
c5c0d60
 
 
 
 
 
 
007d471
c5c0d60
 
 
8e93ddc
c5c0d60
 
8e93ddc
c5c0d60
 
007d471
8e93ddc
c5c0d60
ec1031e
c5c0d60
 
 
007d471
c5c0d60
 
 
 
 
 
 
 
 
 
 
007d471
c5c0d60
 
 
 
 
 
 
 
007d471
c5c0d60
 
 
007d471
c5c0d60
 
 
 
007d471
c5c0d60
 
 
 
 
007d471
c5c0d60
 
007d471
3a947b7
c5c0d60
 
8e93ddc
c5c0d60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e93ddc
8621d6e
8e93ddc
c5c0d60
 
 
 
 
 
 
 
 
 
 
8e93ddc
c5c0d60
8e93ddc
 
c5c0d60
 
b693fac
 
8e93ddc
c5c0d60
 
8e93ddc
c5c0d60
 
 
 
 
 
 
 
 
8e93ddc
c5c0d60
 
8e93ddc
c5c0d60
 
 
 
8e93ddc
c5c0d60
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import streamlit as st
from PIL import Image
import io
import soundfile as sf
import numpy as np
import torch
from transformers import pipeline
from diffusers import StableAudioPipeline

# --- Configuration ---
# Determine the optimal device for model inference
# Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

# Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS)
# Fallback to float32 for CPU for better stability
TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32

# --- Cached Model Loading Functions ---
@st.cache_resource(show_spinner="Loading Image Captioning Model (BLIP)...")
def load_blip_model():
    """
    Loads the BLIP image captioning model using Hugging Face transformers pipeline.
    The model is cached to prevent reloading on every Streamlit rerun.
    """
    try:
        captioner = pipeline(
            "image-to-text",
            model="Salesforce/blip-image-captioning-base",
            torch_dtype=TORCH_DTYPE,
            device=DEVICE
        )
        return captioner
    except Exception as e:
        st.error(f"Failed to load BLIP model: {e}")
        return None

@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
def load_stable_audio_model():
    """
    Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
    The pipeline is cached to prevent reloading on every Streamlit rerun.
    """
    try:
        # Changed model to stabilityai/stable-audio-open-small
        audio_pipeline = StableAudioPipeline.from_pretrained(
            "stabilityai/stable-audio-open-1.0",
            torch_dtype=TORCH_DTYPE
        ).to(DEVICE)
        return audio_pipeline
    except Exception as e:
        st.error(f"Failed to load Stable Audio model: {e}")
        return None

# --- Audio Conversion Utility ---
def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes:
    """
    Converts a NumPy audio array to an in-memory WAV byte stream.
    This avoids writing temporary files to disk, which is efficient and
    suitable for ephemeral environments like Hugging Face Spaces.
    """
    byte_io = io.BytesIO()
    
    # Stable Audio Open's diffusers output is (channels, frames).
    # soundfile typically expects (frames, channels) for stereo.
    # Transpose if it's a 2D array (stereo) to match soundfile's expectation.
    if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels)
        audio_array = audio_array.T # Transpose to (frames, channels) [1]

    # Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2]
    sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT')
    
    # IMPORTANT: Reset the stream position to the beginning before reading [3]
    byte_io.seek(0) 
    return byte_io.read()

# --- Streamlit App Layout ---
st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator")
st.title("๐Ÿž๏ธ Image-to-Soundscape Generator ๐ŸŽถ")
st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!")

# Initialize session state for persistence across reruns [4]
if "audio_bytes" not in st.session_state:
    st.session_state.audio_bytes = None
if "image_uploaded" not in st.session_state:
    st.session_state.image_uploaded = False

# --- UI Components ---
uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5]

if uploaded_file is not None:
    st.session_state.image_uploaded = True
    image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
    st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]

    # Button to trigger the generation pipeline
    if st.button("Generate Soundscape"):
        st.session_state.audio_bytes = None # Clear previous audio
        
        with st.spinner("Generating soundscape... This may take a moment."): # [4]
            try:
                # 1. Load BLIP model and generate caption (hidden from user)
                captioner = load_blip_model()
                if captioner is None:
                    st.error("Image captioning model could not be loaded. Please try again.")
                    st.session_state.image_uploaded = False # Reset to allow re-upload
                    st.stop()

                # Generate caption
                # The BLIP pipeline expects a PIL Image object directly
                caption_results = captioner(image)
                # Extract the generated text from the pipeline's output 
                generated_caption = caption_results[0]['generated_text']

                # Optional: Enhance prompt for soundscape generation
                # This helps guide the audio model towards environmental sounds
                soundscape_prompt = f"A soundscape of {generated_caption}"

                # 2. Load Stable Audio model and generate audio
                audio_pipeline = load_stable_audio_model()
                if audio_pipeline is None:
                    st.error("Audio generation model could not be loaded. Please try again.")
                    st.session_state.image_uploaded = False # Reset to allow re-upload
                    st.stop()

                # Generate audio with optimized parameters for speed [7, 8]
                # num_inference_steps: Lower for faster generation, higher for better quality
                # audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
                # negative_prompt: Helps improve perceived quality [8]
                audio_output = audio_pipeline(
                    prompt=soundscape_prompt,
                    num_inference_steps=10,  # Tuned for faster generation [8]
                    audio_end_in_s=5,     # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
                    negative_prompt="low quality, average quality, distorted" # [8]
                )
                
                # Extract the NumPy array and sample rate [9]
                audio_numpy_array = audio_output.audios
                sample_rate = audio_pipeline.config.sampling_rate

                # 3. Convert NumPy array to WAV bytes and store in session state
                st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate)
                
                st.success("Soundscape generated successfully!")

            except Exception as e:
                st.error(f"An error occurred during generation: {e}") # 
                st.session_state.audio_bytes = None # Clear any partial audio
                st.session_state.image_uploaded = False # Reset to allow re-upload
                st.exception(e) # Display full traceback for debugging 

# Display generated soundscape if available in session state
if st.session_state.audio_bytes:
    st.subheader("Generated Soundscape:")
    st.audio(st.session_state.audio_bytes, format='audio/wav') # 
    st.markdown("You can download the audio using the controls above.")

# Reset button for new image upload
if st.session_state.image_uploaded and st.button("Upload New Image"):
    st.session_state.audio_bytes = None
    st.session_state.image_uploaded = False
    st.rerun() # Rerun the app to clear the file uploader