import streamlit as st
from PIL import Image
import io
import soundfile as sf
import numpy as np
import torch
from transformers import pipeline
from diffusers import StableAudioPipeline

# --- Configuration ---
# Determine the optimal device for model inference
# Prioritize CUDA (NVIDIA GPUs), then MPS (Apple Silicon), fallback to CPU
DEVICE = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")

# Use float16 for reduced memory and faster inference on compatible hardware (GPU/MPS)
# Fallback to float32 for CPU for better stability
TORCH_DTYPE = torch.float16 if DEVICE in ["cuda", "mps"] else torch.float32

# --- Cached Model Loading Functions ---
@st.cache_resource(show_spinner="Loading Image Captioning Model (BLIP)...")
def load_blip_model():
    """
    Loads the BLIP image captioning model using Hugging Face transformers pipeline.
    The model is cached to prevent reloading on every Streamlit rerun.
    """
    try:
        captioner = pipeline(
            "image-to-text",
            model="Salesforce/blip-image-captioning-base",
            torch_dtype=TORCH_DTYPE,
            device=DEVICE
        )
        return captioner
    except Exception as e:
        st.error(f"Failed to load BLIP model: {e}")
        return None

@st.cache_resource(show_spinner="Loading Audio Generation Model (Stable Audio Open Small)...")
def load_stable_audio_model():
    """
    Loads the Stable Audio Open Small pipeline using Hugging Face diffusers.
    The pipeline is cached to prevent reloading on every Streamlit rerun.
    """
    try:
        # Changed model to stabilityai/stable-audio-open-small
        audio_pipeline = StableAudioPipeline.from_pretrained(
            "stabilityai/stable-audio-open-1.0",
            torch_dtype=TORCH_DTYPE
        ).to(DEVICE)
        return audio_pipeline
    except Exception as e:
        st.error(f"Failed to load Stable Audio model: {e}")
        return None

# --- Audio Conversion Utility ---
def convert_numpy_to_wav_bytes(audio_array: np.ndarray, sample_rate: int) -> bytes:
    """
    Converts a NumPy audio array to an in-memory WAV byte stream.
    This avoids writing temporary files to disk, which is efficient and
    suitable for ephemeral environments like Hugging Face Spaces.
    """
    byte_io = io.BytesIO()
    
    # Stable Audio Open's diffusers output is (channels, frames).
    # soundfile typically expects (frames, channels) for stereo.
    # Transpose if it's a 2D array (stereo) to match soundfile's expectation.
    if audio_array.ndim == 2 and audio_array.shape == 2: # Check if stereo (2 channels)
        audio_array = audio_array.T # Transpose to (frames, channels) [1]

    # Write the NumPy array to the in-memory BytesIO object as a WAV file [1, 2]
    sf.write(byte_io, audio_array, sample_rate, format='WAV', subtype='FLOAT')
    
    # IMPORTANT: Reset the stream position to the beginning before reading [3]
    byte_io.seek(0) 
    return byte_io.read()

# --- Streamlit App Layout ---
st.set_page_config(layout="centered", page_title="Image-to-Soundscape Generator")
st.title("🏞️ Image-to-Soundscape Generator 🎶")
st.markdown("Upload a landscape image, and let AI transform it into a unique soundscape!")

# Initialize session state for persistence across reruns [4]
if "audio_bytes" not in st.session_state:
    st.session_state.audio_bytes = None
if "image_uploaded" not in st.session_state:
    st.session_state.image_uploaded = False

# --- UI Components ---
uploaded_file = st.file_uploader("Choose a landscape image...", type=["jpg", "jpeg", "png"]) # [5]

if uploaded_file is not None:
    st.session_state.image_uploaded = True
    image = Image.open(uploaded_file).convert("RGB") # Ensure image is in RGB format
    st.image(image, caption="Uploaded Image", use_container_width=True) # Updated deprecated parameter [6]

    # Button to trigger the generation pipeline
    if st.button("Generate Soundscape"):
        st.session_state.audio_bytes = None # Clear previous audio
        
        with st.spinner("Generating soundscape... This may take a moment."): # [4]
            try:
                # 1. Load BLIP model and generate caption (hidden from user)
                captioner = load_blip_model()
                if captioner is None:
                    st.error("Image captioning model could not be loaded. Please try again.")
                    st.session_state.image_uploaded = False # Reset to allow re-upload
                    st.stop()

                # Generate caption
                # The BLIP pipeline expects a PIL Image object directly
                caption_results = captioner(image)
                # Extract the generated text from the pipeline's output 
                generated_caption = caption_results[0]['generated_text']

                # Optional: Enhance prompt for soundscape generation
                # This helps guide the audio model towards environmental sounds
                soundscape_prompt = f"A soundscape of {generated_caption}"

                # 2. Load Stable Audio model and generate audio
                audio_pipeline = load_stable_audio_model()
                if audio_pipeline is None:
                    st.error("Audio generation model could not be loaded. Please try again.")
                    st.session_state.image_uploaded = False # Reset to allow re-upload
                    st.stop()

                # Generate audio with optimized parameters for speed [7, 8]
                # num_inference_steps: Lower for faster generation, higher for better quality
                # audio_end_in_s: Shorter audio for faster generation (max 11s for stable-audio-open-small) [10, 11, 12]
                # negative_prompt: Helps improve perceived quality [8]
                audio_output = audio_pipeline(
                    prompt=soundscape_prompt,
                    num_inference_steps=10,  # Tuned for faster generation [8]
                    audio_end_in_s=5,     # 10 seconds audio length (within 11s limit for small model) [10, 11, 12]
                    negative_prompt="low quality, average quality, distorted" # [8]
                )
                
                # Extract the NumPy array and sample rate [9]
                audio_numpy_array = audio_output.audios
                sample_rate = audio_pipeline.config.sampling_rate

                # 3. Convert NumPy array to WAV bytes and store in session state
                st.session_state.audio_bytes = convert_numpy_to_wav_bytes(audio_numpy_array, sample_rate)
                
                st.success("Soundscape generated successfully!")

            except Exception as e:
                st.error(f"An error occurred during generation: {e}") # 
                st.session_state.audio_bytes = None # Clear any partial audio
                st.session_state.image_uploaded = False # Reset to allow re-upload
                st.exception(e) # Display full traceback for debugging 

# Display generated soundscape if available in session state
if st.session_state.audio_bytes:
    st.subheader("Generated Soundscape:")
    st.audio(st.session_state.audio_bytes, format='audio/wav') # 
    st.markdown("You can download the audio using the controls above.")

# Reset button for new image upload
if st.session_state.image_uploaded and st.button("Upload New Image"):
    st.session_state.audio_bytes = None
    st.session_state.image_uploaded = False
    st.rerun() # Rerun the app to clear the file uploader