visox / app.py
adil9858's picture
Update app.py
bfa5acf verified
import streamlit as st
from together import Together
import base64
from PIL import Image
from io import BytesIO
import os
from elevenlabs.client import ElevenLabs
# Set API keys as environment variables
os.environ["TOGETHER_API_KEY"] = st.secrets['together_api']
os.environ["ELEVENLABS_API_KEY"] = st.secrets['elevenlabs_api']
# Initialize the Together client
together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])
# Initialize ElevenLabs client
elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
# Function to encode image to base64
def encode_image(image):
buffered = BytesIO()
image.save(buffered, format="JPEG")
image_bytes = buffered.getvalue()
return base64.b64encode(image_bytes).decode('utf-8')
# Function to get image description from Together API
def get_image_description(image):
get_description_prompt = "Describe the given image in detail in only 20 words max."
# Encode the image to base64
base64_image = encode_image(image)
# Create the request to Together API
response = together_client.chat.completions.create(
model="meta-llama/Llama-Vision-Free",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": get_description_prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}],
stream=False,
)
# Return the result from the API
return response.choices[0].message.content
# Function to convert text to speech using ElevenLabs
def tts(text):
try:
# Generate the audio (returns a generator)
audio_generator = elevenlabs_client.text_to_speech.convert(
text=text,
voice_id="JBFqnCBsd6RMkjVDRZzb", # Replace with your preferred voice ID
model_id="eleven_multilingual_v2",
output_format="mp3_44100_128",
)
# Save the audio to a temporary file
audio_file_path = "temp_audio.mp3"
with open(audio_file_path, "wb") as f:
for chunk in audio_generator:
f.write(chunk)
# Play the audio in Streamlit
st.audio(audio_file_path, format="audio/mp3",autoplay=True)
except Exception as e:
st.error(f"Error generating speech: {e}")
# Custom CSS for a futuristic look
st.markdown(
"""
<style>
.stApp {
background: linear-gradient(135deg, #1e1e2f, #2a2a40);
color: #ffffff;
font-family: 'Arial', sans-serif;
}
.stButton>button {
background: linear-gradient(135deg, #6a11cb, #2575fc);
color: white;
border: none;
border-radius: 12px;
padding: 10px 20px;
font-size: 16px;
font-weight: bold;
}
.stButton>button:hover {
background: linear-gradient(135deg, #2575fc, #6a11cb);
}
.stImage {
border-radius: 12px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
}
.stMarkdown h1 {
color: #6a11cb;
text-align: center;
font-size: 36px;
font-weight: bold;
}
.stMarkdown h2 {
color: #2575fc;
font-size: 24px;
font-weight: bold;
}
.stSpinner>div {
color: #6a11cb;
}
</style>
""",
unsafe_allow_html=True,
)
# Streamlit app layout
st.title("๐Ÿ”ฎ Visox | Koshur AI")
st.markdown("### See the world through AI's eyes!")
# Sidebar for additional info
st.sidebar.markdown("## About")
st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit.")
# Access the camera
img_file_buffer = st.camera_input("Take a picture")
if img_file_buffer is not None:
# Convert the image file buffer to a PIL Image
img = Image.open(img_file_buffer)
# Display the captured image
st.image(img, caption='Captured Image', width=300)
# Get and display the description
with st.spinner('๐Ÿ” Analyzing the image...'):
description = get_image_description(img)
st.success('โœ… Analysis complete!')
st.markdown("### AI Description:")
st.write(description)
# Convert description to speech and play it
tts(description)