Update app.py
Browse files
app.py
CHANGED
@@ -4,16 +4,18 @@ import base64
|
|
4 |
from PIL import Image
|
5 |
from io import BytesIO
|
6 |
import os
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
# Set the API key
|
12 |
os.environ["TOGETHER_API_KEY"] = "tgp_v1_ci8Tlva09oBrdDV89ULFNcyPgnR9NwNTQyvQ_4XBw3M"
|
|
|
13 |
|
14 |
# Initialize the Together client
|
15 |
together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])
|
16 |
|
|
|
|
|
|
|
17 |
# Function to encode image to base64
|
18 |
def encode_image(image):
|
19 |
buffered = BytesIO()
|
@@ -23,7 +25,7 @@ def encode_image(image):
|
|
23 |
|
24 |
# Function to get image description from Together API
|
25 |
def get_image_description(image):
|
26 |
-
get_description_prompt = "Describe the given image in detail in only 20 words max"
|
27 |
|
28 |
# Encode the image to base64
|
29 |
base64_image = encode_image(image)
|
@@ -49,6 +51,27 @@ def get_image_description(image):
|
|
49 |
# Return the result from the API
|
50 |
return response.choices[0].message.content
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
# Custom CSS for a futuristic look
|
53 |
st.markdown(
|
54 |
"""
|
@@ -93,34 +116,10 @@ st.markdown(
|
|
93 |
unsafe_allow_html=True,
|
94 |
)
|
95 |
|
96 |
-
def tts(text):
|
97 |
-
key = 'sk_8db078175c4eba3e2d4b500ffb167c46326e520bdf001f0e'
|
98 |
-
|
99 |
-
|
100 |
-
from elevenlabs.client import ElevenLabs
|
101 |
-
from elevenlabs import play
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
client = ElevenLabs(api_key=key)
|
106 |
-
|
107 |
-
|
108 |
-
audio = client.text_to_speech.convert(
|
109 |
-
text=text,
|
110 |
-
voice_id="JBFqnCBsd6RMkjVDRZzb",
|
111 |
-
model_id="eleven_multilingual_v2",
|
112 |
-
output_format="mp3_44100_128",
|
113 |
-
)
|
114 |
-
|
115 |
-
|
116 |
-
play(audio)
|
117 |
-
|
118 |
# Streamlit app layout
|
119 |
st.title("๐ฎ Visox | Koshur AI")
|
120 |
st.markdown("### See the world through AI's eyes!")
|
121 |
|
122 |
-
|
123 |
-
|
124 |
# Sidebar for additional info
|
125 |
st.sidebar.markdown("## About")
|
126 |
st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
|
@@ -130,20 +129,22 @@ st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit
|
|
130 |
img_file_buffer = st.camera_input("Take a picture")
|
131 |
|
132 |
if img_file_buffer is not None:
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
|
|
|
|
|
4 |
from PIL import Image
|
5 |
from io import BytesIO
|
6 |
import os
|
7 |
+
from elevenlabs.client import ElevenLabs
|
8 |
|
9 |
+
# Set API keys as environment variables
|
|
|
|
|
|
|
10 |
os.environ["TOGETHER_API_KEY"] = "tgp_v1_ci8Tlva09oBrdDV89ULFNcyPgnR9NwNTQyvQ_4XBw3M"
|
11 |
+
os.environ["ELEVENLABS_API_KEY"] = "sk_8db078175c4eba3e2d4b500ffb167c46326e520bdf001f0e"
|
12 |
|
13 |
# Initialize the Together client
|
14 |
together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])
|
15 |
|
16 |
+
# Initialize ElevenLabs client
|
17 |
+
elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
|
18 |
+
|
19 |
# Function to encode image to base64
|
20 |
def encode_image(image):
|
21 |
buffered = BytesIO()
|
|
|
25 |
|
26 |
# Function to get image description from Together API
|
27 |
def get_image_description(image):
|
28 |
+
get_description_prompt = "Describe the given image in detail in only 20 words max."
|
29 |
|
30 |
# Encode the image to base64
|
31 |
base64_image = encode_image(image)
|
|
|
51 |
# Return the result from the API
|
52 |
return response.choices[0].message.content
|
53 |
|
54 |
+
# Function to convert text to speech using ElevenLabs
|
55 |
+
def tts(text):
|
56 |
+
try:
|
57 |
+
# Generate audio using ElevenLabs
|
58 |
+
audio = elevenlabs_client.text_to_speech.convert(
|
59 |
+
text=text,
|
60 |
+
voice_id="JBFqnCBsd6RMkjVDRZzb", # Replace with your preferred voice ID
|
61 |
+
model_id="eleven_multilingual_v2",
|
62 |
+
output_format="mp3_44100_128",
|
63 |
+
)
|
64 |
+
|
65 |
+
# Save the audio to a temporary file
|
66 |
+
audio_path = "temp_audio.mp3"
|
67 |
+
with open(audio_path, "wb") as f:
|
68 |
+
f.write(audio)
|
69 |
+
|
70 |
+
# Play the audio in Streamlit
|
71 |
+
st.audio(audio_path, format="audio/mp3")
|
72 |
+
except Exception as e:
|
73 |
+
st.error(f"Error generating speech: {e}")
|
74 |
+
|
75 |
# Custom CSS for a futuristic look
|
76 |
st.markdown(
|
77 |
"""
|
|
|
116 |
unsafe_allow_html=True,
|
117 |
)
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
# Streamlit app layout
|
120 |
st.title("๐ฎ Visox | Koshur AI")
|
121 |
st.markdown("### See the world through AI's eyes!")
|
122 |
|
|
|
|
|
123 |
# Sidebar for additional info
|
124 |
st.sidebar.markdown("## About")
|
125 |
st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
|
|
|
129 |
img_file_buffer = st.camera_input("Take a picture")
|
130 |
|
131 |
if img_file_buffer is not None:
|
132 |
+
try:
|
133 |
+
# Convert the image file buffer to a PIL Image
|
134 |
+
img = Image.open(img_file_buffer)
|
135 |
+
|
136 |
+
# Display the captured image
|
137 |
+
st.image(img, caption='Captured Image', use_column_width=True)
|
138 |
+
|
139 |
+
# Get and display the description
|
140 |
+
with st.spinner('๐ Analyzing the image...'):
|
141 |
+
description = get_image_description(img)
|
142 |
+
st.success('โ
Analysis complete!')
|
143 |
+
st.markdown("### AI Description:")
|
144 |
+
st.write(description)
|
145 |
+
|
146 |
+
# Convert description to speech and play it
|
147 |
+
if st.button("๐ Read Aloud"):
|
148 |
+
tts(description)
|
149 |
+
except Exception as e:
|
150 |
+
st.error(f"An error occurred: {e}")
|