adil9858 commited on
Commit
27e9a0a
ยท
verified ยท
1 Parent(s): 3a448ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -46
app.py CHANGED
@@ -4,16 +4,18 @@ import base64
4
  from PIL import Image
5
  from io import BytesIO
6
  import os
 
7
 
8
-
9
-
10
-
11
- # Set the API key
12
  os.environ["TOGETHER_API_KEY"] = "tgp_v1_ci8Tlva09oBrdDV89ULFNcyPgnR9NwNTQyvQ_4XBw3M"
 
13
 
14
  # Initialize the Together client
15
  together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])
16
 
 
 
 
17
  # Function to encode image to base64
18
  def encode_image(image):
19
  buffered = BytesIO()
@@ -23,7 +25,7 @@ def encode_image(image):
23
 
24
  # Function to get image description from Together API
25
  def get_image_description(image):
26
- get_description_prompt = "Describe the given image in detail in only 20 words max"
27
 
28
  # Encode the image to base64
29
  base64_image = encode_image(image)
@@ -49,6 +51,27 @@ def get_image_description(image):
49
  # Return the result from the API
50
  return response.choices[0].message.content
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  # Custom CSS for a futuristic look
53
  st.markdown(
54
  """
@@ -93,34 +116,10 @@ st.markdown(
93
  unsafe_allow_html=True,
94
  )
95
 
96
- def tts(text):
97
- key = 'sk_8db078175c4eba3e2d4b500ffb167c46326e520bdf001f0e'
98
-
99
-
100
- from elevenlabs.client import ElevenLabs
101
- from elevenlabs import play
102
-
103
-
104
-
105
- client = ElevenLabs(api_key=key)
106
-
107
-
108
- audio = client.text_to_speech.convert(
109
- text=text,
110
- voice_id="JBFqnCBsd6RMkjVDRZzb",
111
- model_id="eleven_multilingual_v2",
112
- output_format="mp3_44100_128",
113
- )
114
-
115
-
116
- play(audio)
117
-
118
  # Streamlit app layout
119
  st.title("๐Ÿ”ฎ Visox | Koshur AI")
120
  st.markdown("### See the world through AI's eyes!")
121
 
122
-
123
-
124
  # Sidebar for additional info
125
  st.sidebar.markdown("## About")
126
  st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
@@ -130,20 +129,22 @@ st.sidebar.markdown("Powered by [Together AI](https://together.ai) and Streamlit
130
  img_file_buffer = st.camera_input("Take a picture")
131
 
132
  if img_file_buffer is not None:
133
- # Convert the image file buffer to a PIL Image
134
- img = Image.open(img_file_buffer)
135
-
136
- # Display the captured image
137
- st.image(img, caption='Captured Image',width=300)
138
-
139
- # Get and display the description
140
- with st.spinner('๐Ÿ” Analyzing the image...'):
141
- description = get_image_description(img)
142
- st.success('โœ… Analysis complete!')
143
- st.markdown("### AI Description:")
144
- st.write(description)
145
-
146
- tts(description)
147
-
148
-
149
-
 
 
 
4
  from PIL import Image
5
  from io import BytesIO
6
  import os
7
+ from elevenlabs.client import ElevenLabs
8
 
9
+ # Set API keys as environment variables
 
 
 
10
  os.environ["TOGETHER_API_KEY"] = "tgp_v1_ci8Tlva09oBrdDV89ULFNcyPgnR9NwNTQyvQ_4XBw3M"
11
+ os.environ["ELEVENLABS_API_KEY"] = "sk_8db078175c4eba3e2d4b500ffb167c46326e520bdf001f0e"
12
 
13
  # Initialize the Together client
14
  together_client = Together(api_key=os.environ["TOGETHER_API_KEY"])
15
 
16
+ # Initialize ElevenLabs client
17
+ elevenlabs_client = ElevenLabs(api_key=os.environ["ELEVENLABS_API_KEY"])
18
+
19
  # Function to encode image to base64
20
  def encode_image(image):
21
  buffered = BytesIO()
 
25
 
26
  # Function to get image description from Together API
27
  def get_image_description(image):
28
+ get_description_prompt = "Describe the given image in detail in only 20 words max."
29
 
30
  # Encode the image to base64
31
  base64_image = encode_image(image)
 
51
  # Return the result from the API
52
  return response.choices[0].message.content
53
 
54
+ # Function to convert text to speech using ElevenLabs
55
+ def tts(text):
56
+ try:
57
+ # Generate audio using ElevenLabs
58
+ audio = elevenlabs_client.text_to_speech.convert(
59
+ text=text,
60
+ voice_id="JBFqnCBsd6RMkjVDRZzb", # Replace with your preferred voice ID
61
+ model_id="eleven_multilingual_v2",
62
+ output_format="mp3_44100_128",
63
+ )
64
+
65
+ # Save the audio to a temporary file
66
+ audio_path = "temp_audio.mp3"
67
+ with open(audio_path, "wb") as f:
68
+ f.write(audio)
69
+
70
+ # Play the audio in Streamlit
71
+ st.audio(audio_path, format="audio/mp3")
72
+ except Exception as e:
73
+ st.error(f"Error generating speech: {e}")
74
+
75
  # Custom CSS for a futuristic look
76
  st.markdown(
77
  """
 
116
  unsafe_allow_html=True,
117
  )
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  # Streamlit app layout
120
  st.title("๐Ÿ”ฎ Visox | Koshur AI")
121
  st.markdown("### See the world through AI's eyes!")
122
 
 
 
123
  # Sidebar for additional info
124
  st.sidebar.markdown("## About")
125
  st.sidebar.markdown("This app uses advanced AI to describe what it sees through your camera in real-time.")
 
129
  img_file_buffer = st.camera_input("Take a picture")
130
 
131
  if img_file_buffer is not None:
132
+ try:
133
+ # Convert the image file buffer to a PIL Image
134
+ img = Image.open(img_file_buffer)
135
+
136
+ # Display the captured image
137
+ st.image(img, caption='Captured Image', use_column_width=True)
138
+
139
+ # Get and display the description
140
+ with st.spinner('๐Ÿ” Analyzing the image...'):
141
+ description = get_image_description(img)
142
+ st.success('โœ… Analysis complete!')
143
+ st.markdown("### AI Description:")
144
+ st.write(description)
145
+
146
+ # Convert description to speech and play it
147
+ if st.button("๐Ÿ”Š Read Aloud"):
148
+ tts(description)
149
+ except Exception as e:
150
+ st.error(f"An error occurred: {e}")