SameerArz commited on
Commit
9f5c19b
·
verified ·
1 Parent(s): 6e809e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -205
app.py CHANGED
@@ -1,16 +1,13 @@
1
  #!/usr/bin/env python3
2
  import streamlit as st
3
  from gradio_client import Client
 
4
  from PIL import Image
5
  import moviepy.editor as mp
6
  from natsort import natsorted
7
- from pydantic import BaseModel, Field
8
- from typing import List, Dict, Type, Optional, TypedDict
9
- from langgraph.graph import StateGraph, START, END
10
- from langchain_groq import ChatGroq
11
- from langchain_core.messages import SystemMessage
12
  import os
13
  from dotenv import load_dotenv
 
14
 
15
  # Load environment variables
16
  load_dotenv()
@@ -18,227 +15,145 @@ load_dotenv()
18
  # Constants
19
  HF_TOKEN = os.getenv("HF_TOKEN")
20
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
21
- IMAGE_GENERATION_SPACE_NAME = "stabilityai/stable-diffusion-3.5-large-turbo" # Updated to correct Space
22
- SUPPORTED_FORMATS = ["mp3", "wav", "ogg", "flac", "aac", "m4a"]
23
 
24
- # Pydantic Models
25
- class SingleScene(BaseModel):
26
- text: str = Field(description="Actual Segment of text(a scene) from the complete story")
27
- image_prompts: List[str] = Field(
28
- description="""List of detailed and descriptive image prompts for the segment
29
- prompt format: [theme: {atmosphere/mood}] [style: {artistic/photorealistic}] [focus: {main subject}] [details: {specific elements}] [lighting: {day/night/mystic}] [perspective: {close-up/wide-angle}]"
30
- Example: "theme: eerie forest | style: cinematic realism | focus: abandoned cabin | details: broken windows, overgrown vines | lighting: moonlit fog | perspective: wide-angle shot"
31
- """
32
- )
33
 
34
- class ScenesResponseSchema(BaseModel):
35
- scenes: List[SingleScene]
36
-
37
- # Structured Output Extractor
38
- class State(TypedDict):
39
- messages: list
40
- output: Optional[BaseModel]
41
-
42
- class StructuredOutputExtractor:
43
- def __init__(self, response_schema: Type[BaseModel]):
44
- self.response_schema = response_schema
45
- self.llm = ChatGroq(model="deepseek-r1-distill-llama-70b", api_key=GROQ_API_KEY)
46
- self.structured_llm = self.llm.with_structured_output(response_schema)
47
- self._build_graph()
48
-
49
- def _build_graph(self):
50
- graph_builder = StateGraph(State)
51
- graph_builder.add_node("extract", self._extract_structured_info)
52
- graph_builder.add_edge(START, "extract")
53
- graph_builder.add_edge("extract", END)
54
- self.graph = graph_builder.compile()
55
-
56
- def _extract_structured_info(self, state: dict):
57
- query = state['messages'][-1].content
58
- try:
59
- output = self.structured_llm.invoke(query)
60
- return {"output": output}
61
- except Exception as e:
62
- st.error(f"Error during extraction: {e}")
63
- return {"output": None}
64
-
65
- def extract(self, query: str) -> Optional[BaseModel]:
66
- result = self.graph.invoke({"messages": [SystemMessage(content=query)]})
67
- return result.get('output')
68
 
69
  # Utility Functions
70
- def calculate_read_time(text: str, words_per_minute: int = 155) -> str:
71
- try:
72
- if not text or not isinstance(text, str):
73
- return "Invalid input: Text must be a non-empty string."
74
- words = text.split()
75
- word_count = len(words)
76
- total_seconds = (word_count / words_per_minute) * 60
77
- hours = int(total_seconds // 3600)
78
- minutes = int((total_seconds % 3600) // 60)
79
- seconds = int(total_seconds % 60)
80
- if hours > 0:
81
- return f"Reading time: {hours} hour(s), {minutes} minute(s), and {seconds} second(s)."
82
- elif minutes > 0:
83
- return f"Reading time: {minutes} minute(s) and {seconds} second(s)."
84
- else:
85
- return f"Reading time: {seconds} second(s)."
86
- except Exception as e:
87
- return f"An error occurred: {e}"
88
-
89
- def get_scenes(text_script: str):
90
- read_time = calculate_read_time(text_script)
91
  prompt = f"""
92
- ROLE: Story to Scene Generator
93
- Tasks: For the given story
94
- 1. Read it Completely and Understand the Complete Context
95
- 2. Rewrite the story in tiny scenes(but without even changing a word) with highly detailed and context aware list of image prompts to visualize each scene
96
- 3. Never Describe complete scene in a single image prompt use multiple prompts
97
- RULE OF THUMB: 12 image prompts / 1 min audio
98
-
99
- Estimated Read Time: {read_time}\n\n
100
- Complete Story: {text_script}
101
  """
102
- extractor = StructuredOutputExtractor(response_schema=ScenesResponseSchema)
103
- result = extractor.extract(prompt)
104
- return result.model_dump() if result else {}
105
-
106
- def generate_audio(text, language_code, speaker, path='test_audio.mp3'):
107
- try:
108
- client = Client("habib926653/Multilingual-TTS")
109
- result = client.predict(
110
- text=text,
111
- language_code=language_code,
112
- speaker=speaker,
113
- api_name="/text_to_speech_edge"
114
  )
115
- audio_file_path = result[1]
116
- with open(audio_file_path, 'rb') as f:
117
- audio_bytes = f.read()
118
- with open(path, 'wb') as f:
119
- f.write(audio_bytes)
120
- return {"audio_file": path}
121
- except Exception as e:
122
- st.error(f"Error during audio generation: {e}")
123
- return {"error": str(e)}
124
-
125
- def generate_image(prompt, path='test_image.png'):
126
  try:
127
- client = Client("stabilityai/stable-diffusion-3.5-large-turbo", hf_token=HF_TOKEN)
128
  result = client.predict(
129
- prompt=prompt, # Text prompt for image generation
130
- width=1280,
131
- height=720,
132
- api_name="/generate_image"
133
  )
134
  image = Image.open(result)
135
  image.save(path)
136
- return result
137
  except Exception as e:
138
- st.error(f"Error during image generation: {e}")
139
- return {"error": str(e)}
140
-
141
- def generate_video_assets(scenes: Dict, language: str, speaker: str, base_path: str = "media") -> str:
142
- try:
143
- if not os.path.exists(base_path):
144
- os.makedirs(base_path)
145
- scenes_list = scenes.get("scenes", [])
146
- video_folder = os.path.join(base_path, f"video_{len(os.listdir(base_path)) + 1}")
147
- os.makedirs(video_folder, exist_ok=True)
148
- images_folder = os.path.join(video_folder, "images")
149
- audio_folder = os.path.join(video_folder, "audio")
150
- os.makedirs(images_folder, exist_ok=True)
151
- os.makedirs(audio_folder, exist_ok=True)
152
-
153
- for scene_count, scene in enumerate(scenes_list):
154
- text = scene.get("text", "")
155
- image_prompts = scene.get("image_prompts", [])
156
- audio_path = os.path.join(audio_folder, f"scene_{scene_count + 1}.mp3")
157
- audio_result = generate_audio(text, language, speaker, path=audio_path)
158
- if "error" in audio_result:
159
- continue
160
- scene_images_folder = os.path.join(images_folder, f"scene_{scene_count + 1}")
161
- os.makedirs(scene_images_folder, exist_ok=True)
162
- for count, prompt in enumerate(image_prompts):
163
- image_path = os.path.join(scene_images_folder, f"scene_{scene_count + 1}_image_{count + 1}.png")
164
- generate_image(prompt=prompt, path=image_path)
165
-
166
- return video_folder
167
- except Exception as e:
168
- st.error(f"Error during video asset generation: {e}")
169
- return ""
170
 
171
- def generate_video(video_folder: str, output_filename: str = "final_video.mp4"):
172
  try:
173
- audio_folder = os.path.join(video_folder, "audio")
174
- images_folder = os.path.join(video_folder, "images")
175
- final_clips = []
176
- scene_folders = [
177
- os.path.join(images_folder, scene)
178
- for scene in natsorted(os.listdir(images_folder))
179
- if os.path.isdir(os.path.join(images_folder, scene))
180
- ]
181
- for scene_path in scene_folders:
182
- scene_name = os.path.basename(scene_path)
183
- audio_path = os.path.join(audio_folder, f"{scene_name}.mp3")
184
- if not os.path.exists(audio_path):
185
- continue
186
- image_files = natsorted([
187
- os.path.join(scene_path, img)
188
- for img in os.listdir(scene_path)
189
- if img.lower().endswith(('.png', '.jpg', '.jpeg'))
190
- ])
191
- if not image_files:
192
- continue
193
- audio_clip = mp.AudioFileClip(audio_path)
194
- duration_per_image = audio_clip.duration / len(image_files)
195
- image_clips = [mp.ImageClip(img).set_duration(duration_per_image) for img in image_files]
196
- scene_video = mp.concatenate_videoclips(image_clips, method="compose").set_audio(audio_clip)
197
- final_clips.append(scene_video)
198
- if not final_clips:
199
- st.error("No valid scenes processed.")
200
- return None
201
- final_video = mp.concatenate_videoclips(final_clips, method="compose")
202
- output_path = os.path.join(video_folder, output_filename)
203
- final_video.write_videofile(output_path, fps=24, codec='libx264')
204
- return output_path
205
  except Exception as e:
206
- st.error(f"Error during video generation: {e}")
207
  return None
208
 
209
  # Streamlit App
210
  def main():
211
- st.markdown("<h1 style='text-align: center;'>Text to Video Generator</h1>", unsafe_allow_html=True)
212
- st.markdown("<p style='text-align: center;'>Leave a Like if it works for you! ❤️</p>", unsafe_allow_html=True)
213
-
214
- text_script = st.text_area("Enter your script/story (max 1500 characters):", max_chars=1500)
215
- language = st.selectbox("Choose Language:", ["Urdu", "English"])
216
- client = Client("habib926653/Multilingual-TTS")
217
- speakers_response = client.predict(language=language, api_name="/get_speakers")
218
- speakers = [choice[0] for choice in speakers_response["choices"]]
219
- selected_speaker = st.selectbox("Choose Speaker:", speakers)
220
-
221
- if st.button("Generate Video"):
222
- if text_script:
223
- with st.spinner("Generating video... This may take a few minutes."):
224
- scenes = get_scenes(text_script)
225
- if not scenes:
226
- st.error("Failed to generate scenes.")
227
- else:
228
- video_assets_folder = generate_video_assets(scenes, language, selected_speaker)
229
- if video_assets_folder:
230
- generated_video_path = generate_video(video_assets_folder)
231
- if generated_video_path:
232
- st.success("Video generated successfully!")
233
- st.video(generated_video_path)
234
  else:
235
- st.warning("Please enter some text to generate a video.")
236
-
237
- st.markdown("### 🔥 See How It Works (Example)")
238
- example_script = """
239
- One hot summer day, a thirsty crow was flying in search of water. He looked everywhere, but he couldn't find a single drop. Tired and exhausted, he finally spotted a clay pot with a little water at the bottom.
240
- """
241
- st.markdown(f"**Example Script:** {example_script}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  if __name__ == "__main__":
244
  main()
 
1
  #!/usr/bin/env python3
2
  import streamlit as st
3
  from gradio_client import Client
4
+ from groq import Groq
5
  from PIL import Image
6
  import moviepy.editor as mp
7
  from natsort import natsorted
 
 
 
 
 
8
  import os
9
  from dotenv import load_dotenv
10
+ import json
11
 
12
  # Load environment variables
13
  load_dotenv()
 
15
  # Constants
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
  GROQ_API_KEY = os.getenv("GROQ_API_KEY")
18
+ IMAGE_GENERATION_SPACE_NAME = "stabilityai/stable-diffusion-3.5-large-turbo"
 
19
 
20
+ # Initialize Groq client
21
+ groq_client = Groq(api_key=GROQ_API_KEY)
 
 
 
 
 
 
 
22
 
23
+ # LLM Models (free options)
24
+ LLM_MODELS = {
25
+ "Mixtral 8x7B (Groq)": "mixtral-8x7b-32768",
26
+ "Mistral 7B (HF)": "mistralai/Mixtral-7B-Instruct-v0.1",
27
+ "LLaMA 13B (HF)": "meta-llama/Llama-13b-hf" # Note: May require approval; replace if needed
28
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # Utility Functions
31
+ def generate_tutor_output(subject, difficulty, student_input, model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  prompt = f"""
33
+ You are an expert tutor in {subject} at the {difficulty} level.
34
+ The student has provided the following input: "{student_input}"
35
+
36
+ Please generate:
37
+ 1. A brief, engaging lesson on the topic (2-3 paragraphs)
38
+ 2. A thought-provoking question to check understanding
39
+ 3. Constructive feedback on the student's input
40
+
41
+ Format your response as a JSON object with keys: "lesson", "question", "feedback"
42
  """
43
+
44
+ if model.startswith("mixtral"): # Groq model
45
+ completion = groq_client.chat.completions.create(
46
+ messages=[{
47
+ "role": "system",
48
+ "content": f"You are the world's best AI tutor for {subject}, renowned for clear, engaging explanations."
49
+ }, {
50
+ "role": "user",
51
+ "content": prompt
52
+ }],
53
+ model=model,
54
+ max_tokens=1000
55
  )
56
+ return json.loads(completion.choices[0].message.content)
57
+ else: # Hugging Face models
58
+ try:
59
+ client = Client("https://api-inference.huggingface.co/models/" + model, hf_token=HF_TOKEN)
60
+ response = client.predict(prompt, api_name="/generate")
61
+ return json.loads(response)
62
+ except:
63
+ st.warning(f"HF model {model} failed, falling back to Mixtral.")
64
+ return generate_tutor_output(subject, difficulty, student_input, "mixtral-8x7b-32768")
65
+
66
+ def generate_image(prompt, path='temp_image.png'):
67
  try:
68
+ client = Client(IMAGE_GENERATION_SPACE_NAME, hf_token=HF_TOKEN)
69
  result = client.predict(
70
+ prompt=prompt,
71
+ width=512, # Reduced for speed
72
+ height=512,
73
+ api_name="/predict" # Correct endpoint
74
  )
75
  image = Image.open(result)
76
  image.save(path)
77
+ return path
78
  except Exception as e:
79
+ st.error(f"Error generating image: {e}")
80
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ def generate_video(images, audio_text, language, speaker, path='temp_video.mp4'):
83
  try:
84
+ audio_client = Client("habib926653/Multilingual-TTS")
85
+ audio_result = audio_client.predict(
86
+ text=audio_text,
87
+ language_code=language,
88
+ speaker=speaker,
89
+ api_name="/text_to_speech_edge"
90
+ )
91
+ audio_file = audio_result[1]
92
+ with open(audio_file, 'rb') as f:
93
+ audio_bytes = f.read()
94
+ audio_path = "temp_audio.mp3"
95
+ with open(audio_path, 'wb') as f:
96
+ f.write(audio_bytes)
97
+
98
+ audio_clip = mp.AudioFileClip(audio_path)
99
+ duration_per_image = audio_clip.duration / len(images)
100
+ image_clips = [mp.ImageClip(img).set_duration(duration_per_image) for img in images if img]
101
+ video = mp.concatenate_videoclips(image_clips, method="compose").set_audio(audio_clip)
102
+ video.write_videofile(path, fps=24, codec='libx264')
103
+ return path
 
 
 
 
 
 
 
 
 
 
 
 
104
  except Exception as e:
105
+ st.error(f"Error generating video: {e}")
106
  return None
107
 
108
  # Streamlit App
109
  def main():
110
+ st.markdown("<h1 style='text-align: center;'>EduAI: Your Interactive Tutor</h1>", unsafe_allow_html=True)
111
+ st.markdown("<p style='text-align: center;'>Learn, Ask, Visualize! ❤️</p>", unsafe_allow_html=True)
112
+
113
+ subject = st.selectbox("Choose Subject:", ["Math", "Science", "History", "Literature", "Code", "AI"])
114
+ difficulty = st.selectbox("Difficulty Level:", ["Beginner", "Intermediate", "Advanced"])
115
+ model = st.selectbox("Choose LLM Model:", list(LLM_MODELS.keys()))
116
+ student_input = st.text_area("Your Question/Input (max 1500 chars):", max_chars=1500)
117
+
118
+ if 'tutor_response' not in st.session_state:
119
+ st.session_state.tutor_response = None
120
+
121
+ if st.button("Generate Answer & Question"):
122
+ if student_input:
123
+ with st.spinner("Generating your lesson..."):
124
+ response = generate_tutor_output(subject, difficulty, student_input, LLM_MODELS[model])
125
+ st.session_state.tutor_response = response
 
 
 
 
 
 
 
126
  else:
127
+ st.warning("Please provide an input!")
128
+
129
+ if st.session_state.tutor_response:
130
+ st.markdown("### Lesson")
131
+ st.write(st.session_state.tutor_response["lesson"])
132
+ st.markdown("### Comprehension Question")
133
+ st.write(st.session_state.tutor_response["question"])
134
+ st.markdown("### Feedback")
135
+ st.write(st.session_state.tutor_response["feedback"])
136
+
137
+ col1, col2 = st.columns(2)
138
+ with col1:
139
+ if st.button("Generate Image"):
140
+ with st.spinner("Creating image..."):
141
+ image_path = generate_image(st.session_state.tutor_response["lesson"])
142
+ if image_path:
143
+ st.image(image_path, caption="Visual of your lesson")
144
+ with col2:
145
+ if st.button("Generate Video"):
146
+ with st.spinner("Creating video..."):
147
+ audio_client = Client("habib926653/Multilingual-TTS")
148
+ speakers_response = audio_client.predict(language="English", api_name="/get_speakers")
149
+ speaker = speakers_response["choices"][0][0]
150
+ images = [generate_image(st.session_state.tutor_response["lesson"])]
151
+ video_path = generate_video(images, st.session_state.tutor_response["lesson"], "English", speaker)
152
+ if video_path:
153
+ st.video(video_path)
154
+
155
+ st.markdown("---")
156
+ st.markdown("<p style='text-align: center;'>Built for learning, powered by AI!</p>", unsafe_allow_html=True)
157
 
158
  if __name__ == "__main__":
159
  main()