|
import re |
|
from google import genai |
|
from google.genai import types as genai_types |
|
from dotenv import load_dotenv |
|
import os |
|
import pathlib |
|
import logging |
|
from pydantic import BaseModel |
|
|
|
load_dotenv() |
|
|
|
logging.basicConfig( |
|
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" |
|
) |
|
|
|
|
|
class ManimOutput(BaseModel): |
|
manim_code: str |
|
narration: str |
|
|
|
|
|
SYSTEM_PROMPT = """You are an expert Manim programmer specializing in creating visually striking 60-second animations based on user prompts or documents, strictly following Manim Community v0.19.0 standards. Your output MUST be a JSON object conforming to the provided schema. |
|
|
|
CRITICAL TIMING REQUIREMENTS: |
|
- **Total Duration:** Exactly 60 seconds (1 minute) |
|
- **Narration:** Exactly 150-160 words (average speaking pace: 2.5 words per second) |
|
- **Animation Structure:** Use this timing framework: |
|
* Introduction: 8-10 seconds |
|
* Main content: 40-45 seconds (3-4 major segments) |
|
* Conclusion/summary: 7-10 seconds |
|
- **Synchronization:** Each narration sentence should correspond to 3-5 seconds of animation |
|
|
|
Core Requirements: |
|
- **API Version:** Use only Manim Community v0.19.0 API |
|
- **Vectors & Math:** Use 3D vectors (np.array([x, y, 0])) and ensure correct math operations |
|
- **Matrix Visualization:** Use MathTex for matrices: r'\\begin{bmatrix} a & b \\\\ c & d \\end{bmatrix}' |
|
- **Star Usage:** Use Star(n=5, ...) not n_points |
|
- **Error Prevention:** Always validate Scene class exists; avoid 3D scenes |
|
- **Visual Style:** Create vibrant, dynamic animations with smooth transitions |
|
- **Output Format:** JSON with "manim_code" and "narration" keys |
|
""" |
|
|
|
base_prompt_instructions = ( |
|
"\nSTRICT TIMING REQUIREMENTS:" |
|
"\n1. **Video Duration:** Exactly 60 seconds total" |
|
"\n2. **Narration Constraints:**" |
|
"\n - Exactly 150-160 words (no more, no less)" |
|
"\n - Speaking pace: 2.5 words per second" |
|
"\n - Use short, clear sentences (8-12 words each)" |
|
"\n - Include natural pauses between major concepts" |
|
"\n3. **Animation Timing Structure:**" |
|
"\n - Use self.wait() to match narration pauses" |
|
"\n - run_time in self.play() should match sentence duration" |
|
"\n - Fade out elements after 3-5 seconds to avoid clutter" |
|
"\n - Example timing: self.play(Create(obj), run_time=3), self.wait(1)" |
|
"\nTECHNICAL REQUIREMENTS:" |
|
"\n4. Use only Manim Community v0.19.0 API" |
|
"\n5. Vector operations (3D vectors): np.array([x, y, 0])" |
|
"\n6. Matrix display: MathTex(r'\\begin{bmatrix} a & b \\\\ c & d \\end{bmatrix}')" |
|
"\n7. Verified methods only: Create(), Write(), Transform(), FadeIn(), FadeOut(), " |
|
"\n Add(), Remove(), MoveAlongPath(), Rotating(), Circumscribe(), Indicate(), " |
|
"\n FocusOn(), Shift(), Scale(), MoveTo(), NextTo(), Axes(), Plot(), LineGraph(), " |
|
"\n BarChart(), Dot(), Line(), Arrow(), Text(), Tex(), MathTex(), VGroup()" |
|
"\n8. Star shapes: Star(n=5, ...) not n_points" |
|
"\n9. NO image imports or 3D scenes" |
|
"\n10. There is no .to_center() method so please don't use that" |
|
"\nVISUAL & CONTENT GUIDELINES:" |
|
"\n10. Create 4-5 distinct visual segments matching narration flow" |
|
"\n11. Use vibrant colors and smooth transitions" |
|
"\n12. Fade out text/objects when no longer needed" |
|
"\n13. Include interactive elements: arrows, labels, highlights" |
|
"\n14. Validate all objects before animation calls" |
|
"\n15. Use longer run_times (4-6s) for complex animations, shorter (2-3s) for simple ones" |
|
"\nCODE STRUCTURE TEMPLATE:" |
|
"\n16. Always follow this timing pattern:" |
|
"\n ```python" |
|
"\n class VideoScene(Scene):" |
|
"\n def construct(self):" |
|
"\n # Intro (8-10s): Title + brief setup" |
|
"\n title = Text('Title')" |
|
"\n self.play(Write(title), run_time=3)" |
|
"\n self.wait(2) # Pause for narration" |
|
"\n self.play(FadeOut(title), run_time=2)" |
|
"\n " |
|
"\n # Main content (40-45s): 3-4 segments" |
|
"\n # Segment 1 (10-12s)" |
|
"\n # Segment 2 (10-12s) " |
|
"\n # Segment 3 (10-12s)" |
|
"\n # Segment 4 (8-10s)" |
|
"\n " |
|
"\n # Conclusion (7-10s): Summary + fade out" |
|
"\n ```" |
|
"\nNARRATION STRUCTURE:" |
|
"\n17. Follow this word count breakdown:" |
|
"\n - Introduction: 15-25 words (8-10 seconds)" |
|
"\n - Main content: 70-85 words (36-40 seconds)" |
|
"\n - Conclusion: 20-25 words (8-10 seconds)" |
|
"\n - Natural pauses: 3-5 seconds total" |
|
"\n18. Use active voice, present tense" |
|
"\n19. Include transition phrases: 'Now let's see...', 'Next, we'll explore...'" |
|
"\n20. End with a strong concluding statement" |
|
"\nQUALITY ASSURANCE:" |
|
"\n21. Count words in narration before finalizing (must be 120-150)" |
|
"\n22. Calculate total animation time (self.play + self.wait = 60s)" |
|
"\n23. Ensure Scene class exists and imports are correct" |
|
"\n24. Test that all animation objects are valid before use" |
|
"\n25. No broadcasting errors in vector operations" |
|
"\n26. Distinct start/end points for arrows to prevent normalization errors" |
|
) |
|
|
|
|
|
def load_manim_examples(): |
|
guide_path = pathlib.Path(__file__).parent / "guide.md" |
|
if not guide_path.exists(): |
|
logging.warning(f"Manim examples guide not found at {guide_path}") |
|
return "" |
|
logging.info(f"Loading Manim examples from {guide_path}") |
|
return guide_path.read_text(encoding="utf-8") |
|
|
|
|
|
def generate_video(idea: str | None = None, pdf_path: str | None = None): |
|
api_key = os.getenv("GEMINI_API_KEY") |
|
if not api_key: |
|
logging.error("GEMINI_API_KEY not found in environment variables") |
|
raise Exception("GEMINI_API_KEY not found in environment variables") |
|
if not idea and not pdf_path: |
|
raise ValueError("Either an idea or a pdf_path must be provided.") |
|
if idea and pdf_path: |
|
logging.warning("Both idea and pdf_path provided. Using pdf_path.") |
|
idea = None |
|
|
|
client = genai.Client(api_key=api_key) |
|
contents = [] |
|
|
|
manim_examples = load_manim_examples() |
|
if manim_examples: |
|
examples_prompt = ( |
|
"Below are examples of Manim code that demonstrate proper usage patterns. Use these as reference when generating your animation:\n\n" |
|
+ manim_examples |
|
) |
|
contents.append(examples_prompt) |
|
logging.info("Added Manim examples from guide.md to prime the model") |
|
else: |
|
logging.warning("No Manim examples were loaded from guide.md") |
|
|
|
user_prompt_text = "" |
|
|
|
if pdf_path: |
|
pdf_file_path = pathlib.Path(pdf_path) |
|
if not pdf_file_path.exists(): |
|
logging.error(f"PDF file not found at: {pdf_path}") |
|
raise FileNotFoundError(f"PDF file not found at: {pdf_path}") |
|
|
|
logging.info(f"Reading PDF: {pdf_path}") |
|
pdf_data = pdf_file_path.read_bytes() |
|
pdf_part = genai_types.Part.from_bytes( |
|
data=pdf_data, mime_type="application/pdf" |
|
) |
|
contents.append(pdf_part) |
|
|
|
user_prompt_text = f"Create a 30-second Manim video script summarizing the key points or illustrating a core concept from the provided PDF document. {base_prompt_instructions}" |
|
contents.append(user_prompt_text) |
|
|
|
elif idea: |
|
logging.info(f"Generating video based on idea: {idea[:50]}...") |
|
user_prompt_text = f"Create a 30-second Manim video script about '{idea}'. {base_prompt_instructions}" |
|
contents.append(user_prompt_text) |
|
|
|
logging.info("Sending request to Gemini API...") |
|
try: |
|
generation_config = genai_types.GenerateContentConfig( |
|
response_mime_type="application/json", |
|
response_schema=ManimOutput, |
|
system_instruction=SYSTEM_PROMPT, |
|
) |
|
|
|
response = client.models.generate_content( |
|
model="gemini-2.5-pro", contents=contents, config=generation_config |
|
) |
|
except Exception as e: |
|
logging.exception(f"Error calling Gemini API: {e}") |
|
raise Exception(f"Error calling Gemini API: {e}") |
|
|
|
if response: |
|
try: |
|
parsed_output = response.parsed |
|
if not parsed_output or not isinstance(parsed_output, ManimOutput): |
|
logging.error("Failed to parse structured output from Gemini.") |
|
raise Exception("Failed to parse structured output from Gemini.") |
|
|
|
manim_code = parsed_output.manim_code |
|
narration = parsed_output.narration |
|
logging.info("Successfully parsed structured output from Gemini.") |
|
|
|
if "from manim import *" not in manim_code: |
|
logging.warning("Adding missing 'from manim import *'.") |
|
manim_code = "from manim import *\nimport numpy as np\n" + manim_code |
|
elif "import numpy as np" not in manim_code: |
|
logging.warning("Adding missing 'import numpy as np'.") |
|
lines = manim_code.splitlines() |
|
for i, line in enumerate(lines): |
|
if "from manim import *" in line: |
|
lines.insert(i + 1, "import numpy as np") |
|
manim_code = "\n".join(lines) |
|
break |
|
|
|
return {"manim_code": manim_code, "output_file": "output.mp4"}, narration |
|
except (ValueError, AttributeError) as e: |
|
logging.warning( |
|
f"Could not parse the response. Error: {e}. Response details:" |
|
) |
|
logging.warning(response) |
|
if response.prompt_feedback and response.prompt_feedback.block_reason: |
|
logging.error( |
|
f"Content generation blocked. Reason: {response.prompt_feedback.block_reason.name}" |
|
) |
|
raise Exception( |
|
f"Content generation blocked. Reason: {response.prompt_feedback.block_reason.name}" |
|
) |
|
else: |
|
logging.error( |
|
"Failed to generate content. The response was empty or malformed." |
|
) |
|
raise Exception( |
|
"Failed to generate content. The response was empty or malformed." |
|
) |
|
else: |
|
logging.error( |
|
"Error generating video content. No response received from Gemini." |
|
) |
|
raise Exception("Error generating video content. No response received.") |
|
|