Manimator / src /api /gemini.py
MostlyKIGuess's picture
updated model to 2.5 pro
c4a80a5
import re
from google import genai
from google.genai import types as genai_types
from dotenv import load_dotenv
import os
import pathlib
import logging
from pydantic import BaseModel
load_dotenv()
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
class ManimOutput(BaseModel):
manim_code: str
narration: str
SYSTEM_PROMPT = """You are an expert Manim programmer specializing in creating visually striking 60-second animations based on user prompts or documents, strictly following Manim Community v0.19.0 standards. Your output MUST be a JSON object conforming to the provided schema.
CRITICAL TIMING REQUIREMENTS:
- **Total Duration:** Exactly 60 seconds (1 minute)
- **Narration:** Exactly 150-160 words (average speaking pace: 2.5 words per second)
- **Animation Structure:** Use this timing framework:
* Introduction: 8-10 seconds
* Main content: 40-45 seconds (3-4 major segments)
* Conclusion/summary: 7-10 seconds
- **Synchronization:** Each narration sentence should correspond to 3-5 seconds of animation
Core Requirements:
- **API Version:** Use only Manim Community v0.19.0 API
- **Vectors & Math:** Use 3D vectors (np.array([x, y, 0])) and ensure correct math operations
- **Matrix Visualization:** Use MathTex for matrices: r'\\begin{bmatrix} a & b \\\\ c & d \\end{bmatrix}'
- **Star Usage:** Use Star(n=5, ...) not n_points
- **Error Prevention:** Always validate Scene class exists; avoid 3D scenes
- **Visual Style:** Create vibrant, dynamic animations with smooth transitions
- **Output Format:** JSON with "manim_code" and "narration" keys
"""
# Detailed Instructions
base_prompt_instructions = (
"\nSTRICT TIMING REQUIREMENTS:"
"\n1. **Video Duration:** Exactly 60 seconds total"
"\n2. **Narration Constraints:**"
"\n - Exactly 150-160 words (no more, no less)"
"\n - Speaking pace: 2.5 words per second"
"\n - Use short, clear sentences (8-12 words each)"
"\n - Include natural pauses between major concepts"
"\n3. **Animation Timing Structure:**"
"\n - Use self.wait() to match narration pauses"
"\n - run_time in self.play() should match sentence duration"
"\n - Fade out elements after 3-5 seconds to avoid clutter"
"\n - Example timing: self.play(Create(obj), run_time=3), self.wait(1)"
"\nTECHNICAL REQUIREMENTS:"
"\n4. Use only Manim Community v0.19.0 API"
"\n5. Vector operations (3D vectors): np.array([x, y, 0])"
"\n6. Matrix display: MathTex(r'\\begin{bmatrix} a & b \\\\ c & d \\end{bmatrix}')"
"\n7. Verified methods only: Create(), Write(), Transform(), FadeIn(), FadeOut(), "
"\n Add(), Remove(), MoveAlongPath(), Rotating(), Circumscribe(), Indicate(), "
"\n FocusOn(), Shift(), Scale(), MoveTo(), NextTo(), Axes(), Plot(), LineGraph(), "
"\n BarChart(), Dot(), Line(), Arrow(), Text(), Tex(), MathTex(), VGroup()"
"\n8. Star shapes: Star(n=5, ...) not n_points"
"\n9. NO image imports or 3D scenes"
"\n10. There is no .to_center() method so please don't use that"
"\nVISUAL & CONTENT GUIDELINES:"
"\n10. Create 4-5 distinct visual segments matching narration flow"
"\n11. Use vibrant colors and smooth transitions"
"\n12. Fade out text/objects when no longer needed"
"\n13. Include interactive elements: arrows, labels, highlights"
"\n14. Validate all objects before animation calls"
"\n15. Use longer run_times (4-6s) for complex animations, shorter (2-3s) for simple ones"
"\nCODE STRUCTURE TEMPLATE:"
"\n16. Always follow this timing pattern:"
"\n ```python"
"\n class VideoScene(Scene):"
"\n def construct(self):"
"\n # Intro (8-10s): Title + brief setup"
"\n title = Text('Title')"
"\n self.play(Write(title), run_time=3)"
"\n self.wait(2) # Pause for narration"
"\n self.play(FadeOut(title), run_time=2)"
"\n "
"\n # Main content (40-45s): 3-4 segments"
"\n # Segment 1 (10-12s)"
"\n # Segment 2 (10-12s) "
"\n # Segment 3 (10-12s)"
"\n # Segment 4 (8-10s)"
"\n "
"\n # Conclusion (7-10s): Summary + fade out"
"\n ```"
"\nNARRATION STRUCTURE:"
"\n17. Follow this word count breakdown:"
"\n - Introduction: 15-25 words (8-10 seconds)"
"\n - Main content: 70-85 words (36-40 seconds)"
"\n - Conclusion: 20-25 words (8-10 seconds)"
"\n - Natural pauses: 3-5 seconds total"
"\n18. Use active voice, present tense"
"\n19. Include transition phrases: 'Now let's see...', 'Next, we'll explore...'"
"\n20. End with a strong concluding statement"
"\nQUALITY ASSURANCE:"
"\n21. Count words in narration before finalizing (must be 120-150)"
"\n22. Calculate total animation time (self.play + self.wait = 60s)"
"\n23. Ensure Scene class exists and imports are correct"
"\n24. Test that all animation objects are valid before use"
"\n25. No broadcasting errors in vector operations"
"\n26. Distinct start/end points for arrows to prevent normalization errors"
)
def load_manim_examples():
guide_path = pathlib.Path(__file__).parent / "guide.md"
if not guide_path.exists():
logging.warning(f"Manim examples guide not found at {guide_path}")
return ""
logging.info(f"Loading Manim examples from {guide_path}")
return guide_path.read_text(encoding="utf-8")
def generate_video(idea: str | None = None, pdf_path: str | None = None):
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
logging.error("GEMINI_API_KEY not found in environment variables")
raise Exception("GEMINI_API_KEY not found in environment variables")
if not idea and not pdf_path:
raise ValueError("Either an idea or a pdf_path must be provided.")
if idea and pdf_path:
logging.warning("Both idea and pdf_path provided. Using pdf_path.")
idea = None
client = genai.Client(api_key=api_key)
contents = []
manim_examples = load_manim_examples()
if manim_examples:
examples_prompt = (
"Below are examples of Manim code that demonstrate proper usage patterns. Use these as reference when generating your animation:\n\n"
+ manim_examples
)
contents.append(examples_prompt)
logging.info("Added Manim examples from guide.md to prime the model")
else:
logging.warning("No Manim examples were loaded from guide.md")
user_prompt_text = ""
if pdf_path:
pdf_file_path = pathlib.Path(pdf_path)
if not pdf_file_path.exists():
logging.error(f"PDF file not found at: {pdf_path}")
raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
logging.info(f"Reading PDF: {pdf_path}")
pdf_data = pdf_file_path.read_bytes()
pdf_part = genai_types.Part.from_bytes(
data=pdf_data, mime_type="application/pdf"
)
contents.append(pdf_part)
user_prompt_text = f"Create a 30-second Manim video script summarizing the key points or illustrating a core concept from the provided PDF document. {base_prompt_instructions}"
contents.append(user_prompt_text)
elif idea:
logging.info(f"Generating video based on idea: {idea[:50]}...")
user_prompt_text = f"Create a 30-second Manim video script about '{idea}'. {base_prompt_instructions}"
contents.append(user_prompt_text)
logging.info("Sending request to Gemini API...")
try:
generation_config = genai_types.GenerateContentConfig(
response_mime_type="application/json",
response_schema=ManimOutput,
system_instruction=SYSTEM_PROMPT,
)
response = client.models.generate_content(
model="gemini-2.5-pro", contents=contents, config=generation_config
)
except Exception as e:
logging.exception(f"Error calling Gemini API: {e}")
raise Exception(f"Error calling Gemini API: {e}")
if response:
try:
parsed_output = response.parsed
if not parsed_output or not isinstance(parsed_output, ManimOutput):
logging.error("Failed to parse structured output from Gemini.")
raise Exception("Failed to parse structured output from Gemini.")
manim_code = parsed_output.manim_code
narration = parsed_output.narration
logging.info("Successfully parsed structured output from Gemini.")
if "from manim import *" not in manim_code:
logging.warning("Adding missing 'from manim import *'.")
manim_code = "from manim import *\nimport numpy as np\n" + manim_code
elif "import numpy as np" not in manim_code:
logging.warning("Adding missing 'import numpy as np'.")
lines = manim_code.splitlines()
for i, line in enumerate(lines):
if "from manim import *" in line:
lines.insert(i + 1, "import numpy as np")
manim_code = "\n".join(lines)
break
return {"manim_code": manim_code, "output_file": "output.mp4"}, narration
except (ValueError, AttributeError) as e:
logging.warning(
f"Could not parse the response. Error: {e}. Response details:"
)
logging.warning(response)
if response.prompt_feedback and response.prompt_feedback.block_reason:
logging.error(
f"Content generation blocked. Reason: {response.prompt_feedback.block_reason.name}"
)
raise Exception(
f"Content generation blocked. Reason: {response.prompt_feedback.block_reason.name}"
)
else:
logging.error(
"Failed to generate content. The response was empty or malformed."
)
raise Exception(
"Failed to generate content. The response was empty or malformed."
)
else:
logging.error(
"Error generating video content. No response received from Gemini."
)
raise Exception("Error generating video content. No response received.")