dfdfdsfgs commited on
Commit
d9486d1
·
1 Parent(s): 929083d

Upload project files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.template +33 -0
  2. .github/ISSUE_TEMPLATE/bug_report.md +32 -0
  3. .github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. .gitignore +3 -0
  5. .specstory/history/.what-is-this.md +65 -0
  6. Dockerfile +39 -0
  7. LICENSE +21 -0
  8. README.md +359 -14
  9. app.py +164 -4
  10. data/thb_easy/chemistry.json +142 -0
  11. data/thb_easy/comp_sci.json +142 -0
  12. data/thb_easy/math.json +142 -0
  13. data/thb_easy/physics.json +142 -0
  14. data/thb_hard/chemistry.json +142 -0
  15. data/thb_hard/comp_sci.json +142 -0
  16. data/thb_hard/math.json +142 -0
  17. data/thb_hard/physics.json +142 -0
  18. data/thb_medium/chemistry.json +142 -0
  19. data/thb_medium/comp_sci.json +142 -0
  20. data/thb_medium/math.json +142 -0
  21. data/thb_medium/physics.json +142 -0
  22. eval_suite/__init__.py +0 -0
  23. eval_suite/image_utils.py +104 -0
  24. eval_suite/parse_prompt.py +54 -0
  25. eval_suite/prompts_raw/__init__.py +145 -0
  26. eval_suite/prompts_raw/fix_transcript.txt +8 -0
  27. eval_suite/prompts_raw/image_eval.txt +45 -0
  28. eval_suite/prompts_raw/text_eval_new.txt +47 -0
  29. eval_suite/prompts_raw/video_eval_new.txt +37 -0
  30. eval_suite/text_utils.py +80 -0
  31. eval_suite/utils.py +81 -0
  32. eval_suite/video_utils.py +167 -0
  33. evaluate.py +474 -0
  34. generate_video.py +954 -0
  35. mllm_tools/__init__.py +1 -0
  36. mllm_tools/gemini.py +176 -0
  37. mllm_tools/litellm.py +193 -0
  38. mllm_tools/utils.py +174 -0
  39. mllm_tools/vertex_ai.py +86 -0
  40. requirements.txt +101 -0
  41. src/__init__.py +1 -0
  42. src/config/__init__.py +0 -0
  43. src/config/config.py +20 -0
  44. src/core/__init__.py +0 -0
  45. src/core/code_generator.py +454 -0
  46. src/core/parse_video.py +227 -0
  47. src/core/video_planner.py +417 -0
  48. src/core/video_renderer.py +448 -0
  49. src/rag/__init__.py +0 -0
  50. src/rag/rag_integration.py +390 -0
.env.template ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OpenAI
2
+ OPENAI_API_KEY=""
3
+
4
+ # Azure OpenAI
5
+ AZURE_API_KEY=""
6
+ AZURE_API_BASE=""
7
+ AZURE_API_VERSION=""
8
+
9
+ # Google Vertex AI
10
+ VERTEXAI_PROJECT=""
11
+ VERTEXAI_LOCATION=""
12
+ GOOGLE_APPLICATION_CREDENTIALS=""
13
+
14
+ # Google Gemini
15
+ GEMINI_API_KEY=""
16
+
17
+ # AWS Bedrock / S3
18
+ AWS_ACCESS_KEY_ID=""
19
+ AWS_SECRET_ACCESS_KEY=""
20
+ AWS_REGION_NAME=""
21
+ AWS_S3_BUCKET=""
22
+
23
+ # Langfuse
24
+ LANGFUSE_PUBLIC_KEY=""
25
+ LANGFUSE_SECRET_KEY=""
26
+ LANGFUSE_HOST=""
27
+
28
+ # Kokoro TTS Settings
29
+ KOKORO_MODEL_PATH="models/kokoro-v0_19.onnx"
30
+ KOKORO_VOICES_PATH="models/voices.bin"
31
+ KOKORO_DEFAULT_VOICE="af"
32
+ KOKORO_DEFAULT_SPEED="1.0"
33
+ KOKORO_DEFAULT_LANG="en-us"
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: ''
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+ A clear and concise description of what the bug is.
12
+
13
+ **To Reproduce**
14
+ Steps to reproduce the behavior:
15
+ 1. Go to '...'
16
+ 2. Click on '....'
17
+ 3. Scroll down to '....'
18
+ 4. See error
19
+
20
+ **Expected behavior**
21
+ A clear and concise description of what you expected to happen.
22
+
23
+ **Screenshots**
24
+ If applicable, add screenshots to help explain your problem.
25
+
26
+ **Desktop (please complete the following information):**
27
+ - OS: [e.g. iOS]
28
+ - Browser [e.g. chrome, safari]
29
+ - Version [e.g. 22]
30
+
31
+ **Additional context**
32
+ Add any other context about the problem here.
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: ''
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12
+
13
+ **Describe the solution you'd like**
14
+ A clear and concise description of what you want to happen.
15
+
16
+ **Describe alternatives you've considered**
17
+ A clear and concise description of any alternative solutions or features you've considered.
18
+
19
+ **Additional context**
20
+ Add any other context or screenshots about the feature request here.
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ **/__pycache__/
2
+
3
+ .env
.specstory/history/.what-is-this.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # SpecStory Artifacts Directory
3
+
4
+ This directory is automatically created and maintained by the SpecStory extension to preserve your Cursor composer and chat history.
5
+
6
+ ## What's Here?
7
+
8
+ - `.specstory/history`: Contains markdown files of your AI coding sessions
9
+ - Each file represents a separate chat or composer session
10
+ - Files are automatically updated as you work
11
+ - `.specstory/cursor_rules_backups`: Contains backups of the `.cursor/rules/derived-cursor-rules.mdc` file
12
+ - Backups are automatically created each time the `.cursor/rules/derived-cursor-rules.mdc` file is updated
13
+ - You can enable/disable the Cursor Rules feature in the SpecStory settings
14
+
15
+ ## Valuable Uses
16
+
17
+ - Capture: Keep your context window up-to-date when starting new Chat/Composer sessions via @ references
18
+ - Search: For previous prompts and code snippets
19
+ - Learn: Meta-analyze your patterns and learn from your past experiences
20
+ - Derive: Keep Cursor on course with your past decisions by automatically deriving Cursor rules from your AI interactions
21
+
22
+ ## Version Control
23
+
24
+ We recommend keeping this directory under version control to maintain a history of your AI interactions. However, if you prefer not to version these files, you can exclude them by adding this to your `.gitignore`:
25
+
26
+ ```
27
+ .specstory
28
+ ```
29
+
30
+ We recommend not keeping the `.specstory/cursor_rules_backups` directory under version control if you are already using git to version the `.cursor/rules` directory, and committing regularly. You can exclude it by adding this to your `.gitignore`:
31
+
32
+ ```
33
+ .specstory/cursor_rules_backups
34
+ ```
35
+
36
+ ## Searching Your Codebase
37
+
38
+ When searching your codebase in Cursor, search results may include your previous AI coding interactions. To focus solely on your actual code files, you can exclude the AI interaction history from search results.
39
+
40
+ To exclude AI interaction history:
41
+
42
+ 1. Open the "Find in Files" search in Cursor (Cmd/Ctrl + Shift + F)
43
+ 2. Navigate to the "files to exclude" section
44
+ 3. Add the following pattern:
45
+
46
+ ```
47
+ .specstory/*
48
+ ```
49
+
50
+ This will ensure your searches only return results from your working codebase files.
51
+
52
+ ## Notes
53
+
54
+ - Auto-save only works when Cursor/sqlite flushes data to disk. This results in a small delay after the AI response is complete before SpecStory can save the history.
55
+ - Auto-save does not yet work on remote WSL workspaces.
56
+
57
+ ## Settings
58
+
59
+ You can control auto-saving behavior in Cursor:
60
+
61
+ 1. Open Cursor → Settings → VS Code Settings (Cmd/Ctrl + ,)
62
+ 2. Search for "SpecStory"
63
+ 3. Find "Auto Save" setting to enable/disable
64
+
65
+ Auto-save occurs when changes are detected in Cursor's sqlite database, or every 2 minutes as a safety net.
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Start with a Python base image
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies for Manim
8
+ # This is a large installation and will take time
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ ffmpeg \
11
+ texlive-full \
12
+ pango1.0-tools \
13
+ libcairo2-dev \
14
+ libjpeg-dev \
15
+ libgif-dev \
16
+ libpango1.0-dev \
17
+ libsdl-pango-dev \
18
+ portaudio19-dev \
19
+ git \
20
+ && rm -rf /var/lib/apt/lists/*
21
+
22
+ # Copy the entire project into the container
23
+ COPY . .
24
+
25
+ # Install Python requirements
26
+ # Manim is included in requirements.txt
27
+ RUN pip install --no-cache-dir -r requirements.txt
28
+
29
+ # Download Kokoro TTS models during the build process
30
+ RUN mkdir -p models && \
31
+ wget -P models https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx && \
32
+ wget -P models https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin
33
+
34
+ # Expose the port the API will run on (e.g., 7860 for Gradio/FastAPI)
35
+ EXPOSE 7860
36
+
37
+ # Command to run the application
38
+ # We will use Gradio to create the UI endpoint
39
+ CMD ["python", "app.py"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 TIGER Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,14 +1,359 @@
1
- ---
2
- title: TheoremExplainAgent
3
- emoji: 😻
4
- colorFrom: pink
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 5.33.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: TheoremExplainAgent
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TheoremExplainAgent (TEA) 🍵
2
+ [![arXiv](https://img.shields.io/badge/arXiv-2502.19400-b31b1b.svg)](https://arxiv.org/abs/2502.19400)
3
+ <a href='https://huggingface.co/papers/2502.19400'><img src='https://img.shields.io/static/v1?label=Paper&message=Huggingface&color=orange'></a>
4
+
5
+ [**🌐 Homepage**](https://tiger-ai-lab.github.io/TheoremExplainAgent/) | [**📖 arXiv**](https://arxiv.org/abs/2502.19400) | [**🤗 HuggingFace Dataset**](https://huggingface.co/datasets/TIGER-Lab/TheoremExplainBench) | [🎥Video Data](https://drive.google.com/file/d/18kmzXvbxaFGyJw-g51jnq9m93v_ez4aJ/view)
6
+
7
+ [![contributors](https://img.shields.io/github/contributors/TIGER-AI-Lab/TheoremExplainAgent)](https://github.com/TIGER-AI-Lab/TheoremExplainAgent/graphs/contributors)
8
+ [![license](https://img.shields.io/github/license/TIGER-AI-Lab/TheoremExplainAgent.svg)](https://github.com/TIGER-AI-Lab/TheoremExplainAgent/blob/main/LICENSE)
9
+ [![GitHub](https://img.shields.io/github/stars/TIGER-AI-Lab/TheoremExplainAgent?style=social)](https://github.com/TIGER-AI-Lab/TheoremExplainAgent)
10
+ [![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2FTIGER-AI-Lab%2FTheoremExplainAgent&count_bg=%23C83DB9&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=visitors&edge_flat=false)](https://hits.seeyoufarm.com)
11
+
12
+ This repo contains the codebase for our paper [TheoremExplainAgent: Towards Video-based Multimodal Explanations for LLM Theorem Understanding](https://arxiv.org/abs/2502.19400)
13
+
14
+ **ACL 2025 main**
15
+
16
+ ## Introduction
17
+ TheoremExplainAgent is an AI system that generates long-form Manim videos to visually explain theorems, proving its deep understanding while uncovering reasoning flaws that text alone often hides.
18
+
19
+
20
+
21
+ https://github.com/user-attachments/assets/17f2f4f2-8f2c-4abc-b377-ac92ebda69f3
22
+
23
+
24
+ ## 📰 News
25
+ * 2025 Jun 8: We released our generated video data for researchers to serve as baselines.
26
+ * 2025 May 15: Paper accepted to ACL 2025 main conference.
27
+ * 2025 Mar 3: Generation code and Evaluation code released. Thanks for the wait!
28
+ <!--* 2025 Mar 3: Reach 404 stars without code.-->
29
+ * 2025 Feb 27: Paper available on [Arxiv](https://arxiv.org/abs/2502.19400). Thanks AK for putting our paper on [HF Daily](https://huggingface.co/papers/2502.19400).
30
+
31
+ ## Downloading Generated Video Data
32
+ Skip this section if you just want to try out the code.
33
+ If you are researchers who just need the baseline videos as baseline comparison, download it here:
34
+ ```shell
35
+ wget --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=18kmzXvbxaFGyJw-g51jnq9m93v_ez4aJ' -O /tmp/gdrive.html && wget --load-cookies /tmp/cookies.txt -O baseline_videos.zip "https://drive.usercontent.google.com/download?id=18kmzXvbxaFGyJw-g51jnq9m93v_ez4aJ&export=download&confirm=$(sed -rn 's/.*name="confirm" value="([^"]+)".*/\\1/p' /tmp/gdrive.html)&uuid=$(sed -rn 's/.*name="uuid" value="([^"]+)".*/\\1/p' /tmp/gdrive.html)" && rm /tmp/gdrive.html /tmp/cookies.txt
36
+ ```
37
+
38
+ ## Installation
39
+
40
+ > **Look at the [FAQ section in this README doc](https://github.com/TIGER-AI-Lab/TheoremExplainAgent?tab=readme-ov-file#-faq) if you encountered any errors. If that didnt help, create a issue**<br>
41
+
42
+ 1. Setting up conda environment
43
+ ```shell
44
+ conda create --name tea python=3.12.8
45
+ conda activate tea
46
+ pip install -r requirements.txt
47
+ ```
48
+
49
+ 2. You may also need to install latex and other dependencies for Manim Community. Look at [Manim Installation Docs](https://docs.manim.community/en/stable/installation.html) for more details.
50
+ ```shell
51
+ # You might need these dependencies if you are using Linux Ubuntu:
52
+ sudo apt-get install portaudio19-dev
53
+ sudo apt-get install libsdl-pango-dev
54
+ ```
55
+
56
+ 3. Then Download the Kokoro model and voices using the commands to enable TTS service.
57
+
58
+ ```shell
59
+ mkdir -p models && wget -P models https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/kokoro-v0_19.onnx && wget -P models https://github.com/thewh1teagle/kokoro-onnx/releases/download/model-files/voices.bin
60
+ ```
61
+
62
+ 4. Create `.env` based on `.env.template`, filling in the environmental variables according to the models you choose to use.
63
+ See [LiteLLM](https://docs.litellm.ai/docs/providers) for reference.
64
+
65
+ ```shell
66
+ touch .env
67
+ ```
68
+ Then open the `.env` file and edit it with whatever text editor you like.
69
+
70
+ Your `.env` file should look like the following:
71
+ ```shell
72
+ # OpenAI
73
+ OPENAI_API_KEY=""
74
+
75
+ # Azure OpenAI
76
+ AZURE_API_KEY=""
77
+ AZURE_API_BASE=""
78
+ AZURE_API_VERSION=""
79
+
80
+ # Google Vertex AI
81
+ VERTEXAI_PROJECT=""
82
+ VERTEXAI_LOCATION=""
83
+ GOOGLE_APPLICATION_CREDENTIALS=""
84
+
85
+ # Google Gemini
86
+ GEMINI_API_KEY=""
87
+
88
+ ...
89
+
90
+ # Kokoro TTS Settings
91
+ KOKORO_MODEL_PATH="models/kokoro-v0_19.onnx"
92
+ KOKORO_VOICES_PATH="models/voices.bin"
93
+ KOKORO_DEFAULT_VOICE="af"
94
+ KOKORO_DEFAULT_SPEED="1.0"
95
+ KOKORO_DEFAULT_LANG="en-us"
96
+ ```
97
+ Fill in the API keys according to the model you wanted to use.
98
+
99
+ 5. Configure Python path. Note that you need to configure the python path to make it work. Otherwise you may encounter import issues (like not being able to import src etc.)
100
+ ```shell
101
+ export PYTHONPATH=$(pwd):$PYTHONPATH
102
+ ```
103
+
104
+ 6. (Optional) To setup RAG, See [https://github.com/TIGER-AI-Lab/TheoremExplainAgent?tab=readme-ov-file#generation-with-rag](https://github.com/TIGER-AI-Lab/TheoremExplainAgent?tab=readme-ov-file#generation-with-rag).
105
+
106
+ > **Look at the [FAQ section in this README doc](https://github.com/TIGER-AI-Lab/TheoremExplainAgent?tab=readme-ov-file#-faq) if you encountered any errors. If that didnt help, create a issue**<br>
107
+
108
+ ## Generation
109
+
110
+ ### Supported Models
111
+ <!--You can customize the allowed models by editing the `src/utils/allowed_models.json` file. This file specifies which `model` and `helper_model` the system is permitted to use.-->
112
+ The model naming follows the LiteLLM convention. For details on how models should be named, please refer to the [LiteLLM documentation](https://docs.litellm.ai/docs/providers).
113
+
114
+ ### Generation (Single topic)
115
+ ```shell
116
+ python generate_video.py \
117
+ --model "openai/o3-mini" \
118
+ --helper_model "openai/o3-mini" \
119
+ --output_dir "output/your_exp_name" \
120
+ --topic "your_topic" \
121
+ --context "description of your topic, e.g. 'This is a topic about the properties of a triangle'" \
122
+ ```
123
+
124
+ Example:
125
+ ```shell
126
+ python generate_video.py \
127
+ --model "openai/o3-mini" \
128
+ --helper_model "openai/o3-mini" \
129
+ --output_dir "output/my_exp_name" \
130
+ --topic "Big O notation" \
131
+ --context "most common type of asymptotic notation in computer science used to measure worst case complexity" \
132
+ ```
133
+
134
+ ### Generation (in batch)
135
+ ```shell
136
+ python generate_video.py \
137
+ --model "openai/o3-mini" \
138
+ --helper_model "openai/o3-mini" \
139
+ --output_dir "output/my_exp_name" \
140
+ --theorems_path data/thb_easy/math.json \
141
+ --max_scene_concurrency 7 \
142
+ --max_topic_concurrency 20 \
143
+ ```
144
+
145
+ ### Generation with RAG
146
+ Before using RAG, download the RAG documentation from this [Google Drive link](https://drive.google.com/file/d/1Tn6J_JKVefFZRgZbjns93KLBtI9ullRv/view?usp=sharing). After downloading, unzip the file. For example, if you unzip it to `data/rag/manim_docs`, then you should set `--manim_docs_path` to `data/rag/manim_docs`. The vector database will be created the first time you run with RAG.
147
+
148
+ ```shell
149
+ python generate_video.py \
150
+ --model "openai/o3-mini" \
151
+ --helper_model "openai/o3-mini" \
152
+ --output_dir "output/with_rag/o3-mini/vtutorbench_easy/math" \
153
+ --topic "Big O notation" \
154
+ --context "most common type of asymptotic notation in computer science used to measure worst case complexity" \
155
+ --use_rag \
156
+ --chroma_db_path "data/rag/chroma_db" \
157
+ --manim_docs_path "data/rag/manim_docs" \
158
+ --embedding_model "vertex_ai/text-embedding-005"
159
+ ```
160
+
161
+ We support more options for generation, see below for more details:
162
+ ```shell
163
+ usage: generate_video.py [-h]
164
+ [--model]
165
+ [--topic TOPIC] [--context CONTEXT]
166
+ [--helper_model]
167
+ [--only_gen_vid] [--only_combine] [--peek_existing_videos] [--output_dir OUTPUT_DIR] [--theorems_path THEOREMS_PATH]
168
+ [--sample_size SAMPLE_SIZE] [--verbose] [--max_retries MAX_RETRIES] [--use_rag] [--use_visual_fix_code]
169
+ [--chroma_db_path CHROMA_DB_PATH] [--manim_docs_path MANIM_DOCS_PATH]
170
+ [--embedding_model {azure/text-embedding-3-large,vertex_ai/text-embedding-005}] [--use_context_learning]
171
+ [--context_learning_path CONTEXT_LEARNING_PATH] [--use_langfuse] [--max_scene_concurrency MAX_SCENE_CONCURRENCY]
172
+ [--max_topic_concurrency MAX_TOPIC_CONCURRENCY] [--debug_combine_topic DEBUG_COMBINE_TOPIC] [--only_plan] [--check_status]
173
+ [--only_render] [--scenes SCENES [SCENES ...]]
174
+
175
+ Generate Manim videos using AI
176
+
177
+ options:
178
+ -h, --help show this help message and exit
179
+ --model Select the AI model to use
180
+ --topic TOPIC Topic to generate videos for
181
+ --context CONTEXT Context of the topic
182
+ --helper_model Select the helper model to use
183
+ --only_gen_vid Only generate videos to existing plans
184
+ --only_combine Only combine videos
185
+ --peek_existing_videos, --peek
186
+ Peek at existing videos
187
+ --output_dir OUTPUT_DIR
188
+ Output directory
189
+ --theorems_path THEOREMS_PATH
190
+ Path to theorems json file
191
+ --sample_size SAMPLE_SIZE, --sample SAMPLE_SIZE
192
+ Number of theorems to sample
193
+ --verbose Print verbose output
194
+ --max_retries MAX_RETRIES
195
+ Maximum number of retries for code generation
196
+ --use_rag, --rag Use Retrieval Augmented Generation
197
+ --use_visual_fix_code, --visual_fix_code
198
+ Use VLM to fix code with rendered visuals
199
+ --chroma_db_path CHROMA_DB_PATH
200
+ Path to Chroma DB
201
+ --manim_docs_path MANIM_DOCS_PATH
202
+ Path to manim docs
203
+ --embedding_model {azure/text-embedding-3-large,vertex_ai/text-embedding-005}
204
+ Select the embedding model to use
205
+ --use_context_learning
206
+ Use context learning with example Manim code
207
+ --context_learning_path CONTEXT_LEARNING_PATH
208
+ Path to context learning examples
209
+ --use_langfuse Enable Langfuse logging
210
+ --max_scene_concurrency MAX_SCENE_CONCURRENCY
211
+ Maximum number of scenes to process concurrently
212
+ --max_topic_concurrency MAX_TOPIC_CONCURRENCY
213
+ Maximum number of topics to process concurrently
214
+ --debug_combine_topic DEBUG_COMBINE_TOPIC
215
+ Debug combine videos
216
+ --only_plan Only generate scene outline and implementation plans
217
+ --check_status Check planning and code status for all theorems
218
+ --only_render Only render scenes without combining videos
219
+ --scenes SCENES [SCENES ...]
220
+ Specific scenes to process (if theorems_path is provided)
221
+ ```
222
+
223
+ ## Evaluation
224
+ Note that Gemini and GPT4o is required for evaluation.
225
+
226
+ Currently, evaluation requires a video file and a subtitle file (SRT format).
227
+
228
+ Video evaluation:
229
+ ```shell
230
+ usage: evaluate.py [-h]
231
+ [--model_text {gemini/gemini-1.5-pro-002,gemini/gemini-1.5-flash-002,gemini/gemini-2.0-flash-001,vertex_ai/gemini-1.5-flash-002,vertex_ai/gemini-1.5-pro-002,vertex_ai/gemini-2.0-flash-001,openai/o3-mini,gpt-4o,azure/gpt-4o,azure/gpt-4o-mini,bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0,bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0,bedrock/anthropic.claude-3-5-haiku-20241022-v1:0,bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0}]
232
+ [--model_video {gemini/gemini-1.5-pro-002,gemini/gemini-2.0-flash-exp,gemini/gemini-2.0-pro-exp-02-05}]
233
+ [--model_image {gemini/gemini-1.5-pro-002,gemini/gemini-1.5-flash-002,gemini/gemini-2.0-flash-001,vertex_ai/gemini-1.5-flash-002,vertex_ai/gemini-1.5-pro-002,vertex_ai/gemini-2.0-flash-001,openai/o3-mini,gpt-4o,azure/gpt-4o,azure/gpt-4o-mini,bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0,bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0,bedrock/anthropic.claude-3-5-haiku-20241022-v1:0,bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0}]
234
+ [--eval_type {text,video,image,all}] --file_path FILE_PATH --output_folder OUTPUT_FOLDER [--retry_limit RETRY_LIMIT] [--combine] [--bulk_evaluate] [--target_fps TARGET_FPS]
235
+ [--use_parent_folder_as_topic] [--max_workers MAX_WORKERS]
236
+
237
+ Automatic evaluation of theorem explanation videos with LLMs
238
+
239
+ options:
240
+ -h, --help show this help message and exit
241
+ --model_text {gemini/gemini-1.5-pro-002,gemini/gemini-1.5-flash-002,gemini/gemini-2.0-flash-001,vertex_ai/gemini-1.5-flash-002,vertex_ai/gemini-1.5-pro-002,vertex_ai/gemini-2.0-flash-001,openai/o3-mini,gpt-4o,azure/gpt-4o,azure/gpt-4o-mini,bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0,bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0,bedrock/anthropic.claude-3-5-haiku-20241022-v1:0,bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0}
242
+ Select the AI model to use for text evaluation
243
+ --model_video {gemini/gemini-1.5-pro-002,gemini/gemini-2.0-flash-exp,gemini/gemini-2.0-pro-exp-02-05}
244
+ Select the AI model to use for video evaluation
245
+ --model_image {gemini/gemini-1.5-pro-002,gemini/gemini-1.5-flash-002,gemini/gemini-2.0-flash-001,vertex_ai/gemini-1.5-flash-002,vertex_ai/gemini-1.5-pro-002,vertex_ai/gemini-2.0-flash-001,openai/o3-mini,gpt-4o,azure/gpt-4o,azure/gpt-4o-mini,bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0,bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0,bedrock/anthropic.claude-3-5-haiku-20241022-v1:0,bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0}
246
+ Select the AI model to use for image evaluation
247
+ --eval_type {text,video,image,all}
248
+ Type of evaluation to perform
249
+ --file_path FILE_PATH
250
+ Path to a file or a theorem folder
251
+ --output_folder OUTPUT_FOLDER
252
+ Directory to store the evaluation files
253
+ --retry_limit RETRY_LIMIT
254
+ Number of retry attempts for each inference
255
+ --combine Combine all results into a single JSON file
256
+ --bulk_evaluate Evaluate a folder of theorems together
257
+ --target_fps TARGET_FPS
258
+ Target FPS for video processing. If not set, original video FPS will be used
259
+ --use_parent_folder_as_topic
260
+ Use parent folder name as topic name for single file evaluation
261
+ --max_workers MAX_WORKERS
262
+ Maximum number of concurrent workers for parallel processing
263
+ ```
264
+ * For `file_path`, it is recommended to pass a folder containing both an MP4 file and an SRT file.
265
+
266
+ ## Misc: Modify the system prompt in TheoremExplainAgent
267
+
268
+ If you want to modify the system prompt, you need to:
269
+
270
+ 1. Modify files in `task_generator/prompts_raw` folder.
271
+ 2. Run `task_generator/parse_prompt.py` to rebuild the `__init__.py` file.
272
+
273
+ ```python
274
+ cd task_generator
275
+ python parse_prompt.py
276
+ cd ..
277
+ ```
278
+
279
+ ## TheoremExplainBench (TEB)
280
+
281
+ TheoremExplainBench can be found on https://huggingface.co/datasets/TIGER-Lab/TheoremExplainBench.
282
+
283
+ How to use:
284
+ ```python
285
+ import datasets
286
+ dataset = datasets.load_dataset("TIGER-Lab/TheoremExplainBench")
287
+ ```
288
+
289
+ Dataset info:
290
+ ```shell
291
+ DatasetDict({
292
+ train: Dataset({
293
+ features: ['uid', 'subject', 'difficulty', 'theorem', 'description', 'subfield'],
294
+ num_rows: 240
295
+ })
296
+ })
297
+ ```
298
+
299
+ ## ❓ FAQ
300
+
301
+ The FAQ should cover the most common errors you could encounter. If you see something new, report it on issues.
302
+
303
+ Q: Error `src.utils.kokoro_voiceover import KokoroService # You MUST import like this as this is our custom voiceover service. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ModuleNotFoundError: No module named 'src'`. <br>
304
+ A: Please run `export PYTHONPATH=$(pwd):$PYTHONPATH` when you start a new terminal. <br>
305
+
306
+ Q: Error `Files not found` <br>
307
+ A: Check your Manim installation. <br>
308
+
309
+ Q: Error `latex ...` <br>
310
+ A: Check your latex installation. <br>
311
+
312
+ Q: The output log is not showing response? <br>
313
+ A: It could be API-related issues. Make sure your `.env` file is properly configured (fill in your API keys), or you can enable litellm debug mode to figure out the issues. <be>
314
+
315
+ Q: Plans / Scenes are missing? <br>
316
+ A: It could be API-related issues. Make sure your `.env` file is properly configured (fill in your API keys), or you can enable litellm debug mode to figure out the issues. <br>
317
+
318
+
319
+ ## 🖊️ Citation
320
+
321
+ Please kindly cite our paper if you use our code, data, models or results:
322
+ ```bibtex
323
+ @misc{ku2025theoremexplainagentmultimodalexplanationsllm,
324
+ title={TheoremExplainAgent: Towards Multimodal Explanations for LLM Theorem Understanding},
325
+ author={Max Ku and Thomas Chong and Jonathan Leung and Krish Shah and Alvin Yu and Wenhu Chen},
326
+ year={2025},
327
+ eprint={2502.19400},
328
+ archivePrefix={arXiv},
329
+ primaryClass={cs.AI},
330
+ url={https://arxiv.org/abs/2502.19400},
331
+ }
332
+ ```
333
+
334
+ ## 🎫 License
335
+
336
+ This project is released under the [the MIT License](LICENSE).
337
+
338
+ ## ⭐ Star History
339
+
340
+ [![Star History Chart](https://api.star-history.com/svg?repos=TIGER-AI-Lab/TheoremExplainAgent&type=Date)](https://star-history.com/#TIGER-AI-Lab/TheoremExplainAgent&Date)
341
+
342
+ ## 💞 Acknowledgements
343
+
344
+ We want to thank [Votee AI](https://votee.ai/) for sponsoring API keys to access the close-sourced models.
345
+
346
+ The code is built upon the below repositories, we thank all the contributors for open-sourcing.
347
+ * [Manim Community](https://www.manim.community/)
348
+ * [kokoro-manim-voiceover](https://github.com/xposed73/kokoro-manim-voiceover)
349
+ * [manim-physics](https://github.com/Matheart/manim-physics)
350
+ * [manim-Chemistry](https://github.com/UnMolDeQuimica/manim-Chemistry)
351
+ * [ManimML](https://github.com/helblazer811/ManimML)
352
+ * [manim-dsa](https://github.com/F4bbi/manim-dsa)
353
+ * [manim-circuit](https://github.com/Mr-FuzzyPenguin/manim-circuit)
354
+
355
+ ## 🚨 Disclaimer
356
+
357
+ **This work is intended for research purposes only. The authors do not encourage or endorse the use of this codebase for commercial applications. The code is provided "as is" without any warranties, and users assume all responsibility for its use.**
358
+
359
+ Tested Environment: MacOS, Linux
app.py CHANGED
@@ -1,7 +1,167 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import uuid
3
+ import subprocess
4
+ import threading
5
+ import os
6
+ import time
7
+ from fastapi import FastAPI
8
+ from fastapi.responses import FileResponse
9
+ import asyncio
10
 
 
 
11
 
12
+ # A simple in-memory dictionary to track task status.
13
+ # For a production system, you'd use a database or Redis.
14
+ tasks = {}
15
+
16
+ def run_video_generation(task_id: str, topic: str, context: str):
17
+ """
18
+ This function runs the main generation script in a separate process.
19
+ """
20
+ tasks[task_id]['status'] = 'running'
21
+
22
+ # Sanitize topic to create a valid directory name
23
+ file_prefix = "".join(c if c.isalnum() else "_" for c in topic.lower())
24
+ output_dir = os.path.join("output", file_prefix)
25
+
26
+ command = [
27
+ "python", "generate_video.py",
28
+ "--model", "openai/o3-mini", # Or get from request
29
+ "--topic", topic,
30
+ "--context", context,
31
+ "--output_dir", "output",
32
+ "--use_langfuse" # Assuming you have secrets set
33
+ ]
34
+
35
+ try:
36
+ # Using subprocess to run the existing script
37
+ process = subprocess.run(command, check=True, capture_output=True, text=True)
38
+
39
+ # Assume the final video is named based on the topic
40
+ # Note: The actual video path might differ. This is an assumption.
41
+ # You may need to parse the stdout from generate_video.py to get the exact path.
42
+ video_path = None
43
+ for file in os.listdir(output_dir):
44
+ if file.endswith("_combined.mp4"):
45
+ video_path = os.path.join(output_dir, file)
46
+ break
47
+
48
+ if video_path and os.path.exists(video_path):
49
+ tasks[task_id]['status'] = 'completed'
50
+ tasks[task_id]['video_path'] = video_path
51
+ else:
52
+ tasks[task_id]['status'] = 'failed'
53
+ tasks[task_id]['error'] = "Video file not found after generation."
54
+ tasks[task_id]['stdout'] = process.stdout
55
+ tasks[task_id]['stderr'] = process.stderr
56
+
57
+ except subprocess.CalledProcessError as e:
58
+ tasks[task_id]['status'] = 'failed'
59
+ tasks[task_id]['error'] = str(e)
60
+ tasks[task_id]['stdout'] = e.stdout
61
+ tasks[task_id]['stderr'] = e.stderr
62
+ except Exception as e:
63
+ tasks[task_id]['status'] = 'failed'
64
+ tasks[task_id]['error'] = str(e)
65
+
66
+ def start_generation_thread(topic: str, context: str):
67
+ if not topic or not context:
68
+ return "Topic and Context cannot be empty.", "", None
69
+
70
+ task_id = str(uuid.uuid4())
71
+ tasks[task_id] = {'status': 'queued'}
72
+
73
+ # Use a background thread to run the time-consuming task
74
+ thread = threading.Thread(
75
+ target=run_video_generation,
76
+ args=(task_id, topic, context)
77
+ )
78
+ thread.start()
79
+
80
+ return f"Task started. Your Task ID is: {task_id}", task_id, None
81
+
82
+
83
+ def check_status(task_id: str):
84
+ if not task_id:
85
+ return "Please provide a Task ID.", None
86
+
87
+ task = tasks.get(task_id)
88
+ if not task:
89
+ return "Task not found.", None
90
+
91
+ status = task.get('status')
92
+ if status == 'completed':
93
+ video_path = task.get('video_path')
94
+ return f"Status: {status}", video_path
95
+ elif status == 'failed':
96
+ error = task.get('error', 'Unknown error')
97
+ stdout = task.get('stdout', '')
98
+ stderr = task.get('stderr', '')
99
+ return f"Status: {status}\nError: {error}\nOutput: {stdout}\nStderr: {stderr}", None
100
+
101
+ return f"Status: {status}", None
102
+
103
+ # We need a lightweight FastAPI app in the background to serve the video files.
104
+ # Gradio can't serve files directly from arbitrary paths in a secure way.
105
+ fastapi_app = FastAPI()
106
+
107
+ @fastapi_app.get("/videos/{task_id}")
108
+ def get_video(task_id: str):
109
+ """
110
+ Serves the final generated video file.
111
+ """
112
+ task = tasks.get(task_id)
113
+ if not task or task.get('status') != 'completed':
114
+ return {"error": "Task not completed or not found"}
115
+
116
+ video_path = task.get('video_path')
117
+ if not os.path.exists(video_path):
118
+ return {"error": "Video file not found."}
119
+
120
+ return FileResponse(video_path, media_type="video/mp4", filename=os.path.basename(video_path))
121
+
122
+
123
+ # Gradio Interface
124
+ with gr.Blocks() as demo:
125
+ gr.Markdown("# Theorem-Explain-Agent Video Generation")
126
+ gr.Markdown("Start a video generation task and check its status.")
127
+
128
+ with gr.Tab("Start Generation"):
129
+ topic_input = gr.Textbox(label="Topic", placeholder="e.g., The Pythagorean Theorem")
130
+ context_input = gr.Textbox(label="Context", placeholder="A short explanation of the theorem.")
131
+ start_button = gr.Button("Generate Video")
132
+
133
+ with gr.Column():
134
+ task_id_output = gr.Textbox(label="Task ID", interactive=False)
135
+ status_output_start = gr.Textbox(label="Status", interactive=False)
136
+
137
+ with gr.Tab("Check Status"):
138
+ task_id_input = gr.Textbox(label="Task ID", placeholder="Enter the Task ID you received.")
139
+ check_button = gr.Button("Check Status")
140
+
141
+ with gr.Column():
142
+ status_output_check = gr.Textbox(label="Status", interactive=False)
143
+ video_output = gr.Video(label="Generated Video")
144
+
145
+ # Actions
146
+ start_button.click(
147
+ fn=start_generation_thread,
148
+ inputs=[topic_input, context_input],
149
+ outputs=[status_output_start, task_id_output, video_output] # Clear video on new task
150
+ )
151
+
152
+ check_button.click(
153
+ fn=check_status,
154
+ inputs=[task_id_input],
155
+ outputs=[status_output_check, video_output]
156
+ )
157
+
158
+ gr.Markdown("### How to Use")
159
+ gr.Markdown(
160
+ "1. Enter a `Topic` and `Context` in the 'Start Generation' tab and click 'Generate Video'.\n"
161
+ "2. Copy the `Task ID` that appears.\n"
162
+ "3. Go to the 'Check Status' tab, paste the `Task ID`, and click 'Check Status' periodically.\n"
163
+ "4. When the generation is complete, the video will appear."
164
+ )
165
+
166
+ # To run both Gradio and FastAPI, we mount the FastAPI app into Gradio's internal FastAPI app.
167
+ app = gr.mount_ όπου(demo, fastapi_app, path="/")
data/thb_easy/chemistry.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Aufbau Principle",
4
+ "description": "Electrons fill atomic orbitals in order of increasing energy levels. This means the lowest energy orbitals are filled first, followed by higher energy orbitals. This helps in predicting electronic configuration and understanding the properties of elements.",
5
+ "difficulty": "Easy",
6
+ "remark": "Fundamental principle for building the electron configurations of atoms and understanding the periodic table.",
7
+ "subfield": "Atomic Structure"
8
+ },
9
+ {
10
+ "theorem": "The Law of Conservation of Mass",
11
+ "description": "In a closed system, the total mass of the reactants is equal to the total mass of the products. This implies that matter is neither created nor destroyed during a chemical reaction, only transformed. This principle is fundamental for understanding stoichiometry.",
12
+ "difficulty": "Easy",
13
+ "remark": "A cornerstone of chemistry, this principle allows us to balance chemical equations and make quantitative predictions.",
14
+ "subfield": "Chemical Reactions and Stoichiometry"
15
+ },
16
+ {
17
+ "theorem": "The Octet Rule",
18
+ "description": "Atoms tend to gain, lose, or share electrons in order to achieve a full outer shell of eight electrons (or two in the case of hydrogen and some other exceptions). This explains the bonding behaviour of most main group elements, guiding the formations of compounds.",
19
+ "difficulty": "Easy",
20
+ "remark": "Simple and powerful rule to understand the formations of chemical bonds and predict molecules' structures.",
21
+ "subfield": "Chemical Bonding"
22
+ },
23
+ {
24
+ "theorem": "Alkali metals",
25
+ "description": "The alkali metals consist of the chemical elements lithium (Li), sodium (Na), potassium (K), rubidium (Rb), caesium (Cs), and francium (Fr).",
26
+ "difficulty": "Easy",
27
+ "remark": "",
28
+ "subfield": "Periodic Table and Elements"
29
+ },
30
+ {
31
+ "theorem": "Distillation",
32
+ "description": "In chemistry, Distillation is among the most useful methods available to chemists for separating the parts of a liquid. A process that relies on a cycle of heating, vaporization, condensing and cooling. A liquid of a lower boiling point will vaporize before a liquid of higher boiling point.",
33
+ "difficulty": "Easy",
34
+ "remark": "",
35
+ "subfield": "Separation Techniques"
36
+ },
37
+ {
38
+ "theorem": "Crystallization",
39
+ "description": "In chemistry, Crystallization, or crystallisation, is the process of atoms or molecules arranging into a well-defined, rigid crystal lattice in order to minimize their energetic state. The smallest entity of a crystal lattice is called a unit cell, which can accept atoms or molecules to grow a macroscopic crystal.",
40
+ "difficulty": "Easy",
41
+ "remark": "",
42
+ "subfield": "Solid State Chemistry"
43
+ },
44
+ {
45
+ "theorem": "Titration",
46
+ "description": "Titration is a common laboratory method of quantitative chemical analysis to determine the concentration of an identified analyte. A reagent, termed the titrant or titrator, is prepared as a standard solution of known concentration and volume.",
47
+ "difficulty": "Easy",
48
+ "remark": "",
49
+ "subfield": "Analytical Chemistry"
50
+ },
51
+ {
52
+ "theorem": "Ionic Compound",
53
+ "description": "An ionic compound is a chemical compound composed of ions. Ionic compounds are formed by the electrostatic attraction between positively charged cations and negatively charged anions.",
54
+ "difficulty": "Easy",
55
+ "remark": "",
56
+ "subfield": "Chemical Bonding"
57
+ },
58
+ {
59
+ "theorem": "Noble gas",
60
+ "description": "The noble gases are so named because they rarely react with other elements. Helium, neon, argon, krypton, xenon and radon atoms all have a full outer valence shell of electrons, which makes them quite unreactive.",
61
+ "difficulty": "Easy",
62
+ "remark": "",
63
+ "subfield": "Periodic Table and Elements"
64
+ },
65
+ {
66
+ "theorem": "Transition Metal",
67
+ "description": "Transition metal, any of various chemical elements that have valence electrons—i.e., electrons that can participate in the formation of chemical bonds—in two shells instead of only one.",
68
+ "difficulty": "Easy",
69
+ "remark": "",
70
+ "subfield": "Periodic Table and Elements"
71
+ },
72
+ {
73
+ "theorem": "Balance Chemical Equation",
74
+ "description": "A balanced equation is an equation for a chemical reaction in which the number of atoms for each element in the reaction and the total charge are the same for both the reactants and the products.",
75
+ "difficulty": "Easy",
76
+ "remark": "",
77
+ "subfield": "Chemical Reactions and Stoichiometry"
78
+ },
79
+ {
80
+ "theorem": "Combustion analysis",
81
+ "description": "Combustion analysis is a method used in both organic chemistry and analytical chemistry to determine the elemental composition (more precisely empirical formula) of a pure organic compound by combusting the sample under conditions where the resulting combustion products can be quantitatively analyzed.",
82
+ "difficulty": "Easy",
83
+ "remark": "",
84
+ "subfield": "Analytical Chemistry"
85
+ },
86
+ {
87
+ "theorem": "Oxidation",
88
+ "description": "In chemistry, the oxidation state, or oxidation number, is the hypothetical charge of an atom if all of its bonds to other atoms were fully ionic. It describes the degree of oxidation of an atom in a chemical compound. Conceptually, the oxidation state may be positive, negative or zero.",
89
+ "difficulty": "Easy",
90
+ "remark": "",
91
+ "subfield": "Redox Chemistry"
92
+ },
93
+ {
94
+ "theorem": "First law of thermodynamics",
95
+ "description": "The first law of thermodynamics is a formulation of the law of conservation of energy in the context of thermodynamic processes. The law distinguishes two principal forms of energy transfer, heat and thermodynamic work, that modify a thermodynamic system containing a constant amount of matter.",
96
+ "difficulty": "Easy",
97
+ "remark": "",
98
+ "subfield": "Thermodynamics"
99
+ },
100
+ {
101
+ "theorem": "Hess's Law",
102
+ "description": "The enthalpy change of a reaction is independent of the path taken from reactants to products. This allows the calculation of enthalpy changes for reactions that cannot be easily measured directly by using a series of reactions with known enthalpy changes. The overall enthalpy change is the sum of enthalpy changes of individual steps.",
103
+ "difficulty": "Easy",
104
+ "remark": "Useful for calculating enthalpy changes of complex reactions. It's based on the state function of enthalpy.",
105
+ "subfield": "Thermodynamics"
106
+ },
107
+ {
108
+ "theorem": "The Ideal Gas Law",
109
+ "description": "The product of the pressure and volume of an ideal gas is proportional to the product of the amount of gas and its absolute temperature: PV = nRT. This law describes the behavior of ideal gases and helps predict their volume, pressure, temperature, or amount under given conditions.",
110
+ "difficulty": "Easy",
111
+ "remark": "Ideal for understanding the behaviour of gases, often used in stoichiometry related to gases. Assumes no intermolecular forces or particle volume.",
112
+ "subfield": "Gas Laws"
113
+ },
114
+ {
115
+ "theorem": "Charles's Law",
116
+ "description": "Charles's law (also known as the law of volumes) is an experimental gas law that describes how gases tend to expand when heated.",
117
+ "difficulty": "Easy",
118
+ "remark": "",
119
+ "subfield": "Gas Laws"
120
+ },
121
+ {
122
+ "theorem": "Gay-Lussac's Law",
123
+ "description": "Gay-Lussac's law usually refers to Joseph-Louis Gay-Lussac's law of combining volumes of gases, discovered in 1808 and published in 1809.",
124
+ "difficulty": "Easy",
125
+ "remark": "",
126
+ "subfield": "Gas Laws"
127
+ },
128
+ {
129
+ "theorem": "pH Scale Definition",
130
+ "description": "pH is a measure of the hydrogen ion concentration in a solution.",
131
+ "difficulty": "Easy",
132
+ "remark": "",
133
+ "subfield": "Acid-Base Chemistry"
134
+ },
135
+ {
136
+ "theorem": "Van't Hoff Equation",
137
+ "description": "The Van 't Hoff equation has been widely utilized to explore the changes in state functions in a thermodynamic system. ",
138
+ "difficulty": "Easy",
139
+ "remark": "",
140
+ "subfield": "Chemical Kinetics"
141
+ }
142
+ ]
data/thb_easy/comp_sci.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Pigeonhole Principle",
4
+ "description": "If you have more pigeons than pigeonholes, then at least one pigeonhole must contain more than one pigeon. More formally, if *n* items are put into *m* containers, with *n > m*, then at least one container must contain more than one item.",
5
+ "difficulty": "Easy",
6
+ "remark": "A fundamental principle in combinatorics with surprising applications in various areas of computer science, like proving existence in hashing or data compression. Simple to understand, powerful in use.",
7
+ "subfield": "Discrete Mathematics"
8
+ },
9
+ {
10
+ "theorem": "De Morgan's Laws",
11
+ "description": "De Morgan's Laws provide a way to simplify or transform logical statements involving AND, OR, and NOT. Specifically: 1) NOT (A AND B) is equivalent to (NOT A) OR (NOT B). 2) NOT (A OR B) is equivalent to (NOT A) AND (NOT B).",
12
+ "difficulty": "Easy",
13
+ "remark": "Crucial for boolean algebra and digital logic design. Helps with simplifying complex logic expressions and is widely used in programming.",
14
+ "subfield": "Boolean Algebra"
15
+ },
16
+ {
17
+ "theorem": "The Time Complexity of Linear Search",
18
+ "description": "In the worst-case scenario, searching for an element in an unsorted array using linear search requires O(n) time, where 'n' is the number of elements in the array. This is because the algorithm may need to examine every element in the array to find or conclude the non-existence of the target.",
19
+ "difficulty": "Easy",
20
+ "remark": "A foundational concept in algorithm analysis. Illustrates how the running time of an algorithm scales with the input size.",
21
+ "subfield": "Algorithm Analysis"
22
+ },
23
+ {
24
+ "theorem": "The Properties of a Binary Tree",
25
+ "description": "For a complete or full binary tree: 1) The maximum number of nodes at level *l* is 2^l (where the root is at level 0). 2) The total number of nodes in a complete binary tree of *h* depth is 2^(h+1) - 1.",
26
+ "difficulty": "Easy",
27
+ "remark": "Fundamental for understanding and analyzing tree data structures. Used in many algorithmic designs.",
28
+ "subfield": "Data Structures"
29
+ },
30
+ {
31
+ "theorem": "The Triangle Inequality Theorem",
32
+ "description": "The triangle inequality states that for any three points A, B, and C in a metric space (e.g., the Euclidean plane), the sum of the lengths of any two sides of a triangle must be greater than or equal to the length of the third side. |AB| + |BC| >= |AC|",
33
+ "difficulty": "Easy",
34
+ "remark": "Often used in graph algorithms (e.g. proving properties of shortest path) . The principle is used as basis of many distance metrics.",
35
+ "subfield": "Computational Geometry"
36
+ },
37
+ {
38
+ "theorem": "Hamming distance",
39
+ "description": "In information theory, the Hamming distance between two strings or vectors of equal length is the number of positions at which the corresponding symbols are different.",
40
+ "difficulty": "Easy",
41
+ "remark": "",
42
+ "subfield": "Information Theory"
43
+ },
44
+ {
45
+ "theorem": "Big O notation",
46
+ "description": "most common type of asymptotic notation in computer science used to measure worst case complexity",
47
+ "difficulty": "Easy",
48
+ "remark": "",
49
+ "subfield": "Algorithm Analysis"
50
+ },
51
+ {
52
+ "theorem": "Deadlock",
53
+ "description": "A deadlock is a situation where two or more processes are blocked waiting for each other to release resources, resulting in a circular wait condition.",
54
+ "difficulty": "Easy",
55
+ "remark": "",
56
+ "subfield": "Operating Systems"
57
+ },
58
+ {
59
+ "theorem": "Bubble Sort",
60
+ "description": "Bubble sort is a simple sorting algorithm that repeatedly steps through the list, compares adjacent elements and swaps them if they are in the wrong order.",
61
+ "difficulty": "Easy",
62
+ "remark": "",
63
+ "subfield": "Algorithms"
64
+ },
65
+ {
66
+ "theorem": "Karnaugh Map",
67
+ "description": "A Karnaugh map (K-map) is a graphical method for simplifying Boolean algebra expressions.",
68
+ "difficulty": "Easy",
69
+ "remark": "",
70
+ "subfield": "Digital Logic Design"
71
+ },
72
+ {
73
+ "theorem": "Hash table",
74
+ "description": "A hash table uses a hash function to compute an index, also called a hash code, into an array of buckets or slots, from which the desired value can be found.",
75
+ "difficulty": "Easy",
76
+ "remark": "",
77
+ "subfield": "Data Structures"
78
+ },
79
+ {
80
+ "theorem": "Linked list",
81
+ "description": "data structure that does not necessarily store elements next to each other and instead works by maintaining, for each element, a link to the next element in the list",
82
+ "difficulty": "Easy",
83
+ "remark": "",
84
+ "subfield": "Data Structures"
85
+ },
86
+ {
87
+ "theorem": "Chain Code",
88
+ "description": "A chain code is a lossless compression based image segmentation method for binary images based upon tracing image contours. The basic principle of chain coding, like other contour codings, is to separately encode each connected component, or blob in the image.",
89
+ "difficulty": "Easy",
90
+ "remark": "",
91
+ "subfield": "Image Processing"
92
+ },
93
+ {
94
+ "theorem": "Signal-to-noise ratio",
95
+ "description": "The signal-to-noise ratio (SNR) is a measure of the ratio between the power of a signal and the power of background noise.",
96
+ "difficulty": "Easy",
97
+ "remark": "",
98
+ "subfield": "Signal Processing"
99
+ },
100
+ {
101
+ "theorem": "Run-length encoding",
102
+ "description": "Run-length encoding (RLE) is a form of data compression that encodes consecutive data elements by a single data value and count, rather than by the original data values.",
103
+ "difficulty": "Easy",
104
+ "remark": "",
105
+ "subfield": "Data Compression"
106
+ },
107
+ {
108
+ "theorem": "Elbow method",
109
+ "description": "The elbow method is a graphical method for finding the optimal K value in a k-means clustering algorithm.",
110
+ "difficulty": "Easy",
111
+ "remark": "",
112
+ "subfield": "Machine Learning"
113
+ },
114
+ {
115
+ "theorem": "Huffman coding",
116
+ "description": "In computer science and information theory, a Huffman code is a particular type of optimal prefix code that is commonly used for lossless data compression.",
117
+ "difficulty": "Easy",
118
+ "remark": "",
119
+ "subfield": "Data Compression"
120
+ },
121
+ {
122
+ "theorem": "Paging",
123
+ "description": "Paging is a memory management technique used in operating systems to manage virtual memory. It involves dividing the virtual address space into fixed-size blocks called pages, and storing these pages in a secondary storage device called a paging file.",
124
+ "difficulty": "Easy",
125
+ "remark": "",
126
+ "subfield": "Operating Systems"
127
+ },
128
+ {
129
+ "theorem": "OSI model",
130
+ "description": "The Open Systems Interconnection (OSI) model is a conceptual framework that describes how data is sent over a network.",
131
+ "difficulty": "Easy",
132
+ "remark": "",
133
+ "subfield": "Computer Networks"
134
+ },
135
+ {
136
+ "theorem": "IEEE Convertion",
137
+ "description": "The IEEE-754 standard describes floating-point formats, a way to represent real numbers in hardware.",
138
+ "difficulty": "Easy",
139
+ "remark": "",
140
+ "subfield": "Computer Architecture"
141
+ }
142
+ ]
data/thb_easy/math.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Pythagorean Theorem",
4
+ "description": "In a right-angled triangle, the square of the length of the hypotenuse (the side opposite the right angle) is equal to the sum of the squares of the lengths of the other two sides. If a and b are the lengths of the legs and c is the length of the hypotenuse, then a\u00b2 + b\u00b2 = c\u00b2.",
5
+ "difficulty": "Easy",
6
+ "remark": "Fundamental theorem in geometry; widely used in various fields.",
7
+ "subfield": "Geometry"
8
+ },
9
+ {
10
+ "theorem": "Properties of Kites",
11
+ "description": "A kite is a quadrilateral with two pairs of adjacent, congruent sides. In geometry, kites have several unique properties that distinguish them from other quadrilaterals. Here are some of the key properties of kites:\n\n1. Two pairs of adjacent sides are congruent: In a kite, there are two distinct pairs of adjacent sides that have equal length. This means that if one pair of sides has a length of 'a', the other pair will also have a length of 'a', and if the other pair has a length of 'b', the first pair will also have a length of 'b'.\n\n2. Diagonals are perpendicular: The diagonals of a kite intersect at a 90-degree angle, meaning they are perpendicular to each other.\n\n3. One diagonal is bisected: In a kite, one of the diagonals is bisected by the other diagonal, meaning it is divided into two equal parts. This property is true for the diagonal connecting the vertices between the congruent sides.\n\n4. One pair of opposite angles is congruent: In a kite, the angles between the congruent sides (the angles formed by the two pairs of equal sides) are congruent, meaning they have the same degree measure.\n\n5. Area: The area of a kite can be calculated using the lengths of its diagonals. If 'd1' and 'd2' are the lengths of the diagonals, the area of the kite is given by the formula: Area = (1/2) * d1 * d2.\n\n6. Circumscribed circle: A kite can have a circumscribed circle only if it is a rhombus (all sides are congruent) or a square (all sides and angles are congruent).\n\n7. Inscribed circle: A kite can have an inscribed circle only if it is a square (all sides and angles are congruent).\n\nThese properties make kites an interesting and unique type of quadrilateral in geometry.",
12
+ "difficulty": "Easy",
13
+ "remark": "Properties of kites are useful for solving geometry problems involving kites.",
14
+ "subfield": "Geometry"
15
+ },
16
+ {
17
+ "theorem": "Euler's formula",
18
+ "description": "Euler's formula is a fundamental equation in complex analysis that establishes a deep connection between trigonometry and complex exponentials. It is named after the Swiss mathematician Leonhard Euler. The formula is given by:\n\ne^(ix) = cos(x) + i*sin(x)\n\nwhere e is the base of the natural logarithm (approximately 2.71828), i is the imaginary unit (i^2 = -1), x is a real number, and cos(x) and sin(x) are the trigonometric functions cosine and sine, respectively.\n\nEuler's formula demonstrates that complex exponentials can be expressed in terms of trigonometric functions, and vice versa. This relationship is particularly useful in various fields of mathematics, physics, and engineering, as it simplifies calculations involving complex numbers and trigonometric functions.\n\nOne of the most famous consequences of Euler's formula is Euler's identity, which is obtained by setting x = \u03c0 in the formula:\n\ne^(i\u03c0) + 1 = 0\n\nEuler's identity is considered one of the most beautiful equations in mathematics, as it combines five fundamental constants (e, i, \u03c0, 1, and 0) in a simple and elegant relationship.",
19
+ "difficulty": "Easy",
20
+ "remark": "Euler's formula is widely used in various fields, including engineering, physics, and computer science.",
21
+ "subfield": "Complex Analysis"
22
+ },
23
+ {
24
+ "theorem": "Laws of Exponents",
25
+ "description": "The laws of exponents simplify the multiplication and division operations.",
26
+ "difficulty": "Easy",
27
+ "remark": "",
28
+ "subfield": "Algebra"
29
+ },
30
+ {
31
+ "theorem": "One-to-one function",
32
+ "description": "a function for which each value of the output is associated with a unique input value",
33
+ "difficulty": "Easy",
34
+ "remark": "",
35
+ "subfield": "Functions"
36
+ },
37
+ {
38
+ "theorem": "Inverse function",
39
+ "description": "For any one-to-one function f(x), the inverse is a function f^(-1)(x) such that f^(-1)(f(x))=x for all x in the domain of f; this also implies that f(f^(-1)(x))=x for all x in the domain of f^(-1)",
40
+ "difficulty": "Easy",
41
+ "remark": "",
42
+ "subfield": "Functions"
43
+ },
44
+ {
45
+ "theorem": "Remainder theorem",
46
+ "description": "The remainder theorem states that when a polynomial p(x) is divided by a linear polynomial (x - a), then the remainder is equal to p(a).",
47
+ "difficulty": "Easy",
48
+ "remark": "",
49
+ "subfield": "Algebra"
50
+ },
51
+ {
52
+ "theorem": "Rational Zero Theorem",
53
+ "description": "The rational root theorem is also known as the rational zero theorem (or) the rational zero test (or) rational test theorem and is used to determine the rational roots of a polynomial function. ",
54
+ "difficulty": "Easy",
55
+ "remark": "",
56
+ "subfield": "Algebra"
57
+ },
58
+ {
59
+ "theorem": "Product-to-sum formula",
60
+ "description": "The product-to-sum formulas are a set of formulas from trigonometric formulas.",
61
+ "difficulty": "Easy",
62
+ "remark": "",
63
+ "subfield": "Geometry"
64
+ },
65
+ {
66
+ "theorem": "Heron's formula",
67
+ "description": "Heron's formula is a formula that is used to find the area of a triangle when the lengths of all three sides are known.",
68
+ "difficulty": "Easy",
69
+ "remark": "",
70
+ "subfield": "Geometry"
71
+ },
72
+ {
73
+ "theorem": "De Moivre's Theorem",
74
+ "description": "Formula used to find the nth power or nth roots of a complex number; states that, for a positive integer n, z^n is found by raising the modulus to the nth power and multiplying the angles by n",
75
+ "difficulty": "Easy",
76
+ "remark": "",
77
+ "subfield": "Complex Analysis"
78
+ },
79
+ {
80
+ "theorem": "Cramer's Rule",
81
+ "description": "a method for solving systems of equations that have the same number of equations as variables using determinants",
82
+ "difficulty": "Easy",
83
+ "remark": "",
84
+ "subfield": "Algebra"
85
+ },
86
+ {
87
+ "theorem": "Angle of rotation",
88
+ "description": "An angle of rotation is the measure of the amount that a figure is rotated about a fixed point called a point of rotation.",
89
+ "difficulty": "Easy",
90
+ "remark": "",
91
+ "subfield": "Geometry"
92
+ },
93
+ {
94
+ "theorem": "Similar Triangles Theorem",
95
+ "description": "Two triangles are similar if their corresponding angles are equal and their corresponding sides are proportional.",
96
+ "difficulty": "Easy",
97
+ "remark": "",
98
+ "subfield": "Geometry"
99
+ },
100
+ {
101
+ "theorem": "Congruent Triangles Theorem",
102
+ "description": "Two triangles are congruent if they satisfy any of these criteria: SSS (Side-Side-Side), SAS (Side-Angle-Side), ASA (Angle-Side-Angle), AAS (Angle-Angle-Side), or HL (Hypotenuse-Leg) for right triangles.",
103
+ "difficulty": "Easy",
104
+ "remark": "",
105
+ "subfield": "Geometry"
106
+ },
107
+ {
108
+ "theorem": "Geometric Sequence",
109
+ "description": "For a geometric sequence with the first term a, common ratio r, and n terms, the sum is: S_n = a * (1 - r^n) / (1 - r) for r != 1",
110
+ "difficulty": "Easy",
111
+ "remark": "",
112
+ "subfield": "Sequences and Series"
113
+ },
114
+ {
115
+ "theorem": "Arithmetic Sequence",
116
+ "description": "For an arithmetic sequence with the first term a, common difference d, and n terms, the sum is: S_n = (n/2) * (2a + (n-1)d)",
117
+ "difficulty": "Easy",
118
+ "remark": "",
119
+ "subfield": "Sequences and Series"
120
+ },
121
+ {
122
+ "theorem": "Permutation",
123
+ "description": "The term permutation refers to a mathematical calculation of the number of ways a particular set can be arranged.",
124
+ "difficulty": "Easy",
125
+ "remark": "",
126
+ "subfield": "Combinatorics"
127
+ },
128
+ {
129
+ "theorem": "Directrix",
130
+ "description": "a line perpendicular to the axis of symmetry of a parabola; a line such that the ratio of the distance between the points on the conic and the focus to the distance to the directrix is constant.",
131
+ "difficulty": "Easy",
132
+ "remark": "",
133
+ "subfield": "Conic Sections"
134
+ },
135
+ {
136
+ "theorem": "Eccentricity",
137
+ "description": "the eccentricity of a conic section is a non-negative real number that uniquely characterizes its shape.",
138
+ "difficulty": "Easy",
139
+ "remark": "",
140
+ "subfield": "Conic Sections"
141
+ }
142
+ ]
data/thb_easy/physics.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "Ohm's Law",
4
+ "description": "The voltage (V) across a conductor is directly proportional to the current (I) flowing through it, given the resistance (R) remains constant. The formula is V = IR. This law holds for many materials, particularly metals, and components like resistors.",
5
+ "difficulty": "Easy",
6
+ "remark": "A cornerstone of circuit analysis. While it is an approximation, it's incredibly useful in solving basic circuit problems. The 'resistance' is a macroscopic property representing the ease of electron movement.",
7
+ "subfield": "Electricity and Circuits"
8
+ },
9
+ {
10
+ "theorem": "Newton's First Law of Motion",
11
+ "description": "a body at rest remains at rest, or, if in motion, remains in motion at a constant velocity unless acted on by a net external force; also known as the law of inertia",
12
+ "difficulty": "Easy",
13
+ "remark": "This law is fundamental to understanding the relationship between force and motion. It establishes that forces cause acceleration which changes velocity. Applicable for solving motion problems where force and mass are known.",
14
+ "subfield": "Classical Mechanics"
15
+ },
16
+ {
17
+ "theorem": "Newton's Second Law of Motion",
18
+ "description": "The net force (F_net) acting on an object is equal to the mass (m) of the object multiplied by its acceleration (a). F_net = ma. This law is fundamental to understanding the relationship between force and motion.",
19
+ "difficulty": "Easy",
20
+ "remark": "This is one of the most important laws in classical mechanics. It establishes that forces cause acceleration which changes velocity. Applicable for solving motion problems where force and mass are known.",
21
+ "subfield": "Classical Mechanics"
22
+ },
23
+ {
24
+ "theorem": "Hooke's law",
25
+ "description": "In physics, Hooke's law is an empirical law which states that the force needed to extend or compress a spring by some distance scales linearly with respect to that distance.",
26
+ "difficulty": "Easy",
27
+ "remark": "This law is fundamental to understanding the relationship between force and motion. It establishes that forces cause acceleration which changes velocity. Applicable for solving motion problems where force and mass are known.",
28
+ "subfield": "Classical Mechanics"
29
+ },
30
+ {
31
+ "theorem": "Gravitational Force",
32
+ "description": "In physics, gravity is a fundamental interaction primarily observed as mutual attraction between all things that have mass.",
33
+ "difficulty": "Easy",
34
+ "remark": "",
35
+ "subfield": "Classical Mechanics"
36
+ },
37
+ {
38
+ "theorem": "Centrifugal force",
39
+ "description": "Centrifugal force is a fictitious force in Newtonian mechanics that appears to act on all objects when viewed in a rotating frame of reference. It appears to be directed radially away from the axis of rotation of the frame.",
40
+ "difficulty": "Easy",
41
+ "remark": "",
42
+ "subfield": "Classical Mechanics"
43
+ },
44
+ {
45
+ "theorem": "Kinetic energy",
46
+ "description": "In physics, the kinetic energy of an object is the form of energy that it possesses due to its motion. In classical mechanics, the kinetic energy of a non-rotating object of mass m traveling at a speed v is.",
47
+ "difficulty": "Easy",
48
+ "remark": "",
49
+ "subfield": "Classical Mechanics"
50
+ },
51
+ {
52
+ "theorem": "Torque",
53
+ "description": "Torque is a measure of the force that can cause an object to rotate about an axis. Just as force is what causes an object to accelerate in linear kinematics, torque is what causes an object to acquire angular acceleration. Torque is a vector quantity.",
54
+ "difficulty": "Easy",
55
+ "remark": "",
56
+ "subfield": "Classical Mechanics"
57
+ },
58
+ {
59
+ "theorem": "Right-hand rule",
60
+ "description": "The right hand rule is a hand mnemonic used in physics to identify the direction of axes or parameters that point in three dimensions.",
61
+ "difficulty": "Easy",
62
+ "remark": "",
63
+ "subfield": "Electromagnetism"
64
+ },
65
+ {
66
+ "theorem": "Snell's Law",
67
+ "description": "Relates the angles of incidence and refraction of light when passing between two different media. It states that n₁sin(θ₁) = n₂sin(θ₂), where n₁ and n₂ are the refractive indices of the two media, and θ₁ and θ₂ are the angles of incidence and refraction, respectively.",
68
+ "difficulty": "Easy",
69
+ "remark": "This theorem is fundamental to understanding how light bends when it travels through different materials, essential for studying optics (lenses, prisms). Its application involves using trigonometry.",
70
+ "subfield": "Optics"
71
+ },
72
+ {
73
+ "theorem": "The Ideal Gas Law",
74
+ "description": "Relates the pressure (P), volume (V), temperature (T), and the number of moles (n) of an ideal gas: PV = nRT, where R is the ideal gas constant. It serves as a good approximation for the behavior of real gases under certain conditions.",
75
+ "difficulty": "Easy",
76
+ "remark": "Connects macroscopic gas properties and allows calculations involving gas behavior under varied conditions. Applicable for thermodynamics problems and understanding gas pressure, volume and temperature relationship.",
77
+ "subfield": "Thermodynamics"
78
+ },
79
+ {
80
+ "theorem": "Pascal's Principle",
81
+ "description": "Pascal's law is a principle in fluid mechanics given by Blaise Pascal that states that a pressure change at any point in a confined incompressible fluid is transmitted throughout the fluid such that the same change occurs everywhere.",
82
+ "difficulty": "Easy",
83
+ "remark": "",
84
+ "subfield": "Fluid Mechanics"
85
+ },
86
+ {
87
+ "theorem": "Avogadro's number",
88
+ "description": "The concept of the mole can be used to convert between mass and number of particles.",
89
+ "difficulty": "Easy",
90
+ "remark": "",
91
+ "subfield": "Thermodynamics"
92
+ },
93
+ {
94
+ "theorem": "Dalton's law of partial pressures",
95
+ "description": "Dalton's law of partial pressures states that the total pressure of a mixture of gases is the sum of the partial pressures of its components.",
96
+ "difficulty": "Easy",
97
+ "remark": "",
98
+ "subfield": "Thermodynamics"
99
+ },
100
+ {
101
+ "theorem": "PV diagram",
102
+ "description": "a graph of pressure vs. volume",
103
+ "difficulty": "Easy",
104
+ "remark": "",
105
+ "subfield": "Thermodynamics"
106
+ },
107
+ {
108
+ "theorem": "Color wavelengths",
109
+ "description": "The wavelength of a color is the range of nanometers (nm) at which it appears in the visible light spectrum.",
110
+ "difficulty": "Easy",
111
+ "remark": "",
112
+ "subfield": "Optics"
113
+ },
114
+ {
115
+ "theorem": "Ultrasound",
116
+ "description": "Ultrasound refers to sound waves with frequencies higher than the audible range for humans.",
117
+ "difficulty": "Easy",
118
+ "remark": "",
119
+ "subfield": "Waves and Sound"
120
+ },
121
+ {
122
+ "theorem": "Coulomb's law",
123
+ "description": "Coulomb's inverse-square law, or simply Coulomb's law, is an experimental law of physics that calculates the amount of force between two electrically charged particles at rest. This electric force is conventionally called the electrostatic force or Coulomb force.",
124
+ "difficulty": "Easy",
125
+ "remark": "",
126
+ "subfield": "Electromagnetism"
127
+ },
128
+ {
129
+ "theorem": "Kirchhoff's voltage law",
130
+ "description": "The sum of all the voltages around a loop is equal to zero.",
131
+ "difficulty": "Easy",
132
+ "remark": "",
133
+ "subfield": "Electricity and Circuits"
134
+ },
135
+ {
136
+ "theorem": "Thévenin's theorem",
137
+ "description": "Thévenin's theorem states that any linear circuit containing several voltage sources and resistors can be simplified to a Thévenin-equivalent circuit with a single voltage source and resistance connected in series with a load.",
138
+ "difficulty": "Easy",
139
+ "remark": "",
140
+ "subfield": "Electricity and Circuits"
141
+ }
142
+ ]
data/thb_hard/chemistry.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Henderson-Hasselbalch Equation",
4
+ "description": "The pH of a buffer solution is equal to the pKa of the weak acid plus the logarithm of the ratio of the concentration of the conjugate base to the concentration of the weak acid: pH = pKa + log([A-]/[HA]). It allows for the calculation of buffer solutions pH and predicting how pH would change with addition of acid or base",
5
+ "difficulty": "Hard",
6
+ "remark": "Crucial in understanding buffer solutions and titrations. Used in biochemistry extensively.",
7
+ "subfield": "Acid-Base Chemistry"
8
+ },
9
+ {
10
+ "theorem": "Bragg's law",
11
+ "description": "Bragg's law in chemistry describes how X-rays reflect off of a crystal surface.",
12
+ "difficulty": "Hard",
13
+ "remark": "",
14
+ "subfield": "Crystallography"
15
+ },
16
+ {
17
+ "theorem": "Debye-Scherrer Equation",
18
+ "description": "The Debye-Scherrer equation is used in chemistry to calculate the size of crystalline nanoparticles. It is based on X-ray diffraction (XRD) measurements.",
19
+ "difficulty": "Hard",
20
+ "remark": "",
21
+ "subfield": "Crystallography"
22
+ },
23
+ {
24
+ "theorem": "Hückel's Rule",
25
+ "description": "In organic chemistry, Hückel's rule predicts that a planar ring molecule will have aromatic properties if it has 4n + 2 π-electrons, where n is a non-negative integer.",
26
+ "difficulty": "Hard",
27
+ "remark": "",
28
+ "subfield": "Organic Chemistry"
29
+ },
30
+ {
31
+ "theorem": "Hard Acid Soft Base Theory",
32
+ "description": "Hard Acid Soft Base Theory (HSAB): This theory works on the principle that soft acid reacts with the soft base while hard acid reacts with the hard base",
33
+ "difficulty": "Hard",
34
+ "remark": "",
35
+ "subfield": "Acid-Base Chemistry"
36
+ },
37
+ {
38
+ "theorem": "Pauli Exclusion Principle",
39
+ "description": "Pauli's Exclusion Principle states that no two electrons in the same atom can have identical values for all four of their quantum numbers.",
40
+ "difficulty": "Hard",
41
+ "remark": "",
42
+ "subfield": "Quantum Chemistry"
43
+ },
44
+ {
45
+ "theorem": "Crystal Field Theory",
46
+ "description": "Crystal field theory (CFT) describes the breaking of orbital degeneracy in transition metal complexes due to the presence of ligands.",
47
+ "difficulty": "Hard",
48
+ "remark": "",
49
+ "subfield": "Inorganic Chemistry"
50
+ },
51
+ {
52
+ "theorem": "Hohenberg-Kohn theorem",
53
+ "description": "The first Hohenberg–Kohn theorem states that 'the ground state of any interacting many particle system with a given fixed inter-particle interaction is a unique functional of the electron density n(r).",
54
+ "difficulty": "Hard",
55
+ "remark": "",
56
+ "subfield": "Quantum Chemistry"
57
+ },
58
+ {
59
+ "theorem": "Frost–Ebsworth diagram",
60
+ "description": "A Frost diagram or Frost–Ebsworth diagram is a type of graph used by inorganic chemists in electrochemistry to illustrate the relative stability of a number of different oxidation states of a particular substance. The graph illustrates the free energy vs oxidation state of a chemical species.",
61
+ "difficulty": "Hard",
62
+ "remark": "",
63
+ "subfield": "Electrochemistry"
64
+ },
65
+ {
66
+ "theorem": "Coulson-Fischer Theorem",
67
+ "description": "In theoretical chemistry and molecular physics, Coulson–Fischer theory provides a quantum mechanical description of the electronic structure of molecules.",
68
+ "difficulty": "Hard",
69
+ "remark": "",
70
+ "subfield": "Quantum Chemistry"
71
+ },
72
+ {
73
+ "theorem": "Frank-Condon Principle",
74
+ "description": "The Franck-Condon Principle describes the intensities of vibronic transitions, or the absorption or emission of a photon.",
75
+ "difficulty": "Hard",
76
+ "remark": "",
77
+ "subfield": "Spectroscopy"
78
+ },
79
+ {
80
+ "theorem": "Nernst Equation",
81
+ "description": "The Nernst Equation enables the determination of cell potential under non-standard conditions.",
82
+ "difficulty": "Hard",
83
+ "remark": "",
84
+ "subfield": "Electrochemistry"
85
+ },
86
+ {
87
+ "theorem": "Slater's Rules",
88
+ "description": "The general principle behind Slater's Rule is that the actual charge felt by an electron is equal to what you'd expect the charge to be from a certain number of protons, but minus a certain amount of charge from other electrons.",
89
+ "difficulty": "Hard",
90
+ "remark": "",
91
+ "subfield": "Quantum Chemistry"
92
+ },
93
+ {
94
+ "theorem": "Langmuir Adsorption Isotherm",
95
+ "description": "A continuous monolayer of adsorbate molecules surrounding a homogeneous solid surface is the conceptual basis for this adsorption model.",
96
+ "difficulty": "Hard",
97
+ "remark": "",
98
+ "subfield": "Physical Chemistry"
99
+ },
100
+ {
101
+ "theorem": "Marcus Theory",
102
+ "description": "Marcus theory is a theory originally developed by Rudolph A. Marcus, starting in 1956, to explain the rates of electron transfer reactions.",
103
+ "difficulty": "Hard",
104
+ "remark": "",
105
+ "subfield": "Physical Chemistry"
106
+ },
107
+ {
108
+ "theorem": "Eyring Equation",
109
+ "description": "The Eyring equation is an equation used in chemical kinetics to describe changes in the rate of a chemical reaction against temperature.",
110
+ "difficulty": "Hard",
111
+ "remark": "",
112
+ "subfield": "Chemical Kinetics"
113
+ },
114
+ {
115
+ "theorem": "Woodward-Hoffmann Rules",
116
+ "description": "Robert Burns Woodward and Roald Hoffmann devised these set of rules to explain the stereochemistry of pericyclic reactions based on the orbital symmetry.",
117
+ "difficulty": "Hard",
118
+ "remark": "",
119
+ "subfield": "Organic Chemistry"
120
+ },
121
+ {
122
+ "theorem": "Born-Haber Cycle",
123
+ "description": "A Born–Haber cycle applies Hess's law to calculate the lattice enthalpy by comparing the standard enthalpy change of formation of the ionic compound (from the elements) to the enthalpy required to make gaseous ions from the elements. This lattice calculation is complex.",
124
+ "difficulty": "Hard",
125
+ "remark": "",
126
+ "subfield": "Thermodynamics"
127
+ },
128
+ {
129
+ "theorem": "Molecular Orbital Theory",
130
+ "description": "In chemistry, molecular orbital theory is a method for describing the electronic structure of molecules using quantum mechanics.",
131
+ "difficulty": "Hard",
132
+ "remark": "",
133
+ "subfield": "Quantum Chemistry"
134
+ },
135
+ {
136
+ "theorem": "Hammond Postulate",
137
+ "description": "The postulate, which George Hammond first proposed in 1955, states that if two states, such as a transition state and an unstable intermediate, occur consecutively during a reaction process and have nearly the same energy content, their interconversion will result in only a minor reorganisation of molecular structures.",
138
+ "difficulty": "Hard",
139
+ "remark": "",
140
+ "subfield": "Physical Chemistry"
141
+ }
142
+ ]
data/thb_hard/comp_sci.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "Evidence lower bound",
4
+ "description": "The evidence lower bound (ELBO) is a lower bound on the log-evidence of a model, which is a measure of how well the model fits the data.",
5
+ "difficulty": "Hard",
6
+ "remark": "",
7
+ "subfield": "Machine Learning"
8
+ },
9
+ {
10
+ "theorem": "Viterbi Algorithm",
11
+ "description": "The Viterbi Algorithm is a dynamic programming algorithm used for finding the most likely sequence of hidden states, known as the Viterbi path, in a Hidden Markov Model (HMM). It is named after its inventor, Andrew Viterbi, and is widely used in various applications such as speech recognition, natural language processing, and bioinformatics.\n\nA Hidden Markov Model (HMM) is a statistical model that represents a stochastic process involving a sequence of observable events and hidden states. In an HMM, the observable events are generated by the hidden states, which follow a Markov chain. The Markov chain is characterized by the transition probabilities between hidden states, and the emission probabilities of observable events given the hidden states.\n\nThe Viterbi Algorithm works by finding the most probable path of hidden states that generates the observed sequence of events. It does this by iteratively computing the maximum probability of reaching each state at each time step, considering all possible paths that lead to that state. The algorithm uses dynamic programming to efficiently compute these probabilities and store them in a trellis structure.\n\nHere's a high-level description of the Viterbi Algorithm:\n\n1. Initialization: Set the initial probabilities for each hidden state, considering the initial state probabilities and the emission probabilities for the first observed event.\n\n2. Recursion: For each subsequent observed event, compute the maximum probability of reaching each hidden state, considering all possible previous states and their transition probabilities. Update the emission probabilities for the current observed event.\n\n3. Termination: Identify the hidden state with the highest probability at the last time step.\n\n4. Traceback: Starting from the identified state in the termination step, backtrack through the trellis to find the most probable path of hidden states that generated the observed sequence.\n\nThe Viterbi Algorithm is an efficient and widely used method for decoding the hidden states in a Hidden Markov Model, providing valuable insights into the underlying structure of the stochastic process.",
12
+ "difficulty": "Hard",
13
+ "remark": "",
14
+ "subfield": "Dynamic Programming"
15
+ },
16
+ {
17
+ "theorem": "Fano's inequality",
18
+ "description": "In information theory, Fano's inequality relates the average information lost in a noisy channel to the probability of the categorization error.",
19
+ "difficulty": "Hard",
20
+ "remark": "",
21
+ "subfield": "Information Theory"
22
+ },
23
+ {
24
+ "theorem": "Message Passing algorithm",
25
+ "description": "Message passing algorithm is an iterative decoding algorithm factorizes the global function of many variables into product of simpler local functions, whose arguments are the subset of variables.",
26
+ "difficulty": "Hard",
27
+ "remark": "",
28
+ "subfield": "Machine Learning"
29
+ },
30
+ {
31
+ "theorem": "Maximal Planar Graph",
32
+ "description": "A maximal planar graph is a graph which can be embedded in the plane such that every face of the graph is a triangle.",
33
+ "difficulty": "Hard",
34
+ "remark": "",
35
+ "subfield": "Graph Theory"
36
+ },
37
+ {
38
+ "theorem": "Cayley's formula",
39
+ "description": "This formula tells how many trees can be constructed with N vertices.",
40
+ "difficulty": "Hard",
41
+ "remark": "",
42
+ "subfield": "Graph Theory"
43
+ },
44
+ {
45
+ "theorem": "Floyd's Cycle Finding Algorithm",
46
+ "description": "Also known as the tortoise and the hare algorithm, it is a pointer algorithm that uses two pointers which move at different speeds to find a cycle in a sequence.",
47
+ "difficulty": "Hard",
48
+ "remark": "",
49
+ "subfield": "Algorithms"
50
+ },
51
+ {
52
+ "theorem": "Sigma-Delta Modulation",
53
+ "description": "A sigma delta modulator converts this shunt voltage across the resistor, into high-frequency one-bit digital bitstream using oversampling and noise shaping.",
54
+ "difficulty": "Hard",
55
+ "remark": "",
56
+ "subfield": "Digital Signal Processing"
57
+ },
58
+ {
59
+ "theorem": "Kruskal's algorithm",
60
+ "description": "greedy algorithm that sorts the list of edges in the graph by weight.",
61
+ "difficulty": "Hard",
62
+ "remark": "A fundamental algorithm in graph theory. It's used in network design, spanning tree construction, and various optimization problems. Requires understanding of graph theory and greedy algorithms.",
63
+ "subfield": "Graph Theory"
64
+ },
65
+ {
66
+ "theorem": "Prim's algorithm",
67
+ "description": "greedy algorithm that maintains a priority queue of vertices in the graph ordered by connecting edge weight",
68
+ "difficulty": "Hard",
69
+ "remark": "",
70
+ "subfield": "Graph Theory"
71
+ },
72
+ {
73
+ "theorem": "Region growing by pixel aggregation",
74
+ "description": "Region growing by pixel aggregation is a technique used in image processing to segment an image into regions based on the similarity of pixel values.",
75
+ "difficulty": "Hard",
76
+ "remark": "",
77
+ "subfield": "Image Processing"
78
+ },
79
+ {
80
+ "theorem": "Arithmetic coding",
81
+ "description": "Arithmetic coding is a lossless data compression technique that assigns a unique code to each symbol in a message based on its probability of occurrence.",
82
+ "difficulty": "Hard",
83
+ "remark": "",
84
+ "subfield": "Data Compression"
85
+ },
86
+ {
87
+ "theorem": "Expectation–maximization (EM) algorithm",
88
+ "description": "an expectation–maximization (EM) algorithm is an iterative method to find (local) maximum likelihood or maximum a posteriori (MAP) estimates of parameters in statistical models, where the model depends on unobserved latent variables.",
89
+ "difficulty": "Hard",
90
+ "remark": "",
91
+ "subfield": "Machine Learning"
92
+ },
93
+ {
94
+ "theorem": "Differential entropy",
95
+ "description": "Differential entropy, also known as continuous entropy, is a concept in information theory that extends the idea of entropy from discrete random variables to continuous random variables. Entropy, in general, is a measure of the uncertainty or randomness associated with a random variable. In the context of information theory, it quantifies the average amount of information required to describe the outcome of a random variable.\n\nFor discrete random variables, entropy is well-defined using the Shannon entropy formula, which sums the product of the probability of each outcome and the logarithm of its reciprocal probability. However, for continuous random variables, the probability of any specific outcome is zero, making the Shannon entropy formula inapplicable.\n\nDifferential entropy addresses this issue by considering the probability density function (pdf) of a continuous random variable instead of the probabilities of individual outcomes. The differential entropy H(X) of a continuous random variable X with a probability density function f(x) is defined as:\n\nH(X) = - \u222b f(x) * log(f(x)) dx\n\nwhere the integral is taken over the entire range of the random variable X, and log is the logarithm base 2 (or any other base, depending on the desired unit of measurement for entropy).\n\nDifferential entropy can be interpreted as the average amount of information required to describe the outcome of a continuous random variable with a given probability density function. However, unlike the entropy of discrete random variables, differential entropy can be negative, which occurs when the probability density function is highly concentrated around certain values.\n\nIt is important to note that differential entropy is not a direct extension of discrete entropy, and some properties of discrete entropy do not hold for differential entropy. For example, differential entropy is not invariant under changes of variables or coordinate transformations, whereas discrete entropy is invariant under permutations of the outcomes.",
96
+ "difficulty": "Hard",
97
+ "remark": "",
98
+ "subfield": "Information Theory"
99
+ },
100
+ {
101
+ "theorem": "Kullback–Leibler divergence",
102
+ "description": "a type of statistical distance: a measure of how much a model probability distribution Q is different from a true probability distribution P.",
103
+ "difficulty": "Hard",
104
+ "remark": "",
105
+ "subfield": "Information Theory"
106
+ },
107
+ {
108
+ "theorem": "Principal component analysis",
109
+ "description": "Principal component analysis (PCA) is a statistical method that reduces the dimensions of a dataset to a smaller set of components.",
110
+ "difficulty": "Hard",
111
+ "remark": "",
112
+ "subfield": "Machine Learning"
113
+ },
114
+ {
115
+ "theorem": "Self-attention",
116
+ "description": "Self-attention is a mechanism in neural networks that allows the model to focus on different parts of the input sequence when making predictions.",
117
+ "difficulty": "Hard",
118
+ "remark": "",
119
+ "subfield": "Machine Learning"
120
+ },
121
+ {
122
+ "theorem": "Adversarial training",
123
+ "description": "Adversarial Training is a machine learning technique that is primarily used for improving the robustness of models. It's a process where models are trained with malicious inputs (adversarial examples) alongside the genuine data.",
124
+ "difficulty": "Hard",
125
+ "remark": "",
126
+ "subfield": "Machine Learning"
127
+ },
128
+ {
129
+ "theorem": "Forward-Backward Algorithm",
130
+ "description": "The Forward-Backward Algorithm is a dynamic programming algorithm used in Hidden Markov Models (HMMs) to compute the posterior probabilities of hidden states given a sequence of observations. It is a stochastic process that combines both the forward and backward algorithms to efficiently compute these probabilities.\n\nThe algorithm consists of two main steps:\n\n1. Forward Algorithm:\nThe forward algorithm computes the probability of observing a particular sequence of observations up to a certain time step, given the hidden state at that time step. It calculates the forward probabilities, which are the joint probabilities of the observed sequence and the hidden state at each time step. The forward algorithm uses a recursive approach, where the forward probability at each time step is calculated based on the forward probabilities of the previous time step.\n\n2. Backward Algorithm:\nThe backward algorithm computes the probability of observing the remaining sequence of observations from a certain time step onwards, given the hidden state at that time step. It calculates the backward probabilities, which are the conditional probabilities of the future observations given the hidden state at each time step. Similar to the forward algorithm, the backward algorithm also uses a recursive approach, where the backward probability at each time step is calculated based on the backward probabilities of the next time step.\n\nAfter computing the forward and backward probabilities, the Forward-Backward Algorithm combines these probabilities to calculate the posterior probabilities of the hidden states at each time step. The posterior probability of a hidden state at a particular time step is the probability of that state given the entire sequence of observations. This is computed by multiplying the forward probability and the backward probability for that state at that time step and then normalizing the result.\n\nThe Forward-Backward Algorithm is widely used in various applications, such as speech recognition, natural language processing, and bioinformatics, where the goal is to infer the most likely sequence of hidden states given a sequence of observations.",
131
+ "difficulty": "Hard",
132
+ "remark": "",
133
+ "subfield": "Dynamic Programming"
134
+ },
135
+ {
136
+ "theorem": "Cook-Levin Theorem",
137
+ "description": "In computational complexity theory, the Cook–Levin theorem, also known as Cook's theorem, states that the Boolean satisfiability problem is NP-complete.",
138
+ "difficulty": "Hard",
139
+ "remark": "",
140
+ "subfield": "Computational Complexity"
141
+ }
142
+ ]
data/thb_hard/math.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "Taylor's theorem",
4
+ "description": "Taylor's theorem gives an approximation of a k-times differentiable function around a given point by a polynomial of degree k, called the k-th order Taylor polynomial.",
5
+ "difficulty": "Hard",
6
+ "remark": "",
7
+ "subfield": "Calculus"
8
+ },
9
+ {
10
+ "theorem": "Simpson's rule",
11
+ "description": "In numerical integration, Simpson's rules are several approximations for definite integrals, named after Thomas Simpson.",
12
+ "difficulty": "Hard",
13
+ "remark": "",
14
+ "subfield": "Numerical Analysis"
15
+ },
16
+ {
17
+ "theorem": "Velocity vector",
18
+ "description": "Velocity is the speed in combination with the direction of motion of an object.",
19
+ "difficulty": "Hard",
20
+ "remark": "",
21
+ "subfield": "Vector Calculus"
22
+ },
23
+ {
24
+ "theorem": "Double Riemann sum",
25
+ "description": "A double Riemann sum is a mathematical method used to approximate the value of a double integral over a two-dimensional region.",
26
+ "difficulty": "Hard",
27
+ "remark": "",
28
+ "subfield": "Multivariable Calculus"
29
+ },
30
+ {
31
+ "theorem": "Fubini's theorem",
32
+ "description": "Fubini's Theorem is a fundamental result in calculus that allows the evaluation of a double integral as an iterated integral, provided certain conditions are met. It simplifies the computation of double integrals over a rectangular or general region by breaking them into two single integrals.",
33
+ "difficulty": "Hard",
34
+ "remark": "",
35
+ "subfield": "Multivariable Calculus"
36
+ },
37
+ {
38
+ "theorem": "Jacobian matrix and determinant",
39
+ "description": "In vector calculus, the Jacobian matrix of a vector-valued function of several variables is the matrix of all its first-order partial derivatives.",
40
+ "difficulty": "Hard",
41
+ "remark": "",
42
+ "subfield": "Vector Calculus"
43
+ },
44
+ {
45
+ "theorem": "Green's theorem",
46
+ "description": "Green's theorem is used to integrate the derivatives in a particular plane.",
47
+ "difficulty": "Hard",
48
+ "remark": "",
49
+ "subfield": "Vector Calculus"
50
+ },
51
+ {
52
+ "theorem": "Stokes' theorem",
53
+ "description": "relates the flux integral over a surface S to a line integral around the boundary C of the surface S",
54
+ "difficulty": "Hard",
55
+ "remark": "",
56
+ "subfield": "Vector Calculus"
57
+ },
58
+ {
59
+ "theorem": "Burnside's Lemma",
60
+ "description": "Burnside's Lemma, also known as the Cauchy-Frobenius Lemma or the Orbit-Counting Theorem, is a fundamental result in combinatorics that deals with counting the number of distinct elements in a set under the action of a group. It is particularly useful in counting problems involving symmetries and permutations.\n\nThe lemma is named after the British mathematician William Burnside, who contributed significantly to the development of group theory.\n\nStatement of Burnside's Lemma:\n\nLet G be a finite group that acts on a finite set X. Then the number of distinct orbits of X under the action of G is given by:\n\n(1/|G|) * \u03a3 |Fix(g)|\n\nwhere |G| is the order of the group (i.e., the number of elements in G), the sum is taken over all elements g in G, and |Fix(g)| is the number of elements in X that are fixed by the action of g (i.e., the number of elements x in X such that g(x) = x).\n\nIn simpler terms, Burnside's Lemma states that the number of distinct orbits (or equivalence classes) in a set under the action of a group can be found by averaging the number of fixed points of each group element.\n\nBurnside's Lemma is often used in combinatorial problems where we need to count the number of distinct configurations of an object, taking into account its symmetries. By applying the lemma, we can avoid overcounting configurations that are equivalent under a given symmetry operation.",
61
+ "difficulty": "Hard",
62
+ "remark": "",
63
+ "subfield": "Group Theory"
64
+ },
65
+ {
66
+ "theorem": "Lah Number",
67
+ "description": "In mathematics, the (signed and unsigned) Lah numbers are coefficients expressing rising factorials in terms of falling factorials and vice versa.",
68
+ "difficulty": "Hard",
69
+ "remark": "",
70
+ "subfield": "Combinatorics"
71
+ },
72
+ {
73
+ "theorem": "Ramsey's theorem",
74
+ "description": "Ramsey's theorem essentially states that if a structure (such as a graph or a set of numbers) is large enough, then some kind of order or regularity will always emerge, no matter how it is arranged or colored.",
75
+ "difficulty": "Hard",
76
+ "remark": "",
77
+ "subfield": "Combinatorics"
78
+ },
79
+ {
80
+ "theorem": "Schwarz Lemma theorem",
81
+ "description": "Schwarz Lemma is a fundamental result in complex analysis that provides a bound on the behavior of holomorphic functions (i.e., complex-differentiable functions) in the unit disk. It is named after the German mathematician Hermann Schwarz.\n\nStatement of Schwarz Lemma:\n\nLet f be a holomorphic function on the open unit disk D = {z \u2208 \u2102 : |z| < 1} such that f(0) = 0 and |f(z)| \u2264 1 for all z \u2208 D. Then, for all z \u2208 D, the following inequalities hold:\n\n1. |f(z)| \u2264 |z|\n2. |f'(0)| \u2264 1\n\nMoreover, if equality holds for some z \u2260 0 (i.e., |f(z)| = |z|) or |f'(0)| = 1, then f is a rotation, i.e., f(z) = e^(i\u03b8)z for some real \u03b8.\n\nThe Schwarz Lemma has several important consequences and generalizations in complex analysis, such as the Riemann Mapping Theorem and the Pick's Lemma. It is a powerful tool for understanding the behavior of holomorphic functions in the unit disk and provides a way to compare the size of their derivatives at the origin.",
82
+ "difficulty": "Hard",
83
+ "remark": "",
84
+ "subfield": "Complex Analysis"
85
+ },
86
+ {
87
+ "theorem": "Cauchy Riemann Theorem",
88
+ "description": "The Cauchy-Riemann Theorem is a fundamental result in complex analysis, a branch of mathematics that studies functions of complex variables. It provides necessary and sufficient conditions for a complex function to be holomorphic (complex differentiable) in a given domain.",
89
+ "difficulty": "Hard",
90
+ "remark": "",
91
+ "subfield": "Complex Analysis"
92
+ },
93
+ {
94
+ "theorem": "Morera's Theorem",
95
+ "description": "Morera's theorem, named after Giacinto Morera, gives an important criterion for proving that a function is holomorphic.",
96
+ "difficulty": "Hard",
97
+ "remark": "",
98
+ "subfield": "Complex Analysis"
99
+ },
100
+ {
101
+ "theorem": "Catalan-Mingantu Number",
102
+ "description": "The Catalan numbers are a sequence of natural numbers that occur in various counting problems, often involving recursively defined objects. ",
103
+ "difficulty": "Hard",
104
+ "remark": "",
105
+ "subfield": "Combinatorics"
106
+ },
107
+ {
108
+ "theorem": "Liouville's theorem",
109
+ "description": "Liouville's theorem states that: The density of states in an ensemble of many identical states with different initial conditions is constant along every trajectory in phase space. It states that if one constructs an ensemble of paths, the probability density along the trajectory remains constant.",
110
+ "difficulty": "Hard",
111
+ "remark": "",
112
+ "subfield": "Complex Analysis"
113
+ },
114
+ {
115
+ "theorem": "Derangement Formula",
116
+ "description": "In combinatorial mathematics, a derangement is a permutation of the elements of a set in which no element appears in its original position.",
117
+ "difficulty": "Hard",
118
+ "remark": "",
119
+ "subfield": "Combinatorics"
120
+ },
121
+ {
122
+ "theorem": "Delian problem",
123
+ "description": "Doubling the cube, also known as the Delian problem, is an ancient geometric problem. Given the edge of a cube, the problem requires the construction of the edge of a second cube whose volume is double that of the first.",
124
+ "difficulty": "Hard",
125
+ "remark": "",
126
+ "subfield": "Geometry"
127
+ },
128
+ {
129
+ "theorem": "Polya's Enumeration Theorem",
130
+ "description": "Pólya's Enumeration Theorem, also known as Pólya's Counting Theorem, is a powerful result in combinatorics used to count distinct arrangements or configurations of objects that are invariant under a group of symmetries.",
131
+ "difficulty": "Hard",
132
+ "remark": "",
133
+ "subfield": "Combinatorics"
134
+ },
135
+ {
136
+ "theorem": "Cauchy's theorem",
137
+ "description": "Cauchy's Theorem is a fundamental result in group theory, a branch of abstract algebra. It provides a condition under which a finite group contains an element of a specific order. It is named after the French mathematician Augustin-Louis Cauchy.",
138
+ "difficulty": "Hard",
139
+ "remark": "",
140
+ "subfield": "Group Theory"
141
+ }
142
+ ]
data/thb_hard/physics.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "Boltzmann machine",
4
+ "description": "It is a statistical physics technique applied in the context of cognitive science. It is also classified as a Markov random field.",
5
+ "difficulty": "Hard",
6
+ "remark": "",
7
+ "subfield": "Statistical Physics"
8
+ },
9
+ {
10
+ "theorem": "Geometric Brownian Motion",
11
+ "description": "A geometric Brownian motion (GBM) (also known as exponential Brownian motion) is a continuous-time stochastic process in which the logarithm of the randomly varying quantity follows a Brownian motion (also called a Wiener process) with drift.",
12
+ "difficulty": "Hard",
13
+ "remark": "",
14
+ "subfield": "Statistical Physics"
15
+ },
16
+ {
17
+ "theorem": "Fermat's Principle",
18
+ "description": "Fermat's principle states that light travels between two points along the path that requires the least time, as compared to other nearby paths.",
19
+ "difficulty": "Hard",
20
+ "remark": "",
21
+ "subfield": "Optics"
22
+ },
23
+ {
24
+ "theorem": "Huygens's Principle",
25
+ "description": "The Huygens–Fresnel principle states that every point on a wavefront is itself the source of spherical wavelets, and the secondary wavelets emanating from different points mutually interfere. The sum of these spherical wavelets forms a new wavefront.",
26
+ "difficulty": "Hard",
27
+ "remark": "",
28
+ "subfield": "Optics"
29
+ },
30
+ {
31
+ "theorem": "Virial Theorem",
32
+ "description": "In mechanics, the virial theorem provides a general equation that relates the average over time of the total kinetic energy of a stable system of discrete particles, bound by a conservative force, with that of the total potential energy of the system.",
33
+ "difficulty": "Hard",
34
+ "remark": "",
35
+ "subfield": "Classical Mechanics"
36
+ },
37
+ {
38
+ "theorem": "Poynting Theorem",
39
+ "description": "It states that in a given volume, the stored energy changes at a rate given by the work done on the charges within the volume, minus the rate at which energy leaves the volume.",
40
+ "difficulty": "Hard",
41
+ "remark": "",
42
+ "subfield": "Electromagnetism"
43
+ },
44
+ {
45
+ "theorem": "Fresnel transmission equations",
46
+ "description": "Fresnel's equations describe the reflection and transmission of electromagnetic waves at an interface.",
47
+ "difficulty": "Hard",
48
+ "remark": "",
49
+ "subfield": "Optics"
50
+ },
51
+ {
52
+ "theorem": "Fourier Heat Conduction Law",
53
+ "description": "Fourier's law states that the negative gradient of temperature and the time rate of heat transfer is proportional to the area at right angles of that gradient through which the heat flows.",
54
+ "difficulty": "Hard",
55
+ "remark": "",
56
+ "subfield": "Thermodynamics"
57
+ },
58
+ {
59
+ "theorem": "Ampère's circuital law",
60
+ "description": "Ampere's circuital law states that the line integral of the magnetic field surrounding closed-loop equals to the number of times the algebraic sum of currents passing through the loop.",
61
+ "difficulty": "Hard",
62
+ "remark": "",
63
+ "subfield": "Electromagnetism"
64
+ },
65
+ {
66
+ "theorem": "Malus's Law",
67
+ "description": "Malus law states that the intensity of a plane-polarised light that passes through an analyser is directly proportional to the square of the cosine of the angle between the plane of the polariser and the transmission axis of the analyser.",
68
+ "difficulty": "Hard",
69
+ "remark": "",
70
+ "subfield": "Optics"
71
+ },
72
+ {
73
+ "theorem": "Van der Waals Equation",
74
+ "description": "The van der Waals equation is a mathematical formula that describes the behavior of real gases. It is an equation of state that relates the pressure, temperature, and molar volume in a fluid.",
75
+ "difficulty": "Hard",
76
+ "remark": "",
77
+ "subfield": "Thermodynamics"
78
+ },
79
+ {
80
+ "theorem": "Rayleigh Criterion",
81
+ "description": "The Rayleigh criterion is the generally accepted criterion for the minimum resolvable detail - the imaging process is said to be diffraction-limited when the first diffraction minimum of the image of one source point coincides with the maximum of another.",
82
+ "difficulty": "Hard",
83
+ "remark": "",
84
+ "subfield": "Optics"
85
+ },
86
+ {
87
+ "theorem": "Paschen Curve",
88
+ "description": "Paschen's law is an equation that gives the breakdown voltage, that is, the voltage necessary to start a discharge or electric arc, between two electrodes in a gas as a function of pressure and gap length.",
89
+ "difficulty": "Hard",
90
+ "remark": "",
91
+ "subfield": "Electromagnetism"
92
+ },
93
+ {
94
+ "theorem": "Chandrasekhar Limit",
95
+ "description": "The Chandrasekhar limit is the maximum mass that a star can have and still be a stable white dwarf.",
96
+ "difficulty": "Hard",
97
+ "remark": "",
98
+ "subfield": "Astrophysics"
99
+ },
100
+ {
101
+ "theorem": "Landau Damping",
102
+ "description": "Landau damping is a phenomena observed in plasma wherein there is an ex- ponential decay in the oscillations of the number density of electrons in a plasma (also referred to as Langmuir waves) and so stability is achieved in some area of the phase-space.",
103
+ "difficulty": "Hard",
104
+ "remark": "",
105
+ "subfield": "Plasma Physics"
106
+ },
107
+ {
108
+ "theorem": "Schwarzschild radius",
109
+ "description": "The Schwarzschild radius is the critical distance from the center of a massive body where the gravitational pull becomes so strong that not even light can escape, defining the boundary of a black hole.",
110
+ "difficulty": "Hard",
111
+ "remark": "",
112
+ "subfield": "Astrophysics"
113
+ },
114
+ {
115
+ "theorem": "Babinet's Principle",
116
+ "description": "In physics, Babinet's principle states that the diffraction pattern from an opaque body is identical to that from a hole of the same size and shape except for the overall forward beam intensity.",
117
+ "difficulty": "Hard",
118
+ "remark": "",
119
+ "subfield": "Optics"
120
+ },
121
+ {
122
+ "theorem": "Schrödinger's Cat",
123
+ "description": "Schrödinger's cat is a thought experiment in quantum mechanics that illustrates the paradoxical nature of quantum superposition and wave function collapse.",
124
+ "difficulty": "Hard",
125
+ "remark": "",
126
+ "subfield": "Quantum Mechanics"
127
+ },
128
+ {
129
+ "theorem": "Rayleigh Criterion for Resolution",
130
+ "description": "For a circular aperture, lens, or mirror, the Rayleigh criterion states that two images are just resolvable when the center of the diffraction pattern of one is directly over the first minimum of the diffraction pattern of the other.",
131
+ "difficulty": "Hard",
132
+ "remark": "",
133
+ "subfield": "Optics"
134
+ },
135
+ {
136
+ "theorem": "Navier-Stokes Equations",
137
+ "description": "In fluid mechanics, the Navier-Stokes equations are partial differential equations that express the flow of viscous fluids.",
138
+ "difficulty": "Hard",
139
+ "remark": "",
140
+ "subfield": "Fluid Mechanics"
141
+ }
142
+ ]
data/thb_medium/chemistry.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "Le Chatelier's Principle",
4
+ "description": "When a system at equilibrium is subjected to a change in condition (such as temperature, pressure, or concentration), the system will shift in a direction that relieves the stress and a new equilibrium will be established. This principle helps predict how equilibrium will shift in response to external changes.",
5
+ "difficulty": "Medium",
6
+ "remark": "Essential for understanding chemical equilibrium and its practical applications in industrial processes.",
7
+ "subfield": "Chemical Equilibrium"
8
+ },
9
+ {
10
+ "theorem": "The Pauli Exclusion Principle",
11
+ "description": "No two electrons in the same atom can have the same set of four quantum numbers (n, l, ml, ms). This limits the number of electrons that can occupy an orbital, which is max two electrons, with opposite spins (+1/2 and -1/2). This explains electronic configuration in atoms.",
12
+ "difficulty": "Medium",
13
+ "remark": "Essential for understanding electronic structure and the basis for chemical bonding.",
14
+ "subfield": "Quantum Chemistry"
15
+ },
16
+ {
17
+ "theorem": "Raoult's Law",
18
+ "description": "The partial vapor pressure of a component in an ideal solution is equal to the vapor pressure of the pure component multiplied by its mole fraction in the solution: P_A = P_A* X_A. This helps to predict vapor pressure of ideal solutions and is a basis for colligative properties",
19
+ "difficulty": "Medium",
20
+ "remark": "Describes vapor pressure of solutions, useful in understanding boiling point elevation and freezing point depression.",
21
+ "subfield": "Physical Chemistry"
22
+ },
23
+ {
24
+ "theorem": "Beer-Lambert Law",
25
+ "description": "The absorbance of a solution is directly proportional to the concentration of the analyte and the path length of the light beam through the solution: A = \u03b5bc, where \u03b5 is molar absorptivity, b is path length, and c is the concentration. Useful in analytical chemistry for determining the concentration of a substance by measuring the light it absorbs.",
26
+ "difficulty": "Medium",
27
+ "remark": "Important in spectrophotometry for quantitative analysis of solutions.",
28
+ "subfield": "Analytical Chemistry"
29
+ },
30
+ {
31
+ "theorem": "Phase diagram",
32
+ "description": "Phase diagram is a graphical representation of the physical states of a substance under different conditions of temperature and pressure.",
33
+ "difficulty": "Medium",
34
+ "remark": "Useful in understanding the phase transitions of substances.",
35
+ "subfield": "Physical Chemistry"
36
+ },
37
+ {
38
+ "theorem": "Boyle's Law",
39
+ "description": "Raoult's law is a relation of physical chemistry, with implications in thermodynamics.",
40
+ "difficulty": "Medium",
41
+ "remark": "",
42
+ "subfield": "Physical Chemistry"
43
+ },
44
+ {
45
+ "theorem": "Graham's Law of Effusion",
46
+ "description": "Graham's law of effusion was formulated by Scottish physical chemist Thomas Graham in 1848. Graham found experimentally that the rate of effusion of a gas is inversely proportional to the square root of the molar mass of its particles.",
47
+ "difficulty": "Medium",
48
+ "remark": "",
49
+ "subfield": "Physical Chemistry"
50
+ },
51
+ {
52
+ "theorem": "Arrhenius Equation",
53
+ "description": "In physical chemistry, the Arrhenius equation is a formula for the temperature dependence of reaction rates.",
54
+ "difficulty": "Medium",
55
+ "remark": "",
56
+ "subfield": "Chemical Kinetics"
57
+ },
58
+ {
59
+ "theorem": "Henry's law",
60
+ "description": "the proportional relationship between the concentration of dissolved gas in a solution and the partial pressure of the gas in contact with the solution",
61
+ "difficulty": "Medium",
62
+ "remark": "",
63
+ "subfield": "Physical Chemistry"
64
+ },
65
+ {
66
+ "theorem": "Lewis Acid-Base Theory",
67
+ "description": "In the Lewis theory of acid-base reactions, bases donate pairs of electrons and acids accept pairs of electrons.",
68
+ "difficulty": "Medium",
69
+ "remark": "",
70
+ "subfield": "Acid-Base Chemistry"
71
+ },
72
+ {
73
+ "theorem": "Clausius-Clapeyron Equation",
74
+ "description": "allows us to estimate the vapor pressure at another temperature.",
75
+ "difficulty": "Medium",
76
+ "remark": "",
77
+ "subfield": "Thermodynamics"
78
+ },
79
+ {
80
+ "theorem": "Michaelis-Menten Kinetics",
81
+ "description": "In biochemistry, Michaelis–Menten kinetics, named after Leonor Michaelis and Maud Menten, is the simplest case of enzyme kinetics, applied to enzyme-catalysed reactions of one substrate and one product.",
82
+ "difficulty": "Medium",
83
+ "remark": "",
84
+ "subfield": "Chemical Kinetics"
85
+ },
86
+ {
87
+ "theorem": "Gibbs Free Energy Equation",
88
+ "description": "The change in free energy, ΔG, is equal to the sum of the enthalpy plus the product of the temperature and entropy of the system.",
89
+ "difficulty": "Medium",
90
+ "remark": "",
91
+ "subfield": "Thermodynamics"
92
+ },
93
+ {
94
+ "theorem": "Transition State Theory",
95
+ "description": "In chemistry, transition state theory (TST) explains the reaction rates of elementary chemical reactions.",
96
+ "difficulty": "Medium",
97
+ "remark": "",
98
+ "subfield": "Chemical Kinetics"
99
+ },
100
+ {
101
+ "theorem": "Koopman's Theorem",
102
+ "description": "Koopmans' theorem states that the first ionization energy of a molecule is equal to the negative of the energy of the highest occupied molecular orbital (HOMO).",
103
+ "difficulty": "Medium",
104
+ "remark": "",
105
+ "subfield": "Quantum Chemistry"
106
+ },
107
+ {
108
+ "theorem": "Recrystallization",
109
+ "description": "Recrystallization, also known as fractional crystallization, is a procedure for purifying an impure compound in a solvent.",
110
+ "difficulty": "Medium",
111
+ "remark": "",
112
+ "subfield": "Analytical Chemistry"
113
+ },
114
+ {
115
+ "theorem": "Electrogravimetry",
116
+ "description": "Electrogravimetry is a method used to separate and quantify ions of a substance, usually a metal. In this process, the analyte solution is electrolyzed.",
117
+ "difficulty": "Medium",
118
+ "remark": "",
119
+ "subfield": "Analytical Chemistry"
120
+ },
121
+ {
122
+ "theorem": "Kjeldahl Method",
123
+ "description": "The Kjeldahl method is a laboratory technique used to measure the amount of nitrogen in a sample. ",
124
+ "difficulty": "Medium",
125
+ "remark": "",
126
+ "subfield": "Analytical Chemistry"
127
+ },
128
+ {
129
+ "theorem": "Liquid-Liquid Extraction",
130
+ "description": "Liquid–liquid extraction, also known as solvent extraction and partitioning, is a method to separate compounds or metal complexes, based on their relative solubilities in two different immiscible liquids, usually water (polar) and an organic solvent (non-polar).",
131
+ "difficulty": "Medium",
132
+ "remark": "",
133
+ "subfield": "Analytical Chemistry"
134
+ },
135
+ {
136
+ "theorem": "Reflux",
137
+ "description": "Reflux is a laboratory technique where a reaction mixture is heated to boil and the vapors are condensed back into the reaction flask, allowing continuous heating without loss of volatile components.",
138
+ "difficulty": "Medium",
139
+ "remark": "",
140
+ "subfield": "Laboratory Techniques"
141
+ }
142
+ ]
data/thb_medium/comp_sci.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Halting Problem (Undecidability)",
4
+ "description": "There is no general algorithm (or program) that can determine, for any arbitrary computer program and its input, whether the program will eventually halt (stop) or run forever.",
5
+ "difficulty": "Medium",
6
+ "remark": "A core concept in theoretical computer science. Introduces the idea of limits of computation. Understanding the proof (usually using diagonalization) is key to grasp the concept. Usually taught in discrete math or Theory of Computation.",
7
+ "subfield": "Theory of Computation"
8
+ },
9
+ {
10
+ "theorem": "The Time Complexity of Binary Search",
11
+ "description": "In the worst case, searching for an element in a sorted array using binary search requires O(log n) time, where n is the number of elements in the array. This efficiency arises from repeatedly dividing the search interval in half.",
12
+ "difficulty": "Medium",
13
+ "remark": "Highlights the power of divide-and-conquer algorithms. Illustrates why sorted data structures are often essential. Requires understanding of logarithms",
14
+ "subfield": "Algorithms"
15
+ },
16
+ {
17
+ "theorem": "The Correctness of Simple Sorting Algorithm (e.g. Bubble Sort)",
18
+ "description": "Bubble sort repeatedly compares adjacent elements and swaps them if they are in the wrong order. We can formally prove that after n-1 passes, the array will be sorted. Proving it involves demonstrating that the largest element is 'bubbled' to the end of the array in each pass, by using loop invariants.",
19
+ "difficulty": "Medium",
20
+ "remark": "Demonstrates how to formally analyze simple algorithms for their correctness and requires some understanding of loop invariants. Useful for introduction to proofs in algorithm.",
21
+ "subfield": "Algorithms"
22
+ },
23
+ {
24
+ "theorem": "The Church-Turing Thesis",
25
+ "description": "All models of computation that we know can compute what is Turing computable. In other words, if an effective method (algorithm) for solving a problem exists at all, then a Turing machine can also compute a solution, and vice versa.",
26
+ "difficulty": "Medium",
27
+ "remark": "A fundamental principle in theoretical computer science. It defines the limit of computability. It links different computational models to a single class. Requires an understanding of the Turing Machine.",
28
+ "subfield": "Theory of Computation"
29
+ },
30
+ {
31
+ "theorem": "The Relationship between Recursion and Induction",
32
+ "description": "Recursive functions can be proven correct and analyzed with mathematical induction. The base case of induction matches the base case in the recursive function. The induction step corresponds to the recursive step.",
33
+ "difficulty": "Medium",
34
+ "remark": "Connects two key concepts in Computer Science. Illustrates how induction can be used to prove correctness of recursive algorithms and mathematical induction can be used to define recursive functions. Important for formal analysis.",
35
+ "subfield": "Programming Fundamentals"
36
+ },
37
+ {
38
+ "theorem": "Chroma Subsampling",
39
+ "description": "Chroma subsampling is a technique used in digital image processing to reduce the amount of data required to represent an image. It involves reducing the number of color channels or samples per pixel in an image, typically by using fewer bits for chroma (color) information.",
40
+ "difficulty": "Medium",
41
+ "remark": "",
42
+ "subfield": "Image Processing"
43
+ },
44
+ {
45
+ "theorem": "Median filtering",
46
+ "description": "Median filtering is a non-linear digital filtering technique that is used to remove noise from an image or signal. It works by replacing each pixel with the median value of the pixels in its neighborhood.",
47
+ "difficulty": "Medium",
48
+ "remark": "",
49
+ "subfield": "Image Processing"
50
+ },
51
+ {
52
+ "theorem": "Shannon Lower bound",
53
+ "description": "The Shannon Lower Bound refers to a theoretical limit in information theory that represents the minimum entropy or information required to encode a random source. It is tied to the Shannon Entropy, which quantifies the average information content of a random variable. Here's a breakdown of what it means:",
54
+ "difficulty": "Medium",
55
+ "remark": "",
56
+ "subfield": "Information Theory"
57
+ },
58
+ {
59
+ "theorem": "Dijkstra's algorithm",
60
+ "description": "maintains a priority queue of vertices in the graph ordered by distance from the start and repeatedly selects the next shortest path to an unconnected part of the graph",
61
+ "difficulty": "Medium",
62
+ "remark": "",
63
+ "subfield": "Graph Theory"
64
+ },
65
+ {
66
+ "theorem": "K-means clustering",
67
+ "description": "K-means clustering is a method of clustering that partitions the dataset into K clusters, where each cluster is represented by its centroid or center point.",
68
+ "difficulty": "Medium",
69
+ "remark": "",
70
+ "subfield": "Machine Learning"
71
+ },
72
+ {
73
+ "theorem": "K-nearest neighbors",
74
+ "description": "K-nearest neighbors (KNN) is a simple and effective classification algorithm that works by finding the K closest data points in the training set to a new data point and then assigning the class label based on the majority class of these neighbors.",
75
+ "difficulty": "Medium",
76
+ "remark": "",
77
+ "subfield": "Machine Learning"
78
+ },
79
+ {
80
+ "theorem": "Gradient descent",
81
+ "description": "Common optimization algorithm used in machine learning to minimize a loss function.",
82
+ "difficulty": "Medium",
83
+ "remark": "",
84
+ "subfield": "Machine Learning"
85
+ },
86
+ {
87
+ "theorem": "Markov Decision Processes",
88
+ "description": "A Markov decision process (MDP) refers to a stochastic decision-making process that uses a mathematical framework to model the decision-making of a dynamic system.",
89
+ "difficulty": "Medium",
90
+ "remark": "",
91
+ "subfield": "Machine Learning"
92
+ },
93
+ {
94
+ "theorem": "ALOHA network",
95
+ "description": "ALOHA is basically a multiple access protocol which describes how all the terminals can access a medium without interfering at all with one another or even colliding. It operates at the data-link layer.",
96
+ "difficulty": "Medium",
97
+ "remark": "",
98
+ "subfield": "Computer Networks"
99
+ },
100
+ {
101
+ "theorem": "Discrete Cosine Transform",
102
+ "description": "A discrete cosine transform (DCT) expresses a finite sequence of data points in terms of a sum of cosine functions oscillating at different frequencies.",
103
+ "difficulty": "Medium",
104
+ "remark": "",
105
+ "subfield": "Digital Signal Processing"
106
+ },
107
+ {
108
+ "theorem": "Master Theorem",
109
+ "description": "The master theorem is used in calculating the time complexity of recurrence relations (divide and conquer algorithms) in a simple and quick way.",
110
+ "difficulty": "Medium",
111
+ "remark": "",
112
+ "subfield": "Algorithms"
113
+ },
114
+ {
115
+ "theorem": "Fast Fourier Transform",
116
+ "description": "A fast Fourier transform (FFT) is an algorithm that computes the Discrete Fourier Transform (DFT) of a sequence, or its inverse (IDFT).",
117
+ "difficulty": "Medium",
118
+ "remark": "",
119
+ "subfield": "Digital Signal Processing"
120
+ },
121
+ {
122
+ "theorem": "SR latch",
123
+ "description": "S-R latches i.e., Set-Reset latches are the simplest form of latches and are implemented using two inputs: S (Set) and R (Reset).",
124
+ "difficulty": "Medium",
125
+ "remark": "",
126
+ "subfield": "Digital Logic"
127
+ },
128
+ {
129
+ "theorem": "TCP Reno",
130
+ "description": "TCP Reno is a classic congestion control algorithm that was introduced in the early 1990s. It uses a mechanism called additive increase multiplicative decrease (AIMD) to adjust the TCP window size, which is the amount of data that can be sent without waiting for an acknowledgment.",
131
+ "difficulty": "Medium",
132
+ "remark": "",
133
+ "subfield": "Computer Networks"
134
+ },
135
+ {
136
+ "theorem": "Chord P2P Network and finger table",
137
+ "description": "Chord addresses peer addressability and peer findability and message routability challenges by organizing all peers in the P2P network into a single virtual ring.",
138
+ "difficulty": "Medium",
139
+ "remark": "",
140
+ "subfield": "Computer Networks"
141
+ }
142
+ ]
data/thb_medium/math.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Factor Theorem",
4
+ "description": "A polynomial f(x) has a factor (x - a) if and only if f(a) = 0. This theorem helps in finding roots and factors of polynomials.",
5
+ "difficulty": "Medium",
6
+ "remark": "Crucial for solving polynomial equations and understanding polynomial behavior.",
7
+ "subfield": "Algebra"
8
+ },
9
+ {
10
+ "theorem": "The Law of Sines",
11
+ "description": "In any triangle, the ratio of the length of a side to the sine of its opposite angle is constant. If a, b, and c are the side lengths, and A, B, and C are the opposite angles, then a/sin(A) = b/sin(B) = c/sin(C).",
12
+ "difficulty": "Medium",
13
+ "remark": "Useful for solving triangles when you have angle-side relationships.",
14
+ "subfield": "Trigonometry"
15
+ },
16
+ {
17
+ "theorem": "The Binomial Theorem",
18
+ "description": "For any non-negative integer n and real numbers a and b, (a + b)^n = Σ(k=0 to n) [n choose k] a^(n-k) b^k, where [n choose k] is the binomial coefficient, also written as nCk. It gives a formula for expanding powers of binomials.",
19
+ "difficulty": "Medium",
20
+ "remark": "Important in algebra, combinatorics, and probability.",
21
+ "subfield": "Algebra"
22
+ },
23
+ {
24
+ "theorem": "The Intermediate Value Theorem",
25
+ "description": "If f(x) is a continuous function on a closed interval [a, b] and k is any number between f(a) and f(b), then there exists at least one number c in the interval [a, b] such that f(c) = k. This theorem helps to find roots and demonstrate the behavior of continuous functions.",
26
+ "difficulty": "Medium",
27
+ "remark": "Fundamental for understand continuous functions in calculus",
28
+ "subfield": "Calculus"
29
+ },
30
+ {
31
+ "theorem": "The Cosine Rule",
32
+ "description": "In any triangle, the square of the length of one side is equal to the sum of the squares of the lengths of the other two sides minus twice the product of the lengths of those two sides multiplied by the cosine of the angle between them. For a triangle with side lengths a, b, c, and opposite angles A, B, C: a² = b² + c² - 2bc*cos(A). Similar formulas are valid for b² and c².",
33
+ "difficulty": "Medium",
34
+ "remark": "Used in any triangle to solve for sides and/or angles",
35
+ "subfield": "Trigonometry"
36
+ },
37
+ {
38
+ "theorem": "The Divergence Test",
39
+ "description": "If lim (n→∞) aₙ ≠ 0 or doesn't exist, then the series ∑aₙ diverges. It is a simple test to identify divergent series but will not be able to determine if the series is convergent.",
40
+ "difficulty": "Medium",
41
+ "remark": "An important initial check when examining series convergence.",
42
+ "subfield": "Calculus"
43
+ },
44
+ {
45
+ "theorem": "The Squeeze Theorem (or Sandwich Theorem)",
46
+ "description": "If g(x) ≤ f(x) ≤ h(x) for all x near a (except possibly at a), and if lim(x→a) g(x) = L and lim(x→a) h(x) = L, then lim(x→a) f(x) = L. Useful for evaluating limits when direct calculation is difficult, by bounding a function between two simpler functions.",
47
+ "difficulty": "Medium",
48
+ "remark": "Commonly used in calculus for finding challenging limits.",
49
+ "subfield": "Calculus"
50
+ },
51
+ {
52
+ "theorem": "The Chain Rule",
53
+ "description": "The chain rule is a formula for finding the derivative of a composite function. It states that the derivative of a function composed of two functions is the product of the derivative of the outer function and the derivative of the inner function.",
54
+ "difficulty": "Medium",
55
+ "remark": "Commonly used in calculus for finding the derivative of composite functions.",
56
+ "subfield": "Calculus"
57
+ },
58
+ {
59
+ "theorem": "Product Rule",
60
+ "description": "The product rule is a formula for finding the derivative of a product of two functions. It states that the derivative of a product of two functions is the sum of the product of the first function and the derivative of the second function, and the product of the second function and the derivative of the first function.",
61
+ "difficulty": "Medium",
62
+ "remark": "Commonly used in calculus for finding the derivative of products of functions.",
63
+ "subfield": "Calculus"
64
+ },
65
+ {
66
+ "theorem": "Quotient Rule",
67
+ "description": "The quotient rule is a formula for finding the derivative of a quotient of two functions. It states that the derivative of a quotient of two functions is the quotient of the derivative of the numerator and the denominator, minus the product of the numerator and the derivative of the denominator, all divided by the square of the denominator.",
68
+ "difficulty": "Medium",
69
+ "remark": "Commonly used in calculus for finding the derivative of quotients of functions.",
70
+ "subfield": "Calculus"
71
+ },
72
+ {
73
+ "theorem": "Power Rule",
74
+ "description": "The power rule is a formula for finding the derivative of a power of a function. It states that the derivative of a power of a function is the product of the power and the derivative of the function.",
75
+ "difficulty": "Medium",
76
+ "remark": "Commonly used in calculus for finding the derivative of powers of functions.",
77
+ "subfield": "Calculus"
78
+ },
79
+ {
80
+ "theorem": "Integration by Substitution",
81
+ "description": "Integration by substitution is a technique used to simplify the integration of a function by substituting a new variable for the original variable.",
82
+ "difficulty": "Medium",
83
+ "remark": "Commonly used in calculus for finding the integral of functions.",
84
+ "subfield": "Calculus"
85
+ },
86
+ {
87
+ "theorem": "Disk & Washer Method",
88
+ "description": "The washer method formula is used to find the volume of two functions that are rotated around the x-axis.",
89
+ "difficulty": "Medium",
90
+ "remark": "",
91
+ "subfield": "Calculus"
92
+ },
93
+ {
94
+ "theorem": "Extreme value theorem",
95
+ "description": "if 𝑓 is a continuous function over a finite, closed interval, then 𝑓 has an absolute maximum and an absolute minimum",
96
+ "difficulty": "Medium",
97
+ "remark": "",
98
+ "subfield": "Calculus"
99
+ },
100
+ {
101
+ "theorem": "Fermat's theorem",
102
+ "description": "if 𝑓 has a local extremum at 𝑐, then 𝑐 is a critical point of 𝑓",
103
+ "difficulty": "Medium",
104
+ "remark": "",
105
+ "subfield": "Calculus"
106
+ },
107
+ {
108
+ "theorem": "Mean Value Theorem",
109
+ "description": "Mean Value Theorem states that if a function f is continuous on the closed interval [a,b] and differentiable on the open interval (a,b), then there exists a point c in the interval (a,b) such that f'(c) is equal to the function's average rate of change over [a,b].",
110
+ "difficulty": "Medium",
111
+ "remark": "",
112
+ "subfield": "Calculus"
113
+ },
114
+ {
115
+ "theorem": "Newton-Raphson method",
116
+ "description": "The Newton-Raphson method, also known as the Newton's method, is a widely used iterative numerical technique for finding the approximate roots of a real-valued function. It is named after Sir Isaac Newton and Joseph Raphson, who independently developed the method in the 17th century.\n\nThe method is based on the idea of linear approximation, where a function is approximated by its tangent line at a given point. The intersection of this tangent line with the x-axis provides a better approximation of the root than the initial point. This process is then repeated iteratively until the desired level of accuracy is achieved.\n\nGiven a function f(x) and an initial guess x0 for the root, the Newton-Raphson method can be described by the following iterative formula:\n\nx1 = x0 - f(x0) / f'(x0)\n\nHere, f'(x0) is the derivative of the function f(x) evaluated at the point x0. The new approximation x1 is then used as the starting point for the next iteration, and the process is repeated until the difference between successive approximations is smaller than a predefined tolerance level or a maximum number of iterations is reached.\n\nThe Newton-Raphson method converges rapidly when the initial guess is close to the actual root and the function is well-behaved. However, the method may fail to converge or converge to a wrong root if the initial guess is not close enough to the actual root, or if the function has multiple roots, or if the derivative of the function is zero or nearly zero at the root.\n\nDespite these limitations, the Newton-Raphson method is widely used in various fields of science and engineering due to its simplicity and fast convergence properties when applied to well-behaved functions.",
117
+ "difficulty": "Medium",
118
+ "remark": "",
119
+ "subfield": "Numerical Analysis"
120
+ },
121
+ {
122
+ "theorem": "Rolle's theorem",
123
+ "description": "Rolle's theorem or Rolle's lemma essentially states that any real-valued differentiable function that attains equal values at two distinct points must have at least one point, somewhere between them, at which the slope of the tangent line is zero.",
124
+ "difficulty": "Medium",
125
+ "remark": "",
126
+ "subfield": "Calculus"
127
+ },
128
+ {
129
+ "theorem": "Second derivative test",
130
+ "description": "The second partial derivatives test classifies the point as a local maximum or local minimum.",
131
+ "difficulty": "Medium",
132
+ "remark": "",
133
+ "subfield": "Calculus"
134
+ },
135
+ {
136
+ "theorem": "Pappus's Theorem",
137
+ "description": "Pappus's centroid theorem is either of two related theorems dealing with the surface areas and volumes of surfaces and solids of revolution.",
138
+ "difficulty": "Medium",
139
+ "remark": "",
140
+ "subfield": "Geometry"
141
+ }
142
+ ]
data/thb_medium/physics.json ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "theorem": "The Work-Energy Theorem",
4
+ "description": "The net work done on an object is equal to the change in its kinetic energy. Mathematically, this is expressed as W_net = \u0394KE, where W_net is the net work and \u0394KE is the change in kinetic energy.",
5
+ "difficulty": "Medium",
6
+ "remark": "This theorem connects force, displacement, and energy. It's crucial for analyzing motion when forces are not constant or when the detailed time evolution is not needed. It's often used to solve problems involving motion and energy transfer.",
7
+ "subfield": "Classical Mechanics"
8
+ },
9
+ {
10
+ "theorem": "The Law of Conservation of Energy",
11
+ "description": "In a closed system, the total energy remains constant; it can transform from one form to another (e.g., potential to kinetic) but cannot be created or destroyed. Mathematically, E_total_initial = E_total_final.",
12
+ "difficulty": "Medium",
13
+ "remark": "This is a fundamental principle in physics, applicable to a wide range of scenarios from mechanics to thermodynamics. It simplifies problem-solving by focusing on energy balance rather than detailed force interactions.",
14
+ "subfield": "Classical Mechanics"
15
+ },
16
+ {
17
+ "theorem": "The Law of Universal Gravitation",
18
+ "description": "Any two objects with mass attract each other with a force that is directly proportional to the product of their masses and inversely proportional to the square of the distance between their centers. F = G(m\u2081m\u2082)/r\u00b2, where G is the gravitational constant.",
19
+ "difficulty": "Medium",
20
+ "remark": "This law describes the gravitational force that governs the motions of celestial bodies and explains why things fall towards the earth. Its mathematical form shows the distance dependence of the gravitational force.",
21
+ "subfield": "Gravitation"
22
+ },
23
+ {
24
+ "theorem": "Archimedes' Principle",
25
+ "description": "An object immersed in a fluid experiences an upward buoyant force equal to the weight of the fluid displaced by the object. This principle explains buoyancy and is crucial for understanding why objects float or sink.",
26
+ "difficulty": "Medium",
27
+ "remark": "Connects the density of a fluid, the volume of displaced fluid, and the buoyant force. It's used to design boats and determine densities through buoyancy measurements.",
28
+ "subfield": "Fluid Mechanics"
29
+ },
30
+ {
31
+ "theorem": "The Doppler Effect",
32
+ "description": "Describes the change in frequency of a wave (sound or light) when the source and the observer are moving relative to each other. The perceived frequency shifts higher when the source and observer move closer and lower when they move apart. The mathematical formulation differs for sound and light.",
33
+ "difficulty": "Medium",
34
+ "remark": "Has applications in areas like radar speed guns, medical imaging, astronomy for finding the recession velocity of galaxies. It's crucial in understanding wave phenomena in a dynamic context.",
35
+ "subfield": "Wave Physics"
36
+ },
37
+ {
38
+ "theorem": "The Principle of Superposition of Waves",
39
+ "description": "When two or more waves overlap in a medium, the resultant displacement at any point is the vector sum of the displacements of the individual waves at that point. This principle governs wave interference and diffraction phenomena.",
40
+ "difficulty": "Medium",
41
+ "remark": "Explains how waves combine with each other. Its application can create both constructive and destructive interference effects. Essential in understanding the behavior of light and sound, diffraction gratings.",
42
+ "subfield": "Wave Physics"
43
+ },
44
+ {
45
+ "theorem": "Kepler's laws of planetary motion",
46
+ "description": "These laws describe the motion of planets around the sun. Kepler's First Law states that planets orbit in elliptical paths with the sun at one of the two foci. Kepler's Second Law states that a line drawn from the sun to a planet sweeps out equal areas in equal times. Kepler's Third Law relates the orbital period of a planet to its average distance from the sun.",
47
+ "difficulty": "Medium",
48
+ "remark": "These laws are crucial for understanding the motion of planets and are used in astronomy and space science.",
49
+ "subfield": "Astrophysics"
50
+ },
51
+ {
52
+ "theorem": "Gauss's law",
53
+ "description": "Gauss's law states that the electric flux through any closed surface is equal to the charge enclosed by the surface divided by the permittivity of free space.",
54
+ "difficulty": "Medium",
55
+ "remark": "This law is fundamental to understanding the relationship between electric fields and charges. It's used in electrostatics and electromagnetism to calculate electric fields around charged objects.",
56
+ "subfield": "Electromagnetism"
57
+ },
58
+ {
59
+ "theorem": "Stokes' law",
60
+ "description": "Stokes' Law describes the force of viscous drag on a small spherical object moving through a viscous fluid.",
61
+ "difficulty": "Medium",
62
+ "remark": "",
63
+ "subfield": "Fluid Mechanics"
64
+ },
65
+ {
66
+ "theorem": "Bernoulli's principle",
67
+ "description": "Bernoulli's principle is a key concept in fluid dynamics that relates pressure, density, speed and height. Bernoulli's principle states that an increase in the speed of a parcel of fluid occurs simultaneously with a decrease in either the pressure or the height above a datum.",
68
+ "difficulty": "Medium",
69
+ "remark": "",
70
+ "subfield": "Fluid Mechanics"
71
+ },
72
+ {
73
+ "theorem": "Poiseuille's law",
74
+ "description": "the rate of laminar flow of an incompressible fluid in a tube.",
75
+ "difficulty": "Medium",
76
+ "remark": "",
77
+ "subfield": "Fluid Mechanics"
78
+ },
79
+ {
80
+ "theorem": "Stefan-Boltzmann Law of Radiation",
81
+ "description": "The Stefan–Boltzmann law, also known as Stefan's law, describes the intensity of the thermal radiation emitted by matter in terms of that matter's temperature. It is named for Josef Stefan, who empirically derived the relationship, and Ludwig Boltzmann who derived the law theoretically.",
82
+ "difficulty": "Medium",
83
+ "remark": "",
84
+ "subfield": "Thermodynamics"
85
+ },
86
+ {
87
+ "theorem": "Carnot cycle",
88
+ "description": "A Carnot cycle is an ideal thermodynamic cycle proposed by French physicist Sadi Carnot in 1824.",
89
+ "difficulty": "Medium",
90
+ "remark": "",
91
+ "subfield": "Thermodynamics"
92
+ },
93
+ {
94
+ "theorem": "Electromagnetic spectrum",
95
+ "description": "The electromagnetic spectrum is the full range of electromagnetic radiation, organized by frequency or wavelength. The spectrum is divided into separate bands, with different names for the electromagnetic waves within each band.",
96
+ "difficulty": "Easy",
97
+ "remark": "",
98
+ "subfield": "Electromagnetism"
99
+ },
100
+ {
101
+ "theorem": "Ampere's law",
102
+ "description": "In classical electromagnetism, Ampère's circuital law relates the circulation of a magnetic field around a closed loop to the electric current passing through the loop.",
103
+ "difficulty": "Medium",
104
+ "remark": "",
105
+ "subfield": "Electromagnetism"
106
+ },
107
+ {
108
+ "theorem": "Brewster's law",
109
+ "description": "Brewster's law is a relationship of light waves at the maximum polarization angle of light.",
110
+ "difficulty": "Medium",
111
+ "remark": "",
112
+ "subfield": "Optics"
113
+ },
114
+ {
115
+ "theorem": "Brownian motion",
116
+ "description": "Brownian motion is the seemingly random motion of particles within a liquid or gas that emerges from constant collisions and redirection from impacting the atoms or molecules within the fluid. All matter is in constant motion which results in Brownian motion.",
117
+ "difficulty": "Medium",
118
+ "remark": "",
119
+ "subfield": "Statistical Physics"
120
+ },
121
+ {
122
+ "theorem": "Hubble's law",
123
+ "description": "Hubble's law, also known as the Hubble–Lemaître law, is the observation in physical cosmology that galaxies are moving away from Earth at speeds proportional to their distance. In other words, the farther a galaxy is from the Earth, the faster it moves away.",
124
+ "difficulty": "Medium",
125
+ "remark": "",
126
+ "subfield": "Astrophysics"
127
+ },
128
+ {
129
+ "theorem": "Tsiolkovsky rocket equation",
130
+ "description": "It is a mathematical equation that describes the motion of a rocket in a vacuum and is used to calculate the velocity, acceleration, and thrust of the rocket.",
131
+ "difficulty": "Medium",
132
+ "remark": "",
133
+ "subfield": "Classical Mechanics"
134
+ },
135
+ {
136
+ "theorem": "Hall Effect",
137
+ "description": "Hall effect is a process in which a transverse electric field is developed in a solid material when the material carrying an electric current is placed in a magnetic field that is perpendicular to the current.",
138
+ "difficulty": "Medium",
139
+ "remark": "",
140
+ "subfield": "Electromagnetism"
141
+ }
142
+ ]
eval_suite/__init__.py ADDED
File without changes
eval_suite/image_utils.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ import numpy as np
5
+ from PIL import Image, ImageOps
6
+ from moviepy import VideoFileClip
7
+
8
+ from eval_suite.prompts_raw import _image_eval
9
+ from eval_suite.utils import extract_json, convert_score_fields, calculate_geometric_mean
10
+ from mllm_tools.utils import _prepare_text_image_inputs
11
+ from src.core.parse_video import image_with_most_non_black_space
12
+
13
+ def extract_key_frames(video_path, output_dir, num_chunks):
14
+ """Extract key frames from a video by dividing it into chunks and selecting representative frames.
15
+
16
+ Args:
17
+ video_path (str): Path to the input video file
18
+ output_dir (str): Directory where extracted frames will be saved
19
+ num_chunks (int): Number of chunks to divide the video into
20
+
21
+ Returns:
22
+ list: List of paths to the extracted key frames
23
+ """
24
+ # Create output directory if it doesn't exist
25
+ os.makedirs(output_dir, exist_ok=True)
26
+
27
+ # Extract all frames from the video
28
+ clip = VideoFileClip(video_path)
29
+ frames = list(clip.iter_frames(fps=1)) # one frame every second
30
+
31
+ total_frames = len(frames)
32
+ if total_frames == 0:
33
+ print("No frames extracted from the video.")
34
+ return []
35
+
36
+ # Determine the number of frames per chunk
37
+ frames_per_chunk = total_frames // num_chunks
38
+ num_chunks = min(num_chunks, (total_frames + frames_per_chunk - 1) // frames_per_chunk)
39
+
40
+ key_frames = []
41
+
42
+ # Process each chunk of frames
43
+ for i in range(num_chunks):
44
+ start_idx = i * frames_per_chunk
45
+ end_idx = min((i + 1) * frames_per_chunk, total_frames)
46
+ chunk_frames = frames[start_idx:end_idx]
47
+
48
+ if chunk_frames:
49
+ # Save the frame with most non-black space
50
+ output_path = os.path.join(output_dir, f"key_frame_{i+1}.jpg")
51
+ result = image_with_most_non_black_space(chunk_frames, output_path)
52
+ else:
53
+ print(f"No frames in chunk {i+1}. Skipping.")
54
+ result = None
55
+
56
+ if result is not None:
57
+ key_frames.append(output_path)
58
+ clip.close()
59
+
60
+ return key_frames
61
+
62
+
63
+ def evaluate_sampled_images(model, video_path, description="No description provided", num_chunks=10, output_folder=None):
64
+ """Evaluate sampled frames from a video using an image evaluation model.
65
+
66
+ Args:
67
+ model: The image evaluation model to use
68
+ video_path (str): Path to the input video file
69
+ description (str, optional): Description of the video content. Defaults to "No description provided"
70
+ num_chunks (int, optional): Number of chunks to divide the video into. Defaults to 10
71
+ output_folder (str, optional): Directory for temporary files. Defaults to None
72
+
73
+ Returns:
74
+ dict: Dictionary containing evaluation scores and individual frame assessments with keys:
75
+ - evaluation: Dictionary of averaged scores for each criterion
76
+ - image_chunks: List of individual frame evaluation results
77
+ """
78
+ with tempfile.TemporaryDirectory(dir=output_folder) as temp_dir:
79
+ key_frames = extract_key_frames(video_path, temp_dir, num_chunks)
80
+
81
+ prompt = _image_eval.format(description=description)
82
+
83
+ responses = []
84
+ for key_frame in key_frames:
85
+ inputs = _prepare_text_image_inputs(prompt, key_frame)
86
+ response = model(inputs)
87
+ response_json = extract_json(response)
88
+ response_json = convert_score_fields(response_json)
89
+ responses.append(response_json)
90
+
91
+ criteria = list(responses[0]["evaluation"].keys())
92
+ scores_dict = {c: [] for c in criteria}
93
+ for response in responses:
94
+ for key, val in response["evaluation"].items():
95
+ scores_dict[key].append(val["score"])
96
+
97
+ res_score = {}
98
+ for key, scores in scores_dict.items():
99
+ res_score[key] = {"score": calculate_geometric_mean(scores)}
100
+
101
+ return {
102
+ "evaluation": res_score,
103
+ "image_chunks": responses
104
+ }
eval_suite/parse_prompt.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from tqdm import tqdm
3
+
4
+
5
+ def call_parse_prompt():
6
+ """
7
+ Locates the prompts_raw directory and generates an __init__.py file containing prompt texts.
8
+
9
+ Searches for prompts_raw directory in current and parent directories. Once found, calls
10
+ create_python_file_with_texts() to generate the __init__.py file.
11
+ """
12
+ current_file_path = os.path.abspath(__file__)
13
+ current_folder_path = os.path.dirname(current_file_path)
14
+ folder_path = os.path.join(current_folder_path, "prompts_raw")
15
+
16
+ # If prompts_raw not found in current directory, search parent directories
17
+ if not os.path.exists(folder_path):
18
+ parent_dir = current_folder_path
19
+ while parent_dir != os.path.dirname(parent_dir): # Stop at root directory
20
+ parent_dir = os.path.dirname(parent_dir)
21
+ test_path = os.path.join(parent_dir, "prompts_raw")
22
+ if os.path.exists(test_path):
23
+ folder_path = test_path
24
+ break
25
+
26
+ output_file = os.path.join(folder_path, "__init__.py")
27
+ create_python_file_with_texts(folder_path, output_file)
28
+
29
+
30
+ def create_python_file_with_texts(folder_path, output_file):
31
+ """
32
+ Creates a Python file containing prompt texts from .txt files.
33
+
34
+ Args:
35
+ folder_path (str): Path to directory containing prompt .txt files
36
+ output_file (str): Path where the output __init__.py file will be created
37
+
38
+ The function reads all .txt files in the given folder, converts their contents into
39
+ Python variables, and writes them to the output file. Variable names are derived from
40
+ file paths with special characters replaced.
41
+ """
42
+ with open(output_file, 'w', encoding='utf-8') as out_file:
43
+ out_file.write("# This file is generated automatically through parse_prompt.py\n\n")
44
+ txt_files = [file for root, dirs, files in os.walk(folder_path) for file in files if file.endswith(".txt")]
45
+ for file in tqdm(txt_files, desc="Processing files"):
46
+ file_path = os.path.join(folder_path, file)
47
+ var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_")
48
+ with open(file_path, 'r', encoding='utf-8') as f:
49
+ content = f.read().replace('"""', '\"\"\"')
50
+ out_file.write(f'{var_name} = """{content}"""\n\n')
51
+
52
+
53
+ if __name__ == "__main__":
54
+ call_parse_prompt()
eval_suite/prompts_raw/__init__.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is generated automatically through parse_prompt.py
2
+
3
+ _video_eval_new = """# Task: Video Frame Quality Evaluation
4
+
5
+ You are tasked with analyzing and scoring a chunk of a theorem explanation video. Note that you may not have the full context of the video. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
6
+
7
+ ## Evaluation Criteria
8
+
9
+ 1. **Visual Consistency**
10
+ - Style Consistency: Does the visual style remain consistent across frames?
11
+ - Smoothness: Are the motions and transitions smooth?
12
+
13
+ ## Scoring Instructions
14
+ 1. Assign a score from **1 to 5** for each dimension:
15
+ - **1**: Very poor quality, completely fails to meet the criteria.
16
+ - **2**: Below average, significant issues present.
17
+ - **3**: Acceptable, meets the basic criteria with minor issues.
18
+ - **4**: Good, performs well with no major issues.
19
+ - **5**: Excellent, fully meets or exceeds expectations.
20
+ 2. Provide a comprehensive evaluation for each dimension.
21
+ 3. Format your output in **JSON**
22
+
23
+ ### JSON Output Format
24
+ ```json
25
+ {{
26
+ "overall_analysis": "[Provide a general assessment of the video's quality]",
27
+ "evaluation": {{
28
+ "visual_consistency": {{
29
+ "comprehensive_evaluation": "[Analysis of visual consistency]",
30
+ "score": [1-5]
31
+ }}
32
+ }}
33
+ }}
34
+ ```
35
+
36
+ Description of the theorem:
37
+ {description}
38
+
39
+ Video chunk:"""
40
+
41
+ _text_eval_new = """You are a specialist in evaluating theorem explanation videos, known for giving clear and objective feedback. You will be given the transcript of a video. Your task is to evaluate and score the content of the video in several dimensions.
42
+
43
+ ### Task Objective
44
+ 1. Perform an overall analysis of the video.
45
+ * Identify the topic of the video.
46
+ * Note your general thoughts and impression of the video, and any findings and observations.
47
+ 2. Conduct a comprehensive evaluation and score each criterion in the given dimensions.
48
+ * Analyze how well or poorly the video meets each criterion.
49
+ * Assign a score from **1 to 5** for each dimension:
50
+ - **1**: Very poor quality, completely fails to meet the criteria.
51
+ - **2**: Below average, significant issues present.
52
+ - **3**: Acceptable, meets the basic criteria with minor issues.
53
+ - **4**: Good, performs well with no major issues.
54
+ - **5**: Excellent, fully meets or exceeds expectations.
55
+ 3. Output the results in the specified JSON format.
56
+
57
+ ### Evaluation Criteria
58
+ 1. **Accuracy and Depth**
59
+ - Does the narration explain the theorem accurately?
60
+ - Does the video provide intuitive and/or rigorous explanations for why the theorem holds?
61
+ 2. **Logical Flow**
62
+ - Does the video follow a clear and logical structure?
63
+ - Does the video present a coherent buildup of ideas?
64
+
65
+ ### Notes
66
+ * You do not have access to the visual portion of the video as you are given only the textual portion. Do not reference or commentate on the visuals as they will be evaluated separately - just assume that there are reasonable visuals (e.g., geometric objects, graphs of functions, and calculations) to accompany the narration.
67
+ * The evaluation criteria are intended to be independent of each other. Do not restate the same violation in multiple criteria; only consider it in the most relevant criterion.
68
+
69
+ ### Output Format
70
+ ```json
71
+ {{
72
+ "overall_analysis": "[Overall analysis]",
73
+ "evaluation": {{
74
+ "accuracy_and_depth": {{
75
+ "comprehensive_evaluation": "[Analysis of accuracy and depth]",
76
+ "score": [1-5]
77
+ }},
78
+ "logical_flow": {{
79
+ "comprehensive_evaluation": "[Analysis of logical flow]",
80
+ "score": [1-5]
81
+ }}
82
+ }}
83
+ }}
84
+ ```
85
+
86
+ The transcript of the video is as follows:
87
+ {transcript}
88
+ """
89
+
90
+ _fix_transcript = """You are an expert in YouTube video transcripts. There is a transcript that was automatically generated through YouTube, so it lacks proper capitalization and punctuation. Your task is to fix the transcript so that there is proper punctuation, capitalization, and spacing. Do not make other modifications (e.g., keep the original word choice).
91
+
92
+ You should enclose the fixed transcript with a <SCRIPT></SCRIPT> block, i.e.:
93
+ <SCRIPT>
94
+ (Fixed transcript here)
95
+ </SCRIPT>
96
+
97
+ Original transcript: {transcript}
98
+ """
99
+
100
+ _image_eval = """# Task: Video Frame Quality Evaluation
101
+
102
+ You are tasked with analyzing and scoring a frame taken from a theorem explanation video. Note that you may not have the context of the video, so the captured frame may be a frame where some motion of visual elements is taking place. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
103
+
104
+ ## Evaluation Criteria
105
+
106
+ 1. **Visual Relevance**
107
+ - Does the video frame align with the theorem's concepts and derivations?
108
+
109
+ 2. **Element Layout**
110
+ - Placemend and Size: Are the visual elements well-placed and appropriately sized within the frame?
111
+ - Overlap: Are the visual elements free of unintentional overlap?
112
+ - Clarity: Is the visual information conveyed in the frame clear and easy to understand?
113
+
114
+ ## Scoring Instructions
115
+ 1. Assign a score from **1 to 5** for each dimension:
116
+ - **1**: Very poor quality, completely fails to meet the criteria.
117
+ - **2**: Below average, significant issues present.
118
+ - **3**: Acceptable, meets the basic criteria with minor issues.
119
+ - **4**: Good, performs well with no major issues.
120
+ - **5**: Excellent, fully meets or exceeds expectations.
121
+ 2. Provide a comprehensive evaluation for each dimension.
122
+ 3. Format your output in **JSON**
123
+
124
+ ### JSON Output Format
125
+ ```json
126
+ {{
127
+ "overall_analysis": "[Provide a general assessment of the image's quality]",
128
+ "evaluation": {{
129
+ "visual_relevance": {{
130
+ "comprehensive_evaluation": "[Analysis of visual relevance]",
131
+ "score": [1-5]
132
+ }},
133
+ "element_layout": {{
134
+ "comprehensive_evaluation": "[Analysis of element layout]",
135
+ "score": [1-5]
136
+ }}
137
+ }}
138
+ }}
139
+ ```
140
+
141
+ Description of the theorem:
142
+ {description}
143
+
144
+ Image:"""
145
+
eval_suite/prompts_raw/fix_transcript.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ You are an expert in YouTube video transcripts. There is a transcript that was automatically generated through YouTube, so it lacks proper capitalization and punctuation. Your task is to fix the transcript so that there is proper punctuation, capitalization, and spacing. Do not make other modifications (e.g., keep the original word choice).
2
+
3
+ You should enclose the fixed transcript with a <SCRIPT></SCRIPT> block, i.e.:
4
+ <SCRIPT>
5
+ (Fixed transcript here)
6
+ </SCRIPT>
7
+
8
+ Original transcript: {transcript}
eval_suite/prompts_raw/image_eval.txt ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task: Video Frame Quality Evaluation
2
+
3
+ You are tasked with analyzing and scoring a frame taken from a theorem explanation video. Note that you may not have the context of the video, so the captured frame may be a frame where some motion of visual elements is taking place. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
4
+
5
+ ## Evaluation Criteria
6
+
7
+ 1. **Visual Relevance**
8
+ - Does the video frame align with the theorem's concepts and derivations?
9
+
10
+ 2. **Element Layout**
11
+ - Placemend and Size: Are the visual elements well-placed and appropriately sized within the frame?
12
+ - Overlap: Are the visual elements free of unintentional overlap?
13
+ - Clarity: Is the visual information conveyed in the frame clear and easy to understand?
14
+
15
+ ## Scoring Instructions
16
+ 1. Assign a score from **1 to 5** for each dimension:
17
+ - **1**: Very poor quality, completely fails to meet the criteria.
18
+ - **2**: Below average, significant issues present.
19
+ - **3**: Acceptable, meets the basic criteria with minor issues.
20
+ - **4**: Good, performs well with no major issues.
21
+ - **5**: Excellent, fully meets or exceeds expectations.
22
+ 2. Provide a comprehensive evaluation for each dimension.
23
+ 3. Format your output in **JSON**
24
+
25
+ ### JSON Output Format
26
+ ```json
27
+ {{
28
+ "overall_analysis": "[Provide a general assessment of the image's quality]",
29
+ "evaluation": {{
30
+ "visual_relevance": {{
31
+ "comprehensive_evaluation": "[Analysis of visual relevance]",
32
+ "score": [1-5]
33
+ }},
34
+ "element_layout": {{
35
+ "comprehensive_evaluation": "[Analysis of element layout]",
36
+ "score": [1-5]
37
+ }}
38
+ }}
39
+ }}
40
+ ```
41
+
42
+ Description of the theorem:
43
+ {description}
44
+
45
+ Image:
eval_suite/prompts_raw/text_eval_new.txt ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a specialist in evaluating theorem explanation videos, known for giving clear and objective feedback. You will be given the transcript of a video. Your task is to evaluate and score the content of the video in several dimensions.
2
+
3
+ ### Task Objective
4
+ 1. Perform an overall analysis of the video.
5
+ * Identify the topic of the video.
6
+ * Note your general thoughts and impression of the video, and any findings and observations.
7
+ 2. Conduct a comprehensive evaluation and score each criterion in the given dimensions.
8
+ * Analyze how well or poorly the video meets each criterion.
9
+ * Assign a score from **1 to 5** for each dimension:
10
+ - **1**: Very poor quality, completely fails to meet the criteria.
11
+ - **2**: Below average, significant issues present.
12
+ - **3**: Acceptable, meets the basic criteria with minor issues.
13
+ - **4**: Good, performs well with no major issues.
14
+ - **5**: Excellent, fully meets or exceeds expectations.
15
+ 3. Output the results in the specified JSON format.
16
+
17
+ ### Evaluation Criteria
18
+ 1. **Accuracy and Depth**
19
+ - Does the narration explain the theorem accurately?
20
+ - Does the video provide intuitive and/or rigorous explanations for why the theorem holds?
21
+ 2. **Logical Flow**
22
+ - Does the video follow a clear and logical structure?
23
+ - Does the video present a coherent buildup of ideas?
24
+
25
+ ### Notes
26
+ * You do not have access to the visual portion of the video as you are given only the textual portion. Do not reference or commentate on the visuals as they will be evaluated separately - just assume that there are reasonable visuals (e.g., geometric objects, graphs of functions, and calculations) to accompany the narration.
27
+ * The evaluation criteria are intended to be independent of each other. Do not restate the same violation in multiple criteria; only consider it in the most relevant criterion.
28
+
29
+ ### Output Format
30
+ ```json
31
+ {{
32
+ "overall_analysis": "[Overall analysis]",
33
+ "evaluation": {{
34
+ "accuracy_and_depth": {{
35
+ "comprehensive_evaluation": "[Analysis of accuracy and depth]",
36
+ "score": [1-5]
37
+ }},
38
+ "logical_flow": {{
39
+ "comprehensive_evaluation": "[Analysis of logical flow]",
40
+ "score": [1-5]
41
+ }}
42
+ }}
43
+ }}
44
+ ```
45
+
46
+ The transcript of the video is as follows:
47
+ {transcript}
eval_suite/prompts_raw/video_eval_new.txt ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Task: Video Frame Quality Evaluation
2
+
3
+ You are tasked with analyzing and scoring a chunk of a theorem explanation video. Note that you may not have the full context of the video. Your job is to assign a score from 1 to 5 for each criterion. Please provide a brief justification for your scores.
4
+
5
+ ## Evaluation Criteria
6
+
7
+ 1. **Visual Consistency**
8
+ - Style Consistency: Does the visual style remain consistent across frames?
9
+ - Smoothness: Are the motions and transitions smooth?
10
+
11
+ ## Scoring Instructions
12
+ 1. Assign a score from **1 to 5** for each dimension:
13
+ - **1**: Very poor quality, completely fails to meet the criteria.
14
+ - **2**: Below average, significant issues present.
15
+ - **3**: Acceptable, meets the basic criteria with minor issues.
16
+ - **4**: Good, performs well with no major issues.
17
+ - **5**: Excellent, fully meets or exceeds expectations.
18
+ 2. Provide a comprehensive evaluation for each dimension.
19
+ 3. Format your output in **JSON**
20
+
21
+ ### JSON Output Format
22
+ ```json
23
+ {{
24
+ "overall_analysis": "[Provide a general assessment of the video's quality]",
25
+ "evaluation": {{
26
+ "visual_consistency": {{
27
+ "comprehensive_evaluation": "[Analysis of visual consistency]",
28
+ "score": [1-5]
29
+ }}
30
+ }}
31
+ }}
32
+ ```
33
+
34
+ Description of the theorem:
35
+ {description}
36
+
37
+ Video chunk:
eval_suite/text_utils.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
+ import pysrt
4
+
5
+ from mllm_tools.litellm import LiteLLMWrapper
6
+ from mllm_tools.gemini import GeminiWrapper
7
+ from mllm_tools.utils import _prepare_text_inputs
8
+ from eval_suite.prompts_raw import _fix_transcript, _text_eval_new
9
+ from eval_suite.utils import extract_json, convert_score_fields
10
+
11
+
12
+ def parse_srt_to_text(srt_path) -> str:
13
+ """
14
+ Parse an SRT subtitle file into plain text.
15
+
16
+ Args:
17
+ srt_path: Path to the SRT subtitle file.
18
+
19
+ Returns:
20
+ str: The subtitle text with duplicates removed and ellipses replaced.
21
+ """
22
+ subs = pysrt.open(srt_path)
23
+ full_text = []
24
+ for sub in subs:
25
+ sub.text = sub.text.replace("...", ".")
26
+ for line in sub.text.splitlines():
27
+ # .srt can contain repeated lines
28
+ if full_text and full_text[-1] == line:
29
+ continue
30
+ full_text.append(line)
31
+ return "\n".join(full_text)
32
+
33
+
34
+ def fix_transcript(text_eval_model: Union[LiteLLMWrapper, GeminiWrapper], transcript: str) -> str:
35
+ """
36
+ Fix and clean up a transcript using an LLM model.
37
+
38
+ Args:
39
+ text_eval_model: The LLM model wrapper to use for fixing the transcript.
40
+ transcript: The input transcript text to fix.
41
+
42
+ Returns:
43
+ str: The fixed and cleaned transcript text.
44
+ """
45
+ print("Fixing transcript...")
46
+
47
+ prompt = _fix_transcript.format(transcript=transcript)
48
+ response = text_eval_model(_prepare_text_inputs(prompt))
49
+ fixed_script = response.split("<SCRIPT>", maxsplit=1)[1].split("</SCRIPT>")[0]
50
+
51
+ return fixed_script
52
+
53
+
54
+ def evaluate_text(text_eval_model: LiteLLMWrapper, transcript: str, retry_limit: int) -> dict:
55
+ """
56
+ Evaluate transcript text using an LLM model with retry logic.
57
+
58
+ Args:
59
+ text_eval_model: The LLM model wrapper to use for evaluation.
60
+ transcript: The transcript text to evaluate.
61
+ retry_limit: Maximum number of retry attempts on failure.
62
+
63
+ Returns:
64
+ dict: The evaluation results as a JSON object.
65
+
66
+ Raises:
67
+ ValueError: If all retry attempts fail.
68
+ """
69
+ # prompt = _text_eval.format(transcript=transcript)
70
+ prompt = _text_eval_new.format(transcript=transcript)
71
+ for attempt in range(retry_limit):
72
+ try:
73
+ evaluation = text_eval_model(_prepare_text_inputs(prompt))
74
+ evaluation_json = extract_json(evaluation)
75
+ evaluation_json = convert_score_fields(evaluation_json)
76
+ return evaluation_json
77
+ except Exception as e:
78
+ print(f"Attempt {attempt + 1} failed: {e.__class__.__name__}: {e}")
79
+ if attempt + 1 == retry_limit:
80
+ raise ValueError("Reached maximum retry limit. Evaluation failed.") from None
eval_suite/utils.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from math import prod
4
+ from typing import List
5
+
6
+ def extract_json(response: str) -> dict:
7
+ """
8
+ Extract JSON content from a string response.
9
+
10
+ Args:
11
+ response (str): String containing JSON content, possibly within code blocks.
12
+
13
+ Returns:
14
+ dict: Extracted and parsed JSON content.
15
+
16
+ Raises:
17
+ ValueError: If no valid JSON content could be extracted.
18
+ """
19
+ try:
20
+ evaluation_json = json.loads(response)
21
+ except json.JSONDecodeError:
22
+ # If JSON parsing fails, try to extract the content between ```json and ```
23
+ match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
24
+ if not match:
25
+ # If no match for ```json, try to extract content between ``` and ```
26
+ match = re.search(r'```\n(.*?)\n```', response, re.DOTALL)
27
+
28
+ if match:
29
+ evaluation_content = match.group(1)
30
+ evaluation_json = json.loads(evaluation_content)
31
+ else:
32
+ raise ValueError("Failed to extract valid JSON content")
33
+ return evaluation_json
34
+
35
+
36
+ def convert_score_fields(data: dict) -> dict:
37
+ """
38
+ Convert score fields in a dictionary to integers recursively.
39
+
40
+ Args:
41
+ data (dict): Dictionary containing score fields to convert.
42
+
43
+ Returns:
44
+ dict: Dictionary with score fields converted to integers.
45
+
46
+ Raises:
47
+ ValueError: If a score value cannot be converted to integer.
48
+ """
49
+ # Create a new dictionary with the converted values
50
+ converted_data = {}
51
+ for key, value in data.items():
52
+ if key == "score":
53
+ if isinstance(value, int):
54
+ converted_data[key] = value
55
+ elif isinstance(value, str) and value.isdigit():
56
+ converted_data[key] = int(value)
57
+ else:
58
+ raise ValueError(f"Invalid score value: {value!r}")
59
+ elif isinstance(value, dict):
60
+ converted_data[key] = convert_score_fields(value)
61
+ else:
62
+ converted_data[key] = value
63
+ return converted_data
64
+
65
+
66
+ def calculate_geometric_mean(scores: List[int]) -> float:
67
+ """
68
+ Calculate the geometric mean of a list of scores.
69
+
70
+ Args:
71
+ scores (List[int]): List of integer scores, may contain None values.
72
+
73
+ Returns:
74
+ float: Geometric mean of non-None scores. Returns 0.0 if list is empty
75
+ or contains only None values.
76
+ """
77
+ scores = [s for s in scores if s is not None]
78
+ if not scores:
79
+ return 0.0
80
+ product = prod(scores)
81
+ return product ** (1 / len(scores))
eval_suite/video_utils.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import tempfile
4
+
5
+ from dotenv import load_dotenv
6
+
7
+ from mllm_tools.utils import _prepare_text_video_inputs
8
+ from eval_suite.prompts_raw import _video_eval_new
9
+ from eval_suite.utils import extract_json, convert_score_fields
10
+
11
+ load_dotenv()
12
+
13
+
14
+ def reduce_video_framerate(input_path, target_fps=1, output_path=None):
15
+ """
16
+ Reduces the frame rate of a video by only keeping frames at the target interval.
17
+
18
+ Args:
19
+ input_path (str): Path to the input video
20
+ target_fps (int): Target frames per second (default: 1)
21
+ output_path (str, optional): Path to save the processed video. If None, uses a temporary file.
22
+
23
+ Returns:
24
+ str: Path to the processed video
25
+
26
+ Raises:
27
+ ValueError: If input video cannot be opened or has invalid FPS
28
+ RuntimeError: If video writer initialization fails or output video creation fails
29
+ """
30
+ cap = cv2.VideoCapture(input_path)
31
+ if not cap.isOpened():
32
+ raise ValueError(f"Could not open input video: {input_path}")
33
+
34
+ original_fps = cap.get(cv2.CAP_PROP_FPS)
35
+ if original_fps <= 0:
36
+ raise ValueError(f"Invalid FPS ({original_fps}) detected in input video")
37
+
38
+ frame_interval = int(original_fps / target_fps)
39
+
40
+ # Get video properties
41
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
42
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
43
+
44
+ # Use provided output path or create temporary file
45
+ if output_path is None:
46
+ temp_output = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
47
+ output_path = temp_output.name
48
+
49
+ # Ensure output directory exists
50
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
51
+
52
+ # Try different codecs in order of preference
53
+ codecs = [
54
+ ('avc1', '.mp4'), # H.264 codec
55
+ ('mp4v', '.mp4'), # MP4V codec
56
+ ('XVID', '.avi'), # XVID codec
57
+ ('MJPG', '.avi'), # Motion JPEG codec
58
+ ]
59
+
60
+ success = False
61
+ for codec, ext in codecs:
62
+ if output_path.endswith('.mp4') and not ext.endswith('.mp4'):
63
+ # If we're switching to AVI format, change the extension
64
+ output_path = output_path[:-4] + ext
65
+
66
+ fourcc = cv2.VideoWriter_fourcc(*codec)
67
+ out = cv2.VideoWriter(output_path, fourcc, target_fps, (width, height))
68
+
69
+ if out.isOpened():
70
+ success = True
71
+ print(f"Successfully initialized video writer with codec: {codec}")
72
+ break
73
+ else:
74
+ out.release()
75
+ if os.path.exists(output_path):
76
+ os.remove(output_path)
77
+
78
+ if not success:
79
+ raise RuntimeError("Could not initialize video writer with any available codec")
80
+
81
+ frame_count = 0
82
+ frames_written = 0
83
+ while cap.isOpened():
84
+ ret, frame = cap.read()
85
+ if not ret:
86
+ break
87
+
88
+ # Only write frames at the specified interval
89
+ if frame_count % frame_interval == 0:
90
+ out.write(frame)
91
+ frames_written += 1
92
+ frame_count += 1
93
+
94
+ cap.release()
95
+ out.release()
96
+
97
+ # Verify the output
98
+ verify_cap = cv2.VideoCapture(output_path)
99
+ if not verify_cap.isOpened():
100
+ raise RuntimeError(f"Failed to create output video at {output_path}")
101
+
102
+ actual_fps = verify_cap.get(cv2.CAP_PROP_FPS)
103
+ total_frames = verify_cap.get(cv2.CAP_PROP_FRAME_COUNT)
104
+ verify_cap.release()
105
+
106
+ if actual_fps <= 0:
107
+ print("Warning: Output video reports invalid FPS. This might be a codec issue.")
108
+ actual_fps = target_fps # Use target FPS for duration calculation
109
+
110
+ print(f"Created video with {frames_written} frames at {actual_fps} FPS")
111
+ print(f"Total duration: {total_frames/actual_fps:.2f} seconds")
112
+ print(f"Video saved to: {output_path}")
113
+
114
+ return output_path
115
+
116
+
117
+ def evaluate_video_chunk_new(model, video_path, transcript="No transcript provided", description="No description provided",
118
+ save_processed_video=None, target_fps=None, retry_limit=5):
119
+ """
120
+ Evaluate a single video chunk using a multimodal model.
121
+
122
+ Args:
123
+ model: The multimodal model to use for evaluation
124
+ video_path (str): Path to the video file to evaluate
125
+ transcript (str, optional): Video transcript text. Defaults to "No transcript provided"
126
+ description (str, optional): Video description text. Defaults to "No description provided"
127
+ save_processed_video (str, optional): Path to save processed video. If None, uses temporary file
128
+ target_fps (int, optional): Target frames per second for video processing. If None, no processing
129
+ retry_limit (int, optional): Maximum number of retry attempts. Defaults to 5
130
+
131
+ Returns:
132
+ dict: Evaluation results as a JSON object with scores converted to integers
133
+
134
+ Raises:
135
+ FileNotFoundError: If video file does not exist
136
+ Exception: If evaluation fails after all retry attempts
137
+ """
138
+ if not os.path.exists(video_path):
139
+ raise FileNotFoundError(f"Video file not found: {video_path}")
140
+
141
+ # Only process video if target_fps is specified
142
+ if target_fps is not None:
143
+ processed_video_path = reduce_video_framerate(video_path, target_fps=target_fps, output_path=save_processed_video)
144
+ video_to_use = processed_video_path
145
+ else:
146
+ video_to_use = video_path
147
+
148
+ prompt = _video_eval_new.format(description=description)
149
+ inputs = _prepare_text_video_inputs(prompt, video_to_use)
150
+
151
+ try:
152
+ for attempt in range(retry_limit):
153
+ try:
154
+ response = model(inputs)
155
+ response_json = extract_json(response)
156
+ response_json = convert_score_fields(response_json)
157
+
158
+ return response_json
159
+ except Exception as e:
160
+ print(f"Attempt {attempt + 1} failed: {e}")
161
+ if attempt + 1 == retry_limit:
162
+ print("Reached maximum retry limit. Evaluation failed.")
163
+ raise
164
+ finally:
165
+ # Clean up the temporary processed video if we created one
166
+ if target_fps is not None and save_processed_video is None and os.path.exists(processed_video_path):
167
+ os.unlink(processed_video_path)
evaluate.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ import tempfile
5
+ from typing import Dict, List, Union
6
+ from datetime import datetime
7
+
8
+ from dotenv import load_dotenv
9
+ from moviepy import VideoFileClip
10
+
11
+ from mllm_tools.litellm import LiteLLMWrapper
12
+ from mllm_tools.gemini import GeminiWrapper
13
+ from eval_suite.utils import calculate_geometric_mean
14
+ from eval_suite.text_utils import parse_srt_to_text, fix_transcript, evaluate_text
15
+ from eval_suite.video_utils import evaluate_video_chunk_new
16
+ from eval_suite.image_utils import evaluate_sampled_images
17
+
18
+ load_dotenv()
19
+
20
+ with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "src", "utils", "allowed_models.json")) as f:
21
+ ALLOWED_MODELS = json.load(f)["allowed_models"]
22
+
23
+
24
+ def combine_results(output_folder: str, combined_file: str, results: Dict[str, Dict]) -> None:
25
+ """
26
+ Combine all evaluation results into a single file.
27
+
28
+ Args:
29
+ output_folder (str): Directory to store the combined file.
30
+ combined_file (str): Name of the combined file.
31
+ results (Dict[str, Dict]): Dictionary of evaluation results with file names as keys.
32
+
33
+ Returns:
34
+ None
35
+ """
36
+ combined_path = os.path.join(output_folder, combined_file)
37
+ with open(combined_path, 'w') as output_file:
38
+ json.dump(results, output_file, indent=4)
39
+
40
+
41
+ def save_individual_result(output_folder: str, file_name: str, result: Dict) -> None:
42
+ """
43
+ Save individual evaluation result to a file.
44
+
45
+ Args:
46
+ output_folder (str): Directory to store the evaluation file.
47
+ file_name (str): Name of the file.
48
+ result (Dict): Evaluation result.
49
+
50
+ Returns:
51
+ None
52
+ """
53
+ current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
54
+ result_file = f"evaluation_{file_name}_{current_time}.json"
55
+ os.makedirs(output_folder, exist_ok=True)
56
+ result_path = os.path.join(output_folder, result_file)
57
+ with open(result_path, 'w') as output_file:
58
+ json.dump(result, output_file, indent=4)
59
+
60
+
61
+ def evaluate_text_file(model, transcript_path, retry_limit):
62
+ """
63
+ Evaluate a text file using the provided model.
64
+
65
+ Args:
66
+ model: The model to use for evaluation.
67
+ transcript_path (str): Path to the transcript file (.srt or .txt).
68
+ retry_limit (int): Number of retry attempts for evaluation.
69
+
70
+ Returns:
71
+ Dict or None: Evaluation results if successful, None if file format unsupported.
72
+ """
73
+ if not transcript_path.endswith(('.srt', '.txt')):
74
+ print(f"Skipping {transcript_path}: Unsupported file format for text evaluation.")
75
+ return None
76
+
77
+ if transcript_path.endswith(".srt"):
78
+ transcript = parse_srt_to_text(transcript_path)
79
+ elif transcript_path.endswith(".txt"):
80
+ with open(transcript_path) as f:
81
+ transcript = f.read().strip()
82
+ else:
83
+ raise ValueError("Unrecognized transcript file format.")
84
+
85
+ capital_letter_proportion = sum(1 for c in transcript if c.isupper()) / sum(1 for c in transcript if c.isalpha())
86
+ if capital_letter_proportion < 0.01:
87
+ transcript = fix_transcript(model, transcript)
88
+
89
+ print(f"Performing text evaluation: {os.path.basename(transcript_path)}")
90
+ result = evaluate_text(model, transcript, retry_limit)
91
+ return result
92
+
93
+
94
+ def evaluate_video_file(model, video_path, transcript_path, description_path, target_fps=None, output_folder=None):
95
+ """
96
+ Evaluate a video file using the provided model.
97
+
98
+ Args:
99
+ model: The model to use for evaluation.
100
+ video_path (str): Path to the video file.
101
+ transcript_path (str): Path to the transcript file.
102
+ description_path (str): Path to the description file.
103
+ target_fps (int, optional): Target frames per second for video processing.
104
+ output_folder (str, optional): Directory to store output files.
105
+
106
+ Returns:
107
+ Dict or None: Evaluation results if successful, None if file format unsupported.
108
+ """
109
+ if not video_path.endswith(('.mp4', '.mkv')):
110
+ print(f"Skipping {video_path}: Unsupported file format for video evaluation.")
111
+ return None
112
+
113
+ moviepy_temp_dir = os.path.join(output_folder, "moviepy_temp")
114
+
115
+ # Chunking
116
+ num_chunks = 10
117
+ with VideoFileClip(video_path) as clip:
118
+ duration = clip.duration
119
+ chunk_duration = duration / num_chunks
120
+ results = []
121
+
122
+ # Create a temporary directory in the output_folder
123
+ temp_dir_parent = output_folder or os.getcwd()
124
+ with tempfile.TemporaryDirectory(dir=temp_dir_parent) as temp_dir:
125
+ for i in range(10):
126
+ start = i * chunk_duration
127
+ end = min(start + chunk_duration, duration)
128
+ chunk = clip.subclipped(start, end)
129
+ chunk_path = os.path.join(temp_dir, f"chunk_{i+1}.mp4")
130
+ # Explicitly set the temp_audiofile path with matching codec
131
+ temp_audiofile = os.path.join(moviepy_temp_dir, f"temp_audio_chunk_{i+1}.m4a")
132
+ chunk.write_videofile(
133
+ chunk_path,
134
+ codec="libx264",
135
+ audio_codec="aac",
136
+ temp_audiofile=temp_audiofile,
137
+ audio_bitrate="192k",
138
+ preset="ultrafast", # Speed up encoding
139
+ logger=None
140
+ )
141
+ # Create processed videos folder inside output_folder
142
+ processed_videos_dir = os.path.join(output_folder, "processed_videos")
143
+ save_path = os.path.join(processed_videos_dir, f"processed_chunk_{i+1}.mp4")
144
+ result = evaluate_video_chunk_new(
145
+ model,
146
+ chunk_path,
147
+ transcript_path,
148
+ description_path,
149
+ target_fps=target_fps,
150
+ save_processed_video=save_path
151
+ )
152
+ results.append(result)
153
+
154
+ score_dict = {}
155
+ for key in results[0]["evaluation"].keys():
156
+ score_dict[key] = []
157
+ for result in results:
158
+ score_dict[key].append(result["evaluation"][key]["score"])
159
+
160
+ evaluation = {}
161
+ for key, scores in score_dict.items():
162
+ evaluation[key] = {"score": calculate_geometric_mean(scores)}
163
+
164
+ result_json = {
165
+ "evaluation": evaluation,
166
+ "video_chunks": results
167
+ }
168
+ return result_json
169
+
170
+
171
+ def extract_scores(data: Union[Dict, List]) -> List[int]:
172
+ """
173
+ Extract all score values from a nested dictionary or list structure.
174
+
175
+ Args:
176
+ data (Union[Dict, List]): The data structure to extract scores from.
177
+
178
+ Returns:
179
+ List[int]: List of extracted score values.
180
+ """
181
+ scores = []
182
+ if isinstance(data, dict):
183
+ for key, value in data.items():
184
+ if "chunks" in key:
185
+ continue
186
+ elif isinstance(value, dict) or isinstance(value, list):
187
+ scores.extend(extract_scores(value))
188
+ elif key == 'score':
189
+ scores.append(value)
190
+ elif isinstance(data, list):
191
+ for item in data:
192
+ scores.extend(extract_scores(item))
193
+ return scores
194
+
195
+
196
+ def calculate_overall_score(result: Dict) -> float:
197
+ """
198
+ Calculate the overall score from evaluation results.
199
+
200
+ Args:
201
+ result (Dict): Dictionary containing evaluation results.
202
+
203
+ Returns:
204
+ float: The calculated overall score.
205
+ """
206
+ scores = extract_scores(result)
207
+ overall_score = calculate_geometric_mean(scores)
208
+ return overall_score
209
+
210
+
211
+ def process_topic_name(topic_name: str) -> str:
212
+ """
213
+ Process a topic name by capitalizing words and handling special characters.
214
+
215
+ Args:
216
+ topic_name (str): The topic name to process.
217
+
218
+ Returns:
219
+ str: The processed topic name.
220
+ """
221
+ words = topic_name.replace("_s_", "'s_").split("_")
222
+ return " ".join([word.capitalize() for word in words])
223
+
224
+
225
+ def merge_dicts(dict1: dict, dict2: dict) -> dict:
226
+ """
227
+ Recursively merge two dictionaries.
228
+
229
+ Args:
230
+ dict1 (dict): First dictionary.
231
+ dict2 (dict): Second dictionary.
232
+
233
+ Returns:
234
+ dict: Merged dictionary.
235
+ """
236
+ merged = dict1.copy()
237
+ for key, value in dict2.items():
238
+ if key in merged and isinstance(merged[key], dict) and isinstance(value, dict):
239
+ merged[key] = merge_dicts(merged[key], value)
240
+ else:
241
+ merged[key] = value
242
+ return merged
243
+
244
+
245
+ def process_theorem(models, file_path: str, eval_type: str, retry_limit: int,
246
+ target_fps: int = None, use_parent_folder_as_topic: bool = False,
247
+ output_folder: str = None) -> tuple[str, dict]:
248
+ """
249
+ Process a theorem file or directory for evaluation.
250
+
251
+ Args:
252
+ models: Dictionary of models for different evaluation types.
253
+ file_path (str): Path to the file or directory to evaluate.
254
+ eval_type (str): Type of evaluation to perform.
255
+ retry_limit (int): Number of retry attempts.
256
+ target_fps (int, optional): Target frames per second for video processing.
257
+ use_parent_folder_as_topic (bool, optional): Use parent folder name as topic.
258
+ output_folder (str, optional): Directory to store output files.
259
+
260
+ Returns:
261
+ tuple[str, dict]: Tuple of file name and evaluation results.
262
+ """
263
+ ext_map = {
264
+ 'text': ('.txt', '.srt'),
265
+ 'video': ('.mp4', '.mkv')
266
+ }
267
+
268
+ # Handle single file evaluation
269
+ if os.path.isfile(file_path):
270
+ file_ext = os.path.splitext(file_path)[1].lower()
271
+ file_name = os.path.basename(file_path)
272
+
273
+ if eval_type == "text" and file_ext in ext_map['text']:
274
+ return file_name, evaluate_text_file(models['text'], file_path, retry_limit)
275
+ elif eval_type == "video" and file_ext in ext_map['video']:
276
+ if use_parent_folder_as_topic:
277
+ topic_name = os.path.basename(os.path.dirname(file_path))
278
+ else:
279
+ topic_name = None
280
+ topic_name = process_topic_name(topic_name)
281
+ return file_name, evaluate_video_file(models['video'], file_path, None, topic_name, target_fps, output_folder)
282
+ elif eval_type == "image" and file_ext in ext_map['video']:
283
+ if use_parent_folder_as_topic:
284
+ topic_name = os.path.basename(os.path.dirname(file_path))
285
+ else:
286
+ topic_name = None
287
+ topic_name = process_topic_name(topic_name)
288
+ return file_name, evaluate_sampled_images(models['image'], file_path, topic_name, num_chunks=10, output_folder=output_folder)
289
+ elif eval_type == "all":
290
+ raise ValueError("Evaluation type 'all' is not supported for a single file. Try passing a folder with both a video and a subtitle file.")
291
+ else:
292
+ raise ValueError(f"File type of {file_path} does not match evaluation type {eval_type!r}")
293
+
294
+ # Handle directory evaluation
295
+ theorem_dir = file_path
296
+ all_files = os.listdir(theorem_dir)
297
+
298
+ # Look for transcript files, prioritizing .srt over .txt if both exist
299
+ transcript_file_candidates = [f for f in all_files if f.endswith(ext_map['text']) and not f.endswith('_scene_outline.txt')]
300
+ srt_files = [f for f in transcript_file_candidates if f.endswith('.srt')]
301
+ txt_files = [f for f in transcript_file_candidates if f.endswith('.txt')]
302
+
303
+ transcript_path = None
304
+ if srt_files:
305
+ transcript_path = os.path.join(theorem_dir, srt_files[0])
306
+ elif txt_files:
307
+ transcript_path = os.path.join(theorem_dir, txt_files[0])
308
+
309
+ video_file_candidates = [f for f in all_files if f.endswith(ext_map['video'])]
310
+ video_path = os.path.join(theorem_dir, video_file_candidates[0]) if len(video_file_candidates) == 1 else None
311
+
312
+ topic_name = os.path.basename(theorem_dir)
313
+ topic_name = process_topic_name(topic_name)
314
+
315
+ if not video_path:
316
+ print(f"Skipping {theorem_dir}: No video file found")
317
+ return None, None
318
+
319
+ text_result = video_result = image_result = None
320
+ if eval_type == "text" or eval_type == "all":
321
+ if transcript_path is None:
322
+ print(f"Warning: No suitable transcript file found in {theorem_dir}")
323
+ else:
324
+ text_result = evaluate_text_file(models['text'], transcript_path, retry_limit)
325
+ if eval_type == "video" or eval_type == "all":
326
+ assert video_path is not None, f"Expected 1 video file, got {len(video_file_candidates)} for {theorem_dir}"
327
+ video_result = evaluate_video_file(models['video'], video_path, transcript_path, topic_name, target_fps, output_folder)
328
+ if eval_type == "image" or eval_type == "all":
329
+ assert video_path is not None, f"Expected 1 video file, got {len(video_file_candidates)} for {theorem_dir}"
330
+ image_result = evaluate_sampled_images(models['image'], video_path, topic_name, num_chunks=10, output_folder=output_folder)
331
+
332
+ if eval_type == "all":
333
+ result = {}
334
+ if text_result:
335
+ result = merge_dicts(result, text_result)
336
+ if video_result:
337
+ result = merge_dicts(result, video_result)
338
+ if image_result:
339
+ result = merge_dicts(result, image_result)
340
+ if result:
341
+ result["evaluation"]["overall_score"] = calculate_overall_score(result)
342
+ else:
343
+ result = text_result if eval_type == "text" else video_result if eval_type == "video" else image_result if eval_type == "image" else None
344
+
345
+ file_name = os.path.basename(theorem_dir)
346
+ return file_name, result
347
+
348
+
349
+ def main():
350
+ """
351
+ Main function to run the evaluation script.
352
+
353
+ Parses command line arguments and orchestrates the evaluation process
354
+ for text, video, and image content using specified AI models.
355
+ """
356
+ parser = argparse.ArgumentParser(description='Automatic evaluation of theorem explanation videos with LLMs')
357
+ parser.add_argument('--model_text', type=str,
358
+ choices=ALLOWED_MODELS,
359
+ default='azure/gpt-4o',
360
+ help='Select the AI model to use for text evaluation')
361
+ parser.add_argument('--model_video', type=str,
362
+ choices=['gemini/gemini-1.5-pro-002',
363
+ 'gemini/gemini-2.0-flash-exp',
364
+ 'gemini/gemini-2.0-pro-exp-02-05'],
365
+ default='gemini/gemini-1.5-pro-002',
366
+ help='Select the AI model to use for video evaluation')
367
+ parser.add_argument('--model_image', type=str,
368
+ choices=ALLOWED_MODELS,
369
+ default='azure/gpt-4o',
370
+ help='Select the AI model to use for image evaluation')
371
+ parser.add_argument('--eval_type', type=str, choices=['text', 'video', 'image', 'all'], default='all', help='Type of evaluation to perform')
372
+ parser.add_argument('--file_path', type=str, help='Path to a file or a theorem folder', required=True)
373
+ parser.add_argument('--output_folder', type=str, help='Directory to store the evaluation files', required=True)
374
+ parser.add_argument('--retry_limit', type=int, default=3, help='Number of retry attempts for each inference')
375
+ parser.add_argument('--combine', action='store_true', help='Combine all results into a single JSON file')
376
+ parser.add_argument('--bulk_evaluate', action='store_true', help='Evaluate a folder of theorems together', default=False)
377
+ parser.add_argument('--target_fps', type=int, help='Target FPS for video processing. If not set, original video FPS will be used', required=False)
378
+ parser.add_argument('--use_parent_folder_as_topic', action='store_true', help='Use parent folder name as topic name for single file evaluation', default=True)
379
+ parser.add_argument('--max_workers', type=int, default=4, help='Maximum number of concurrent workers for parallel processing')
380
+
381
+ args = parser.parse_args()
382
+
383
+ # Initialize separate models
384
+ text_model = LiteLLMWrapper(
385
+ model_name=args.model_text,
386
+ temperature=0.0,
387
+ )
388
+ video_model = GeminiWrapper(
389
+ model_name=args.model_video,
390
+ temperature=0.0,
391
+ )
392
+ image_model = LiteLLMWrapper(
393
+ model_name=args.model_image,
394
+ temperature=0.0,
395
+ )
396
+
397
+ models = {
398
+ 'text': text_model,
399
+ 'video': video_model,
400
+ 'image': image_model
401
+ }
402
+
403
+ theorem_dirs = []
404
+ if args.bulk_evaluate:
405
+ assert os.path.isdir(args.file_path), "File path must be a folder for --bulk_evaluate"
406
+ for root, dirnames, _ in os.walk(args.file_path):
407
+ if not any(f.endswith(".mp4") for f in os.listdir(root)):
408
+ continue
409
+
410
+ theorem_dirs.append(root)
411
+ elif os.path.isdir(args.file_path):
412
+ assert any(f.endswith(".mp4") for f in os.listdir(args.file_path)), "The provided folder must contain a video file"
413
+
414
+ theorem_dirs.append(args.file_path)
415
+
416
+ # Create output directory and its temp subdirectories if it doesn't exist
417
+ os.makedirs(args.output_folder, exist_ok=True)
418
+ moviepy_temp_dir = os.path.join(args.output_folder, "moviepy_temp")
419
+ os.makedirs(moviepy_temp_dir, exist_ok=True)
420
+ VideoFileClip.DEFAULT_TEMP_DIR = moviepy_temp_dir
421
+
422
+ processed_videos_dir = os.path.join(args.output_folder, "processed_videos")
423
+ os.makedirs(processed_videos_dir, exist_ok=True)
424
+
425
+ results = {}
426
+ if theorem_dirs:
427
+ for theorem_dir in theorem_dirs:
428
+ file_name, result = process_theorem(
429
+ models,
430
+ theorem_dir,
431
+ args.eval_type,
432
+ args.retry_limit,
433
+ args.target_fps,
434
+ args.use_parent_folder_as_topic,
435
+ args.output_folder
436
+ )
437
+
438
+ if result is not None:
439
+ results[file_name] = result
440
+
441
+ if not args.combine:
442
+ save_individual_result(args.output_folder, file_name, result)
443
+ else:
444
+ file_name, result = process_theorem(
445
+ models,
446
+ args.file_path,
447
+ args.eval_type,
448
+ args.retry_limit,
449
+ args.target_fps,
450
+ args.use_parent_folder_as_topic,
451
+ args.output_folder
452
+ )
453
+
454
+ if result is not None:
455
+ results[file_name] = result
456
+
457
+ if not args.combine:
458
+ save_individual_result(args.output_folder, file_name, result)
459
+
460
+ if args.combine:
461
+ if len(results) > 1:
462
+ current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
463
+ combined_file = f"evaluation_{current_time}.json"
464
+ combine_results(args.output_folder, combined_file, results)
465
+ print("Combining results completed.")
466
+ else:
467
+ for file_name, result in results.items():
468
+ save_individual_result(args.output_folder, file_name, result)
469
+
470
+ os.rmdir(moviepy_temp_dir)
471
+
472
+
473
+ if __name__ == "__main__":
474
+ main()
generate_video.py ADDED
@@ -0,0 +1,954 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import random
4
+ from typing import Union, List, Dict, Optional
5
+ import subprocess
6
+ import argparse
7
+ import glob
8
+ from PIL import Image
9
+ import re
10
+ from dotenv import load_dotenv
11
+ import asyncio
12
+ import uuid # Import uuid for generating trace_id
13
+
14
+ from mllm_tools.litellm import LiteLLMWrapper
15
+ from mllm_tools.utils import _prepare_text_inputs # Keep _prepare_text_inputs if still used directly in main
16
+
17
+ # Import new modules
18
+ from src.core.video_planner import VideoPlanner
19
+ from src.core.code_generator import CodeGenerator
20
+ from src.core.video_renderer import VideoRenderer
21
+ from src.utils.utils import _print_response, _extract_code, extract_xml # Import utility functions
22
+ from src.config.config import Config # Import Config class
23
+
24
+ # Video parsing
25
+ from src.core.parse_video import (
26
+ get_images_from_video,
27
+ image_with_most_non_black_space
28
+ )
29
+ from task_generator import get_banned_reasonings
30
+ from task_generator.prompts_raw import (_code_font_size, _code_disable, _code_limit, _prompt_manim_cheatsheet)
31
+
32
+ # Load allowed models list from JSON file
33
+ allowed_models_path = os.path.join(os.path.dirname(__file__), 'src', 'utils', 'allowed_models.json')
34
+ with open(allowed_models_path, 'r') as f:
35
+ allowed_models = json.load(f).get("allowed_models", [])
36
+
37
+ load_dotenv(override=True)
38
+
39
+ class VideoGenerator:
40
+ """
41
+ A class for generating manim videos using AI models.
42
+
43
+ This class coordinates the video generation pipeline by managing scene planning,
44
+ code generation, and video rendering. It supports concurrent scene processing,
45
+ visual code fixing, and RAG (Retrieval Augmented Generation).
46
+
47
+ Args:
48
+ planner_model: Model used for scene planning and high-level decisions
49
+ scene_model: Model used specifically for scene generation (defaults to planner_model)
50
+ helper_model: Helper model for additional tasks (defaults to planner_model)
51
+ output_dir (str): Directory to store generated files and videos
52
+ verbose (bool): Whether to print detailed output
53
+ use_rag (bool): Whether to use Retrieval Augmented Generation
54
+ use_context_learning (bool): Whether to use context learning with example code
55
+ context_learning_path (str): Path to context learning examples
56
+ chroma_db_path (str): Path to ChromaDB for RAG
57
+ manim_docs_path (str): Path to Manim documentation for RAG
58
+ embedding_model (str): Model to use for embeddings
59
+ use_visual_fix_code (bool): Whether to use visual feedback for code fixing
60
+ use_langfuse (bool): Whether to enable Langfuse logging
61
+ trace_id (str, optional): Trace ID for logging
62
+ max_scene_concurrency (int): Maximum number of scenes to process concurrently
63
+
64
+ Attributes:
65
+ output_dir (str): Directory for output files
66
+ verbose (bool): Verbosity flag
67
+ use_visual_fix_code (bool): Visual code fixing flag
68
+ session_id (str): Unique session identifier
69
+ scene_semaphore (asyncio.Semaphore): Controls concurrent scene processing
70
+ banned_reasonings (list): List of banned reasoning patterns
71
+ planner (VideoPlanner): Handles scene planning
72
+ code_generator (CodeGenerator): Handles code generation
73
+ video_renderer (VideoRenderer): Handles video rendering
74
+ """
75
+
76
+ def __init__(self,
77
+ planner_model,
78
+ scene_model=None,
79
+ helper_model=None,
80
+ output_dir="output",
81
+ verbose=False,
82
+ use_rag=False,
83
+ use_context_learning=False,
84
+ context_learning_path="data/context_learning",
85
+ chroma_db_path="data/rag/chroma_db",
86
+ manim_docs_path="data/rag/manim_docs",
87
+ embedding_model="azure/text-embedding-3-large",
88
+ use_visual_fix_code=False,
89
+ use_langfuse=True,
90
+ trace_id=None,
91
+ max_scene_concurrency: int = 5):
92
+ self.output_dir = output_dir
93
+ self.verbose = verbose
94
+ self.use_visual_fix_code = use_visual_fix_code
95
+ self.session_id = self._load_or_create_session_id() # Modified to load existing or create new
96
+ self.scene_semaphore = asyncio.Semaphore(max_scene_concurrency)
97
+ self.banned_reasonings = get_banned_reasonings()
98
+
99
+ # Initialize separate modules
100
+ self.planner = VideoPlanner(
101
+ planner_model=planner_model,
102
+ helper_model=helper_model,
103
+ output_dir=output_dir,
104
+ print_response=verbose,
105
+ use_context_learning=use_context_learning,
106
+ context_learning_path=context_learning_path,
107
+ use_rag=use_rag,
108
+ session_id=self.session_id,
109
+ chroma_db_path=chroma_db_path,
110
+ manim_docs_path=manim_docs_path,
111
+ embedding_model=embedding_model,
112
+ use_langfuse=use_langfuse
113
+ )
114
+ self.code_generator = CodeGenerator(
115
+ scene_model=scene_model if scene_model is not None else planner_model,
116
+ helper_model=helper_model if helper_model is not None else planner_model,
117
+ output_dir=output_dir,
118
+ print_response=verbose,
119
+ use_rag=use_rag,
120
+ use_context_learning=use_context_learning,
121
+ context_learning_path=context_learning_path,
122
+ chroma_db_path=chroma_db_path,
123
+ manim_docs_path=manim_docs_path,
124
+ embedding_model=embedding_model,
125
+ use_visual_fix_code=use_visual_fix_code,
126
+ use_langfuse=use_langfuse,
127
+ session_id=self.session_id
128
+ )
129
+ self.video_renderer = VideoRenderer(
130
+ output_dir=output_dir,
131
+ print_response=verbose,
132
+ use_visual_fix_code=use_visual_fix_code
133
+ )
134
+
135
+ def _load_or_create_session_id(self) -> str:
136
+ """
137
+ Load existing session ID from file or create a new one.
138
+
139
+ Returns:
140
+ str: The session ID either loaded from file or newly created.
141
+ """
142
+ session_file = os.path.join(self.output_dir, "session_id.txt")
143
+
144
+ if os.path.exists(session_file):
145
+ with open(session_file, 'r') as f:
146
+ session_id = f.read().strip()
147
+ print(f"Loaded existing session ID: {session_id}")
148
+ return session_id
149
+
150
+ # Create new session ID if none exists
151
+ session_id = str(uuid.uuid4())
152
+ os.makedirs(self.output_dir, exist_ok=True)
153
+ with open(session_file, 'w') as f:
154
+ f.write(session_id)
155
+ print(f"Created new session ID: {session_id}")
156
+ return session_id
157
+
158
+ def _save_topic_session_id(self, topic: str, session_id: str) -> None:
159
+ """
160
+ Save session ID for a specific topic.
161
+
162
+ Args:
163
+ topic (str): The topic to save the session ID for
164
+ session_id (str): The session ID to save
165
+ """
166
+ file_prefix = topic.lower()
167
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
168
+ topic_dir = os.path.join(self.output_dir, file_prefix)
169
+ os.makedirs(topic_dir, exist_ok=True)
170
+
171
+ session_file = os.path.join(topic_dir, "session_id.txt")
172
+ with open(session_file, 'w') as f:
173
+ f.write(session_id)
174
+
175
+ def _load_topic_session_id(self, topic: str) -> Optional[str]:
176
+ """
177
+ Load session ID for a specific topic if it exists.
178
+
179
+ Args:
180
+ topic (str): The topic to load the session ID for
181
+
182
+ Returns:
183
+ Optional[str]: The session ID if found, None otherwise
184
+ """
185
+ file_prefix = topic.lower()
186
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
187
+ session_file = os.path.join(self.output_dir, file_prefix, "session_id.txt")
188
+
189
+ if os.path.exists(session_file):
190
+ with open(session_file, 'r') as f:
191
+ return f.read().strip()
192
+ return None
193
+
194
+ def generate_scene_outline(self,
195
+ topic: str,
196
+ description: str,
197
+ session_id: str) -> str:
198
+ """
199
+ Generate scene outline using VideoPlanner.
200
+
201
+ Args:
202
+ topic (str): The topic of the video
203
+ description (str): Description of the video content
204
+ session_id (str): Session identifier for tracking
205
+
206
+ Returns:
207
+ str: Generated scene outline
208
+ """
209
+ return self.planner.generate_scene_outline(topic, description, session_id)
210
+
211
+ async def generate_scene_implementation(self,
212
+ topic: str,
213
+ description: str,
214
+ plan: str,
215
+ session_id: str) -> List[str]:
216
+ """
217
+ Generate scene implementations using VideoPlanner.
218
+
219
+ Args:
220
+ topic (str): The topic of the video
221
+ description (str): Description of the video content
222
+ plan (str): The scene plan to implement
223
+ session_id (str): Session identifier for tracking
224
+
225
+ Returns:
226
+ List[str]: List of generated scene implementations
227
+ """
228
+ return await self.planner.generate_scene_implementation(topic, description, plan, session_id)
229
+
230
+ async def generate_scene_implementation_concurrently(self,
231
+ topic: str,
232
+ description: str,
233
+ plan: str,
234
+ session_id: str) -> List[str]:
235
+ """
236
+ Generate scene implementations concurrently using VideoPlanner.
237
+
238
+ Args:
239
+ topic (str): The topic of the video
240
+ description (str): Description of the video content
241
+ plan (str): The scene plan to implement
242
+ session_id (str): Session identifier for tracking
243
+
244
+ Returns:
245
+ List[str]: List of generated scene implementations
246
+ """
247
+ return await self.planner.generate_scene_implementation_concurrently(topic, description, plan, session_id, self.scene_semaphore) # Pass semaphore
248
+
249
+ def load_implementation_plans(self, topic: str) -> Dict[int, Optional[str]]:
250
+ """
251
+ Load implementation plans for each scene.
252
+
253
+ Args:
254
+ topic (str): The topic to load implementation plans for
255
+
256
+ Returns:
257
+ Dict[int, Optional[str]]: Dictionary mapping scene numbers to their plans.
258
+ If a scene's plan is missing, its value will be None.
259
+ """
260
+ file_prefix = topic.lower()
261
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
262
+
263
+ # Load scene outline from file
264
+ scene_outline_path = os.path.join(self.output_dir, file_prefix, f"{file_prefix}_scene_outline.txt")
265
+ if not os.path.exists(scene_outline_path):
266
+ return {}
267
+
268
+ with open(scene_outline_path, "r") as f:
269
+ scene_outline = f.read()
270
+
271
+ # Extract scene outline to get number of scenes
272
+ scene_outline_content = extract_xml(scene_outline)
273
+ scene_number = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline_content))
274
+ print(f"Number of scenes: {scene_number}")
275
+
276
+ implementation_plans = {}
277
+
278
+ # Check each scene's implementation plan
279
+ for i in range(1, scene_number + 1):
280
+ plan_path = os.path.join(self.output_dir, file_prefix, f"scene{i}", f"{file_prefix}_scene{i}_implementation_plan.txt")
281
+ if os.path.exists(plan_path):
282
+ with open(plan_path, "r") as f:
283
+ implementation_plans[i] = f.read()
284
+ print(f"Found existing implementation plan for scene {i}")
285
+ else:
286
+ implementation_plans[i] = None
287
+ print(f"Missing implementation plan for scene {i}")
288
+
289
+ return implementation_plans
290
+
291
+ async def render_video_fix_code(self,
292
+ topic: str,
293
+ description: str,
294
+ scene_outline: str,
295
+ implementation_plans: List,
296
+ max_retries=3,
297
+ session_id: str = None) -> None:
298
+ """
299
+ Render the video for all scenes with code fixing capability.
300
+
301
+ Args:
302
+ topic (str): The topic of the video
303
+ description (str): Description of the video content
304
+ scene_outline (str): The overall scene outline
305
+ implementation_plans (List): List of implementation plans for each scene
306
+ max_retries (int, optional): Maximum number of code fix attempts. Defaults to 3.
307
+ session_id (str, optional): Session identifier for tracking
308
+ """
309
+ file_prefix = topic.lower()
310
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
311
+
312
+ # Create tasks for each scene
313
+ tasks = []
314
+ for i, implementation_plan in enumerate(implementation_plans):
315
+ # Try to load scene trace id, or generate new one if it doesn't exist
316
+ scene_dir = os.path.join(self.output_dir, file_prefix, f"scene{i+1}")
317
+ subplan_dir = os.path.join(scene_dir, "subplans")
318
+ os.makedirs(subplan_dir, exist_ok=True) # Create directories if they don't exist
319
+
320
+ scene_trace_id_path = os.path.join(subplan_dir, "scene_trace_id.txt")
321
+ try:
322
+ with open(scene_trace_id_path, 'r') as f:
323
+ scene_trace_id = f.read().strip()
324
+ except FileNotFoundError:
325
+ scene_trace_id = str(uuid.uuid4())
326
+ with open(scene_trace_id_path, 'w') as f:
327
+ f.write(scene_trace_id)
328
+
329
+ task = self.process_scene(i, scene_outline, implementation_plan, topic, description, max_retries, file_prefix, session_id, scene_trace_id)
330
+ tasks.append(task)
331
+
332
+ # Execute all tasks concurrently
333
+ await asyncio.gather(*tasks)
334
+
335
+ async def process_scene(self, i: int, scene_outline: str, scene_implementation: str, topic: str, description: str, max_retries: int, file_prefix: str, session_id: str, scene_trace_id: str): # added scene_trace_id
336
+ """
337
+ Process a single scene using CodeGenerator and VideoRenderer.
338
+
339
+ Args:
340
+ i (int): Scene index
341
+ scene_outline (str): Overall scene outline
342
+ scene_implementation (str): Implementation plan for this scene
343
+ topic (str): The topic of the video
344
+ description (str): Description of the video content
345
+ max_retries (int): Maximum number of code fix attempts
346
+ file_prefix (str): Prefix for file naming
347
+ session_id (str): Session identifier for tracking
348
+ scene_trace_id (str): Trace identifier for this scene
349
+ """
350
+ curr_scene = i + 1
351
+ curr_version = 0
352
+ # scene_trace_id = str(uuid.uuid4()) # Remove uuid generation
353
+ rag_queries_cache = {} # Initialize RAG queries cache
354
+
355
+ # Create necessary directories
356
+ code_dir = os.path.join(self.output_dir, file_prefix, f"scene{curr_scene}", "code")
357
+ os.makedirs(code_dir, exist_ok=True)
358
+ media_dir = os.path.join(self.output_dir, file_prefix, "media") # Define media_dir here
359
+
360
+ async with self.scene_semaphore:
361
+ # Step 3A: Generate initial manim code
362
+ code, log = self.code_generator.generate_manim_code(
363
+ topic=topic,
364
+ description=description,
365
+ scene_outline=scene_outline,
366
+ scene_implementation=scene_implementation,
367
+ scene_number=curr_scene,
368
+ additional_context=[_prompt_manim_cheatsheet, _code_font_size, _code_limit, _code_disable],
369
+ scene_trace_id=scene_trace_id, # Use passed scene_trace_id
370
+ session_id=session_id,
371
+ rag_queries_cache=rag_queries_cache # Pass the cache
372
+ )
373
+
374
+ # Save initial code and log (file operations can be offloaded if needed)
375
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}_init_log.txt"), "w") as f:
376
+ f.write(log)
377
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}.py"), "w") as f:
378
+ f.write(code)
379
+ print(f"Code saved to {code_dir}/{file_prefix}_scene{curr_scene}_v{curr_version}.py")
380
+
381
+ # Step 3B: Compile and fix code if needed
382
+ error_message = None
383
+ while True: # Retry loop controlled by break statements
384
+ code, error_message = await self.video_renderer.render_scene(
385
+ code=code,
386
+ file_prefix=file_prefix,
387
+ curr_scene=curr_scene,
388
+ curr_version=curr_version,
389
+ code_dir=code_dir,
390
+ media_dir=media_dir,
391
+ max_retries=max_retries, # Pass max_retries here if needed in render_scene
392
+ use_visual_fix_code=self.use_visual_fix_code,
393
+ visual_self_reflection_func=self.code_generator.visual_self_reflection, # Pass visual_self_reflection function
394
+ banned_reasonings=self.banned_reasonings, # Pass banned reasonings
395
+ scene_trace_id=scene_trace_id,
396
+ topic=topic,
397
+ session_id=session_id
398
+ )
399
+ if error_message is None: # Render success if error_message is None
400
+ break
401
+
402
+ if curr_version >= max_retries: # Max retries reached
403
+ print(f"Max retries reached for scene {curr_scene}, error: {error_message}")
404
+ break # Exit retry loop
405
+
406
+ curr_version += 1
407
+ # if program runs this, it means that the code is not rendered successfully
408
+ code, log = self.code_generator.fix_code_errors(
409
+ implementation_plan=scene_implementation,
410
+ code=code,
411
+ error=error_message,
412
+ scene_trace_id=scene_trace_id,
413
+ topic=topic,
414
+ scene_number=curr_scene,
415
+ session_id=session_id,
416
+ rag_queries_cache=rag_queries_cache
417
+ )
418
+
419
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}_fix_log.txt"), "w") as f:
420
+ f.write(log)
421
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}.py"), "w") as f:
422
+ f.write(code)
423
+
424
+ print(f"Code saved to {code_dir}/{file_prefix}_scene{curr_scene}_v{curr_version}.py")
425
+
426
+ def run_manim_process(self,
427
+ topic: str):
428
+ """
429
+ Run manim on all generated manim code for a specific topic using VideoRenderer.
430
+
431
+ Args:
432
+ topic (str): The topic to render videos for
433
+ """
434
+ return self.video_renderer.run_manim_process(topic)
435
+
436
+ def create_snapshot_scene(self, topic: str, scene_number: int, version_number: int, return_type: str = "image"):
437
+ """
438
+ Create a snapshot of the video for a specific topic and scene using VideoRenderer.
439
+
440
+ Args:
441
+ topic (str): The topic of the video
442
+ scene_number (int): Scene number to snapshot
443
+ version_number (int): Version number to snapshot
444
+ return_type (str, optional): Type of snapshot to return. Defaults to "image".
445
+
446
+ Returns:
447
+ The snapshot in the specified format
448
+ """
449
+ return self.video_renderer.create_snapshot_scene(topic, scene_number, version_number, return_type)
450
+
451
+ def combine_videos(self, topic: str):
452
+ """
453
+ Combine all videos and subtitle files for a specific topic using VideoRenderer.
454
+
455
+ Args:
456
+ topic (str): The topic to combine videos for
457
+ """
458
+ self.video_renderer.combine_videos(topic)
459
+
460
+ async def _generate_scene_implementation_single(self, topic: str, description: str, scene_outline_i: str, i: int, file_prefix: str, session_id: str, scene_trace_id: str) -> str:
461
+ """
462
+ Generate detailed implementation plan for a single scene using VideoPlanner.
463
+
464
+ Args:
465
+ topic (str): The topic of the video
466
+ description (str): Description of the video content
467
+ scene_outline_i (str): Outline for this specific scene
468
+ i (int): Scene index
469
+ file_prefix (str): Prefix for file naming
470
+ session_id (str): Session identifier for tracking
471
+ scene_trace_id (str): Trace identifier for this scene
472
+
473
+ Returns:
474
+ str: Generated implementation plan
475
+ """
476
+ return await self.planner._generate_scene_implementation_single(topic, description, scene_outline_i, i, file_prefix, session_id, scene_trace_id)
477
+
478
+ async def generate_video_pipeline(self, topic: str, description: str, max_retries: int, only_plan: bool = False, specific_scenes: List[int] = None):
479
+ """
480
+ Modified pipeline to handle partial scene completions and option to only generate plans for specific scenes.
481
+
482
+ Args:
483
+ topic (str): The topic of the video
484
+ description (str): Description of the video content
485
+ max_retries (int): Maximum number of code fix attempts
486
+ only_plan (bool, optional): Whether to only generate plans without rendering. Defaults to False.
487
+ specific_scenes (List[int], optional): List of specific scenes to process. Defaults to None.
488
+ """
489
+ session_id = self._load_or_create_session_id()
490
+ self._save_topic_session_id(topic, session_id)
491
+
492
+ file_prefix = topic.lower()
493
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
494
+
495
+ # Load or generate scene outline
496
+ scene_outline_path = os.path.join(self.output_dir, file_prefix, f"{file_prefix}_scene_outline.txt")
497
+ if os.path.exists(scene_outline_path):
498
+ with open(scene_outline_path, "r") as f:
499
+ scene_outline = f.read()
500
+ print(f"Loaded existing scene outline for topic: {topic}")
501
+ if self.planner.use_rag:
502
+ self.planner.relevant_plugins = self.planner.rag_integration.detect_relevant_plugins(topic, description) or []
503
+ self.planner.rag_integration.set_relevant_plugins(self.planner.relevant_plugins)
504
+ print(f"Detected relevant plugins: {self.planner.relevant_plugins}")
505
+ else:
506
+ print(f"Generating new scene outline for topic: {topic}")
507
+ scene_outline = self.planner.generate_scene_outline(topic, description, session_id)
508
+ os.makedirs(os.path.join(self.output_dir, file_prefix), exist_ok=True)
509
+ with open(scene_outline_path, "w") as f:
510
+ f.write(scene_outline)
511
+
512
+ # Load or generate implementation plans
513
+ implementation_plans_dict = self.load_implementation_plans(topic)
514
+ if not implementation_plans_dict:
515
+ scene_outline_content = extract_xml(scene_outline)
516
+ scene_numbers = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline_content))
517
+ implementation_plans_dict = {i: None for i in range(1, scene_numbers + 1)}
518
+
519
+ # Generate missing implementation plans for specified scenes or all missing scenes
520
+ missing_scenes = []
521
+ for scene_num, plan in implementation_plans_dict.items():
522
+ if plan is None and (specific_scenes is None or scene_num in specific_scenes):
523
+ missing_scenes.append(scene_num)
524
+
525
+ if missing_scenes:
526
+ print(f"Generating implementation plans for missing scenes: {missing_scenes}")
527
+ for scene_num in missing_scenes:
528
+ scene_outline_content = extract_xml(scene_outline)
529
+ scene_match = re.search(f'<SCENE_{scene_num}>(.*?)</SCENE_{scene_num}>', scene_outline_content, re.DOTALL)
530
+ if scene_match:
531
+ scene_outline_i = scene_match.group(1)
532
+ scene_trace_id = str(uuid.uuid4())
533
+ implementation_plan = await self._generate_scene_implementation_single(
534
+ topic, description, scene_outline_i, scene_num, file_prefix, session_id, scene_trace_id)
535
+ implementation_plans_dict[scene_num] = implementation_plan
536
+
537
+ if only_plan:
538
+ print(f"Only generating plans - skipping code generation and video rendering for topic: {topic}")
539
+ return
540
+
541
+ # Convert dictionary to list maintaining scene order
542
+ sorted_scene_numbers = sorted(implementation_plans_dict.keys())
543
+ implementation_plans = [implementation_plans_dict[i] for i in sorted_scene_numbers]
544
+
545
+ # Render scenes
546
+ print(f"Starting video rendering for topic: {topic}")
547
+
548
+ # Check which scenes need processing
549
+ scenes_to_process = []
550
+ for i, implementation_plan in enumerate(implementation_plans):
551
+ scene_dir = os.path.join(self.output_dir, file_prefix, f"scene{i+1}")
552
+ code_dir = os.path.join(scene_dir, "code")
553
+
554
+ # Check if scene has any code files
555
+ has_code = False
556
+ if os.path.exists(code_dir):
557
+ if any(f.endswith('.py') for f in os.listdir(code_dir)):
558
+ has_code = True
559
+
560
+ # For only_render mode, only process scenes without code
561
+ if args.only_render:
562
+ if not has_code:
563
+ scenes_to_process.append((i+1, implementation_plan))
564
+ print(f"Scene {i+1} has no code, will process")
565
+ else:
566
+ print(f"Scene {i+1} already has code, skipping")
567
+ # For normal mode, process scenes that haven't been successfully rendered
568
+ elif not os.path.exists(os.path.join(scene_dir, "succ_rendered.txt")):
569
+ scenes_to_process.append((i+1, implementation_plan))
570
+
571
+ if not scenes_to_process:
572
+ print(f"No scenes need processing for topic '{topic}'.")
573
+ else:
574
+ print(f"Rendering {len(scenes_to_process)} scenes that need processing...")
575
+ # Create a list of tuples with scene numbers and plans
576
+ scene_plans = [(scene_num, plan) for scene_num, plan in scenes_to_process]
577
+ # Sort by scene number to ensure correct order
578
+ scene_plans.sort(key=lambda x: x[0])
579
+ # Extract just the plans in the correct order
580
+ filtered_implementation_plans = [plan for _, plan in scene_plans]
581
+ await self.render_video_fix_code(topic, description, scene_outline, filtered_implementation_plans,
582
+ max_retries=max_retries, session_id=session_id)
583
+
584
+ if not args.only_render: # Skip video combination in only_render mode
585
+ print(f"Video rendering completed for topic '{topic}'.")
586
+
587
+ def check_theorem_status(self, theorem: Dict) -> Dict[str, bool]:
588
+ """
589
+ Check if a theorem has its plan, code files, and rendered videos with detailed scene status.
590
+
591
+ Args:
592
+ theorem (Dict): Dictionary containing theorem information
593
+
594
+ Returns:
595
+ Dict[str, bool]: Dictionary containing status information for the theorem
596
+ """
597
+ topic = theorem['theorem']
598
+ file_prefix = topic.lower()
599
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
600
+
601
+ # Check scene outline
602
+ scene_outline_path = os.path.join(self.output_dir, file_prefix, f"{file_prefix}_scene_outline.txt")
603
+ has_scene_outline = os.path.exists(scene_outline_path)
604
+
605
+ # Get number of scenes if outline exists
606
+ num_scenes = 0
607
+ if has_scene_outline:
608
+ with open(scene_outline_path, "r") as f:
609
+ scene_outline = f.read()
610
+ scene_outline_content = extract_xml(scene_outline)
611
+ num_scenes = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline_content))
612
+
613
+ # Check implementation plans, code files, and rendered videos
614
+ implementation_plans = 0
615
+ code_files = 0
616
+ rendered_scenes = 0
617
+
618
+ # Track status of individual scenes
619
+ scene_status = []
620
+ for i in range(1, num_scenes + 1):
621
+ scene_dir = os.path.join(self.output_dir, file_prefix, f"scene{i}")
622
+
623
+ # Check implementation plan
624
+ plan_path = os.path.join(scene_dir, f"{file_prefix}_scene{i}_implementation_plan.txt")
625
+ has_plan = os.path.exists(plan_path)
626
+ if has_plan:
627
+ implementation_plans += 1
628
+
629
+ # Check code files
630
+ code_dir = os.path.join(scene_dir, "code")
631
+ has_code = False
632
+ if os.path.exists(code_dir):
633
+ if any(f.endswith('.py') for f in os.listdir(code_dir)):
634
+ has_code = True
635
+ code_files += 1
636
+
637
+ # Check rendered scene video
638
+ has_render = False
639
+ if os.path.exists(scene_dir):
640
+ succ_rendered_path = os.path.join(scene_dir, "succ_rendered.txt")
641
+ if os.path.exists(succ_rendered_path):
642
+ has_render = True
643
+ rendered_scenes += 1
644
+
645
+ scene_status.append({
646
+ 'scene_number': i,
647
+ 'has_plan': has_plan,
648
+ 'has_code': has_code,
649
+ 'has_render': has_render
650
+ })
651
+
652
+ # Check combined video
653
+ combined_video_path = os.path.join(self.output_dir, file_prefix, f"{file_prefix}_combined.mp4")
654
+ has_combined_video = os.path.exists(combined_video_path)
655
+
656
+ return {
657
+ 'topic': topic,
658
+ 'has_scene_outline': has_scene_outline,
659
+ 'total_scenes': num_scenes,
660
+ 'implementation_plans': implementation_plans,
661
+ 'code_files': code_files,
662
+ 'rendered_scenes': rendered_scenes,
663
+ 'has_combined_video': has_combined_video,
664
+ 'scene_status': scene_status
665
+ }
666
+
667
+ if __name__ == "__main__":
668
+ parser = argparse.ArgumentParser(description='Generate Manim videos using AI')
669
+ parser.add_argument('--model', type=str, choices=allowed_models,
670
+ default='gemini/gemini-1.5-pro-002', help='Select the AI model to use')
671
+ parser.add_argument('--topic', type=str, default=None, help='Topic to generate videos for')
672
+ parser.add_argument('--context', type=str, default=None, help='Context of the topic')
673
+ parser.add_argument('--helper_model', type=str, choices=allowed_models,
674
+ default=None, help='Select the helper model to use')
675
+ parser.add_argument('--only_gen_vid', action='store_true', help='Only generate videos to existing plans')
676
+ parser.add_argument('--only_combine', action='store_true', help='Only combine videos')
677
+ parser.add_argument('--peek_existing_videos', '--peek', action='store_true', help='Peek at existing videos')
678
+ parser.add_argument('--output_dir', type=str, default=Config.OUTPUT_DIR, help='Output directory') # Use Config
679
+ parser.add_argument('--theorems_path', type=str, default=None, help='Path to theorems json file')
680
+ parser.add_argument('--sample_size', '--sample', type=int, default=None, help='Number of theorems to sample')
681
+ parser.add_argument('--verbose', action='store_true', help='Print verbose output')
682
+ parser.add_argument('--max_retries', type=int, default=5, help='Maximum number of retries for code generation')
683
+ parser.add_argument('--use_rag', '--rag', action='store_true', help='Use Retrieval Augmented Generation')
684
+ parser.add_argument('--use_visual_fix_code','--visual_fix_code', action='store_true', help='Use VLM to fix code with rendered visuals')
685
+ parser.add_argument('--chroma_db_path', type=str, default=Config.CHROMA_DB_PATH, help="Path to Chroma DB") # Use Config
686
+ parser.add_argument('--manim_docs_path', type=str, default=Config.MANIM_DOCS_PATH, help="Path to manim docs") # Use Config
687
+ parser.add_argument('--embedding_model', type=str,
688
+ default=Config.EMBEDDING_MODEL, # Use Config
689
+ choices=["azure/text-embedding-3-large", "vertex_ai/text-embedding-005"],
690
+ help='Select the embedding model to use')
691
+ parser.add_argument('--use_context_learning', action='store_true',
692
+ help='Use context learning with example Manim code')
693
+ parser.add_argument('--context_learning_path', type=str,
694
+ default=Config.CONTEXT_LEARNING_PATH, # Use Config
695
+ help='Path to context learning examples')
696
+ parser.add_argument('--use_langfuse', action='store_true',
697
+ help='Enable Langfuse logging')
698
+ parser.add_argument('--max_scene_concurrency', type=int, default=1, help='Maximum number of scenes to process concurrently')
699
+ parser.add_argument('--max_topic_concurrency', type=int, default=1,
700
+ help='Maximum number of topics to process concurrently')
701
+ parser.add_argument('--debug_combine_topic', type=str, help='Debug combine videos', default=None)
702
+ parser.add_argument('--only_plan', action='store_true', help='Only generate scene outline and implementation plans')
703
+ parser.add_argument('--check_status', action='store_true',
704
+ help='Check planning and code status for all theorems')
705
+ parser.add_argument('--only_render', action='store_true', help='Only render scenes without combining videos')
706
+ parser.add_argument('--scenes', nargs='+', type=int, help='Specific scenes to process (if theorems_path is provided)')
707
+ args = parser.parse_args()
708
+
709
+ # Initialize planner model using LiteLLM
710
+ if args.verbose:
711
+ verbose = True
712
+ else:
713
+ verbose = False
714
+ planner_model = LiteLLMWrapper(
715
+ model_name=args.model,
716
+ temperature=0.7,
717
+ print_cost=True,
718
+ verbose=verbose,
719
+ use_langfuse=args.use_langfuse
720
+ )
721
+ helper_model = LiteLLMWrapper(
722
+ model_name=args.helper_model if args.helper_model else args.model, # Use helper_model if provided, else planner_model
723
+ temperature=0.7,
724
+ print_cost=True,
725
+ verbose=verbose,
726
+ use_langfuse=args.use_langfuse
727
+ )
728
+ scene_model = LiteLLMWrapper( # Initialize scene_model separately
729
+ model_name=args.model,
730
+ temperature=0.7,
731
+ print_cost=True,
732
+ verbose=verbose,
733
+ use_langfuse=args.use_langfuse
734
+ )
735
+ print(f"Planner model: {args.model}, Helper model: {args.helper_model if args.helper_model else args.model}, Scene model: {args.model}") # Print all models
736
+
737
+
738
+ if args.theorems_path:
739
+ # Load the sample theorems
740
+ with open(args.theorems_path, "r") as f:
741
+ theorems = json.load(f)
742
+
743
+ if args.sample_size:
744
+ theorems = theorems[:args.sample_size]
745
+
746
+ if args.peek_existing_videos:
747
+ print(f"Here's the results of checking whether videos are rendered successfully in {args.output_dir}:")
748
+ # in output_dir, find all combined.mp4 files and print number of successful rendered videos out of total number of folders
749
+ successful_rendered_videos = 0
750
+ total_folders = 0
751
+ for item in os.listdir(args.output_dir):
752
+ if os.path.isdir(os.path.join(args.output_dir, item)):
753
+ total_folders += 1
754
+ if os.path.exists(os.path.join(args.output_dir, item, f"{item}_combined.mp4")):
755
+ successful_rendered_videos += 1
756
+ print(f"Number of successful rendered videos: {successful_rendered_videos}/{total_folders}")
757
+
758
+ # also check whether any succ_rendered.txt in scene{i} folder, and then add up the number of successful rendered videos
759
+ successful_rendered_videos = 0
760
+ total_scenes = 0
761
+ for item in os.listdir(args.output_dir):
762
+ if os.path.isdir(os.path.join(args.output_dir, item)):
763
+ for scene_folder in os.listdir(os.path.join(args.output_dir, item)):
764
+ if "scene" in scene_folder and os.path.isdir(os.path.join(args.output_dir, item, scene_folder)):
765
+ total_scenes += 1
766
+ if os.path.exists(os.path.join(args.output_dir, item, scene_folder, "succ_rendered.txt")):
767
+ successful_rendered_videos += 1
768
+ print(f"Number of successful rendered scenes: {successful_rendered_videos}/{total_scenes}")
769
+ exit()
770
+
771
+ video_generator = VideoGenerator(
772
+ planner_model=planner_model,
773
+ scene_model=scene_model, # Pass scene_model
774
+ helper_model=helper_model, # Pass helper_model
775
+ output_dir=args.output_dir,
776
+ verbose=args.verbose,
777
+ use_rag=args.use_rag,
778
+ use_context_learning=args.use_context_learning,
779
+ context_learning_path=args.context_learning_path,
780
+ chroma_db_path=args.chroma_db_path,
781
+ manim_docs_path=args.manim_docs_path,
782
+ embedding_model=args.embedding_model,
783
+ use_visual_fix_code=args.use_visual_fix_code,
784
+ use_langfuse=args.use_langfuse,
785
+ max_scene_concurrency=args.max_scene_concurrency
786
+ )
787
+
788
+ if args.debug_combine_topic is not None:
789
+ video_generator.combine_videos(args.debug_combine_topic)
790
+ exit()
791
+
792
+ if args.only_gen_vid:
793
+ # Generate videos for existing plans
794
+ print("Generating videos for existing plans...")
795
+
796
+ async def process_theorem(theorem, topic_semaphore):
797
+ async with topic_semaphore:
798
+ topic = theorem['theorem']
799
+ print(f"Processing topic: {topic}")
800
+ await video_generator.render_video_fix_code(topic, theorem['description'], max_retries=args.max_retries)
801
+
802
+ async def main():
803
+ # Use the command-line argument for topic concurrency
804
+ topic_semaphore = asyncio.Semaphore(args.max_topic_concurrency)
805
+ tasks = [process_theorem(theorem, topic_semaphore) for theorem in theorems]
806
+ await asyncio.gather(*tasks)
807
+
808
+ asyncio.run(main())
809
+
810
+ elif args.check_status:
811
+ print("\nChecking theorem status...")
812
+ video_generator = VideoGenerator(
813
+ planner_model=planner_model,
814
+ scene_model=scene_model,
815
+ helper_model=helper_model,
816
+ output_dir=args.output_dir,
817
+ verbose=args.verbose,
818
+ use_rag=args.use_rag,
819
+ use_context_learning=args.use_context_learning,
820
+ context_learning_path=args.context_learning_path,
821
+ chroma_db_path=args.chroma_db_path,
822
+ manim_docs_path=args.manim_docs_path,
823
+ embedding_model=args.embedding_model,
824
+ use_visual_fix_code=args.use_visual_fix_code,
825
+ use_langfuse=args.use_langfuse,
826
+ max_scene_concurrency=args.max_scene_concurrency
827
+ )
828
+
829
+ all_statuses = [video_generator.check_theorem_status(theorem) for theorem in theorems]
830
+
831
+ # Print combined status table
832
+ print("\nTheorem Status:")
833
+ print("-" * 160)
834
+ print(f"{'Topic':<40} {'Outline':<8} {'Total':<8} {'Status (Plan/Code/Render)':<50} {'Combined':<10} {'Missing Components':<40}")
835
+ print("-" * 160)
836
+ for status in all_statuses:
837
+ # Create status string showing plan/code/render completion for each scene
838
+ scene_status_str = ""
839
+ for scene in status['scene_status']:
840
+ scene_str = (
841
+ ("P" if scene['has_plan'] else "-") +
842
+ ("C" if scene['has_code'] else "-") +
843
+ ("R" if scene['has_render'] else "-") + " "
844
+ )
845
+ scene_status_str += scene_str
846
+
847
+ # Collect missing components
848
+ missing_plans = []
849
+ missing_code = []
850
+ missing_renders = []
851
+ for scene in status['scene_status']:
852
+ if not scene['has_plan']:
853
+ missing_plans.append(str(scene['scene_number']))
854
+ if not scene['has_code']:
855
+ missing_code.append(str(scene['scene_number']))
856
+ if not scene['has_render']:
857
+ missing_renders.append(str(scene['scene_number']))
858
+
859
+ # Format missing components string
860
+ missing_str = []
861
+ if missing_plans:
862
+ missing_str.append(f"P:{','.join(missing_plans)}")
863
+ if missing_code:
864
+ missing_str.append(f"C:{','.join(missing_code)}")
865
+ if missing_renders:
866
+ missing_str.append(f"R:{','.join(missing_renders)}")
867
+ missing_str = ' '.join(missing_str)
868
+
869
+ print(f"{status['topic'][:37]+'...' if len(status['topic'])>37 else status['topic']:<40} "
870
+ f"{'✓' if status['has_scene_outline'] else '✗':<8} "
871
+ f"{status['total_scenes']:<8} "
872
+ f"{scene_status_str[:47]+'...' if len(scene_status_str)>47 else scene_status_str:<50} "
873
+ f"{'✓' if status['has_combined_video'] else '✗':<10} "
874
+ f"{missing_str[:37]+'...' if len(missing_str)>37 else missing_str:<40}")
875
+
876
+ # Print summary
877
+ print("\nSummary:")
878
+ print(f"Total theorems: {len(theorems)}")
879
+ print(f"Total scenes: {sum(status['total_scenes'] for status in all_statuses)}")
880
+ print(f"Scene completion status:")
881
+ print(f" Plans: {sum(status['implementation_plans'] for status in all_statuses)} scenes")
882
+ print(f" Code: {sum(status['code_files'] for status in all_statuses)} scenes")
883
+ print(f" Renders: {sum(status['rendered_scenes'] for status in all_statuses)} scenes")
884
+ print(f"Combined videos: {sum(1 for status in all_statuses if status['has_combined_video'])}/{len(theorems)}")
885
+ exit()
886
+
887
+ else:
888
+ # Generate video pipeline from scratch
889
+ print("Generating video pipeline from scratch...")
890
+
891
+ async def process_theorem(theorem, topic_semaphore):
892
+ async with topic_semaphore:
893
+ topic = theorem['theorem']
894
+ description = theorem['description']
895
+ print(f"Processing topic: {topic}")
896
+ if args.only_combine:
897
+ video_generator.combine_videos(topic)
898
+ else:
899
+ await video_generator.generate_video_pipeline(
900
+ topic,
901
+ description,
902
+ max_retries=args.max_retries,
903
+ only_plan=args.only_plan,
904
+ specific_scenes=args.scenes
905
+ )
906
+ if not args.only_plan and not args.only_render: # Add condition for only_render
907
+ video_generator.combine_videos(topic)
908
+
909
+ async def main():
910
+ # Use the command-line argument for topic concurrency
911
+ topic_semaphore = asyncio.Semaphore(args.max_topic_concurrency)
912
+ tasks = [process_theorem(theorem, topic_semaphore) for theorem in theorems]
913
+ await asyncio.gather(*tasks)
914
+
915
+ asyncio.run(main())
916
+
917
+ elif args.topic and args.context:
918
+ video_generator = VideoGenerator(
919
+ planner_model=planner_model,
920
+ scene_model=scene_model, # Pass scene_model
921
+ helper_model=helper_model, # Pass helper_model
922
+ output_dir=args.output_dir,
923
+ verbose=args.verbose,
924
+ use_rag=args.use_rag,
925
+ use_context_learning=args.use_context_learning,
926
+ context_learning_path=args.context_learning_path,
927
+ chroma_db_path=args.chroma_db_path,
928
+ manim_docs_path=args.manim_docs_path,
929
+ embedding_model=args.embedding_model,
930
+ use_visual_fix_code=args.use_visual_fix_code,
931
+ use_langfuse=args.use_langfuse,
932
+ max_scene_concurrency=args.max_scene_concurrency
933
+ )
934
+ # Process single topic with context
935
+ print(f"Processing topic: {args.topic}")
936
+
937
+ if args.only_gen_vid:
938
+ video_generator.render_video_fix_code(args.topic, args.context, max_retries=args.max_retries)
939
+ exit()
940
+
941
+ if args.only_combine:
942
+ video_generator.combine_videos(args.topic)
943
+ else:
944
+ asyncio.run(video_generator.generate_video_pipeline(
945
+ args.topic,
946
+ args.context,
947
+ max_retries=args.max_retries,
948
+ only_plan=args.only_plan,
949
+ ))
950
+ if not args.only_plan and not args.only_render:
951
+ video_generator.combine_videos(args.topic)
952
+ else:
953
+ print("Please provide either (--theorems_path) or (--topic and --context)")
954
+ exit()
mllm_tools/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Empty file to make this directory a Python package
mllm_tools/gemini.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Any, Union, Optional
2
+ import io
3
+ import os
4
+ import base64
5
+ from PIL import Image
6
+ import mimetypes
7
+ import google.generativeai as genai
8
+ import tempfile
9
+ import time
10
+ from urllib.parse import urlparse
11
+ import requests
12
+ from io import BytesIO
13
+
14
+ class GeminiWrapper:
15
+ """Wrapper for Gemini to support multiple models and logging"""
16
+
17
+ def __init__(
18
+ self,
19
+ model_name: str = "gemini-1.5-pro-002",
20
+ temperature: float = 0.7,
21
+ print_cost: bool = False,
22
+ verbose: bool = False,
23
+ use_langfuse: bool = False
24
+ ):
25
+ """
26
+ Initialize the Gemini wrapper
27
+
28
+ Args:
29
+ model_name: Name of the model to use
30
+ temperature: Temperature for completion
31
+ print_cost: Whether to print the cost of the completion
32
+ verbose: Whether to print verbose output
33
+ use_langfuse: Whether to enable Langfuse logging
34
+ """
35
+ self.model_name = model_name.split('/')[-1] if '/' in model_name else model_name
36
+ self.temperature = temperature
37
+ self.print_cost = print_cost
38
+ self.verbose = verbose
39
+ self.accumulated_cost = 0
40
+
41
+ api_key = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
42
+ if not api_key:
43
+ raise ValueError("No API_KEY found. Please set the `GEMINI_API_KEY` or `GOOGLE_API_KEY` environment variable.")
44
+ genai.configure(api_key=api_key)
45
+
46
+ generation_config = {
47
+ "temperature": self.temperature,
48
+ "top_p": 0.95,
49
+ "response_mime_type": "text/plain",
50
+ }
51
+ safety_settings = [
52
+ {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
53
+ {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
54
+ {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
55
+ {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
56
+ ]
57
+ self.model = genai.GenerativeModel(
58
+ model_name=self.model_name,
59
+ safety_settings=safety_settings,
60
+ generation_config=generation_config,
61
+ )
62
+
63
+ def _get_mime_type(self, file_path: str) -> str:
64
+ """
65
+ Get the MIME type of a file based on its extension
66
+
67
+ Args:
68
+ file_path: Path to the file
69
+
70
+ Returns:
71
+ MIME type as a string (e.g., "image/jpeg", "audio/mp3")
72
+ """
73
+ mime_type, _ = mimetypes.guess_type(file_path)
74
+ if mime_type is None:
75
+ raise ValueError(f"Unsupported file type: {file_path}")
76
+ return mime_type
77
+
78
+ def _download_file(self, url: str) -> str:
79
+ """
80
+ Download a file from a URL and save it as a temporary file
81
+
82
+ Args:
83
+ url: URL of the file to download
84
+
85
+ Returns:
86
+ Path to the temporary file
87
+ """
88
+ response = requests.get(url)
89
+ if response.status_code == 200:
90
+ temp_file = tempfile.NamedTemporaryFile(delete=False)
91
+ temp_file.write(response.content)
92
+ temp_file.close()
93
+ return temp_file.name
94
+ else:
95
+ raise ValueError(f"Failed to download file from URL: {url}")
96
+
97
+ def _save_image_to_temp(self, image: Image.Image) -> str:
98
+ """
99
+ Save a PIL Image to a temporary file
100
+
101
+ Args:
102
+ image: PIL Image object
103
+
104
+ Returns:
105
+ Path to the temporary file
106
+ """
107
+ temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
108
+ image.save(temp_file, format="PNG")
109
+ temp_file.close()
110
+ return temp_file.name
111
+
112
+ def _upload_to_gemini(self, file_path: str, mime_type: Optional[str] = None):
113
+ """
114
+ Uploads the given file to Gemini.
115
+
116
+ Args:
117
+ file_path: Path to the file
118
+ mime_type: MIME type of the file
119
+
120
+ Returns:
121
+ Uploaded file object
122
+ """
123
+ return genai.upload_file(file_path, mime_type=mime_type)
124
+
125
+ def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
126
+ """
127
+ Process messages and return completion
128
+
129
+ Args:
130
+ messages: List of message dictionaries with 'type' and 'content' keys
131
+ metadata: Optional metadata to pass to Gemini completion
132
+
133
+ Returns:
134
+ Generated text response
135
+ """
136
+ contents = []
137
+ for msg in messages:
138
+ if msg["type"] == "text":
139
+ contents.append(msg["content"])
140
+ elif msg["type"] in ["image", "audio", "video"]:
141
+ if isinstance(msg["content"], Image.Image):
142
+ file_path = self._save_image_to_temp(msg["content"])
143
+ mime_type = "image/png"
144
+ elif isinstance(msg["content"], str):
145
+ if msg["content"].startswith("http"):
146
+ file_path = self._download_file(msg["content"])
147
+ mime_type = self._get_mime_type(msg["content"])
148
+ else:
149
+ file_path = msg["content"]
150
+ mime_type = self._get_mime_type(file_path)
151
+ else:
152
+ raise ValueError("Unsupported content type")
153
+
154
+ uploaded_file = self._upload_to_gemini(file_path, mime_type)
155
+
156
+ while uploaded_file.state.name == "PROCESSING":
157
+ print('.', end='')
158
+ time.sleep(3)
159
+ uploaded_file = genai.get_file(uploaded_file.name)
160
+ if uploaded_file.state.name == "FAILED":
161
+ raise ValueError(uploaded_file.state.name)
162
+ print("Upload successfully")
163
+ contents.append(uploaded_file)
164
+ else:
165
+ raise ValueError("Unsupported message type")
166
+
167
+ response = self.model.generate_content(contents, request_options={"timeout": 600})
168
+ try:
169
+ return response.text
170
+ except Exception as e:
171
+ print(e)
172
+ print(response.prompt_feedback)
173
+ return str(response.prompt_feedback)
174
+
175
+ if __name__ == "__main__":
176
+ pass
mllm_tools/litellm.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from typing import List, Dict, Any, Union, Optional
4
+ import io
5
+ import os
6
+ import base64
7
+ from PIL import Image
8
+ import mimetypes
9
+ import litellm
10
+ from litellm import completion, completion_cost
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ class LiteLLMWrapper:
16
+ """Wrapper for LiteLLM to support multiple models and logging"""
17
+
18
+ def __init__(
19
+ self,
20
+ model_name: str = "gpt-4-vision-preview",
21
+ temperature: float = 0.7,
22
+ print_cost: bool = False,
23
+ verbose: bool = False,
24
+ use_langfuse: bool = True,
25
+ ):
26
+ """
27
+ Initialize the LiteLLM wrapper
28
+
29
+ Args:
30
+ model_name: Name of the model to use (e.g. "azure/gpt-4", "vertex_ai/gemini-pro")
31
+ temperature: Temperature for completion
32
+ print_cost: Whether to print the cost of the completion
33
+ verbose: Whether to print verbose output
34
+ use_langfuse: Whether to enable Langfuse logging
35
+ """
36
+ self.model_name = model_name
37
+ self.temperature = temperature
38
+ self.print_cost = print_cost
39
+ self.verbose = verbose
40
+ self.accumulated_cost = 0
41
+
42
+ if self.verbose:
43
+ os.environ['LITELLM_LOG'] = 'DEBUG'
44
+
45
+ # Set langfuse callback only if enabled
46
+ if use_langfuse:
47
+ litellm.success_callback = ["langfuse"]
48
+ litellm.failure_callback = ["langfuse"]
49
+
50
+ def _encode_file(self, file_path: Union[str, Image.Image]) -> str:
51
+ """
52
+ Encode local file or PIL Image to base64 string
53
+
54
+ Args:
55
+ file_path: Path to local file or PIL Image object
56
+
57
+ Returns:
58
+ Base64 encoded file string
59
+ """
60
+ if isinstance(file_path, Image.Image):
61
+ buffered = io.BytesIO()
62
+ file_path.save(buffered, format="PNG")
63
+ return base64.b64encode(buffered.getvalue()).decode("utf-8")
64
+ else:
65
+ with open(file_path, "rb") as file:
66
+ return base64.b64encode(file.read()).decode("utf-8")
67
+
68
+ def _get_mime_type(self, file_path: str) -> str:
69
+ """
70
+ Get the MIME type of a file based on its extension
71
+
72
+ Args:
73
+ file_path: Path to the file
74
+
75
+ Returns:
76
+ MIME type as a string (e.g., "image/jpeg", "audio/mp3")
77
+ """
78
+ mime_type, _ = mimetypes.guess_type(file_path)
79
+ if mime_type is None:
80
+ raise ValueError(f"Unsupported file type: {file_path}")
81
+ return mime_type
82
+
83
+ def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
84
+ """
85
+ Process messages and return completion
86
+
87
+ Args:
88
+ messages: List of message dictionaries with 'type' and 'content' keys
89
+ metadata: Optional metadata to pass to litellm completion, e.g. for Langfuse tracking
90
+
91
+ Returns:
92
+ Generated text response
93
+ """
94
+ if metadata is None:
95
+ print("No metadata provided, using empty metadata")
96
+ metadata = {}
97
+ metadata["trace_name"] = f"litellm-completion-{self.model_name}"
98
+ # Convert messages to LiteLLM format
99
+ formatted_messages = []
100
+ for msg in messages:
101
+ if msg["type"] == "text":
102
+ formatted_messages.append({
103
+ "role": "user",
104
+ "content": [{"type": "text", "text": msg["content"]}]
105
+ })
106
+ elif msg["type"] in ["image", "audio", "video"]:
107
+ # Check if content is a local file path or PIL Image
108
+ if isinstance(msg["content"], Image.Image) or os.path.isfile(msg["content"]):
109
+ try:
110
+ if isinstance(msg["content"], Image.Image):
111
+ mime_type = "image/png"
112
+ else:
113
+ mime_type = self._get_mime_type(msg["content"])
114
+ base64_data = self._encode_file(msg["content"])
115
+ data_url = f"data:{mime_type};base64,{base64_data}"
116
+ except ValueError as e:
117
+ print(f"Error processing file {msg['content']}: {e}")
118
+ continue
119
+ else:
120
+ data_url = msg["content"]
121
+
122
+ # Append the formatted message based on the model
123
+ if "gemini" in self.model_name:
124
+ formatted_messages.append({
125
+ "role": "user",
126
+ "content": [
127
+ {
128
+ "type": "image_url",
129
+ "image_url": data_url
130
+ }
131
+ ]
132
+ })
133
+ elif "gpt" in self.model_name:
134
+ # GPT and other models expect a different format
135
+ if msg["type"] == "image":
136
+ # Default format for images and videos in GPT
137
+ formatted_messages.append({
138
+ "role": "user",
139
+ "content": [
140
+ {
141
+ "type": f"image_url",
142
+ f"{msg['type']}_url": {
143
+ "url": data_url,
144
+ "detail": "high"
145
+ }
146
+ }
147
+ ]
148
+ })
149
+ else:
150
+ raise ValueError("For GPT, only text and image inferencing are supported")
151
+ else:
152
+ raise ValueError("Only support Gemini and Gpt for Multimodal capability now")
153
+
154
+ try:
155
+ # if it's openai o series model, set temperature to None and reasoning_effort to "medium"
156
+ if (re.match(r"^o\d+.*$", self.model_name) or re.match(r"^openai/o.*$", self.model_name)):
157
+ self.temperature = None
158
+ self.reasoning_effort = "medium"
159
+ response = completion(
160
+ model=self.model_name,
161
+ messages=formatted_messages,
162
+ temperature=self.temperature,
163
+ reasoning_effort=self.reasoning_effort,
164
+ metadata=metadata,
165
+ max_retries=99
166
+ )
167
+ else:
168
+ response = completion(
169
+ model=self.model_name,
170
+ messages=formatted_messages,
171
+ temperature=self.temperature,
172
+ metadata=metadata,
173
+ max_retries=99
174
+ )
175
+ if self.print_cost:
176
+ # pass your response from completion to completion_cost
177
+ cost = completion_cost(completion_response=response)
178
+ formatted_string = f"Cost: ${float(cost):.10f}"
179
+ # print(formatted_string)
180
+ self.accumulated_cost += cost
181
+ print(f"Accumulated Cost: ${self.accumulated_cost:.10f}")
182
+
183
+ content = response.choices[0].message.content
184
+ if content is None:
185
+ print(f"Got null response from model. Full response: {response}")
186
+ return content
187
+
188
+ except Exception as e:
189
+ print(f"Error in model completion: {e}")
190
+ return str(e)
191
+
192
+ if __name__ == "__main__":
193
+ pass
mllm_tools/utils.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union, List, Dict, Any, Optional
2
+ from PIL import Image
3
+ import google.generativeai as genai
4
+ import tempfile
5
+ import os
6
+ from .gemini import GeminiWrapper
7
+ from .vertex_ai import VertexAIWrapper
8
+
9
+
10
+ def _prepare_text_inputs(texts: List[str]) -> List[Dict[str, str]]:
11
+ """
12
+ Converts a list of text strings into the input format for the Agent model.
13
+
14
+ Args:
15
+ texts (List[str]): The list of text strings to be processed.
16
+
17
+ Returns:
18
+ List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
19
+ """
20
+ inputs = []
21
+ # Add each text string to the inputs
22
+ if isinstance(texts, str):
23
+ texts = [texts]
24
+ for text in texts:
25
+ inputs.append({
26
+ "type": "text",
27
+ "content": text
28
+ })
29
+ return inputs
30
+
31
+ def _prepare_text_image_inputs(texts: Union[str, List[str]], images: Union[str, Image.Image, List[Union[str, Image.Image]]]) -> List[Dict[str, str]]:
32
+ """
33
+ Converts text strings and images into the input format for the Agent model.
34
+
35
+ Args:
36
+ texts (Union[str, List[str]]): Text string(s) to be processed.
37
+ images (Union[str, Image.Image, List[Union[str, Image.Image]]]): Image file path(s) or PIL Image object(s).
38
+ Returns:
39
+ List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
40
+ """
41
+ inputs = []
42
+ # Add each text string to the inputs
43
+ if isinstance(texts, str):
44
+ texts = [texts]
45
+ for text in texts:
46
+ inputs.append({
47
+ "type": "text",
48
+ "content": text
49
+ })
50
+ if isinstance(images, (str, Image.Image)):
51
+ images = [images]
52
+ for image in images:
53
+ inputs.append({
54
+ "type": "image",
55
+ "content": image
56
+ })
57
+ return inputs
58
+
59
+ def _prepare_text_video_inputs(texts: Union[str, List[str]], videos: Union[str, List[str]]) -> List[Dict[str, str]]:
60
+ """
61
+ Converts text strings and video file paths into the input format for the Agent model.
62
+
63
+ Args:
64
+ texts (Union[str, List[str]]): Text string(s) to be processed.
65
+ videos (Union[str, List[str]]): Video file path(s).
66
+ Returns:
67
+ List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
68
+ """
69
+ inputs = []
70
+ # Add each text string to the inputs
71
+ if isinstance(texts, str):
72
+ texts = [texts]
73
+ for text in texts:
74
+ inputs.append({
75
+ "type": "text",
76
+ "content": text
77
+ })
78
+ # Add each video file path to the inputs
79
+ if isinstance(videos, str):
80
+ videos = [videos]
81
+ for video in videos:
82
+ inputs.append({
83
+ "type": "video",
84
+ "content": video
85
+ })
86
+ return inputs
87
+
88
+ def _prepare_text_audio_inputs(texts: Union[str, List[str]], audios: Union[str, List[str]]) -> List[Dict[str, str]]:
89
+ """
90
+ Converts text strings and audio file paths into the input format for the Agent model.
91
+
92
+ Args:
93
+ texts (Union[str, List[str]]): Text string(s) to be processed.
94
+ audios (Union[str, List[str]]): Audio file path(s).
95
+ Returns:
96
+ List[Dict[str, str]]: A list of dictionaries formatted for the Agent model.
97
+ """
98
+ inputs = []
99
+ # Add each text string to the inputs
100
+ if isinstance(texts, str):
101
+ texts = [texts]
102
+ for text in texts:
103
+ inputs.append({
104
+ "type": "text",
105
+ "content": text
106
+ })
107
+ # Add each audio file path to the inputs
108
+ if isinstance(audios, str):
109
+ audios = [audios]
110
+ for audio in audios:
111
+ inputs.append({
112
+ "type": "audio",
113
+ "content": audio
114
+ })
115
+ return inputs
116
+
117
+ def _extract_code(text: str) -> str:
118
+ """Helper to extract code block from model response, support Gemini style and OpenAI style"""
119
+ try:
120
+ # Find code between ```python and ``` tags
121
+ start = text.split("```python\n")[-1]
122
+ end = start.split("```")[0]
123
+ return end.strip()
124
+ except IndexError:
125
+ return text
126
+
127
+ def _upload_to_gemini(input, mime_type=None):
128
+ """Uploads the given file or PIL image to Gemini.
129
+
130
+ See https://ai.google.dev/gemini-api/docs/prompting_with_media
131
+ """
132
+ if isinstance(input, str):
133
+ # Input is a file path
134
+ file = genai.upload_file(input, mime_type=mime_type)
135
+ elif isinstance(input, Image.Image):
136
+ # Input is a PIL image
137
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
138
+ input.save(tmp_file, format="JPEG")
139
+ tmp_file_path = tmp_file.name
140
+ file = genai.upload_file(tmp_file_path, mime_type=mime_type or "image/jpeg")
141
+ os.remove(tmp_file_path)
142
+ else:
143
+ raise ValueError("Unsupported input type. Must be a file path or PIL Image.")
144
+
145
+ #print(f"Uploaded file '{file.display_name}' as: {file.uri}")
146
+ return file
147
+
148
+ def get_media_wrapper(model_name: str) -> Optional[Union[GeminiWrapper, VertexAIWrapper]]:
149
+ """Get appropriate wrapper for media handling based on model name"""
150
+ if model_name.startswith('gemini/'):
151
+ return GeminiWrapper(model_name=model_name.split('/')[-1])
152
+ elif model_name.startswith('vertex_ai/'):
153
+ return VertexAIWrapper(model_name=model_name.split('/')[-1])
154
+ return None
155
+
156
+ def prepare_media_messages(prompt: str, media_path: Union[str, Image.Image], model_name: str) -> List[Dict[str, Any]]:
157
+ """Prepare messages for media input based on model type"""
158
+ is_video = isinstance(media_path, str) and media_path.endswith('.mp4')
159
+
160
+ if is_video and (model_name.startswith('gemini/') or model_name.startswith('vertex_ai/')):
161
+ return [
162
+ {"type": "text", "content": prompt},
163
+ {"type": "video", "content": media_path}
164
+ ]
165
+ else:
166
+ # For images or non-Gemini/Vertex models
167
+ if isinstance(media_path, str):
168
+ media = Image.open(media_path)
169
+ else:
170
+ media = media_path
171
+ return [
172
+ {"type": "text", "content": prompt},
173
+ {"type": "image", "content": media}
174
+ ]
mllm_tools/vertex_ai.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Dict, Any, Optional
3
+ import vertexai
4
+ from vertexai.generative_models import GenerativeModel, Part
5
+ from google.auth import default
6
+ from google.auth.transport import requests
7
+
8
+
9
+ # TODO: check if this is the correct way to use Vertex AI
10
+ # TODO: add langfuse support
11
+ class VertexAIWrapper:
12
+ """Wrapper for Vertex AI to support Gemini models."""
13
+
14
+ def __init__(
15
+ self,
16
+ model_name: str = "gemini-1.5-pro",
17
+ temperature: float = 0.7,
18
+ print_cost: bool = False,
19
+ verbose: bool = False,
20
+ use_langfuse: bool = False
21
+ ):
22
+ """Initialize the Vertex AI wrapper.
23
+
24
+ Args:
25
+ model_name: Name of the model to use (e.g. "gemini-1.5-pro")
26
+ temperature: Temperature for generation between 0 and 1
27
+ print_cost: Whether to print the cost of the completion
28
+ verbose: Whether to print verbose output
29
+ use_langfuse: Whether to enable Langfuse logging
30
+ """
31
+ self.model_name = model_name
32
+ self.temperature = temperature
33
+ self.print_cost = print_cost
34
+ self.verbose = verbose
35
+
36
+ # Initialize Vertex AI
37
+ project_id = os.getenv("GOOGLE_CLOUD_PROJECT")
38
+ location = os.getenv("GOOGLE_CLOUD_LOCATION", "us-central1")
39
+ if not project_id:
40
+ raise ValueError("No GOOGLE_CLOUD_PROJECT found in environment variables")
41
+
42
+ vertexai.init(project=project_id, location=location)
43
+ self.model = GenerativeModel(model_name)
44
+
45
+ def __call__(self, messages: List[Dict[str, Any]], metadata: Optional[Dict[str, Any]] = None) -> str:
46
+ """Process messages and return completion.
47
+
48
+ Args:
49
+ messages: List of message dictionaries containing type and content
50
+ metadata: Optional metadata dictionary to pass to the model
51
+
52
+ Returns:
53
+ Generated text response from the model
54
+
55
+ Raises:
56
+ ValueError: If message type is not supported
57
+ """
58
+ parts = []
59
+
60
+ for msg in messages:
61
+ if msg["type"] == "text":
62
+ parts.append(Part.from_text(msg["content"]))
63
+ elif msg["type"] in ["image", "video"]:
64
+ mime_type = "video/mp4" if msg["type"] == "video" else "image/jpeg"
65
+ if isinstance(msg["content"], str):
66
+ # Handle GCS URI
67
+ parts.append(Part.from_uri(
68
+ msg["content"],
69
+ mime_type=mime_type
70
+ ))
71
+ else:
72
+ # Handle file path or bytes
73
+ parts.append(Part.from_data(
74
+ msg["content"],
75
+ mime_type=mime_type
76
+ ))
77
+
78
+ response = self.model.generate_content(
79
+ parts,
80
+ generation_config={
81
+ "temperature": self.temperature,
82
+ "top_p": 0.95,
83
+ }
84
+ )
85
+
86
+ return response.text
requirements.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ annotated-types~=0.7.0
2
+ azure-cognitiveservices-speech~=1.41.1
3
+ cachetools~=5.5.0
4
+ certifi~=2024.8.30
5
+ charset-normalizer~=3.4.0
6
+ click~=8.1.7
7
+ cloup~=3.0.5
8
+ Cython~=3.0.11
9
+ decorator~=5.1.1
10
+ glcontext~=3.0.0
11
+ google-ai-generativelanguage~=0.6.10
12
+ google-api-core~=2.22.0
13
+ google-api-python-client~=2.151.0
14
+ google-auth~=2.35.0
15
+ google-auth-httplib2~=0.2.0
16
+ google-generativeai~=0.8.3
17
+ googleapis-common-protos~=1.65.0
18
+ grpcio~=1.67.1
19
+ grpcio-status~=1.67.1
20
+ gTTS~=2.5.3
21
+ httplib2~=0.22.0
22
+ idna~=3.10
23
+ isosurfaces~=0.1.2
24
+ manim~=0.18.1
25
+ manim-voiceover~=0.3.7
26
+ ManimPango~=0.6.0 # sudo apt-get install libsdl-pango-dev if you dont have pangocairo
27
+ mapbox_earcut~=1.0.2
28
+ markdown-it-py~=3.0.0
29
+ mdurl~=0.1.2
30
+ moderngl~=5.12.0
31
+ multipledispatch~=1.0.0
32
+ mutagen~=1.47.0
33
+ networkx~=3.4.2
34
+ numpy~=2.2.2
35
+ pillow
36
+ proto-plus~=1.25.0
37
+ protobuf~=5.28.3
38
+ pyasn1~=0.6.1
39
+ pyasn1_modules~=0.4.1
40
+ PyAudio~=0.2.14 #required brew install portaudio for mac
41
+ pycairo~=1.27.0
42
+ pydantic~=2.9.2
43
+ pydantic_core~=2.23.4
44
+ pydub~=0.25.1
45
+ pyglet~=2.0.18
46
+ Pygments~=2.18.0
47
+ #pyobjc-core~=10.3.1 # only for mac
48
+ #pyobjc-framework-Cocoa~=10.3.1 # only for mac
49
+ pyparsing~=3.2.0
50
+ pyrr~=0.10.3
51
+ python-dotenv~=0.21.1
52
+ python-slugify~=8.0.4
53
+ requests~=2.32.3
54
+ rich~=13.9.3
55
+ rsa~=4.9
56
+ scipy~=1.14.1
57
+ screeninfo~=0.8.1
58
+ skia-pathops~=0.8.0.post2
59
+ sox~=1.5.0
60
+ srt~=3.5.3
61
+ svgelements~=1.9.6
62
+ text-unidecode~=1.3
63
+ tqdm~=4.66.5
64
+ typing_extensions~=4.12.2
65
+ uritemplate~=4.1.1
66
+ urllib3~=2.2.3
67
+ watchdog~=5.0.3
68
+ inquirer
69
+ openai~=1.61.0
70
+ tiktoken~=0.8.0
71
+ timm
72
+ sentencepiece
73
+ transformers
74
+ litellm~=1.60.5
75
+ pysrt
76
+ moviepy~=2.1.2
77
+ yt-dlp
78
+ imageio_ffmpeg~=0.5.1
79
+ langchain~=0.3.14
80
+ langchain_community~=0.3.14
81
+ SpeechRecognition~=3.14.1
82
+ boto3~=1.36.9
83
+ manim-physics~=0.4.0
84
+ manim-ml~=0.0.24
85
+ manim-chemistry~=0.4.4
86
+ manim-dsa~=0.2.0
87
+ manim-circuit~=0.0.3
88
+ langfuse~=2.58.1
89
+ chromadb~=0.6.3
90
+ google-cloud-aiplatform~=1.79.0
91
+ cairosvg
92
+ pylatexenc~=2.10
93
+ ffmpeg-python~=0.2.0
94
+ kokoro-onnx[gpu] # if you have a GPU, otherwise kokoro-onnx
95
+ soundfile~=0.13.1
96
+ krippendorff~=0.8.1
97
+ statsmodels~=0.14.4
98
+ opencv-python~=4.11.0
99
+ fastapi
100
+ uvicorn
101
+ gradio
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This is essential for the release to work
src/config/__init__.py ADDED
File without changes
src/config/config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables from .env file
5
+ load_dotenv()
6
+
7
+ class Config:
8
+ OUTPUT_DIR = "output"
9
+ THEOREMS_PATH = os.path.join("data", "easy_20.json")
10
+ CONTEXT_LEARNING_PATH = "data/context_learning"
11
+ CHROMA_DB_PATH = "data/rag/chroma_db"
12
+ MANIM_DOCS_PATH = "data/rag/manim_docs"
13
+ EMBEDDING_MODEL = "azure/text-embedding-3-large"
14
+
15
+ # Kokoro TTS configurations
16
+ KOKORO_MODEL_PATH = os.getenv('KOKORO_MODEL_PATH')
17
+ KOKORO_VOICES_PATH = os.getenv('KOKORO_VOICES_PATH')
18
+ KOKORO_DEFAULT_VOICE = os.getenv('KOKORO_DEFAULT_VOICE')
19
+ KOKORO_DEFAULT_SPEED = float(os.getenv('KOKORO_DEFAULT_SPEED', '1.0'))
20
+ KOKORO_DEFAULT_LANG = os.getenv('KOKORO_DEFAULT_LANG')
src/core/__init__.py ADDED
File without changes
src/core/code_generator.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from typing import Union, List, Dict
5
+ from PIL import Image
6
+ import glob
7
+
8
+ from src.utils.utils import extract_json
9
+ from mllm_tools.utils import _prepare_text_inputs, _extract_code, _prepare_text_image_inputs
10
+ from mllm_tools.gemini import GeminiWrapper
11
+ from mllm_tools.vertex_ai import VertexAIWrapper
12
+ from task_generator import (
13
+ get_prompt_code_generation,
14
+ get_prompt_fix_error,
15
+ get_prompt_visual_fix_error,
16
+ get_banned_reasonings,
17
+ get_prompt_rag_query_generation_fix_error,
18
+ get_prompt_context_learning_code,
19
+ get_prompt_rag_query_generation_code
20
+ )
21
+ from task_generator.prompts_raw import (
22
+ _code_font_size,
23
+ _code_disable,
24
+ _code_limit,
25
+ _prompt_manim_cheatsheet
26
+ )
27
+ from src.rag.vector_store import RAGVectorStore # Import RAGVectorStore
28
+
29
+ class CodeGenerator:
30
+ """A class for generating and managing Manim code."""
31
+
32
+ def __init__(self, scene_model, helper_model, output_dir="output", print_response=False, use_rag=False, use_context_learning=False, context_learning_path="data/context_learning", chroma_db_path="rag/chroma_db", manim_docs_path="rag/manim_docs", embedding_model="azure/text-embedding-3-large", use_visual_fix_code=False, use_langfuse=True, session_id=None):
33
+ """Initialize the CodeGenerator.
34
+
35
+ Args:
36
+ scene_model: The model used for scene generation
37
+ helper_model: The model used for helper tasks
38
+ output_dir (str, optional): Directory for output files. Defaults to "output".
39
+ print_response (bool, optional): Whether to print model responses. Defaults to False.
40
+ use_rag (bool, optional): Whether to use RAG. Defaults to False.
41
+ use_context_learning (bool, optional): Whether to use context learning. Defaults to False.
42
+ context_learning_path (str, optional): Path to context learning examples. Defaults to "data/context_learning".
43
+ chroma_db_path (str, optional): Path to ChromaDB. Defaults to "rag/chroma_db".
44
+ manim_docs_path (str, optional): Path to Manim docs. Defaults to "rag/manim_docs".
45
+ embedding_model (str, optional): Name of embedding model. Defaults to "azure/text-embedding-3-large".
46
+ use_visual_fix_code (bool, optional): Whether to use visual code fixing. Defaults to False.
47
+ use_langfuse (bool, optional): Whether to use Langfuse logging. Defaults to True.
48
+ session_id (str, optional): Session identifier. Defaults to None.
49
+ """
50
+ self.scene_model = scene_model
51
+ self.helper_model = helper_model
52
+ self.output_dir = output_dir
53
+ self.print_response = print_response
54
+ self.use_rag = use_rag
55
+ self.use_context_learning = use_context_learning
56
+ self.context_learning_path = context_learning_path
57
+ self.context_examples = self._load_context_examples() if use_context_learning else None
58
+ self.manim_docs_path = manim_docs_path
59
+
60
+ self.use_visual_fix_code = use_visual_fix_code
61
+ self.banned_reasonings = get_banned_reasonings()
62
+ self.session_id = session_id # Use session_id passed from VideoGenerator
63
+
64
+ if use_rag:
65
+ self.vector_store = RAGVectorStore(
66
+ chroma_db_path=chroma_db_path,
67
+ manim_docs_path=manim_docs_path,
68
+ embedding_model=embedding_model,
69
+ session_id=self.session_id,
70
+ use_langfuse=use_langfuse
71
+ )
72
+ else:
73
+ self.vector_store = None
74
+
75
+ def _load_context_examples(self) -> str:
76
+ """Load all context learning examples from the specified directory.
77
+
78
+ Returns:
79
+ str: Formatted context learning examples, or None if no examples found.
80
+ """
81
+ examples = []
82
+ for example_file in glob.glob(f"{self.context_learning_path}/**/*.py", recursive=True):
83
+ with open(example_file, 'r') as f:
84
+ examples.append(f"# Example from {os.path.basename(example_file)}\n{f.read()}\n")
85
+
86
+ # Format examples using get_prompt_context_learning_code instead of _prompt_context_learning
87
+ if examples:
88
+ formatted_examples = get_prompt_context_learning_code(
89
+ examples="\n".join(examples)
90
+ )
91
+ return formatted_examples
92
+ return None
93
+
94
+ def _generate_rag_queries_code(self, implementation: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None, relevant_plugins: List[str] = []) -> List[str]:
95
+ """Generate RAG queries from the implementation plan.
96
+
97
+ Args:
98
+ implementation (str): The implementation plan text
99
+ scene_trace_id (str, optional): Trace ID for the scene. Defaults to None.
100
+ topic (str, optional): Topic of the scene. Defaults to None.
101
+ scene_number (int, optional): Scene number. Defaults to None.
102
+ session_id (str, optional): Session identifier. Defaults to None.
103
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to empty list.
104
+
105
+ Returns:
106
+ List[str]: List of generated RAG queries
107
+ """
108
+ # Create a cache key for this scene
109
+ cache_key = f"{topic}_scene{scene_number}"
110
+
111
+ # Check if we already have a cache file for this scene
112
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
113
+ os.makedirs(cache_dir, exist_ok=True)
114
+ cache_file = os.path.join(cache_dir, "rag_queries_code.json")
115
+
116
+ # If cache file exists, load and return cached queries
117
+ if os.path.exists(cache_file):
118
+ with open(cache_file, 'r') as f:
119
+ cached_queries = json.load(f)
120
+ print(f"Using cached RAG queries for {cache_key}")
121
+ return cached_queries
122
+
123
+ # Generate new queries if not cached
124
+ if relevant_plugins:
125
+ prompt = get_prompt_rag_query_generation_code(implementation, ", ".join(relevant_plugins))
126
+ else:
127
+ prompt = get_prompt_rag_query_generation_code(implementation, "No plugins are relevant.")
128
+
129
+ queries = self.helper_model(
130
+ _prepare_text_inputs(prompt),
131
+ metadata={"generation_name": "rag_query_generation", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
132
+ )
133
+
134
+ print(f"RAG queries: {queries}")
135
+ # retreive json triple backticks
136
+
137
+ try: # add try-except block to handle potential json decode errors
138
+ queries = re.search(r'```json(.*)```', queries, re.DOTALL).group(1)
139
+ queries = json.loads(queries)
140
+ except json.JSONDecodeError as e:
141
+ print(f"JSONDecodeError when parsing RAG queries for storyboard: {e}")
142
+ print(f"Response text was: {queries}")
143
+ return [] # Return empty list in case of parsing error
144
+
145
+ # Cache the queries
146
+ with open(cache_file, 'w') as f:
147
+ json.dump(queries, f)
148
+
149
+ return queries
150
+
151
+ def _generate_rag_queries_error_fix(self, error: str, code: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None, relevant_plugins: List[str] = []) -> List[str]:
152
+ """Generate RAG queries for fixing code errors.
153
+
154
+ Args:
155
+ error (str): The error message to fix
156
+ code (str): The code containing the error
157
+ scene_trace_id (str, optional): Trace ID for the scene. Defaults to None.
158
+ topic (str, optional): Topic of the scene. Defaults to None.
159
+ scene_number (int, optional): Scene number. Defaults to None.
160
+ session_id (str, optional): Session identifier. Defaults to None.
161
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to empty list.
162
+
163
+ Returns:
164
+ List[str]: List of generated RAG queries for error fixing
165
+ """
166
+ # Create a cache key for this scene and error
167
+ cache_key = f"{topic}_scene{scene_number}_error_fix"
168
+
169
+ # Check if we already have a cache file for error fix queries
170
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
171
+ os.makedirs(cache_dir, exist_ok=True)
172
+ cache_file = os.path.join(cache_dir, "rag_queries_error_fix.json")
173
+
174
+ # If cache file exists, load and return cached queries
175
+ if os.path.exists(cache_file):
176
+ with open(cache_file, 'r') as f:
177
+ cached_queries = json.load(f)
178
+ print(f"Using cached RAG queries for error fix in {cache_key}")
179
+ return cached_queries
180
+
181
+ # Generate new queries for error fix if not cached
182
+ prompt = get_prompt_rag_query_generation_fix_error(
183
+ error=error,
184
+ code=code,
185
+ relevant_plugins=", ".join(relevant_plugins) if relevant_plugins else "No plugins are relevant."
186
+ )
187
+
188
+ queries = self.helper_model(
189
+ _prepare_text_inputs(prompt),
190
+ metadata={"generation_name": "rag-query-generation-fix-error", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
191
+ )
192
+
193
+ # remove json triple backticks
194
+ queries = queries.replace("```json", "").replace("```", "")
195
+ try: # add try-except block to handle potential json decode errors
196
+ queries = json.loads(queries)
197
+ except json.JSONDecodeError as e:
198
+ print(f"JSONDecodeError when parsing RAG queries for error fix: {e}")
199
+ print(f"Response text was: {queries}")
200
+ return [] # Return empty list in case of parsing error
201
+
202
+ # Cache the queries
203
+ with open(cache_file, 'w') as f:
204
+ json.dump(queries, f)
205
+
206
+ return queries
207
+
208
+ def _extract_code_with_retries(self, response_text: str, pattern: str, generation_name: str = None, trace_id: str = None, session_id: str = None, max_retries: int = 10) -> str:
209
+ """Extract code from response text with retry logic.
210
+
211
+ Args:
212
+ response_text (str): The text containing code to extract
213
+ pattern (str): Regex pattern for extracting code
214
+ generation_name (str, optional): Name of generation step. Defaults to None.
215
+ trace_id (str, optional): Trace identifier. Defaults to None.
216
+ session_id (str, optional): Session identifier. Defaults to None.
217
+ max_retries (int, optional): Maximum number of retries. Defaults to 10.
218
+
219
+ Returns:
220
+ str: The extracted code
221
+
222
+ Raises:
223
+ ValueError: If code extraction fails after max retries
224
+ """
225
+ retry_prompt = """
226
+ Please extract the Python code in the correct format using the pattern: {pattern}.
227
+ You MUST NOT include any other text or comments.
228
+ You MUST return the exact same code as in the previous response, NO CONTENT EDITING is allowed.
229
+ Previous response:
230
+ {response_text}
231
+ """
232
+
233
+ for attempt in range(max_retries):
234
+ code_match = re.search(pattern, response_text, re.DOTALL)
235
+ if code_match:
236
+ return code_match.group(1)
237
+
238
+ if attempt < max_retries - 1:
239
+ print(f"Attempt {attempt + 1}: Failed to extract code pattern. Retrying...")
240
+ # Regenerate response with a more explicit prompt
241
+ response_text = self.scene_model(
242
+ _prepare_text_inputs(retry_prompt.format(pattern=pattern, response_text=response_text)),
243
+ metadata={
244
+ "generation_name": f"{generation_name}_format_retry_{attempt + 1}",
245
+ "trace_id": trace_id,
246
+ "session_id": session_id
247
+ }
248
+ )
249
+
250
+ raise ValueError(f"Failed to extract code pattern after {max_retries} attempts. Pattern: {pattern}")
251
+
252
+ def generate_manim_code(self,
253
+ topic: str,
254
+ description: str,
255
+ scene_outline: str,
256
+ scene_implementation: str,
257
+ scene_number: int,
258
+ additional_context: Union[str, List[str]] = None,
259
+ scene_trace_id: str = None,
260
+ session_id: str = None,
261
+ rag_queries_cache: Dict = None) -> str:
262
+ """Generate Manim code from video plan.
263
+
264
+ Args:
265
+ topic (str): Topic of the scene
266
+ description (str): Description of the scene
267
+ scene_outline (str): Outline of the scene
268
+ scene_implementation (str): Implementation details
269
+ scene_number (int): Scene number
270
+ additional_context (Union[str, List[str]], optional): Additional context. Defaults to None.
271
+ scene_trace_id (str, optional): Trace identifier. Defaults to None.
272
+ session_id (str, optional): Session identifier. Defaults to None.
273
+ rag_queries_cache (Dict, optional): Cache for RAG queries. Defaults to None.
274
+
275
+ Returns:
276
+ Tuple[str, str]: Generated code and response text
277
+ """
278
+ if self.use_context_learning:
279
+ # Add context examples to additional_context
280
+ if additional_context is None:
281
+ additional_context = []
282
+ elif isinstance(additional_context, str):
283
+ additional_context = [additional_context]
284
+
285
+ # Now using the properly formatted code examples
286
+ if self.context_examples:
287
+ additional_context.append(self.context_examples)
288
+
289
+ if self.use_rag:
290
+ # Generate RAG queries (will use cache if available)
291
+ rag_queries = self._generate_rag_queries_code(
292
+ implementation=scene_implementation,
293
+ scene_trace_id=scene_trace_id,
294
+ topic=topic,
295
+ scene_number=scene_number,
296
+ session_id=session_id
297
+ )
298
+
299
+ retrieved_docs = self.vector_store.find_relevant_docs(
300
+ queries=rag_queries,
301
+ k=2, # number of documents to retrieve
302
+ trace_id=scene_trace_id,
303
+ topic=topic,
304
+ scene_number=scene_number
305
+ )
306
+ # Format the retrieved documents into a string
307
+ if additional_context is None:
308
+ additional_context = []
309
+ additional_context.append(retrieved_docs)
310
+
311
+ # Format code generation prompt with plan and retrieved context
312
+ prompt = get_prompt_code_generation(
313
+ scene_outline=scene_outline,
314
+ scene_implementation=scene_implementation,
315
+ topic=topic,
316
+ description=description,
317
+ scene_number=scene_number,
318
+ additional_context=additional_context
319
+ )
320
+
321
+ # Generate code using model
322
+ response_text = self.scene_model(
323
+ _prepare_text_inputs(prompt),
324
+ metadata={"generation_name": "code_generation", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
325
+ )
326
+
327
+ # Extract code with retries
328
+ code = self._extract_code_with_retries(
329
+ response_text,
330
+ r"```python(.*)```",
331
+ generation_name="code_generation",
332
+ trace_id=scene_trace_id,
333
+ session_id=session_id
334
+ )
335
+ return code, response_text
336
+
337
+ def fix_code_errors(self, implementation_plan: str, code: str, error: str, scene_trace_id: str, topic: str, scene_number: int, session_id: str, rag_queries_cache: Dict = None) -> str:
338
+ """Fix errors in generated Manim code.
339
+
340
+ Args:
341
+ implementation_plan (str): Original implementation plan
342
+ code (str): Code containing errors
343
+ error (str): Error message to fix
344
+ scene_trace_id (str): Trace identifier
345
+ topic (str): Topic of the scene
346
+ scene_number (int): Scene number
347
+ session_id (str): Session identifier
348
+ rag_queries_cache (Dict, optional): Cache for RAG queries. Defaults to None.
349
+
350
+ Returns:
351
+ Tuple[str, str]: Fixed code and response text
352
+ """
353
+ # Format error fix prompt
354
+ prompt = get_prompt_fix_error(implementation_plan=implementation_plan, manim_code=code, error=error)
355
+
356
+ if self.use_rag:
357
+ # Generate RAG queries for error fixing
358
+ rag_queries = self._generate_rag_queries_error_fix(
359
+ error=error,
360
+ code=code,
361
+ scene_trace_id=scene_trace_id,
362
+ topic=topic,
363
+ scene_number=scene_number,
364
+ session_id=session_id
365
+ )
366
+ retrieved_docs = self.vector_store.find_relevant_docs(
367
+ queries=rag_queries,
368
+ k=2, # number of documents to retrieve for error fixing
369
+ trace_id=scene_trace_id,
370
+ topic=topic,
371
+ scene_number=scene_number
372
+ )
373
+ # Format the retrieved documents into a string
374
+ prompt = get_prompt_fix_error(implementation_plan=implementation_plan, manim_code=code, error=error, additional_context=retrieved_docs)
375
+
376
+ # Get fixed code from model
377
+ response_text = self.scene_model(
378
+ _prepare_text_inputs(prompt),
379
+ metadata={"generation_name": "code_fix_error", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
380
+ )
381
+
382
+ # Extract fixed code with retries
383
+ fixed_code = self._extract_code_with_retries(
384
+ response_text,
385
+ r"```python(.*)```",
386
+ generation_name="code_fix_error",
387
+ trace_id=scene_trace_id,
388
+ session_id=session_id
389
+ )
390
+ return fixed_code, response_text
391
+
392
+ def visual_self_reflection(self, code: str, media_path: Union[str, Image.Image], scene_trace_id: str, topic: str, scene_number: int, session_id: str) -> str:
393
+ """Use snapshot image or mp4 video to fix code.
394
+
395
+ Args:
396
+ code (str): Code to fix
397
+ media_path (Union[str, Image.Image]): Path to media file or PIL Image
398
+ scene_trace_id (str): Trace identifier
399
+ topic (str): Topic of the scene
400
+ scene_number (int): Scene number
401
+ session_id (str): Session identifier
402
+
403
+ Returns:
404
+ Tuple[str, str]: Fixed code and response text
405
+ """
406
+
407
+ # Determine if we're dealing with video or image
408
+ is_video = isinstance(media_path, str) and media_path.endswith('.mp4')
409
+
410
+ # Load prompt template
411
+ with open('task_generator/prompts_raw/prompt_visual_self_reflection.txt', 'r') as f:
412
+ prompt_template = f.read()
413
+
414
+ # Format prompt
415
+ prompt = prompt_template.format(code=code)
416
+
417
+ # Prepare input based on media type
418
+ if is_video and isinstance(self.scene_model, (GeminiWrapper, VertexAIWrapper)):
419
+ # For video with Gemini models
420
+ messages = [
421
+ {"type": "text", "content": prompt},
422
+ {"type": "video", "content": media_path}
423
+ ]
424
+ else:
425
+ # For images or non-Gemini models
426
+ if isinstance(media_path, str):
427
+ media = Image.open(media_path)
428
+ else:
429
+ media = media_path
430
+ messages = [
431
+ {"type": "text", "content": prompt},
432
+ {"type": "image", "content": media}
433
+ ]
434
+
435
+ # Get model response
436
+ response_text = self.scene_model(
437
+ messages,
438
+ metadata={
439
+ "generation_name": "visual_self_reflection",
440
+ "trace_id": scene_trace_id,
441
+ "tags": [topic, f"scene{scene_number}"],
442
+ "session_id": session_id
443
+ }
444
+ )
445
+
446
+ # Extract code with retries
447
+ fixed_code = self._extract_code_with_retries(
448
+ response_text,
449
+ r"```python(.*)```",
450
+ generation_name="visual_self_reflection",
451
+ trace_id=scene_trace_id,
452
+ session_id=session_id
453
+ )
454
+ return fixed_code, response_text
src/core/parse_video.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pysrt
3
+ from moviepy import VideoFileClip
4
+ import shutil
5
+ from PIL import Image, ImageOps
6
+ import numpy as np
7
+ import speech_recognition as sr
8
+
9
+ def get_images_from_video(video_path, fps=0.2):
10
+ """Extract frames from a video file at specified FPS.
11
+
12
+ Args:
13
+ video_path (str): Path to the video file.
14
+ fps (float, optional): Frames per second to extract. Defaults to 0.2.
15
+
16
+ Returns:
17
+ list: List of frames as numpy arrays.
18
+ """
19
+ clip = VideoFileClip(video_path)
20
+ images = clip.iter_frames(fps=fps)
21
+ return images
22
+
23
+ def image_with_most_non_black_space(images, output_path, return_type="path"):
24
+ """Find and save the image with the most non-black space from a list of images.
25
+
26
+ Args:
27
+ images (list): List of image file paths, PIL Image objects, or numpy arrays.
28
+ output_path (str): Path where the output image should be saved.
29
+ return_type (str, optional): Type of return value - "path" or "image". Defaults to "path".
30
+
31
+ Returns:
32
+ Union[str, PIL.Image, None]: Path to saved image, PIL Image object, or None if no valid image found.
33
+ """
34
+ max_non_black_area = 0
35
+ image_with_max_non_black_space = None
36
+
37
+ for img in images:
38
+ try:
39
+ # If img is a path, open the image
40
+ if isinstance(img, str):
41
+ image = Image.open(img)
42
+ elif isinstance(img, Image.Image):
43
+ image = img
44
+ elif isinstance(img, np.ndarray):
45
+ image = Image.fromarray(img)
46
+ else:
47
+ print(f"Unsupported type: {type(img)}. Skipping.")
48
+ continue
49
+
50
+ # Convert to grayscale
51
+ gray = ImageOps.grayscale(image)
52
+
53
+ # Convert to numpy array
54
+ gray_array = np.array(gray)
55
+
56
+ # Count non-black pixels (threshold to consider near-black as black)
57
+ non_black_pixels = np.sum(gray_array > 10) # Threshold 10 to account for slight variations in black
58
+
59
+ if non_black_pixels > max_non_black_area:
60
+ max_non_black_area = non_black_pixels
61
+ image_with_max_non_black_space = image
62
+
63
+ except Exception as e:
64
+ print(f"Warning: Unable to process image {img}: {e}")
65
+
66
+ if image_with_max_non_black_space is not None:
67
+ image_with_max_non_black_space.save(output_path)
68
+ print(f"Saved image with most non-black space to {output_path}")
69
+
70
+ if return_type == "path":
71
+ return output_path
72
+ else:
73
+ return image_with_max_non_black_space
74
+ return image_with_max_non_black_space
75
+
76
+ def parse_srt_to_text(output_dir, topic_name):
77
+ """Convert SRT subtitle file to plain text.
78
+
79
+ Args:
80
+ output_dir (str): Directory containing the topic folders.
81
+ topic_name (str): Name of the topic/video.
82
+ """
83
+ topic_name = topic_name.replace(" ", "_").lower()
84
+ srt_path = os.path.join(output_dir, topic_name, f"{topic_name}_combined.srt")
85
+ txt_path = os.path.join(output_dir, topic_name, f"{topic_name}_combined.txt")
86
+ subs = pysrt.open(srt_path)
87
+
88
+ with open(txt_path, 'w') as f:
89
+ full_text = ""
90
+ for sub in subs:
91
+ sub.text = sub.text.replace("...", ".")
92
+ full_text += sub.text + " "
93
+ f.write(full_text.strip())
94
+
95
+ def parse_srt_and_extract_frames(output_dir, topic_name):
96
+ """Extract frames from video at subtitle timestamps and save with corresponding text.
97
+
98
+ Args:
99
+ output_dir (str): Directory containing the topic folders.
100
+ topic_name (str): Name of the topic/video.
101
+ """
102
+ topic_name = topic_name.replace(" ", "_").lower()
103
+ video_path = os.path.join(output_dir, topic_name, f"{topic_name}_combined.mp4")
104
+ srt_path = os.path.join(output_dir, topic_name, f"{topic_name}_combined.srt")
105
+ subs = pysrt.open(srt_path)
106
+
107
+ # Create extract_images folder if it doesn't exist
108
+ images_dir = os.path.join(output_dir, topic_name, "extract_images")
109
+ if os.path.exists(images_dir):
110
+ shutil.rmtree(images_dir)
111
+ os.makedirs(images_dir)
112
+
113
+ # Load the video file
114
+ video = VideoFileClip(video_path)
115
+
116
+ # Dictionary to store image-text pairs
117
+ pairs = {}
118
+
119
+ i = 0
120
+ while i < len(subs):
121
+ sub = subs[i]
122
+ text = sub.text
123
+ sub_indexes = [sub.index]
124
+
125
+ # Check if we need to concatenate with next subtitle
126
+ while i < len(subs) - 1 and not text.strip().endswith('.'):
127
+ i += 1
128
+ next_sub = subs[i]
129
+ text += " " + next_sub.text
130
+ sub_indexes.append(next_sub.index)
131
+
132
+ # Get the end time of the last concatenated subtitle
133
+ end_time = sub.end.to_time()
134
+ # Convert end time to seconds
135
+ end_time_seconds = end_time.hour * 3600 + end_time.minute * 60 + end_time.second + end_time.microsecond / 1e6
136
+
137
+ # Save the frame as an image in extract_images folder
138
+ frame_path = os.path.join(images_dir, f"{sub.index}.jpg")
139
+ video.save_frame(frame_path, t=end_time_seconds)
140
+
141
+ # Save the subtitle text to a txt file
142
+ text_path = os.path.join(images_dir, f"{sub.index}.txt")
143
+ with open(text_path, 'w') as f:
144
+ f.write(text)
145
+
146
+ # Add pair to dictionary
147
+ pairs[str(sub.index)] = {
148
+ "image_path": f"{sub.index}.jpg",
149
+ "text": text,
150
+ "text_path": f"{sub.index}.txt",
151
+ "srt_index": sub_indexes,
152
+ }
153
+
154
+ i += 1
155
+
156
+ # Save pairs to json file
157
+ import json
158
+ json_path = os.path.join(images_dir, "pairs.json")
159
+ with open(json_path, 'w') as f:
160
+ json.dump(pairs, f, indent=4)
161
+
162
+ # Close the video file
163
+ video.close()
164
+
165
+ def extract_trasnscript(video_path):
166
+ """Extract transcript from video audio using Google Speech Recognition.
167
+
168
+ Args:
169
+ video_path (str): Path to the video file.
170
+
171
+ Returns:
172
+ str: Transcribed text from the video audio.
173
+
174
+ Raises:
175
+ FileNotFoundError: If video file does not exist.
176
+ """
177
+ if not os.path.exists(video_path):
178
+ raise FileNotFoundError(f"Video file not found: {video_path}")
179
+
180
+ clip = VideoFileClip(video_path)
181
+
182
+ # write the video to a temporary audio file
183
+ audio_path = os.path.join(os.path.dirname(video_path), "audio.wav")
184
+ clip.audio.write_audiofile(audio_path)
185
+
186
+ try:
187
+ # extract the subtitles from the audio file
188
+ recognizer = sr.Recognizer()
189
+ with sr.AudioFile(audio_path) as source:
190
+ audio = recognizer.record(source)
191
+ return recognizer.recognize_google(audio)
192
+ finally:
193
+ # clean up the temporary audio file
194
+ if os.path.exists(audio_path):
195
+ os.remove(audio_path)
196
+
197
+ if __name__ == "__main__":
198
+ import argparse
199
+
200
+ def process_all_topics(output_folder):
201
+ """Process all topic folders in the output directory.
202
+
203
+ Args:
204
+ output_folder (str): Directory containing the topic folders.
205
+ """
206
+ # Only get immediate subdirectories
207
+ topics = [d for d in os.listdir(output_folder)
208
+ if os.path.isdir(os.path.join(output_folder, d))]
209
+
210
+ for topic in topics:
211
+ print(f"\nProcessing topic: {topic}")
212
+ try:
213
+ parse_srt_to_text(output_folder, topic)
214
+ parse_srt_and_extract_frames(output_folder, topic)
215
+ except Exception as e:
216
+ print(f"Error processing {topic}: {str(e)}")
217
+ continue
218
+
219
+ # Set up argument parser
220
+ parser = argparse.ArgumentParser(description='Process video files and extract frames with subtitles')
221
+ parser.add_argument('--output_dir', type=str, default="output",
222
+ help='Directory containing the topic folders')
223
+
224
+ args = parser.parse_args()
225
+
226
+ # Process topics using provided output directory
227
+ process_all_topics(args.output_dir)
src/core/video_planner.py ADDED
@@ -0,0 +1,417 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import glob
5
+ from typing import List, Optional
6
+ import uuid
7
+ import asyncio
8
+
9
+ from mllm_tools.utils import _prepare_text_inputs
10
+ from src.utils.utils import extract_xml
11
+ from task_generator import (
12
+ get_prompt_scene_plan,
13
+ get_prompt_scene_vision_storyboard,
14
+ get_prompt_scene_technical_implementation,
15
+ get_prompt_scene_animation_narration,
16
+ get_prompt_context_learning_scene_plan,
17
+ get_prompt_context_learning_vision_storyboard,
18
+ get_prompt_context_learning_technical_implementation,
19
+ get_prompt_context_learning_animation_narration,
20
+ get_prompt_context_learning_code
21
+ )
22
+ from src.rag.rag_integration import RAGIntegration
23
+
24
+ class VideoPlanner:
25
+ """A class for planning and generating video content.
26
+
27
+ This class handles the planning and generation of video content including scene outlines,
28
+ vision storyboards, technical implementations, and animation narrations.
29
+
30
+ Args:
31
+ planner_model: The model used for planning tasks
32
+ helper_model: Optional helper model, defaults to planner_model if None
33
+ output_dir (str): Directory for output files. Defaults to "output"
34
+ print_response (bool): Whether to print model responses. Defaults to False
35
+ use_context_learning (bool): Whether to use context learning. Defaults to False
36
+ context_learning_path (str): Path to context learning examples. Defaults to "data/context_learning"
37
+ use_rag (bool): Whether to use RAG. Defaults to False
38
+ session_id (str): Session identifier. Defaults to None
39
+ chroma_db_path (str): Path to ChromaDB. Defaults to "data/rag/chroma_db"
40
+ manim_docs_path (str): Path to Manim docs. Defaults to "data/rag/manim_docs"
41
+ embedding_model (str): Name of embedding model. Defaults to "text-embedding-ada-002"
42
+ use_langfuse (bool): Whether to use Langfuse logging. Defaults to True
43
+ """
44
+
45
+ def __init__(self, planner_model, helper_model=None, output_dir="output", print_response=False, use_context_learning=False, context_learning_path="data/context_learning", use_rag=False, session_id=None, chroma_db_path="data/rag/chroma_db", manim_docs_path="data/rag/manim_docs", embedding_model="text-embedding-ada-002", use_langfuse=True):
46
+ self.planner_model = planner_model
47
+ self.helper_model = helper_model if helper_model is not None else planner_model
48
+ self.output_dir = output_dir
49
+ self.print_response = print_response
50
+ self.use_context_learning = use_context_learning
51
+ self.context_learning_path = context_learning_path
52
+ # Initialize different types of context examples
53
+ self.scene_plan_examples = self._load_context_examples('scene_plan') if use_context_learning else None
54
+ self.vision_storyboard_examples = self._load_context_examples('scene_vision_storyboard') if use_context_learning else None
55
+ self.technical_implementation_examples = self._load_context_examples('technical_implementation') if use_context_learning else None
56
+ self.animation_narration_examples = self._load_context_examples('scene_animation_narration') if use_context_learning else None
57
+ self.code_examples = self._load_context_examples('code') if use_context_learning else None
58
+ self.use_rag = use_rag
59
+ self.rag_integration = None
60
+ if use_rag:
61
+ self.rag_integration = RAGIntegration(
62
+ helper_model=helper_model,
63
+ output_dir=output_dir,
64
+ chroma_db_path=chroma_db_path,
65
+ manim_docs_path=manim_docs_path,
66
+ embedding_model=embedding_model,
67
+ use_langfuse=use_langfuse,
68
+ session_id=session_id
69
+ )
70
+ self.relevant_plugins = [] # Initialize as an empty list
71
+
72
+ def _load_context_examples(self, example_type: str) -> str:
73
+ """Load context learning examples of a specific type from files.
74
+
75
+ Args:
76
+ example_type (str): Type of examples to load ('scene_plan', 'scene_vision_storyboard', etc.)
77
+
78
+ Returns:
79
+ str: Formatted string containing the loaded examples, or None if no examples found
80
+ """
81
+ examples = []
82
+
83
+ # Define file patterns for different types
84
+ file_patterns = {
85
+ 'scene_plan': '*_scene_plan.txt',
86
+ 'scene_vision_storyboard': '*_scene_vision_storyboard.txt',
87
+ 'technical_implementation': '*_technical_implementation.txt',
88
+ 'scene_animation_narration': '*_scene_animation_narration.txt',
89
+ 'code': '*.py'
90
+ }
91
+
92
+ pattern = file_patterns.get(example_type)
93
+ if not pattern:
94
+ return None
95
+
96
+ # Search in subdirectories of context_learning_path
97
+ for root, _, _ in os.walk(self.context_learning_path):
98
+ for example_file in glob.glob(os.path.join(root, pattern)):
99
+ with open(example_file, 'r') as f:
100
+ content = f.read()
101
+ if example_type == 'code':
102
+ examples.append(f"# Example from {os.path.basename(example_file)}\n{content}\n")
103
+ else:
104
+ examples.append(f"# Example from {os.path.basename(example_file)}\n{content}\n")
105
+
106
+ # Format examples using appropriate template
107
+ if examples:
108
+ formatted_examples = self._format_examples(example_type, examples)
109
+ return formatted_examples
110
+ return None
111
+
112
+ def _format_examples(self, example_type: str, examples: List[str]) -> str:
113
+ """Format examples using the appropriate template based on their type.
114
+
115
+ Args:
116
+ example_type (str): Type of examples to format
117
+ examples (List[str]): List of example strings to format
118
+
119
+ Returns:
120
+ str: Formatted examples string, or None if no template found
121
+ """
122
+ templates = {
123
+ 'scene_plan': get_prompt_context_learning_scene_plan,
124
+ 'scene_vision_storyboard': get_prompt_context_learning_vision_storyboard,
125
+ 'technical_implementation': get_prompt_context_learning_technical_implementation,
126
+ 'scene_animation_narration': get_prompt_context_learning_animation_narration,
127
+ 'code': get_prompt_context_learning_code
128
+ }
129
+
130
+ template = templates.get(example_type)
131
+ if template:
132
+ return template(examples="\n".join(examples))
133
+ return None
134
+
135
+ def generate_scene_outline(self,
136
+ topic: str,
137
+ description: str,
138
+ session_id: str) -> str:
139
+ """Generate a scene outline based on the topic and description.
140
+
141
+ Args:
142
+ topic (str): The topic of the video
143
+ description (str): Description of the video content
144
+ session_id (str): Session identifier
145
+
146
+ Returns:
147
+ str: Generated scene outline
148
+ """
149
+ # Detect relevant plugins upfront if RAG is enabled
150
+ if self.use_rag:
151
+ self.relevant_plugins = self.rag_integration.detect_relevant_plugins(topic, description) or []
152
+ self.rag_integration.set_relevant_plugins(self.relevant_plugins)
153
+ print(f"Detected relevant plugins: {self.relevant_plugins}")
154
+
155
+ prompt = get_prompt_scene_plan(topic, description)
156
+
157
+ if self.use_context_learning and self.scene_plan_examples:
158
+ prompt += f"\n\nHere are some example scene plans for reference:\n{self.scene_plan_examples}"
159
+
160
+ # Generate plan using planner model
161
+ response_text = self.planner_model(
162
+ _prepare_text_inputs(prompt),
163
+ metadata={"generation_name": "scene_outline", "tags": [topic, "scene-outline"], "session_id": session_id}
164
+ )
165
+ # extract scene outline <SCENE_OUTLINE> ... </SCENE_OUTLINE>
166
+ scene_outline_match = re.search(r'(<SCENE_OUTLINE>.*?</SCENE_OUTLINE>)', response_text, re.DOTALL)
167
+ scene_outline = scene_outline_match.group(1) if scene_outline_match else response_text
168
+
169
+ # replace all spaces and special characters with underscores for file path compatibility
170
+ file_prefix = topic.lower()
171
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
172
+ # save plan to file
173
+ os.makedirs(os.path.join(self.output_dir, file_prefix), exist_ok=True) # Ensure directory exists
174
+ with open(os.path.join(self.output_dir, file_prefix, f"{file_prefix}_scene_outline.txt"), "w") as f:
175
+ f.write(scene_outline)
176
+ print(f"Plan saved to {file_prefix}_scene_outline.txt")
177
+
178
+ return scene_outline
179
+
180
+ async def _generate_scene_implementation_single(self, topic: str, description: str, scene_outline_i: str, i: int, file_prefix: str, session_id: str, scene_trace_id: str) -> str:
181
+ """Generate implementation plan for a single scene.
182
+
183
+ Args:
184
+ topic (str): The topic of the video
185
+ description (str): Description of the video content
186
+ scene_outline_i (str): Outline for this specific scene
187
+ i (int): Scene number
188
+ file_prefix (str): Prefix for output files
189
+ session_id (str): Session identifier
190
+ scene_trace_id (str): Unique trace ID for this scene
191
+
192
+ Returns:
193
+ str: Generated implementation plan for the scene
194
+ """
195
+ # Initialize empty implementation plan
196
+ implementation_plan = ""
197
+ scene_dir = os.path.join(self.output_dir, file_prefix, f"scene{i}")
198
+ subplan_dir = os.path.join(scene_dir, "subplans")
199
+ os.makedirs(scene_dir, exist_ok=True)
200
+ os.makedirs(subplan_dir, exist_ok=True)
201
+
202
+ # Save scene_trace_id to file
203
+ trace_id_file = os.path.join(subplan_dir, "scene_trace_id.txt")
204
+ with open(trace_id_file, 'w') as f:
205
+ f.write(scene_trace_id)
206
+ print(f"Scene trace ID saved to {trace_id_file}")
207
+
208
+ # ===== Step 1: Generate Scene Vision and Storyboard =====
209
+ # ===================================================
210
+ prompt_vision_storyboard = get_prompt_scene_vision_storyboard(i, topic, description, scene_outline_i, self.relevant_plugins)
211
+
212
+ # Add vision storyboard examples only for this stage if available
213
+ if self.use_context_learning and self.vision_storyboard_examples:
214
+ prompt_vision_storyboard += f"\n\nHere are some example storyboards:\n{self.vision_storyboard_examples}"
215
+
216
+ if self.rag_integration:
217
+ # Use the already detected plugins instead of detecting again
218
+ # relevant_plugins = self.relevant_plugins # Removed redundant variable
219
+ # print(f"Using detected plugins: {relevant_plugins}") # Removed redundant print
220
+
221
+ # Generate RAG queries
222
+ rag_queries = self.rag_integration._generate_rag_queries_storyboard(
223
+ scene_plan=scene_outline_i,
224
+ scene_trace_id=scene_trace_id,
225
+ topic=topic,
226
+ scene_number=i,
227
+ session_id=session_id,
228
+ relevant_plugins=self.relevant_plugins # Use self.relevant_plugins directly
229
+ )
230
+
231
+ retrieved_docs = self.rag_integration.get_relevant_docs(
232
+ rag_queries=rag_queries,
233
+ scene_trace_id=scene_trace_id,
234
+ topic=topic,
235
+ scene_number=i
236
+ )
237
+
238
+ # Add documentation to prompt
239
+ prompt_vision_storyboard += f"\n\n{retrieved_docs}"
240
+
241
+ vision_storyboard_plan = self.planner_model(
242
+ _prepare_text_inputs(prompt_vision_storyboard),
243
+ metadata={"generation_name": "scene_vision_storyboard", "trace_id": scene_trace_id, "tags": [topic, f"scene{i}"], "session_id": session_id}
244
+ )
245
+ # extract vision storyboard plan <SCENE_VISION_STORYBOARD_PLAN> ... </SCENE_VISION_STORYBOARD_PLAN>
246
+ vision_match = re.search(r'(<SCENE_VISION_STORYBOARD_PLAN>.*?</SCENE_VISION_STORYBOARD_PLAN>)', vision_storyboard_plan, re.DOTALL)
247
+ vision_storyboard_plan = vision_match.group(1) if vision_match else vision_storyboard_plan
248
+ implementation_plan += vision_storyboard_plan + "\n\n"
249
+ file_path_vs = os.path.join(subplan_dir, f"{file_prefix}_scene{i}_vision_storyboard_plan.txt")
250
+ with open(file_path_vs, "w") as f:
251
+ f.write(vision_storyboard_plan)
252
+ print(f"Scene {i} Vision and Storyboard Plan saved to {file_path_vs}")
253
+
254
+ # ===== Step 2: Generate Technical Implementation Plan =====
255
+ # =========================================================
256
+ prompt_technical_implementation = get_prompt_scene_technical_implementation(i, topic, description, scene_outline_i, vision_storyboard_plan, self.relevant_plugins)
257
+
258
+ # Add technical implementation examples only for this stage if available
259
+ if self.use_context_learning and self.technical_implementation_examples:
260
+ prompt_technical_implementation += f"\n\nHere are some example technical implementations:\n{self.technical_implementation_examples}"
261
+
262
+ if self.rag_integration:
263
+ # Use the already detected plugins instead of detecting again
264
+ # relevant_plugins = self.relevant_plugins # Removed redundant variable
265
+ # print(f"Using detected plugins: {relevant_plugins}") # Removed redundant print
266
+
267
+ # Generate RAG queries
268
+ rag_queries = self.rag_integration._generate_rag_queries_technical(
269
+ storyboard=vision_storyboard_plan,
270
+ scene_trace_id=scene_trace_id,
271
+ topic=topic,
272
+ scene_number=i,
273
+ session_id=session_id,
274
+ relevant_plugins=self.relevant_plugins # Use self.relevant_plugins directly
275
+ )
276
+
277
+ retrieved_docs = self.rag_integration.get_relevant_docs(
278
+ rag_queries=rag_queries,
279
+ scene_trace_id=scene_trace_id,
280
+ topic=topic,
281
+ scene_number=i
282
+ )
283
+
284
+ # Add documentation to prompt
285
+ prompt_technical_implementation += f"\n\n{retrieved_docs}"
286
+
287
+ technical_implementation_plan = self.planner_model(
288
+ _prepare_text_inputs(prompt_technical_implementation),
289
+ metadata={"generation_name": "scene_technical_implementation", "trace_id": scene_trace_id, "tags": [topic, f"scene{i}"], "session_id": session_id}
290
+ )
291
+ # extract technical implementation plan <SCENE_TECHNICAL_IMPLEMENTATION_PLAN> ... </SCENE_TECHNICAL_IMPLEMENTATION_PLAN>
292
+ technical_match = re.search(r'(<SCENE_TECHNICAL_IMPLEMENTATION_PLAN>.*?</SCENE_TECHNICAL_IMPLEMENTATION_PLAN>)', technical_implementation_plan, re.DOTALL)
293
+ technical_implementation_plan = technical_match.group(1) if technical_match else technical_implementation_plan
294
+ implementation_plan += technical_implementation_plan + "\n\n"
295
+ file_path_ti = os.path.join(subplan_dir, f"{file_prefix}_scene{i}_technical_implementation_plan.txt")
296
+ with open(file_path_ti, "w") as f:
297
+ f.write(technical_implementation_plan)
298
+ print(f"Scene {i} Technical Implementation Plan saved to {file_path_ti}")
299
+
300
+ # ===== Step 3: Generate Animation and Narration Plan =====
301
+ # =========================================================
302
+ prompt_animation_narration = get_prompt_scene_animation_narration(i, topic, description, scene_outline_i, vision_storyboard_plan, technical_implementation_plan, self.relevant_plugins)
303
+
304
+ # Add animation narration examples only for this stage if available
305
+ if self.use_context_learning and self.animation_narration_examples:
306
+ prompt_animation_narration += f"\n\nHere are some example animation and narration plans:\n{self.animation_narration_examples}"
307
+
308
+ if self.rag_integration:
309
+ rag_queries = self.rag_integration._generate_rag_queries_narration(
310
+ storyboard=vision_storyboard_plan,
311
+ scene_trace_id=scene_trace_id,
312
+ topic=topic,
313
+ scene_number=i,
314
+ session_id=session_id,
315
+ relevant_plugins=self.relevant_plugins # Use self.relevant_plugins directly
316
+ )
317
+ retrieved_docs = self.rag_integration.get_relevant_docs(
318
+ rag_queries=rag_queries,
319
+ scene_trace_id=scene_trace_id,
320
+ topic=topic,
321
+ scene_number=i
322
+ )
323
+ prompt_animation_narration += f"\n\n{retrieved_docs}"
324
+
325
+ animation_narration_plan = self.planner_model(
326
+ _prepare_text_inputs(prompt_animation_narration),
327
+ metadata={"generation_name": "scene_animation_narration", "trace_id": scene_trace_id, "tags": [topic, f"scene{i}"], "session_id": session_id}
328
+ )
329
+ # extract animation narration plan <SCENE_ANIMATION_NARRATION_PLAN> ... </SCENE_ANIMATION_NARRATION_PLAN>
330
+ animation_match = re.search(r'(<SCENE_ANIMATION_NARRATION_PLAN>.*?</SCENE_ANIMATION_NARRATION_PLAN>)', animation_narration_plan, re.DOTALL)
331
+ animation_narration_plan = animation_match.group(1) if animation_match else animation_narration_plan
332
+ implementation_plan += animation_narration_plan + "\n\n"
333
+ file_path_an = os.path.join(subplan_dir, f"{file_prefix}_scene{i}_animation_narration_plan.txt")
334
+ with open(file_path_an, "w") as f:
335
+ f.write(animation_narration_plan)
336
+ print(f"Scene {i} Animation and Narration Plan saved to {file_path_an}")
337
+
338
+ # ===== Step 4: Save Implementation Plan =====
339
+ # ==========================================
340
+ # save the overall implementation plan to file
341
+ with open(os.path.join(self.output_dir, file_prefix, f"scene{i}", f"{file_prefix}_scene{i}_implementation_plan.txt"), "w") as f:
342
+ f.write(f"# Scene {i} Implementation Plan\n\n")
343
+ f.write(implementation_plan)
344
+ print(f"Scene {i} Implementation Plan saved to {file_path_ti}")
345
+
346
+ return implementation_plan
347
+
348
+ async def generate_scene_implementation(self,
349
+ topic: str,
350
+ description: str,
351
+ plan: str,
352
+ session_id: str) -> List[str]:
353
+ """Generate detailed implementation plans for all scenes.
354
+
355
+ Args:
356
+ topic (str): The topic of the video
357
+ description (str): Description of the video content
358
+ plan (str): Overall scene plan
359
+ session_id (str): Session identifier
360
+
361
+ Returns:
362
+ List[str]: List of implementation plans for each scene
363
+ """
364
+ # extract scene outline <SCENE_OUTLINE> ... </SCENE_OUTLINE>
365
+ scene_outline = re.search(r'(<SCENE_OUTLINE>.*?</SCENE_OUTLINE>)', plan, re.DOTALL).group(1)
366
+ # check the number of scenes in the outline
367
+ scene_number = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline))
368
+ # replace all spaces and special characters with underscores for file path compatibility
369
+ file_prefix = topic.lower()
370
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
371
+ # generate implementation plan for each scene
372
+ all_scene_implementation_plans = []
373
+
374
+ tasks = []
375
+ for i in range(1, scene_number):
376
+ print(f"Generating implementation plan for scene {i} in topic {topic}")
377
+ scene_outline_i = re.search(r'(<SCENE_{i}>.*?</SCENE_{i}>)'.format(i=i), scene_outline, re.DOTALL).group(1)
378
+ scene_trace_id = str(uuid.uuid4())
379
+ task = asyncio.create_task(self._generate_scene_implementation_single(topic, description, scene_outline_i, i, file_prefix, session_id, scene_trace_id))
380
+ tasks.append(task)
381
+
382
+ all_scene_implementation_plans = await asyncio.gather(*tasks)
383
+ return all_scene_implementation_plans
384
+
385
+ async def generate_scene_implementation_concurrently(self,
386
+ topic: str,
387
+ description: str,
388
+ plan: str,
389
+ session_id: str,
390
+ scene_semaphore) -> List[str]:
391
+ """Generate detailed implementation plans for all scenes concurrently with controlled concurrency.
392
+
393
+ Args:
394
+ topic (str): The topic of the video
395
+ description (str): Description of the video content
396
+ plan (str): Overall scene plan
397
+ session_id (str): Session identifier
398
+ scene_semaphore: Semaphore to control concurrent scene generation
399
+
400
+ Returns:
401
+ List[str]: List of implementation plans for each scene
402
+ """
403
+ scene_outline = extract_xml(plan)
404
+ scene_number = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline))
405
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', topic.lower())
406
+ all_scene_implementation_plans = []
407
+
408
+ async def generate_single_scene_implementation(i):
409
+ async with scene_semaphore: # controls parallelism
410
+ print(f"Generating implementation plan for scene {i} in topic {topic}")
411
+ scene_outline_i = re.search(r'(<SCENE_{i}>.*?</SCENE_{i}>)'.format(i=i), scene_outline, re.DOTALL).group(1)
412
+ scene_trace_id = str(uuid.uuid4()) # Generate UUID here
413
+ return await self._generate_scene_implementation_single(topic, description, scene_outline_i, i, file_prefix, session_id, scene_trace_id)
414
+
415
+ tasks = [generate_single_scene_implementation(i + 1) for i in range(scene_number)]
416
+ all_scene_implementation_plans = await asyncio.gather(*tasks)
417
+ return all_scene_implementation_plans
src/core/video_renderer.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import subprocess
4
+ import asyncio
5
+ from PIL import Image
6
+ from typing import Optional, List
7
+ import traceback
8
+ import sys
9
+
10
+ from src.core.parse_video import (
11
+ get_images_from_video,
12
+ image_with_most_non_black_space
13
+ )
14
+ from mllm_tools.vertex_ai import VertexAIWrapper
15
+ from mllm_tools.gemini import GeminiWrapper
16
+
17
+ class VideoRenderer:
18
+ """Class for rendering and combining Manim animation videos."""
19
+
20
+ def __init__(self, output_dir="output", print_response=False, use_visual_fix_code=False):
21
+ """Initialize the VideoRenderer.
22
+
23
+ Args:
24
+ output_dir (str, optional): Directory for output files. Defaults to "output".
25
+ print_response (bool, optional): Whether to print responses. Defaults to False.
26
+ use_visual_fix_code (bool, optional): Whether to use visual fix code. Defaults to False.
27
+ """
28
+ self.output_dir = output_dir
29
+ self.print_response = print_response
30
+ self.use_visual_fix_code = use_visual_fix_code
31
+
32
+ async def render_scene(self, code: str, file_prefix: str, curr_scene: int, curr_version: int, code_dir: str, media_dir: str, max_retries: int = 3, use_visual_fix_code=False, visual_self_reflection_func=None, banned_reasonings=None, scene_trace_id=None, topic=None, session_id=None):
33
+ """Render a single scene and handle error retries and visual fixes.
34
+
35
+ Args:
36
+ code (str): The Manim code to render
37
+ file_prefix (str): Prefix for output files
38
+ curr_scene (int): Current scene number
39
+ curr_version (int): Current version number
40
+ code_dir (str): Directory for code files
41
+ media_dir (str): Directory for media output
42
+ max_retries (int, optional): Maximum retry attempts. Defaults to 3.
43
+ use_visual_fix_code (bool, optional): Whether to use visual fix code. Defaults to False.
44
+ visual_self_reflection_func (callable, optional): Function for visual self-reflection. Defaults to None.
45
+ banned_reasonings (list, optional): List of banned reasoning strings. Defaults to None.
46
+ scene_trace_id (str, optional): Scene trace identifier. Defaults to None.
47
+ topic (str, optional): Topic name. Defaults to None.
48
+ session_id (str, optional): Session identifier. Defaults to None.
49
+
50
+ Returns:
51
+ tuple: (code, error_message) where error_message is None on success
52
+ """
53
+ retries = 0
54
+ while retries < max_retries:
55
+ try:
56
+ # Execute manim in a thread to prevent blocking
57
+ file_path = os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}.py")
58
+ result = await asyncio.to_thread(
59
+ subprocess.run,
60
+ ["manim", "-qh", file_path, "--media_dir", media_dir, "--progress_bar", "none"],
61
+ capture_output=True,
62
+ text=True
63
+ )
64
+
65
+ # if result.returncode != 0, it means that the code is not rendered successfully
66
+ # so we need to fix the code by returning the code and the error message
67
+ if result.returncode != 0:
68
+ raise Exception(result.stderr)
69
+
70
+ if use_visual_fix_code and visual_self_reflection_func and banned_reasonings:
71
+ # Get the rendered video path
72
+ video_path = os.path.join(
73
+ media_dir,
74
+ "videos",
75
+ f"{file_prefix}_scene{curr_scene}_v{curr_version}.mp4"
76
+ )
77
+
78
+ # For Gemini/Vertex AI models, pass the video directly
79
+ if self.scene_model.model_name.startswith(('gemini/', 'vertex_ai/')):
80
+ media_input = video_path
81
+ else:
82
+ # For other models, use image snapshot
83
+ media_input = self.create_snapshot_scene(
84
+ topic, curr_scene, curr_version, return_type="path"
85
+ )
86
+
87
+ new_code, log = visual_self_reflection_func(
88
+ code,
89
+ media_input,
90
+ scene_trace_id=scene_trace_id,
91
+ topic=topic,
92
+ scene_number=curr_scene,
93
+ session_id=session_id
94
+ )
95
+
96
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}_vfix_log.txt"), "w") as f:
97
+ f.write(log)
98
+
99
+ # Check for termination markers
100
+ if "<LGTM>" in new_code or any(word in new_code for word in banned_reasonings):
101
+ break
102
+
103
+ code = new_code
104
+ curr_version += 1
105
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}.py"), "w") as f:
106
+ f.write(code)
107
+ print(f"Code saved to scene{curr_scene}/code/{file_prefix}_scene{curr_scene}_v{curr_version}.py")
108
+ retries = 0
109
+ continue
110
+
111
+ break # Exit retry loop on success
112
+
113
+ except Exception as e:
114
+ print(f"Error: {e}")
115
+ print(f"Retrying {retries+1} of {max_retries}...")
116
+
117
+ with open(os.path.join(code_dir, f"{file_prefix}_scene{curr_scene}_v{curr_version}_error.log"), "a") as f:
118
+ f.write(f"\nError in attempt {retries}:\n{str(e)}\n")
119
+ retries += 1
120
+ return code, str(e) # Indicate failure and return error message
121
+
122
+ print(f"Successfully rendered {file_path}")
123
+ with open(os.path.join(self.output_dir, file_prefix, f"scene{curr_scene}", "succ_rendered.txt"), "w") as f:
124
+ f.write("")
125
+
126
+ return code, None # Indicate success
127
+
128
+ def run_manim_process(self,
129
+ topic: str):
130
+ """Run manim on all generated manim code for a specific topic.
131
+
132
+ Args:
133
+ topic (str): Topic name to process
134
+
135
+ Returns:
136
+ subprocess.CompletedProcess: Result of the final manim process
137
+ """
138
+ file_prefix = topic.lower()
139
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
140
+ search_path = os.path.join(self.output_dir, file_prefix)
141
+ # Iterate through scene folders
142
+ scene_folders = [f for f in os.listdir(search_path) if os.path.isdir(os.path.join(search_path, f))]
143
+ scene_folders.sort() # Sort to process scenes in order
144
+
145
+ for folder in scene_folders:
146
+ folder_path = os.path.join(search_path, folder)
147
+
148
+ # Get all Python files in version order
149
+ py_files = [f for f in os.listdir(folder_path) if f.endswith('.py')]
150
+ py_files.sort(key=lambda x: int(x.split('_v')[-1].split('.')[0])) # Sort by version number
151
+
152
+ for file in py_files:
153
+ file_path = os.path.join(folder_path, file)
154
+ try:
155
+ media_dir = os.path.join(self.output_dir, file_prefix, "media")
156
+ result = subprocess.run(
157
+ f"manim -qh {file_path} --media_dir {media_dir}",
158
+ shell=True,
159
+ capture_output=True,
160
+ text=True
161
+ )
162
+ if result.returncode != 0:
163
+ raise Exception(result.stderr)
164
+ print(f"Successfully rendered {file}")
165
+ break # Move to next scene folder if successful
166
+ except Exception as e:
167
+ print(f"Error rendering {file}: {e}")
168
+ error_log_path = os.path.join(folder_path, f"{file.split('.')[0]}_error.log") # drop the extra py
169
+ with open(error_log_path, "w") as f:
170
+ f.write(f"Error:\n{str(e)}\n")
171
+ print(f"Error log saved to {error_log_path}")
172
+ return result
173
+
174
+ def create_snapshot_scene(self, topic: str, scene_number: int, version_number: int, return_type: str = "image"):
175
+ """Create a snapshot of the video for a specific topic and scene.
176
+
177
+ Args:
178
+ topic (str): Topic name
179
+ scene_number (int): Scene number
180
+ version_number (int): Version number
181
+ return_type (str, optional): Type of return value - "path" or "image". Defaults to "image".
182
+
183
+ Returns:
184
+ Union[str, PIL.Image]: Path to saved image or PIL Image object
185
+
186
+ Raises:
187
+ FileNotFoundError: If no mp4 files found in video folder
188
+ """
189
+ file_prefix = topic.lower()
190
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
191
+ search_path = os.path.join(self.output_dir, file_prefix)
192
+ video_folder_path = os.path.join(search_path, "media", "videos", f"{file_prefix}_scene{scene_number}_v{version_number}", "1080p60")
193
+ os.makedirs(video_folder_path, exist_ok=True)
194
+ snapshot_path = os.path.join(video_folder_path, "snapshot.png")
195
+ # Get the mp4 video file from the video folder path
196
+ video_files = [f for f in os.listdir(video_folder_path) if f.endswith('.mp4')]
197
+ if not video_files:
198
+ raise FileNotFoundError(f"No mp4 files found in {video_folder_path}")
199
+ video_path = os.path.join(video_folder_path, video_files[0])
200
+ saved_image = image_with_most_non_black_space(get_images_from_video(video_path), snapshot_path, return_type=return_type)
201
+ return saved_image
202
+
203
+ def combine_videos(self, topic: str):
204
+ """Combine all videos and subtitle files for a specific topic using ffmpeg.
205
+
206
+ Args:
207
+ topic (str): Topic name to combine videos for
208
+
209
+ This function will:
210
+ - Find all scene videos and subtitles
211
+ - Combine videos with or without audio
212
+ - Merge subtitle files with correct timing
213
+ - Save combined video and subtitles to output directory
214
+ """
215
+ file_prefix = topic.lower()
216
+ file_prefix = re.sub(r'[^a-z0-9_]+', '_', file_prefix)
217
+ search_path = os.path.join(self.output_dir, file_prefix, "media", "videos")
218
+
219
+ # Create output directory if it doesn't exist
220
+ video_output_dir = os.path.join(self.output_dir, file_prefix)
221
+ os.makedirs(video_output_dir, exist_ok=True)
222
+
223
+ output_video_path = os.path.join(video_output_dir, f"{file_prefix}_combined.mp4")
224
+ output_srt_path = os.path.join(video_output_dir, f"{file_prefix}_combined.srt")
225
+
226
+ if os.path.exists(output_video_path) and os.path.exists(output_srt_path):
227
+ print(f"Combined video and subtitles already exist at {output_video_path}, not combining again.")
228
+ return
229
+
230
+ # Get scene count from outline
231
+ scene_outline_path = os.path.join(self.output_dir, file_prefix, f"{file_prefix}_scene_outline.txt")
232
+ if not os.path.exists(scene_outline_path):
233
+ print(f"Warning: Scene outline file not found at {scene_outline_path}. Cannot determine scene count.")
234
+ return
235
+ with open(scene_outline_path) as f:
236
+ plan = f.read()
237
+ scene_outline = re.search(r'(<SCENE_OUTLINE>.*?</SCENE_OUTLINE>)', plan, re.DOTALL).group(1)
238
+ scene_count = len(re.findall(r'<SCENE_(\d+)>[^<]', scene_outline))
239
+
240
+ # Find all scene folders and videos
241
+ scene_folders = []
242
+ for root, dirs, files in os.walk(search_path):
243
+ for dir in dirs:
244
+ if dir.startswith(file_prefix + "_scene"):
245
+ scene_folders.append(os.path.join(root, dir))
246
+
247
+ scene_videos = []
248
+ scene_subtitles = []
249
+
250
+ for scene_num in range(1, scene_count + 1):
251
+ folders = [f for f in scene_folders if int(f.split("scene")[-1].split("_")[0]) == scene_num]
252
+ if not folders:
253
+ print(f"Warning: Missing scene {scene_num}")
254
+ continue
255
+
256
+ folders.sort(key=lambda f: int(f.split("_v")[-1]))
257
+ folder = folders[-1]
258
+
259
+ video_found = False
260
+ subtitles_found = False
261
+ for filename in os.listdir(os.path.join(folder, "1080p60")):
262
+ if filename.endswith('.mp4'):
263
+ scene_videos.append(os.path.join(folder, "1080p60", filename))
264
+ video_found = True
265
+ elif filename.endswith('.srt'):
266
+ scene_subtitles.append(os.path.join(folder, "1080p60", filename))
267
+ subtitles_found = True
268
+
269
+ if not video_found:
270
+ print(f"Warning: Missing video for scene {scene_num}")
271
+ if not subtitles_found:
272
+ scene_subtitles.append(None)
273
+
274
+ if len(scene_videos) != scene_count:
275
+ print("Not all videos/subtitles are found, aborting video combination.")
276
+ return
277
+
278
+ try:
279
+ import ffmpeg # You might need to install ffmpeg-python package: pip install ffmpeg-python
280
+ from tqdm import tqdm
281
+
282
+ print("Analyzing video streams...")
283
+ # Check if videos have audio streams
284
+ has_audio = []
285
+ for video in tqdm(scene_videos, desc="Checking audio streams"):
286
+ probe = ffmpeg.probe(video)
287
+ audio_streams = [stream for stream in probe['streams'] if stream['codec_type'] == 'audio']
288
+ has_audio.append(len(audio_streams) > 0)
289
+
290
+ print("Preparing video combination...")
291
+ # If any video has audio, we need to ensure all videos have audio streams
292
+ if any(has_audio):
293
+ # Create list to store video and audio streams
294
+ streams = []
295
+ for video, has_aud in tqdm(list(zip(scene_videos, has_audio)), desc="Processing videos"):
296
+ if has_aud:
297
+ # Video has audio, use as is
298
+ input_vid = ffmpeg.input(video)
299
+ streams.extend([input_vid['v'], input_vid['a']])
300
+ else:
301
+ # Video lacks audio, add silent audio
302
+ input_vid = ffmpeg.input(video)
303
+ # Generate silent audio for the duration of the video
304
+ probe = ffmpeg.probe(video)
305
+ duration = float(probe['streams'][0]['duration'])
306
+ silent_audio = ffmpeg.input(f'anullsrc=channel_layout=stereo:sample_rate=44100',
307
+ f='lavfi', t=duration)['a']
308
+ streams.extend([input_vid['v'], silent_audio])
309
+
310
+ print("Combining videos with audio...")
311
+ try:
312
+ # Concatenate all streams using optimized CPU encoding settings
313
+ concat = ffmpeg.concat(*streams, v=1, a=1, unsafe=True)
314
+ process = (
315
+ concat
316
+ .output(output_video_path,
317
+ **{'c:v': 'libx264',
318
+ 'c:a': 'aac',
319
+ 'preset': 'veryfast', # Changed from ultrafast for better speed/quality balance
320
+ 'crf': '28', # Same quality setting
321
+ 'threads': '0', # Use all CPU threads
322
+ 'tune': 'fastdecode', # Optimize for decoding speed
323
+ 'profile:v': 'baseline', # Simpler profile for faster encoding
324
+ 'level': '4.0',
325
+ 'x264-params': 'aq-mode=0:no-deblock:no-cabac:ref=1:subme=0:trellis=0:weightp=0', # Added aggressive speed optimizations
326
+ 'movflags': '+faststart',
327
+ 'stats': None,
328
+ 'progress': 'pipe:1'})
329
+ .overwrite_output()
330
+ .run_async(pipe_stdout=True, pipe_stderr=True)
331
+ )
332
+
333
+ # Process progress output
334
+ while True:
335
+ line = process.stdout.readline().decode('utf-8')
336
+ if not line:
337
+ break
338
+ if 'frame=' in line:
339
+ sys.stdout.write('\rProcessing: ' + line.strip())
340
+ sys.stdout.flush()
341
+
342
+ # Wait for the process to complete and capture output
343
+ stdout, stderr = process.communicate()
344
+ print("\nEncoding complete!")
345
+
346
+ except ffmpeg.Error as e:
347
+ print(f"FFmpeg stdout:\n{e.stdout.decode('utf8')}")
348
+ print(f"FFmpeg stderr:\n{e.stderr.decode('utf8')}")
349
+ raise
350
+ else:
351
+ # No videos have audio, concatenate video streams only
352
+ streams = []
353
+ for video in tqdm(scene_videos, desc="Processing videos"):
354
+ streams.append(ffmpeg.input(video)['v'])
355
+
356
+ print("Combining videos without audio...")
357
+ try:
358
+ concat = ffmpeg.concat(*streams, v=1, unsafe=True)
359
+ process = (
360
+ concat
361
+ .output(output_video_path,
362
+ **{'c:v': 'libx264',
363
+ 'preset': 'medium',
364
+ 'crf': '23',
365
+ 'stats': None, # Enable progress stats
366
+ 'progress': 'pipe:1'}) # Output progress to pipe
367
+ .overwrite_output()
368
+ .run_async(pipe_stdout=True, pipe_stderr=True)
369
+ )
370
+
371
+ # Process progress output
372
+ while True:
373
+ line = process.stdout.readline().decode('utf-8')
374
+ if not line:
375
+ break
376
+ if 'frame=' in line:
377
+ sys.stdout.write('\rProcessing: ' + line.strip())
378
+ sys.stdout.flush()
379
+
380
+ # Wait for the process to complete and capture output
381
+ stdout, stderr = process.communicate()
382
+ print("\nEncoding complete!")
383
+
384
+ except ffmpeg.Error as e:
385
+ print(f"FFmpeg stdout:\n{e.stdout.decode('utf8')}")
386
+ print(f"FFmpeg stderr:\n{e.stderr.decode('utf8')}")
387
+ raise
388
+
389
+ print(f"Successfully combined videos into {output_video_path}")
390
+
391
+ # Handle subtitle combination (existing subtitle code remains the same)
392
+ if scene_subtitles:
393
+ with open(output_srt_path, 'w', encoding='utf-8') as outfile:
394
+ current_time_offset = 0
395
+ subtitle_index = 1
396
+
397
+ for srt_file, video_file in zip(scene_subtitles, scene_videos):
398
+ if srt_file is None:
399
+ continue
400
+
401
+ with open(srt_file, 'r', encoding='utf-8') as infile:
402
+ lines = infile.readlines()
403
+ i = 0
404
+ while i < len(lines):
405
+ line = lines[i].strip()
406
+ if line.isdigit(): # Subtitle index
407
+ outfile.write(f"{subtitle_index}\n")
408
+ subtitle_index += 1
409
+ i += 1
410
+
411
+ # Time codes line
412
+ time_line = lines[i].strip()
413
+ start_time, end_time = time_line.split(' --> ')
414
+
415
+ # Convert time codes and add offset
416
+ def adjust_time(time_str, offset):
417
+ h, m, s = time_str.replace(',', '.').split(':')
418
+ total_seconds = float(h) * 3600 + float(m) * 60 + float(s) + offset
419
+ h = int(total_seconds // 3600)
420
+ m = int((total_seconds % 3600) // 60)
421
+ s = total_seconds % 60
422
+ return f"{h:02d}:{m:02d}:{s:06.3f}".replace('.', ',')
423
+
424
+ new_start = adjust_time(start_time, current_time_offset)
425
+ new_end = adjust_time(end_time, current_time_offset)
426
+ outfile.write(f"{new_start} --> {new_end}\n")
427
+ i += 1
428
+
429
+ # Subtitle text (could be multiple lines)
430
+ while i < len(lines) and lines[i].strip():
431
+ outfile.write(lines[i])
432
+ i += 1
433
+ outfile.write('\n')
434
+ else:
435
+ i += 1
436
+
437
+ # Update time offset using ffprobe
438
+ probe = ffmpeg.probe(video_file)
439
+ duration = float(probe['streams'][0]['duration'])
440
+ current_time_offset += duration
441
+
442
+ print(f"Successfully combined videos into {output_video_path}")
443
+ if scene_subtitles:
444
+ print(f"Successfully combined subtitles into {output_srt_path}")
445
+
446
+ except Exception as e:
447
+ print(f"Error combining videos and subtitles: {e}")
448
+ traceback.print_exc()
src/rag/__init__.py ADDED
File without changes
src/rag/rag_integration.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from typing import List, Dict
5
+
6
+ from mllm_tools.utils import _prepare_text_inputs
7
+ from task_generator import (
8
+ get_prompt_rag_query_generation_fix_error,
9
+ get_prompt_detect_plugins,
10
+ get_prompt_rag_query_generation_technical,
11
+ get_prompt_rag_query_generation_vision_storyboard,
12
+ get_prompt_rag_query_generation_narration,
13
+ get_prompt_rag_query_generation_code
14
+ )
15
+ from src.rag.vector_store import RAGVectorStore
16
+
17
+ class RAGIntegration:
18
+ """Class for integrating RAG (Retrieval Augmented Generation) functionality.
19
+
20
+ This class handles RAG integration including plugin detection, query generation,
21
+ and document retrieval.
22
+
23
+ Args:
24
+ helper_model: Model used for generating queries and processing text
25
+ output_dir (str): Directory for output files
26
+ chroma_db_path (str): Path to ChromaDB
27
+ manim_docs_path (str): Path to Manim documentation
28
+ embedding_model (str): Name of embedding model to use
29
+ use_langfuse (bool, optional): Whether to use Langfuse logging. Defaults to True
30
+ session_id (str, optional): Session identifier. Defaults to None
31
+ """
32
+
33
+ def __init__(self, helper_model, output_dir, chroma_db_path, manim_docs_path, embedding_model, use_langfuse=True, session_id=None):
34
+ self.helper_model = helper_model
35
+ self.output_dir = output_dir
36
+ self.manim_docs_path = manim_docs_path
37
+ self.session_id = session_id
38
+ self.relevant_plugins = None
39
+
40
+ self.vector_store = RAGVectorStore(
41
+ chroma_db_path=chroma_db_path,
42
+ manim_docs_path=manim_docs_path,
43
+ embedding_model=embedding_model,
44
+ session_id=self.session_id,
45
+ use_langfuse=use_langfuse,
46
+ helper_model=helper_model
47
+ )
48
+
49
+ def set_relevant_plugins(self, plugins: List[str]) -> None:
50
+ """Set the relevant plugins for the current video.
51
+
52
+ Args:
53
+ plugins (List[str]): List of plugin names to set as relevant
54
+ """
55
+ self.relevant_plugins = plugins
56
+
57
+ def detect_relevant_plugins(self, topic: str, description: str) -> List[str]:
58
+ """Detect which plugins might be relevant based on topic and description.
59
+
60
+ Args:
61
+ topic (str): Topic of the video
62
+ description (str): Description of the video content
63
+
64
+ Returns:
65
+ List[str]: List of detected relevant plugin names
66
+ """
67
+ # Load plugin descriptions
68
+ plugins = self._load_plugin_descriptions()
69
+ if not plugins:
70
+ return []
71
+
72
+ # Get formatted prompt using the task_generator function
73
+ prompt = get_prompt_detect_plugins(
74
+ topic=topic,
75
+ description=description,
76
+ plugin_descriptions=json.dumps([{'name': p['name'], 'description': p['description']} for p in plugins], indent=2)
77
+ )
78
+
79
+ try:
80
+ response = self.helper_model(
81
+ _prepare_text_inputs(prompt),
82
+ metadata={"generation_name": "detect-relevant-plugins", "tags": [topic, "plugin-detection"], "session_id": self.session_id}
83
+ )
84
+ # Clean the response to ensure it only contains the JSON array
85
+ response = re.search(r'```json(.*)```', response, re.DOTALL).group(1)
86
+ try:
87
+ relevant_plugins = json.loads(response)
88
+ except json.JSONDecodeError as e:
89
+ print(f"JSONDecodeError when parsing relevant plugins: {e}")
90
+ print(f"Response text was: {response}")
91
+ return []
92
+
93
+ print(f"LLM detected relevant plugins: {relevant_plugins}")
94
+ return relevant_plugins
95
+ except Exception as e:
96
+ print(f"Error detecting plugins with LLM: {e}")
97
+ return []
98
+
99
+ def _load_plugin_descriptions(self) -> list:
100
+ """Load plugin descriptions from JSON file.
101
+
102
+ Returns:
103
+ list: List of plugin descriptions, empty list if loading fails
104
+ """
105
+ try:
106
+ plugin_config_path = os.path.join(
107
+ self.manim_docs_path,
108
+ "plugin_docs",
109
+ "plugins.json"
110
+ )
111
+ if os.path.exists(plugin_config_path):
112
+ with open(plugin_config_path, "r") as f:
113
+ return json.load(f)
114
+ else:
115
+ print(f"Plugin descriptions file not found at {plugin_config_path}")
116
+ return []
117
+ except Exception as e:
118
+ print(f"Error loading plugin descriptions: {e}")
119
+ return []
120
+
121
+ def _generate_rag_queries_storyboard(self, scene_plan: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None, relevant_plugins: List[str] = []) -> List[str]:
122
+ """Generate RAG queries from the scene plan to help create storyboard.
123
+
124
+ Args:
125
+ scene_plan (str): Scene plan text to generate queries from
126
+ scene_trace_id (str, optional): Trace identifier for the scene. Defaults to None
127
+ topic (str, optional): Topic name. Defaults to None
128
+ scene_number (int, optional): Scene number. Defaults to None
129
+ session_id (str, optional): Session identifier. Defaults to None
130
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to empty list
131
+
132
+ Returns:
133
+ List[str]: List of generated RAG queries
134
+ """
135
+ cache_key = f"{topic}_scene{scene_number}_storyboard_rag"
136
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
137
+ os.makedirs(cache_dir, exist_ok=True)
138
+ cache_file = os.path.join(cache_dir, "rag_queries_storyboard.json")
139
+
140
+ if os.path.exists(cache_file):
141
+ with open(cache_file, 'r') as f:
142
+ return json.load(f)
143
+
144
+ # Format relevant plugins as a string
145
+ plugins_str = ", ".join(relevant_plugins) if relevant_plugins else "No plugins are relevant."
146
+
147
+ # Generate the prompt with only the required arguments
148
+ prompt = get_prompt_rag_query_generation_vision_storyboard(
149
+ scene_plan=scene_plan,
150
+ relevant_plugins=plugins_str
151
+ )
152
+
153
+ queries = self.helper_model(
154
+ _prepare_text_inputs(prompt),
155
+ metadata={"generation_name": "rag_query_generation_storyboard", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
156
+ )
157
+
158
+ # retreive json triple backticks
159
+
160
+ try: # add try-except block to handle potential json decode errors
161
+ queries = re.search(r'```json(.*)```', queries, re.DOTALL).group(1)
162
+ queries = json.loads(queries)
163
+ except json.JSONDecodeError as e:
164
+ print(f"JSONDecodeError when parsing RAG queries for storyboard: {e}")
165
+ print(f"Response text was: {queries}")
166
+ return [] # Return empty list in case of parsing error
167
+
168
+ # Cache the queries
169
+ with open(cache_file, 'w') as f:
170
+ json.dump(queries, f)
171
+
172
+ return queries
173
+
174
+ def _generate_rag_queries_technical(self, storyboard: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None, relevant_plugins: List[str] = []) -> List[str]:
175
+ """Generate RAG queries from the storyboard to help create technical implementation.
176
+
177
+ Args:
178
+ storyboard (str): Storyboard text to generate queries from
179
+ scene_trace_id (str, optional): Trace identifier for the scene. Defaults to None
180
+ topic (str, optional): Topic name. Defaults to None
181
+ scene_number (int, optional): Scene number. Defaults to None
182
+ session_id (str, optional): Session identifier. Defaults to None
183
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to empty list
184
+
185
+ Returns:
186
+ List[str]: List of generated RAG queries
187
+ """
188
+ cache_key = f"{topic}_scene{scene_number}_technical_rag"
189
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
190
+ os.makedirs(cache_dir, exist_ok=True)
191
+ cache_file = os.path.join(cache_dir, "rag_queries_technical.json")
192
+
193
+ if os.path.exists(cache_file):
194
+ with open(cache_file, 'r') as f:
195
+ return json.load(f)
196
+
197
+ prompt = get_prompt_rag_query_generation_technical(
198
+ storyboard=storyboard,
199
+ relevant_plugins=", ".join(relevant_plugins) if relevant_plugins else "No plugins are relevant."
200
+ )
201
+
202
+ queries = self.helper_model(
203
+ _prepare_text_inputs(prompt),
204
+ metadata={"generation_name": "rag_query_generation_technical", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
205
+ )
206
+
207
+ try: # add try-except block to handle potential json decode errors
208
+ queries = re.search(r'```json(.*)```', queries, re.DOTALL).group(1)
209
+ queries = json.loads(queries)
210
+ except json.JSONDecodeError as e:
211
+ print(f"JSONDecodeError when parsing RAG queries for technical implementation: {e}")
212
+ print(f"Response text was: {queries}")
213
+ return [] # Return empty list in case of parsing error
214
+
215
+ # Cache the queries
216
+ with open(cache_file, 'w') as f:
217
+ json.dump(queries, f)
218
+
219
+ return queries
220
+
221
+ def _generate_rag_queries_narration(self, storyboard: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None, relevant_plugins: List[str] = []) -> List[str]:
222
+ """Generate RAG queries from the storyboard to help create narration plan.
223
+
224
+ Args:
225
+ storyboard (str): Storyboard text to generate queries from
226
+ scene_trace_id (str, optional): Trace identifier for the scene. Defaults to None
227
+ topic (str, optional): Topic name. Defaults to None
228
+ scene_number (int, optional): Scene number. Defaults to None
229
+ session_id (str, optional): Session identifier. Defaults to None
230
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to empty list
231
+
232
+ Returns:
233
+ List[str]: List of generated RAG queries
234
+ """
235
+ cache_key = f"{topic}_scene{scene_number}_narration_rag"
236
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
237
+ os.makedirs(cache_dir, exist_ok=True)
238
+ cache_file = os.path.join(cache_dir, "rag_queries_narration.json")
239
+
240
+ if os.path.exists(cache_file):
241
+ with open(cache_file, 'r') as f:
242
+ return json.load(f)
243
+
244
+ prompt = get_prompt_rag_query_generation_narration(
245
+ storyboard=storyboard,
246
+ relevant_plugins=", ".join(relevant_plugins) if relevant_plugins else "No plugins are relevant."
247
+ )
248
+
249
+ queries = self.helper_model(
250
+ _prepare_text_inputs(prompt),
251
+ metadata={"generation_name": "rag_query_generation_narration", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
252
+ )
253
+
254
+ try: # add try-except block to handle potential json decode errors
255
+ queries = re.search(r'```json(.*)```', queries, re.DOTALL).group(1)
256
+ queries = json.loads(queries)
257
+ except json.JSONDecodeError as e:
258
+ print(f"JSONDecodeError when parsing narration RAG queries: {e}")
259
+ print(f"Response text was: {queries}")
260
+ return [] # Return empty list in case of parsing error
261
+
262
+ # Cache the queries
263
+ with open(cache_file, 'w') as f:
264
+ json.dump(queries, f)
265
+
266
+ return queries
267
+
268
+ def get_relevant_docs(self, rag_queries: List[Dict], scene_trace_id: str, topic: str, scene_number: int) -> List[str]:
269
+ """Get relevant documentation using the vector store.
270
+
271
+ Args:
272
+ rag_queries (List[Dict]): List of RAG queries to search for
273
+ scene_trace_id (str): Trace identifier for the scene
274
+ topic (str): Topic name
275
+ scene_number (int): Scene number
276
+
277
+ Returns:
278
+ List[str]: List of relevant documentation snippets
279
+ """
280
+ return self.vector_store.find_relevant_docs(
281
+ queries=rag_queries,
282
+ k=2,
283
+ trace_id=scene_trace_id,
284
+ topic=topic,
285
+ scene_number=scene_number
286
+ )
287
+
288
+ def _generate_rag_queries_code(self, implementation_plan: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, relevant_plugins: List[str] = None) -> List[str]:
289
+ """Generate RAG queries from implementation plan.
290
+
291
+ Args:
292
+ implementation_plan (str): Implementation plan text to generate queries from
293
+ scene_trace_id (str, optional): Trace identifier for the scene. Defaults to None
294
+ topic (str, optional): Topic name. Defaults to None
295
+ scene_number (int, optional): Scene number. Defaults to None
296
+ relevant_plugins (List[str], optional): List of relevant plugins. Defaults to None
297
+
298
+ Returns:
299
+ List[str]: List of generated RAG queries
300
+ """
301
+ cache_key = f"{topic}_scene{scene_number}"
302
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
303
+ os.makedirs(cache_dir, exist_ok=True)
304
+ cache_file = os.path.join(cache_dir, "rag_queries_code.json")
305
+
306
+ if os.path.exists(cache_file):
307
+ with open(cache_file, 'r') as f:
308
+ return json.load(f)
309
+
310
+ prompt = get_prompt_rag_query_generation_code(
311
+ implementation_plan=implementation_plan,
312
+ relevant_plugins=", ".join(relevant_plugins) if relevant_plugins else "No plugins are relevant."
313
+ )
314
+
315
+ try:
316
+ response = self.helper_model(
317
+ _prepare_text_inputs(prompt),
318
+ metadata={"generation_name": "rag_query_generation_code", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": self.session_id}
319
+ )
320
+
321
+ # Clean and parse response
322
+ response = re.search(r'```json(.*)```', response, re.DOTALL).group(1)
323
+ queries = json.loads(response)
324
+
325
+ # Cache the queries
326
+ with open(cache_file, 'w') as f:
327
+ json.dump(queries, f)
328
+
329
+ return queries
330
+ except Exception as e:
331
+ print(f"Error generating RAG queries: {e}")
332
+ return []
333
+
334
+ def _generate_rag_queries_error_fix(self, error: str, code: str, scene_trace_id: str = None, topic: str = None, scene_number: int = None, session_id: str = None) -> List[str]:
335
+ """Generate RAG queries for fixing code errors.
336
+
337
+ Args:
338
+ error (str): Error message to generate queries from
339
+ code (str): Code containing the error
340
+ scene_trace_id (str, optional): Trace identifier for the scene. Defaults to None
341
+ topic (str, optional): Topic name. Defaults to None
342
+ scene_number (int, optional): Scene number. Defaults to None
343
+ session_id (str, optional): Session identifier. Defaults to None
344
+
345
+ Returns:
346
+ List[str]: List of generated RAG queries
347
+ """
348
+ if self.relevant_plugins is None:
349
+ print("Warning: No plugins have been detected yet")
350
+ plugins_str = "No plugins are relevant."
351
+ else:
352
+ plugins_str = ", ".join(self.relevant_plugins) if self.relevant_plugins else "No plugins are relevant."
353
+
354
+ cache_key = f"{topic}_scene{scene_number}_error_fix"
355
+ cache_dir = os.path.join(self.output_dir, re.sub(r'[^a-z0-9_]+', '_', topic.lower()), f"scene{scene_number}", "rag_cache")
356
+ os.makedirs(cache_dir, exist_ok=True)
357
+ cache_file = os.path.join(cache_dir, "rag_queries_error_fix.json")
358
+
359
+ if os.path.exists(cache_file):
360
+ with open(cache_file, 'r') as f:
361
+ cached_queries = json.load(f)
362
+ print(f"Using cached RAG queries for error fix in {cache_key}")
363
+ return cached_queries
364
+
365
+ prompt = get_prompt_rag_query_generation_fix_error(
366
+ error=error,
367
+ code=code,
368
+ relevant_plugins=plugins_str
369
+ )
370
+
371
+ queries = self.helper_model(
372
+ _prepare_text_inputs(prompt),
373
+ metadata={"generation_name": "rag-query-generation-fix-error", "trace_id": scene_trace_id, "tags": [topic, f"scene{scene_number}"], "session_id": session_id}
374
+ )
375
+
376
+
377
+ try:
378
+ # retrieve json triple backticks
379
+ queries = re.search(r'```json(.*)```', queries, re.DOTALL).group(1)
380
+ queries = json.loads(queries)
381
+ except json.JSONDecodeError as e:
382
+ print(f"JSONDecodeError when parsing RAG queries for error fix: {e}")
383
+ print(f"Response text was: {queries}")
384
+ return []
385
+
386
+ # Cache the queries
387
+ with open(cache_file, 'w') as f:
388
+ json.dump(queries, f)
389
+
390
+ return queries