Add library name, link to paper and Github repository

#2
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +3 -161
README.md CHANGED
@@ -1,6 +1,7 @@
1
  ---
2
- pipeline_tag: text-generation
3
  license: apache-2.0
 
 
4
  ---
5
 
6
  <div align="center">
@@ -15,163 +16,4 @@ license: apache-2.0
15
  <path d="M123.782 31.2241L123.144 29.1424C123.116 29.0867 123.079 29.0572 123.038 29.0572H117.81C117.768 29.0572 117.732 29.085 117.704 29.1424L117.088 31.2241C117.046 31.3668 116.954 31.4363 116.812 31.4363H114.112C114.027 31.4363 113.963 31.412 113.921 31.3615C113.879 31.3128 113.871 31.2381 113.9 31.1389L118.49 16.7737C118.532 16.6328 118.624 16.5615 118.766 16.5615H122.102C122.243 16.5615 122.335 16.6328 122.379 16.7737L126.968 31.1389C126.982 31.1668 126.989 31.2033 126.989 31.245C126.989 31.372 126.911 31.4363 126.756 31.4363H124.057C123.916 31.4363 123.824 31.365 123.78 31.2241H123.782ZM118.554 26.7407H122.295C122.38 26.7407 122.408 26.6989 122.38 26.6137L120.467 20.3024C120.453 20.2467 120.432 20.2207 120.403 20.2276C120.375 20.2346 120.352 20.2589 120.339 20.3024L118.469 26.6137C118.455 26.6989 118.483 26.7407 118.554 26.7407Z" fill="currentColor"/>
16
  <path d="M128.222 31.353C128.18 31.2974 128.187 31.2261 128.243 31.1409L132.365 24.0643C132.393 24.0226 132.393 23.9791 132.365 23.9374L128.243 16.8609L128.201 16.7339C128.201 16.6209 128.28 16.5635 128.434 16.5635H131.133C131.274 16.5635 131.38 16.6209 131.452 16.7339L134.213 21.6C134.255 21.6852 134.299 21.6852 134.34 21.6L137.102 16.7339C137.173 16.6209 137.28 16.5635 137.42 16.5635H140.099C140.198 16.5635 140.269 16.5913 140.311 16.6487C140.353 16.7061 140.346 16.7756 140.29 16.8609L136.168 23.9374C136.154 23.9791 136.154 24.0226 136.168 24.0643L140.29 31.1409L140.332 31.2678C140.332 31.3809 140.253 31.4383 140.099 31.4383H137.42C137.278 31.4383 137.172 31.3826 137.102 31.2678L134.34 26.4226C134.299 26.3374 134.255 26.3374 134.213 26.4226L131.429 31.2678C131.358 31.3809 131.252 31.4383 131.111 31.4383H128.433C128.333 31.4383 128.262 31.4104 128.22 31.353H128.222Z" fill="currentColor"/>
17
  <defs>
18
- <linearGradient id="paint0_linear_17_483" x1="3.99826" y1="24" x2="51.6208" y2="24" gradientUnits="userSpaceOnUse">
19
- <stop stop-color="#E21680"/>
20
- <stop offset="1" stop-color="#FF633A"/>
21
- </linearGradient>
22
- </defs>
23
- </svg>
24
-
25
- </div>
26
- <hr>
27
-
28
- <div align="center" style="line-height: 1;">
29
- <a href="https://www.minimax.io" target="_blank" style="margin: 2px;">
30
- <img alt="Homepage" src="https://img.shields.io/badge/_Homepage-MiniMax-FF4040?style=flat-square&labelColor=2C3E50&logo=&logoWidth=20" style="display: inline-block; vertical-align: middle;"/>
31
- </a>
32
- <a href="https://arxiv.org/abs/2506.13585" target="_blank" style="margin: 2px;">
33
- <img alt="Paper" src="https://img.shields.io/badge/📖_Paper-MiniMax--M1-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
34
- </a>
35
- <a href="https://chat.minimax.io/" target="_blank" style="margin: 2px;">
36
- <img alt="Chat" src="https://img.shields.io/badge/_MiniMax_Chat-FF4040?style=flat-square&labelColor=2C3E50&logo=&logoWidth=20" style="display: inline-block; vertical-align: middle;"/>
37
- </a>
38
- <a href="https://www.minimax.io/platform" style="margin: 2px;">
39
- <img alt="API" src="https://img.shields.io/badge/⚡_API-Platform-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
40
- </a>
41
- <a href="https://github.com/MiniMax-AI/MiniMax-MCP" style="margin: 2px;">
42
- <img alt="MCP" src="https://img.shields.io/badge/🚀_MCP-MiniMax_MCP-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
43
- </a>
44
- </div>
45
- <div align="center" style="line-height: 1;">
46
- <a href="https://huggingface.co/MiniMaxAI" target="_blank" style="margin: 2px;">
47
- <img alt="Hugging Face" src="https://img.shields.io/badge/🤗_Hugging_Face-MiniMax-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
48
- </a>
49
- <a href="https://github.com/MiniMax-AI/MiniMax-M1" target="_blank" style="margin: 2px;">
50
- <img alt="GitHub" src="https://img.shields.io/badge/🐙_GitHub-MiniMax-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
51
- </a>
52
- <a href="https://www.modelscope.cn/organization/MiniMax" target="_blank" style="margin: 2px;">
53
- <img alt="ModelScope" src="https://img.shields.io/badge/🤖️_ModelScope-MiniMax-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
54
- </a>
55
- <a href="https://github.com/MiniMax-AI/MiniMax-M1/blob/main/LICENSE" style="margin: 2px;">
56
- <img alt="License" src="https://img.shields.io/badge/⚖️_License-Apache_2.0-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
57
- </a>
58
- <a href="https://github.com/MiniMax-AI/MiniMax-01/blob/main/figures/wechat-qrcode.jpeg" target="_blank" style="margin: 2px;">
59
- <img alt="WeChat" src="https://img.shields.io/badge/💬_WeChat-MiniMax-FF4040?style=flat-square&labelColor=2C3E50" style="display: inline-block; vertical-align: middle;"/>
60
- </a>
61
- </div>
62
-
63
- # MiniMax-M1
64
-
65
- ## 1. Model Overview
66
-
67
- We introduce MiniMax-M1, the world's first open-weight, large-scale hybrid-attention reasoning model.
68
- MiniMax-M1 is powered by a hybrid Mixture-of-Experts (MoE) architecture combined with a lightning
69
- attention mechanism. The model is developed based on our previous [MiniMax-Text-01 model](https://huggingface.co/MiniMaxAI/MiniMax-Text-01),
70
- which contains a total of 456 billion parameters with 45.9 billion parameters activated
71
- per token. Consistent with MiniMax-Text-01, the M1 model natively supports a context length of 1
72
- million tokens, 8x the context size of DeepSeek R1. Furthermore, the lightning attention mechanism
73
- in MiniMax-M1 enables efficient scaling of test-time compute – For example, compared to DeepSeek
74
- R1, M1 consumes 25% of the FLOPs at a generation length of 100K tokens. These properties make M1
75
- particularly suitable for complex tasks that require processing long inputs and thinking extensively.
76
- MiniMax-M1 is trained using large-scale reinforcement learning (RL) on diverse problems ranging from
77
- traditional mathematical reasoning to sandbox-based, real-world software engineering environments.
78
- We develop an efficient RL scaling framework for M1 highlighting two perspectives: (1) We propose
79
- CISPO, a novel algorithm that clips importance sampling weights instead of token updates, which
80
- outperforms other competitive RL variants; (2) Our hybrid-attention design naturally enhances the
81
- efficiency of RL, where we address unique challenges when scaling RL with the hybrid architecture. We
82
- train two versions of MiniMax-M1 models with [40K](https://huggingface.co/MiniMaxAI/MiniMax-M1-40k) and
83
- [80K](https://huggingface.co/MiniMaxAI/MiniMax-M1-80k) thinking budgets respectively. Experiments
84
- on standard benchmarks show that our models outperform other strong open-weight models such as
85
- the original DeepSeek-R1 and Qwen3-235B, particularly on complex software engineering, tool using,
86
- and long context tasks. With efficient scaling of test-time compute, MiniMax-M1 serves as a strong
87
- foundation for next-generation language model agents to reason and tackle real-world challenges.
88
-
89
- <p align="center">
90
- <img width="100%" src="figures/TextBench.png">
91
- <br>
92
- <small><em>Benchmark performance comparison of leading commercial and open-weight models across competition-level mathematics, coding, software engineering, agentic tool use, and long-context understanding tasks. We use the MiniMax-M1-80k model here for MiniMax-M1.</em></small>
93
- </p>
94
-
95
-
96
- ## 2. Evaluation
97
-
98
- **Performance of MiniMax-M1 on core benchmarks.**
99
-
100
-
101
- | **Category** | **Task** | **MiniMax-M1-80K** | **MiniMax-M1-40K** | **Qwen3-235B-A22B** | **DeepSeek-R1-0528** | **DeepSeek-R1** | **Seed-Thinking-v1.5** | **Claude 4 Opus** | **Gemini 2.5 Pro (06-05)** | **OpenAI-o3** |
102
- |:---|:---|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
103
- | | *Extended Thinking* | *80K* | *40K* | *32k* | *64k* | *32k* | *32k* | *64k* | *64k* | *100k* |
104
- | ***Mathematics*** | AIME 2024 | 86.0 | 83.3 | 85.7 | 91.4 | 79.8 | 86.7 | 76.0 | 92.0 | 91.6 |
105
- | | AIME 2025 | 76.9 | 74.6 | 81.5 | 87.5 | 70.0 | 74.0 | 75.5 | 88.0 | 88.9 |
106
- | | MATH-500 | 96.8 | 96.0 | 96.2 | 98.0 | 97.3 | 96.7 | 98.2 | 98.8 | 98.1 |
107
- | ***General Coding*** | LiveCodeBench *(24/8~25/5)* | 65.0 | 62.3 | 65.9 | 73.1 | 55.9 | 67.5 | 56.6 | 77.1 | 75.8 |
108
- | | FullStackBench | 68.3 | 67.6 | 62.9 | 69.4 | 70.1 | 69.9 | 70.3 | -- | 69.3 |
109
- | ***Reasoning & Knowledge***| GPQA Diamond | 70.0 | 69.2 | 71.1 | 81.0 | 71.5 | 77.3 | 79.6 | 86.4 | 83.3 |
110
- | | HLE *(no tools)* | 8.4\* | 7.2\* | 7.6\* | 17.7\* | 8.6\* | 8.2 | 10.7 | 21.6 | 20.3 |
111
- | | ZebraLogic | 86.8 | 80.1 | 80.3 | 95.1 | 78.7 | 84.4 | 95.1 | 91.6 | 95.8 |
112
- | | MMLU-Pro | 81.1 | 80.6 | 83.0 | 85.0 | 84.0 | 87.0 | 85.0 | 86.0 | 85.0 |
113
- | ***Software Engineering***| SWE-bench Verified| 56.0 | 55.6 | 34.4 | 57.6 | 49.2 | 47.0 | 72.5 | 67.2 | 69.1 |
114
- | ***Long Context*** | OpenAI-MRCR *(128k)* | 73.4 | 76.1 | 27.7 | 51.5 | 35.8 | 54.3 | 48.9 | 76.8 | 56.5 |
115
- | | OpenAI-MRCR *(1M)* | 56.2 | 58.6 | -- | -- | -- | -- | -- | 58.8 | -- |
116
- | | LongBench-v2 | 61.5 | 61.0 | 50.1 | 52.1 | 58.3 | 52.5 | 55.6 | 65.0 | 58.8 |
117
- | ***Agentic Tool Use***| TAU-bench *(airline)* | 62.0 | 60.0 | 34.7 | 53.5 | -- | 44.0 | 59.6 | 50.0 | 52.0 |
118
- | | TAU-bench *(retail)* | 63.5 | 67.8 | 58.6 | 63.9 | -- | 55.7 | 81.4 | 67.0 | 73.9 |
119
- | ***Factuality*** | SimpleQA | 18.5 | 17.9 | 11.0 | 27.8 | 30.1 | 12.9 | -- | 54.0 | 49.4 |
120
- | ***General Assistant***| MultiChallenge | 44.7 | 44.7 | 40.0 | 45.0 | 40.7 | 43.0 | 45.8 | 51.8 | 56.5 |
121
-
122
- \* conducted on the text-only HLE subset.
123
-
124
- Our models are evaluated with `temperature=1.0`, `top_p=0.95`.
125
-
126
- ### SWE-bench methodology
127
- We report results derived from the Agentless scaffold. Departing from the original pipeline, our methodology employs a two-stage localization process (without any embedding-based retrieval mechanisms): initial coarse-grained file localization followed by fine-grained localization to specific files and code elements. The values for our models are calculated on the subset of n=486 verified tasks which work on our infrastructure. The excluded 14 test cases that were incompatible with our internal infrastructure are:
128
- `"astropy__astropy-7606"`,
129
- `"astropy__astropy-8707"`,
130
- `"astropy__astropy-8872"`,
131
- `"django__django-10097"`,
132
- `"matplotlib__matplotlib-20488"`,
133
- `"psf__requests-2317"`,
134
- `"psf__requests-2931"`,
135
- `"psf__requests-5414"`,
136
- `"pylint-dev__pylint-6528"`,
137
- `"pylint-dev__pylint-7277"`,
138
- `"sphinx-doc__sphinx-10435"`,
139
- `"sphinx-doc__sphinx-7985"`,
140
- `"sphinx-doc__sphinx-8269"`,
141
- `"sphinx-doc__sphinx-8475"`
142
-
143
- ### TAU-bench methodology
144
- We evaluate TAU-Bench with GPT-4.1 as user model and without any custom tools. The maximum number of interaction steps is 40.
145
- Our general system prompt is:
146
- ```
147
- - In each round, you need to carefully examine the tools provided to you to determine if any can be used.
148
- - You must adhere to all of the policies. Pay attention to the details in the terms. Solutions for most situations can be found within these policies.
149
- ```
150
-
151
- ## 3. Deployment Guide
152
-
153
- Download the model from HuggingFace repository:
154
- - [MiniMax-M1-40k](https://huggingface.co/MiniMaxAI/MiniMax-M1-40k)
155
- - [MiniMax-M1-80k](https://huggingface.co/MiniMaxAI/MiniMax-M1-80k)
156
-
157
- For production deployment, we recommend using [vLLM](https://docs.vllm.ai/en/latest/) to serve MiniMax-M1. vLLM provides excellent performance for serving large language models with the following features:
158
- - 🔥 Outstanding service throughout performance
159
- - ⚡ Efficient and intelligent memory management
160
- - 📦 Powerful batch request processing capability
161
- - ⚙️ Deeply optimized underlying performance
162
-
163
- For detailed vLLM deployment instructions, please refer to our [vLLM Deployment Guide](./docs/vllm_deployment_guide.md).
164
- Alternatively, you can also deploy using Transformers directly. For detailed Transformers deployment instructions, you can see our [MiniMax-M1 Transformers Deployment Guide](./docs/transformers_deployment_guide.md).
165
-
166
-
167
- ## 4. Function Calling
168
-
169
- The MiniMax-M1 model supports function calling capabilities, enabling the model to identify when external functions need to be called and output function call parameters in a structured format. [MiniMax-M1 Function Call Guide](./docs/function_call_guide.md) provides detailed instructions on how to use the function calling feature of MiniMax-M1.
170
-
171
-
172
- ## 5. Chatbot & API
173
- For general use and evaluation, we provide a [Chatbot](https://chat.minimax.io/) with online search capabilities and the [online API](https://www.minimax.io/platform/) for developers. For general use and evaluation, we provide the [MiniMax MCP Server](https://github.com/MiniMax-AI/MiniMax-MCP) with video generation, image generation, speech synthesis, and voice cloning for developers.
174
-
175
-
176
- ## 6. Contact Us
177
- Contact us at [model@minimax.io](mailto:model@minimax.io).
 
1
  ---
 
2
  license: apache-2.0
3
+ pipeline_tag: text-generation
4
+ library_name: transformers
5
  ---
6
 
7
  <div align="center">
 
16
  <path d="M123.782 31.2241L123.144 29.1424C123.116 29.0867 123.079 29.0572 123.038 29.0572H117.81C117.768 29.0572 117.732 29.085 117.704 29.1424L117.088 31.2241C117.046 31.3668 116.954 31.4363 116.812 31.4363H114.112C114.027 31.4363 113.963 31.412 113.921 31.3615C113.879 31.3128 113.871 31.2381 113.9 31.1389L118.49 16.7737C118.532 16.6328 118.624 16.5615 118.766 16.5615H122.102C122.243 16.5615 122.335 16.6328 122.379 16.7737L126.968 31.1389C126.982 31.1668 126.989 31.2033 126.989 31.245C126.989 31.372 126.911 31.4363 126.756 31.4363H124.057C123.916 31.4363 123.824 31.365 123.78 31.2241H123.782ZM118.554 26.7407H122.295C122.38 26.7407 122.408 26.6989 122.38 26.6137L120.467 20.3024C120.453 20.2467 120.432 20.2207 120.403 20.2276C120.375 20.2346 120.352 20.2589 120.339 20.3024L118.469 26.6137C118.455 26.6989 118.483 26.7407 118.554 26.7407Z" fill="currentColor"/>
17
  <path d="M128.222 31.353C128.18 31.2974 128.187 31.2261 128.243 31.1409L132.365 24.0643C132.393 24.0226 132.393 23.9791 132.365 23.9374L128.243 16.8609L128.201 16.7339C128.201 16.6209 128.28 16.5635 128.434 16.5635H131.133C131.274 16.5635 131.38 16.6209 131.452 16.7339L134.213 21.6C134.255 21.6852 134.299 21.6852 134.34 21.6L137.102 16.7339C137.173 16.6209 137.28 16.5635 137.42 16.5635H140.099C140.198 16.5635 140.269 16.5913 140.311 16.6487C140.353 16.7061 140.346 16.7756 140.29 16.8609L136.168 23.9374C136.154 23.9791 136.154 24.0226 136.168 24.0643L140.29 31.1409L140.332 31.2678C140.332 31.3809 140.253 31.4383 140.099 31.4383H137.42C137.278 31.4383 137.172 31.3826 137.102 31.2678L134.34 26.4226C134.299 26.3374 134.255 26.3374 134.213 26.4226L131.429 31.2678C131.358 31.3809 131.252 31.4383 131.111 31.4383H128.433C128.333 31.4383 128.262 31.4104 128.22 31.353H128.222Z" fill="currentColor"/>
18
  <defs>
19
+ <linearGradient id="paint0_linear_