GGUF
ehartford commited on
Commit
966e63c
·
verified ·
1 Parent(s): 14ba292

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +108 -29
README.md CHANGED
@@ -180,55 +180,134 @@ Output: "deoxyribonucleic acid, and it is the hereditary material in all living
180
 
181
  ## Usage
182
 
183
- ### Basic Usage
184
  ```python
185
  from transformers import AutoModelForCausalLM, AutoTokenizer
186
 
187
- # Load model
 
 
 
188
  model = AutoModelForCausalLM.from_pretrained(
189
- "Qwen3-72B-Embiggened",
190
- torch_dtype=torch.bfloat16,
191
- device_map="auto",
192
- trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  )
194
- tokenizer = AutoTokenizer.from_pretrained("Qwen3-72B-Embiggened")
195
 
196
- # Generate text
197
- inputs = tokenizer("The meaning of life is", return_tensors="pt")
198
- outputs = model.generate(**inputs, max_new_tokens=50, temperature=0.7)
199
- print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
 
 
 
 
 
 
 
 
 
200
  ```
201
 
202
- ### Advanced Usage with Quantization
203
  ```python
204
- from transformers import BitsAndBytesConfig
205
 
206
- # 4-bit quantization for reduced memory usage
207
- bnb_config = BitsAndBytesConfig(
208
- load_in_4bit=True,
209
- bnb_4bit_compute_dtype=torch.bfloat16,
210
- bnb_4bit_use_double_quant=True,
 
211
  )
 
212
 
213
- model = AutoModelForCausalLM.from_pretrained(
214
- "Qwen3-72B-Embiggened",
215
- quantization_config=bnb_config,
216
- device_map="auto",
217
- trust_remote_code=True
 
 
 
218
  )
219
  ```
220
 
221
- ### vLLM Deployment
222
  ```python
223
- from vllm import LLM, SamplingParams
 
 
 
 
 
 
224
 
225
- llm = LLM(model="Qwen3-72B-Embiggened", tensor_parallel_size=4)
226
- sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=100)
 
 
 
 
 
227
 
228
- prompts = ["Tell me about quantum computing", "Write a poem about AI"]
229
- outputs = llm.generate(prompts, sampling_params)
 
 
 
 
230
  ```
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  ## Hardware Requirements
233
 
234
  ### Minimum Requirements
 
180
 
181
  ## Usage
182
 
183
+ ### Basic Usage with Thinking Mode
184
  ```python
185
  from transformers import AutoModelForCausalLM, AutoTokenizer
186
 
187
+ model_name = "cognitivecomputations/Qwen3-72B-Embiggened"
188
+
189
+ # Load the tokenizer and the model
190
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
191
  model = AutoModelForCausalLM.from_pretrained(
192
+ model_name,
193
+ torch_dtype="auto",
194
+ device_map="auto"
195
+ )
196
+
197
+ # Prepare the model input
198
+ prompt = "How many r's are in strawberry?"
199
+ messages = [
200
+ {"role": "user", "content": prompt}
201
+ ]
202
+
203
+ # Apply chat template with thinking mode enabled
204
+ text = tokenizer.apply_chat_template(
205
+ messages,
206
+ tokenize=False,
207
+ add_generation_prompt=True,
208
+ enable_thinking=True # Enable thinking mode (default)
209
+ )
210
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
211
+
212
+ # Generate response
213
+ generated_ids = model.generate(
214
+ **model_inputs,
215
+ max_new_tokens=32768,
216
+ temperature=0.6, # Recommended for thinking mode
217
+ top_p=0.95,
218
+ top_k=20,
219
+ min_p=0
220
  )
 
221
 
222
+ # Parse thinking content and final response
223
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
224
+
225
+ try:
226
+ # Find </think> token (151668)
227
+ index = len(output_ids) - output_ids[::-1].index(151668)
228
+ except ValueError:
229
+ index = 0
230
+
231
+ thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
232
+ content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
233
+
234
+ print("Thinking content:", thinking_content)
235
+ print("Final answer:", content)
236
  ```
237
 
238
+ ### Non-Thinking Mode (Efficient General Dialogue)
239
  ```python
240
+ # Same setup as above...
241
 
242
+ # Apply chat template with thinking mode disabled
243
+ text = tokenizer.apply_chat_template(
244
+ messages,
245
+ tokenize=False,
246
+ add_generation_prompt=True,
247
+ enable_thinking=False # Disable thinking for efficiency
248
  )
249
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
250
 
251
+ # Generate with non-thinking parameters
252
+ outputs = model.generate(
253
+ **model_inputs,
254
+ max_new_tokens=2048,
255
+ temperature=0.7, # Recommended for non-thinking mode
256
+ top_p=0.8,
257
+ top_k=20,
258
+ min_p=0
259
  )
260
  ```
261
 
262
+ ### Advanced: Dynamic Mode Switching
263
  ```python
264
+ # Use /think and /no_think tags to control behavior
265
+ messages = [
266
+ {"role": "user", "content": "Explain quantum computing /no_think"}, # Quick response
267
+ {"role": "assistant", "content": "Quantum computing uses quantum bits..."},
268
+ {"role": "user", "content": "How does superposition work mathematically? /think"} # Detailed reasoning
269
+ ]
270
+ ```
271
 
272
+ ### vLLM Deployment with Reasoning Support
273
+ ```python
274
+ # Start server with reasoning parser
275
+ # vllm serve cognitivecomputations/Qwen3-72B-Embiggened --enable-reasoning --reasoning-parser deepseek_r1
276
+
277
+ from openai import OpenAI
278
+ client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
279
 
280
+ # Use with thinking mode
281
+ response = client.chat.completions.create(
282
+ model="cognitivecomputations/Qwen3-72B-Embiggened",
283
+ messages=[{"role": "user", "content": "Solve: What is 15% of 250?"}],
284
+ extra_body={"enable_thinking": True}
285
+ )
286
  ```
287
 
288
+ ### Example Outputs with Thinking
289
+
290
+ ```
291
+ Prompt: "How many r's are in strawberry?"
292
+ Thinking: Let me count the r's in "strawberry". S-t-r-a-w-b-e-r-r-y.
293
+ Going through each letter: s(no), t(no), r(yes, 1), a(no), w(no),
294
+ b(no), e(no), r(yes, 2), r(yes, 3), y(no).
295
+ Final answer: There are 3 r's in the word "strawberry".
296
+
297
+ Prompt: "What is the capital of France, and what is it famous for?"
298
+ Final answer (no thinking): Paris is the capital of France. It's famous for
299
+ the Eiffel Tower, the Louvre Museum, Notre-Dame Cathedral, and its rich
300
+ cultural heritage, fashion, and cuisine.
301
+ ```
302
+
303
+ This updated version:
304
+ 1. Shows both thinking and non-thinking modes clearly
305
+ 2. Includes the proper thinking token parsing (151668)
306
+ 3. Uses recommended temperature settings for each mode
307
+ 4. Demonstrates the `/think` and `/no_think` switches
308
+ 5. Shows example outputs that highlight the thinking capability
309
+ 6. Matches the structure and style of the Qwen3-32B examples
310
+
311
  ## Hardware Requirements
312
 
313
  ### Minimum Requirements