Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,356 +1,24 @@
|
|
1 |
-
|
2 |
-
from fastapi import FastAPI, Query
|
3 |
from pydantic import BaseModel
|
4 |
-
import cloudscraper
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
-
from transformers import pipeline
|
7 |
-
import torch
|
8 |
-
import re
|
9 |
-
import os
|
10 |
-
|
11 |
-
#os.environ["HF_HOME"] = "/home/user/huggingface"
|
12 |
-
#os.environ["TRANSFORMERS_CACHE"] = "/home/user/huggingface"
|
13 |
-
|
14 |
-
app = FastAPI()
|
15 |
-
|
16 |
-
class ThreadResponse(BaseModel):
|
17 |
-
question: str
|
18 |
-
replies: list[str]
|
19 |
-
|
20 |
-
def clean_text(text: str) -> str:
|
21 |
-
text = text.strip()
|
22 |
-
text = re.sub(r"\b\d+\s*likes?,?\s*\d*\s*replies?$", "", text, flags=re.IGNORECASE).strip()
|
23 |
-
return text
|
24 |
-
|
25 |
-
@app.get("/scrape", response_model=ThreadResponse)
|
26 |
-
def scrape(url: str = Query(...)):
|
27 |
-
scraper = cloudscraper.create_scraper()
|
28 |
-
response = scraper.get(url)
|
29 |
-
|
30 |
-
if response.status_code == 200:
|
31 |
-
soup = BeautifulSoup(response.content, 'html.parser')
|
32 |
-
comment_containers = soup.find_all('div', class_='post__content')
|
33 |
-
|
34 |
-
if comment_containers:
|
35 |
-
question = clean_text(comment_containers[0].get_text(strip=True, separator="\n"))
|
36 |
-
replies = [clean_text(comment.get_text(strip=True, separator="\n")) for comment in comment_containers[1:]]
|
37 |
-
return ThreadResponse(question=question, replies=replies)
|
38 |
-
return ThreadResponse(question="", replies=[])
|
39 |
-
|
40 |
-
MODEL_NAME = "microsoft/phi-2"
|
41 |
-
|
42 |
-
# Load the text-generation pipeline once at startup
|
43 |
-
text_generator = pipeline(
|
44 |
-
"text-generation",
|
45 |
-
model=MODEL_NAME,
|
46 |
-
trust_remote_code=True,
|
47 |
-
device=0 if torch.cuda.is_available() else -1, # GPU if available, else CPU
|
48 |
-
)
|
49 |
-
|
50 |
-
class PromptRequest(BaseModel):
|
51 |
-
prompt: str
|
52 |
-
|
53 |
-
@app.post("/generate")
|
54 |
-
async def generate_text(request: PromptRequest):
|
55 |
-
# The model expects a string prompt, so pass request.prompt directly
|
56 |
-
outputs = text_generator(
|
57 |
-
request.prompt,
|
58 |
-
max_new_tokens=512,
|
59 |
-
temperature=0.7,
|
60 |
-
top_p=0.9,
|
61 |
-
do_sample=True,
|
62 |
-
num_return_sequences=1,
|
63 |
-
)
|
64 |
-
|
65 |
-
generated_text = outputs[0]['generated_text']
|
66 |
-
|
67 |
-
# Optional: parse reasoning and content if your model uses special tags like </think>
|
68 |
-
if "</think>" in generated_text:
|
69 |
-
reasoning_content = generated_text.split("</think>")[0].strip()
|
70 |
-
content = generated_text.split("</think>")[1].strip()
|
71 |
-
else:
|
72 |
-
reasoning_content = ""
|
73 |
-
content = generated_text.strip()
|
74 |
-
|
75 |
-
return {
|
76 |
-
"reasoning_content": reasoning_content,
|
77 |
-
"generated_text": content
|
78 |
-
}
|
79 |
-
|
80 |
-
'''
|
81 |
-
|
82 |
-
from fastapi import FastAPI, Query, Path
|
83 |
-
from pydantic import BaseModel
|
84 |
-
import cloudscraper
|
85 |
-
from bs4 import BeautifulSoup
|
86 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration, PegasusTokenizer, PegasusForConditionalGeneration
|
87 |
-
import torch
|
88 |
-
import re
|
89 |
-
from fastapi.responses import JSONResponse
|
90 |
-
from fastapi.requests import Request
|
91 |
-
from fastapi import status
|
92 |
-
from typing import List, Dict, Optional
|
93 |
from llama_cpp import Llama
|
94 |
|
95 |
app = FastAPI()
|
96 |
|
97 |
-
# --- Data Models ---
|
98 |
-
|
99 |
-
class ThreadResponse(BaseModel):
|
100 |
-
question: str
|
101 |
-
replies: list[str]
|
102 |
-
|
103 |
-
class PromptRequest(BaseModel):
|
104 |
-
prompt: str
|
105 |
-
|
106 |
-
class GenerateResponse(BaseModel):
|
107 |
-
reasoning_content: str
|
108 |
-
generated_text: str
|
109 |
-
|
110 |
-
# New model for summarization request
|
111 |
-
class SummarizeRequest(BaseModel):
|
112 |
-
replies: List[str]
|
113 |
-
task: str # expecting "summarisation"
|
114 |
-
|
115 |
-
# New model for summarization response
|
116 |
-
class SummarizeResponse(BaseModel):
|
117 |
-
individual_summaries: Dict[int, Dict[str, str]] # {index: {"reasoning": str, "summary": str}}
|
118 |
-
combined_reasoning: str
|
119 |
-
combined_summary: str
|
120 |
-
|
121 |
-
# --- Utility Functions ---
|
122 |
-
|
123 |
-
def clean_text(text: str) -> str:
|
124 |
-
text = text.strip()
|
125 |
-
text = re.sub(r"\b\d+\s*likes?,?\s*\d*\s*replies?$", "", text, flags=re.IGNORECASE).strip()
|
126 |
-
return text
|
127 |
-
|
128 |
-
# --- Scraping Endpoint ---
|
129 |
-
|
130 |
-
@app.get("/scrape", response_model=ThreadResponse)
|
131 |
-
def scrape(url: str):
|
132 |
-
scraper = cloudscraper.create_scraper()
|
133 |
-
response = scraper.get(url)
|
134 |
-
|
135 |
-
if response.status_code == 200:
|
136 |
-
soup = BeautifulSoup(response.content, "html.parser")
|
137 |
-
comment_containers = soup.find_all("div", class_="post__content")
|
138 |
-
|
139 |
-
if comment_containers:
|
140 |
-
question = clean_text(comment_containers[0].get_text(strip=True, separator="\n"))
|
141 |
-
replies = [clean_text(comment.get_text(strip=True, separator="\n")) for comment in comment_containers[1:]]
|
142 |
-
return ThreadResponse(question=question, replies=replies)
|
143 |
-
return ThreadResponse(question="", replies=[])
|
144 |
-
|
145 |
-
# --- Load DeepSeek-R1-Distill-Qwen-1.5B Model & Tokenizer ---
|
146 |
-
|
147 |
-
deepseek_model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
148 |
-
deepseek_tokenizer = AutoTokenizer.from_pretrained(deepseek_model_name)
|
149 |
-
deepseek_model = AutoModelForCausalLM.from_pretrained(deepseek_model_name)
|
150 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
151 |
-
deepseek_model = deepseek_model.to(device)
|
152 |
-
|
153 |
-
# --- Load T5-Large Model & Tokenizer ---
|
154 |
-
|
155 |
-
t5_model_name = "google-t5/t5-large"
|
156 |
-
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
|
157 |
-
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
|
158 |
-
t5_model = t5_model.to(device)
|
159 |
-
|
160 |
-
pegasus_model_name = "google/pegasus-large"
|
161 |
-
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
|
162 |
-
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name)
|
163 |
-
pegasus_model = pegasus_model.to(device)
|
164 |
-
|
165 |
-
qwen3_model_name = "Qwen/Qwen3-0.6B"
|
166 |
-
qwen3_tokenizer = AutoTokenizer.from_pretrained(qwen3_model_name)
|
167 |
-
qwen3_model = AutoModelForCausalLM.from_pretrained(qwen3_model_name)
|
168 |
-
qwen3_model = qwen3_model.to(device)
|
169 |
-
|
170 |
qwen3_gguf_llm = Llama.from_pretrained(
|
171 |
repo_id="unsloth/Qwen3-0.6B-GGUF",
|
172 |
filename="Qwen3-0.6B-BF16.gguf",
|
173 |
)
|
174 |
|
|
|
|
|
175 |
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
inputs = deepseek_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024).to(device)
|
180 |
-
outputs = deepseek_model.generate(
|
181 |
-
**inputs,
|
182 |
-
max_new_tokens=512,
|
183 |
-
temperature=0.7,
|
184 |
-
top_p=0.9,
|
185 |
-
do_sample=True,
|
186 |
-
num_return_sequences=1,
|
187 |
-
pad_token_id=deepseek_tokenizer.eos_token_id,
|
188 |
-
)
|
189 |
-
generated_text = deepseek_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
190 |
-
|
191 |
-
if "</think>" in generated_text:
|
192 |
-
reasoning_content, content = generated_text.split("</think>", 1)
|
193 |
-
return reasoning_content.strip(), content.strip()
|
194 |
-
else:
|
195 |
-
return "", generated_text.strip()
|
196 |
-
|
197 |
-
def generate_t5(prompt: str) -> (str, str):
|
198 |
-
inputs = t5_tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)
|
199 |
-
outputs = t5_model.generate(
|
200 |
-
inputs,
|
201 |
-
max_length=512,
|
202 |
-
num_beams=4,
|
203 |
-
repetition_penalty=2.5,
|
204 |
-
length_penalty=1.0,
|
205 |
-
early_stopping=True,
|
206 |
-
)
|
207 |
-
generated_text = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
208 |
-
|
209 |
-
if "</think>" in generated_text:
|
210 |
-
reasoning_content, content = generated_text.split("</think>", 1)
|
211 |
-
return reasoning_content.strip(), content.strip()
|
212 |
-
else:
|
213 |
-
return "", generated_text.strip()
|
214 |
-
|
215 |
-
# --- API Endpoints ---
|
216 |
-
|
217 |
-
def generate_pegasus(prompt: str) -> (str, str):
|
218 |
-
# Pegasus expects raw text input (no prefix needed)
|
219 |
-
inputs = pegasus_tokenizer(
|
220 |
-
prompt,
|
221 |
-
return_tensors="pt",
|
222 |
-
truncation=True,
|
223 |
-
max_length=1024,
|
224 |
-
).to(device)
|
225 |
-
|
226 |
-
outputs = pegasus_model.generate(
|
227 |
-
**inputs,
|
228 |
-
max_new_tokens=150,
|
229 |
-
num_beams=4,
|
230 |
-
length_penalty=2.0,
|
231 |
-
early_stopping=True,
|
232 |
-
)
|
233 |
-
generated_text = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
234 |
-
|
235 |
-
# Pegasus does not use <think> tags, so no reasoning extraction
|
236 |
-
return "", generated_text.strip()
|
237 |
-
|
238 |
-
def generate_qwen3(prompt: str) -> (str, str):
|
239 |
-
inputs = qwen3_tokenizer(
|
240 |
-
prompt,
|
241 |
-
return_tensors="pt",
|
242 |
-
truncation=True,
|
243 |
-
max_length=1024,
|
244 |
-
).to(device)
|
245 |
-
|
246 |
-
outputs = qwen3_model.generate(
|
247 |
-
**inputs,
|
248 |
-
max_new_tokens=512,
|
249 |
-
temperature=0.7,
|
250 |
-
top_p=0.9,
|
251 |
-
do_sample=True,
|
252 |
-
num_return_sequences=1,
|
253 |
-
pad_token_id=qwen3_tokenizer.eos_token_id,
|
254 |
-
)
|
255 |
-
|
256 |
-
generated_text = qwen3_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
257 |
-
|
258 |
-
if "</think>" in generated_text:
|
259 |
-
reasoning_content, content = generated_text.split("</think>", 1)
|
260 |
-
return reasoning_content.strip(), content.strip()
|
261 |
-
else:
|
262 |
-
return "", generated_text.strip()
|
263 |
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
response = qwen3_gguf_llm.create_chat_completion(
|
269 |
-
messages=messages,
|
270 |
-
max_tokens=max_tokens,
|
271 |
-
)
|
272 |
generated_text = response['choices'][0]['message']['content']
|
273 |
-
|
274 |
-
reasoning_content, content = generated_text.split("</think>", 1)
|
275 |
-
return reasoning_content.strip() + "</think>", content.strip()
|
276 |
-
else:
|
277 |
-
return "", generated_text.strip()
|
278 |
-
|
279 |
-
# --- New summarization endpoint ---
|
280 |
-
|
281 |
-
@app.post("/summarize_thread", response_model=SummarizeResponse)
|
282 |
-
async def summarize_thread(request: SummarizeRequest):
|
283 |
-
if request.task.lower() != "summarisation":
|
284 |
-
return JSONResponse(
|
285 |
-
status_code=400,
|
286 |
-
content={"error": "Unsupported task. Only 'summarisation' is supported."}
|
287 |
-
)
|
288 |
-
|
289 |
-
individual_summaries = {}
|
290 |
-
combined_reasonings = []
|
291 |
-
combined_summaries = []
|
292 |
-
|
293 |
-
# Summarize each reply individually
|
294 |
-
for idx, reply in enumerate(request.replies):
|
295 |
-
reasoning, summary = generate_qwen3_gguf(reply, max_tokens=256)
|
296 |
-
individual_summaries[idx] = {
|
297 |
-
"reasoning": reasoning,
|
298 |
-
"summary": summary
|
299 |
-
}
|
300 |
-
if reasoning:
|
301 |
-
combined_reasonings.append(reasoning)
|
302 |
-
combined_summaries.append(summary)
|
303 |
-
|
304 |
-
# Combine all individual summaries into one text
|
305 |
-
combined_summary_text = " ".join(combined_summaries)
|
306 |
-
|
307 |
-
# Recursively summarize combined summary if too long (optional)
|
308 |
-
# Here, we summarize combined summary to get final reasoning and summary
|
309 |
-
final_reasoning, final_summary = generate_qwen3_gguf(combined_summary_text, max_tokens=256)
|
310 |
-
|
311 |
-
# Append final reasoning to combined reasonings
|
312 |
-
if final_reasoning:
|
313 |
-
combined_reasonings.append(final_reasoning)
|
314 |
-
|
315 |
-
return SummarizeResponse(
|
316 |
-
individual_summaries=individual_summaries,
|
317 |
-
combined_reasoning="\n\n".join(combined_reasonings).strip(),
|
318 |
-
combined_summary=final_summary.strip()
|
319 |
-
)
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
@app.post("/generate/{model_name}", response_model=GenerateResponse)
|
324 |
-
async def generate(
|
325 |
-
request: PromptRequest,
|
326 |
-
model_name: str = Path(..., description="Model to use: 'deepseekr1-qwen', 't5-large', 'pegasus-large', 'qwen3-0.6b-hf', or 'qwen3-0.6b-gguf'")
|
327 |
-
):
|
328 |
-
if model_name == "deepseekr1-qwen":
|
329 |
-
reasoning, text = generate_deepseek(request.prompt)
|
330 |
-
elif model_name == "t5-large":
|
331 |
-
reasoning, text = generate_t5(request.prompt)
|
332 |
-
elif model_name == "pegasus-large":
|
333 |
-
reasoning, text = generate_pegasus(request.prompt)
|
334 |
-
elif model_name == "qwen3-0.6b-hf":
|
335 |
-
reasoning, text = generate_qwen3_hf(request.prompt)
|
336 |
-
elif model_name == "qwen3-0.6b-gguf":
|
337 |
-
reasoning, text = generate_qwen3_gguf(request.prompt)
|
338 |
-
else:
|
339 |
-
return GenerateResponse(reasoning_content="", generated_text=f"Error: Unknown model '{model_name}'.")
|
340 |
-
|
341 |
-
return GenerateResponse(reasoning_content=reasoning, generated_text=text)
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
# --- Global Exception Handler ---
|
346 |
-
|
347 |
-
@app.exception_handler(Exception)
|
348 |
-
async def global_exception_handler(request: Request, exc: Exception):
|
349 |
-
print(f"Exception: {exc}")
|
350 |
-
return JSONResponse(
|
351 |
-
status_code=status.HTTP_200_OK,
|
352 |
-
content={
|
353 |
-
"reasoning_content": "",
|
354 |
-
"generated_text": f"Error: {str(exc)}"
|
355 |
-
}
|
356 |
-
)
|
|
|
1 |
+
from fastapi import FastAPI
|
|
|
2 |
from pydantic import BaseModel
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from llama_cpp import Llama
|
4 |
|
5 |
app = FastAPI()
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
qwen3_gguf_llm = Llama.from_pretrained(
|
8 |
repo_id="unsloth/Qwen3-0.6B-GGUF",
|
9 |
filename="Qwen3-0.6B-BF16.gguf",
|
10 |
)
|
11 |
|
12 |
+
class PromptRequest(BaseModel):
|
13 |
+
prompt: str
|
14 |
|
15 |
+
class GenerateResponse(BaseModel):
|
16 |
+
reasoning_content: str = ""
|
17 |
+
generated_text: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
@app.post("/generate/qwen3-0.6b-gguf", response_model=GenerateResponse)
|
20 |
+
async def generate_qwen3_gguf_endpoint(request: PromptRequest):
|
21 |
+
messages = [{"role": "user", "content": request.prompt}]
|
22 |
+
response = qwen3_gguf_llm.create_chat_completion(messages=messages, max_tokens=256)
|
|
|
|
|
|
|
|
|
23 |
generated_text = response['choices'][0]['message']['content']
|
24 |
+
return GenerateResponse(generated_text=generated_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|