Dan Walsh
commited on
Commit
·
6f0ac93
1
Parent(s):
59ea016
Updating URL extraction quality
Browse files- Dockerfile +9 -1
- app/api/__pycache__/routes.cpython-311.pyc +0 -0
- app/api/routes.py +11 -1
- app/services/__pycache__/summariser.cpython-311.pyc +0 -0
- app/services/__pycache__/url_extractor.cpython-311.pyc +0 -0
- app/services/summariser.py +42 -6
- app/services/url_extractor.py +76 -31
- requirements.txt +10 -9
Dockerfile
CHANGED
@@ -11,9 +11,17 @@ ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
|
|
11 |
ENV HF_HOME=/tmp/huggingface_cache
|
12 |
ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
# Copy requirements first for better caching
|
15 |
COPY requirements.txt .
|
16 |
-
|
|
|
|
|
|
|
17 |
|
18 |
# Copy the rest of the application
|
19 |
COPY . .
|
|
|
11 |
ENV HF_HOME=/tmp/huggingface_cache
|
12 |
ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
|
13 |
|
14 |
+
# Install system dependencies
|
15 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
16 |
+
build-essential \
|
17 |
+
&& rm -rf /var/lib/apt/lists/*
|
18 |
+
|
19 |
# Copy requirements first for better caching
|
20 |
COPY requirements.txt .
|
21 |
+
|
22 |
+
# Install Python dependencies
|
23 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
24 |
+
pip install --no-cache-dir -r requirements.txt
|
25 |
|
26 |
# Copy the rest of the application
|
27 |
COPY . .
|
app/api/__pycache__/routes.cpython-311.pyc
CHANGED
Binary files a/app/api/__pycache__/routes.cpython-311.pyc and b/app/api/__pycache__/routes.cpython-311.pyc differ
|
|
app/api/routes.py
CHANGED
@@ -4,6 +4,9 @@ from typing import Optional, Union
|
|
4 |
from app.services.summariser import SummariserService
|
5 |
from app.services.url_extractor import URLExtractorService
|
6 |
from app.services.cache import hash_text, get_cached_summary, cache_summary
|
|
|
|
|
|
|
7 |
|
8 |
router = APIRouter(prefix="/api")
|
9 |
summariser_service = SummariserService()
|
@@ -62,12 +65,16 @@ async def summarise_text(request: TextSummaryRequest):
|
|
62 |
async def summarise_url(request: URLSummaryRequest):
|
63 |
try:
|
64 |
# Extract content from URL
|
|
|
65 |
url_extractor = URLExtractorService()
|
66 |
content = await url_extractor.extract_content(str(request.url))
|
67 |
|
68 |
if not content or len(content) < 100:
|
|
|
69 |
raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
|
70 |
|
|
|
|
|
71 |
# Summarise the extracted content
|
72 |
result = summariser_service.summarise(
|
73 |
text=content,
|
@@ -77,16 +84,19 @@ async def summarise_url(request: URLSummaryRequest):
|
|
77 |
temperature=request.temperature
|
78 |
)
|
79 |
|
|
|
80 |
return {
|
81 |
"original_text_length": len(content),
|
82 |
"summary": result["summary"],
|
83 |
"summary_length": len(result["summary"]),
|
84 |
"source_type": "url",
|
85 |
-
"source_url": str(request.url)
|
|
|
86 |
}
|
87 |
except HTTPException:
|
88 |
raise
|
89 |
except Exception as e:
|
|
|
90 |
raise HTTPException(status_code=500, detail=str(e))
|
91 |
|
92 |
@router.get("/status")
|
|
|
4 |
from app.services.summariser import SummariserService
|
5 |
from app.services.url_extractor import URLExtractorService
|
6 |
from app.services.cache import hash_text, get_cached_summary, cache_summary
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
|
11 |
router = APIRouter(prefix="/api")
|
12 |
summariser_service = SummariserService()
|
|
|
65 |
async def summarise_url(request: URLSummaryRequest):
|
66 |
try:
|
67 |
# Extract content from URL
|
68 |
+
logger.info(f"Extracting content from URL: {request.url}")
|
69 |
url_extractor = URLExtractorService()
|
70 |
content = await url_extractor.extract_content(str(request.url))
|
71 |
|
72 |
if not content or len(content) < 100:
|
73 |
+
logger.warning(f"Insufficient content extracted from URL: {request.url}")
|
74 |
raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
|
75 |
|
76 |
+
logger.info(f"Extracted {len(content)} characters from {request.url}")
|
77 |
+
|
78 |
# Summarise the extracted content
|
79 |
result = summariser_service.summarise(
|
80 |
text=content,
|
|
|
84 |
temperature=request.temperature
|
85 |
)
|
86 |
|
87 |
+
# Create a more structured response
|
88 |
return {
|
89 |
"original_text_length": len(content),
|
90 |
"summary": result["summary"],
|
91 |
"summary_length": len(result["summary"]),
|
92 |
"source_type": "url",
|
93 |
+
"source_url": str(request.url),
|
94 |
+
"metadata": result.get("metadata", {})
|
95 |
}
|
96 |
except HTTPException:
|
97 |
raise
|
98 |
except Exception as e:
|
99 |
+
logger.error(f"Error processing URL {request.url}: {str(e)}")
|
100 |
raise HTTPException(status_code=500, detail=str(e))
|
101 |
|
102 |
@router.get("/status")
|
app/services/__pycache__/summariser.cpython-311.pyc
CHANGED
Binary files a/app/services/__pycache__/summariser.cpython-311.pyc and b/app/services/__pycache__/summariser.cpython-311.pyc differ
|
|
app/services/__pycache__/url_extractor.cpython-311.pyc
CHANGED
Binary files a/app/services/__pycache__/url_extractor.cpython-311.pyc and b/app/services/__pycache__/url_extractor.cpython-311.pyc differ
|
|
app/services/summariser.py
CHANGED
@@ -4,6 +4,11 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
4 |
import time
|
5 |
import os
|
6 |
import re
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
class SummariserService:
|
9 |
def __init__(self):
|
@@ -22,8 +27,8 @@ class SummariserService:
|
|
22 |
"literary": "t5-large"
|
23 |
}
|
24 |
|
25 |
-
# Choose the most appropriate model
|
26 |
-
model_name = model_options["
|
27 |
|
28 |
# Update loading status
|
29 |
self.model_loading_status["is_loading"] = True
|
@@ -153,6 +158,8 @@ class SummariserService:
|
|
153 |
Returns:
|
154 |
dict: The generated summary and processing metadata
|
155 |
"""
|
|
|
|
|
156 |
# Reset and start job tracking
|
157 |
self.current_job = {
|
158 |
"in_progress": True,
|
@@ -174,6 +181,10 @@ class SummariserService:
|
|
174 |
}
|
175 |
|
176 |
try:
|
|
|
|
|
|
|
|
|
177 |
# Tokenization step
|
178 |
inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
|
179 |
input_ids = inputs.input_ids.to(self.device)
|
@@ -186,17 +197,19 @@ class SummariserService:
|
|
186 |
self.current_job["stage"] = "Generating summary"
|
187 |
self.current_job["progress"] = 30
|
188 |
|
189 |
-
# Enhanced generation parameters
|
190 |
summary_ids = self.model.generate(
|
191 |
input_ids,
|
192 |
max_length=max_length,
|
193 |
min_length=min_length,
|
194 |
do_sample=do_sample,
|
195 |
temperature=temperature,
|
196 |
-
num_beams=4
|
197 |
early_stopping=True,
|
198 |
no_repeat_ngram_size=3,
|
199 |
length_penalty=2.0,
|
|
|
|
|
200 |
)
|
201 |
|
202 |
# Update job status
|
@@ -212,9 +225,10 @@ class SummariserService:
|
|
212 |
result["metadata"]["output_word_count"] = len(summary.split())
|
213 |
result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
|
214 |
|
|
|
|
|
215 |
except Exception as e:
|
216 |
-
|
217 |
-
print(f"Error during summarization: {str(e)}")
|
218 |
result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
|
219 |
result["error"] = str(e)
|
220 |
finally:
|
@@ -224,3 +238,25 @@ class SummariserService:
|
|
224 |
self.current_job["progress"] = 100
|
225 |
|
226 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import time
|
5 |
import os
|
6 |
import re
|
7 |
+
import logging
|
8 |
+
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(level=logging.INFO)
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
|
13 |
class SummariserService:
|
14 |
def __init__(self):
|
|
|
27 |
"literary": "t5-large"
|
28 |
}
|
29 |
|
30 |
+
# Choose the most appropriate model - BART works better for web content
|
31 |
+
model_name = model_options["general"] # Use BART for better web content summarization
|
32 |
|
33 |
# Update loading status
|
34 |
self.model_loading_status["is_loading"] = True
|
|
|
158 |
Returns:
|
159 |
dict: The generated summary and processing metadata
|
160 |
"""
|
161 |
+
logger.info(f"Starting summarization of text with {len(text)} characters")
|
162 |
+
|
163 |
# Reset and start job tracking
|
164 |
self.current_job = {
|
165 |
"in_progress": True,
|
|
|
181 |
}
|
182 |
|
183 |
try:
|
184 |
+
# Preprocess the text to focus on main content
|
185 |
+
text = self.preprocess_text(text)
|
186 |
+
logger.info(f"After preprocessing: {len(text)} characters")
|
187 |
+
|
188 |
# Tokenization step
|
189 |
inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
|
190 |
input_ids = inputs.input_ids.to(self.device)
|
|
|
197 |
self.current_job["stage"] = "Generating summary"
|
198 |
self.current_job["progress"] = 30
|
199 |
|
200 |
+
# Enhanced generation parameters for better web content summarization
|
201 |
summary_ids = self.model.generate(
|
202 |
input_ids,
|
203 |
max_length=max_length,
|
204 |
min_length=min_length,
|
205 |
do_sample=do_sample,
|
206 |
temperature=temperature,
|
207 |
+
num_beams=5, # Increased from 4 to 5
|
208 |
early_stopping=True,
|
209 |
no_repeat_ngram_size=3,
|
210 |
length_penalty=2.0,
|
211 |
+
top_k=50, # Added for better quality
|
212 |
+
top_p=0.95, # Added for better quality
|
213 |
)
|
214 |
|
215 |
# Update job status
|
|
|
225 |
result["metadata"]["output_word_count"] = len(summary.split())
|
226 |
result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
|
227 |
|
228 |
+
logger.info(f"Generated summary with {len(summary)} characters")
|
229 |
+
|
230 |
except Exception as e:
|
231 |
+
logger.error(f"Error during summarization: {str(e)}")
|
|
|
232 |
result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
|
233 |
result["error"] = str(e)
|
234 |
finally:
|
|
|
238 |
self.current_job["progress"] = 100
|
239 |
|
240 |
return result
|
241 |
+
|
242 |
+
def preprocess_text(self, text):
|
243 |
+
"""Preprocess text to improve summarization quality."""
|
244 |
+
# Remove excessive whitespace
|
245 |
+
text = re.sub(r'\s+', ' ', text)
|
246 |
+
|
247 |
+
# Remove common web page boilerplate text
|
248 |
+
text = re.sub(r'Skip to (content|main).*?»', '', text)
|
249 |
+
text = re.sub(r'Search for:.*?Search', '', text)
|
250 |
+
text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
|
251 |
+
|
252 |
+
# Remove comment sections (often start with phrases like "X responses to")
|
253 |
+
text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
|
254 |
+
|
255 |
+
# Remove form fields and subscription prompts
|
256 |
+
text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
|
257 |
+
|
258 |
+
# Focus on the first part of very long texts (likely the main content)
|
259 |
+
if len(text) > 10000:
|
260 |
+
text = text[:10000]
|
261 |
+
|
262 |
+
return text
|
app/services/url_extractor.py
CHANGED
@@ -1,43 +1,88 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from bs4 import BeautifulSoup
|
3 |
import re
|
|
|
|
|
|
|
4 |
|
5 |
class URLExtractorService:
|
6 |
-
def
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
async def
|
10 |
-
"""
|
11 |
-
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
response.raise_for_status()
|
22 |
|
23 |
-
|
|
|
24 |
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
|
29 |
-
|
30 |
-
|
|
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
# Break multi-headlines into a line each
|
35 |
-
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
36 |
-
# Remove blank lines
|
37 |
-
text = '\n'.join(chunk for chunk in chunks if chunk)
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
try:
|
2 |
+
import aiohttp
|
3 |
+
AIOHTTP_AVAILABLE = True
|
4 |
+
except ImportError:
|
5 |
+
AIOHTTP_AVAILABLE = False
|
6 |
+
import requests
|
7 |
+
|
8 |
from bs4 import BeautifulSoup
|
9 |
import re
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
|
14 |
class URLExtractorService:
|
15 |
+
async def extract_content(self, url: str) -> str:
|
16 |
+
"""Extract the main content from a URL."""
|
17 |
+
try:
|
18 |
+
if AIOHTTP_AVAILABLE:
|
19 |
+
return await self._extract_with_aiohttp(url)
|
20 |
+
else:
|
21 |
+
return self._extract_with_requests(url)
|
22 |
+
except Exception as e:
|
23 |
+
logger.error(f"Error extracting content from URL {url}: {str(e)}")
|
24 |
+
return ""
|
25 |
|
26 |
+
async def _extract_with_aiohttp(self, url: str) -> str:
|
27 |
+
"""Extract content using aiohttp."""
|
28 |
+
async with aiohttp.ClientSession() as session:
|
29 |
+
async with session.get(url) as response:
|
30 |
+
if response.status != 200:
|
31 |
+
return ""
|
32 |
|
33 |
+
html = await response.text()
|
34 |
+
return self._parse_html(html)
|
35 |
|
36 |
+
def _extract_with_requests(self, url: str) -> str:
|
37 |
+
"""Extract content using requests as fallback."""
|
38 |
+
response = requests.get(url)
|
39 |
+
if response.status_code != 200:
|
40 |
+
return ""
|
|
|
41 |
|
42 |
+
html = response.text
|
43 |
+
return self._parse_html(html)
|
44 |
|
45 |
+
def _parse_html(self, html: str) -> str:
|
46 |
+
"""Parse HTML and extract main content."""
|
47 |
+
soup = BeautifulSoup(html, 'html.parser')
|
48 |
|
49 |
+
# Remove elements that typically contain comments or irrelevant content
|
50 |
+
for element in soup.select('footer, .comments, #comments, .comment, .respond, .reply, .sidebar, nav, header, script, style, [id*=comment], [class*=comment]'):
|
51 |
+
element.decompose()
|
52 |
|
53 |
+
# Try to find the main content using common article containers
|
54 |
+
main_content = None
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
# Look for article tag first
|
57 |
+
if soup.find('article'):
|
58 |
+
main_content = soup.find('article')
|
59 |
+
# Then try common content div classes/ids
|
60 |
+
elif soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)):
|
61 |
+
main_content = soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I))
|
62 |
+
# Then try main tag
|
63 |
+
elif soup.find('main'):
|
64 |
+
main_content = soup.find('main')
|
65 |
+
|
66 |
+
if main_content:
|
67 |
+
# Extract text from the main content
|
68 |
+
text = main_content.get_text(separator=' ', strip=True)
|
69 |
+
else:
|
70 |
+
# Fallback to body if no main content container is found
|
71 |
+
text = soup.body.get_text(separator=' ', strip=True)
|
72 |
+
|
73 |
+
# Clean up the text
|
74 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
|
75 |
+
text = re.sub(r'(\.|\?|!)\s+', r'\1\n\n', text) # Add paragraph breaks after sentences
|
76 |
+
|
77 |
+
# Remove common web page boilerplate text
|
78 |
+
text = re.sub(r'Skip to (content|main).*?»', '', text)
|
79 |
+
text = re.sub(r'Search for:.*?Search', '', text)
|
80 |
+
text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
|
81 |
+
|
82 |
+
# Remove comment sections (often start with phrases like "X responses to")
|
83 |
+
text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
|
84 |
+
|
85 |
+
# Remove form fields and subscription prompts
|
86 |
+
text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
|
87 |
+
|
88 |
+
return text
|
requirements.txt
CHANGED
@@ -1,15 +1,16 @@
|
|
1 |
-
numpy
|
2 |
-
torch
|
3 |
-
transformers
|
4 |
huggingface_hub==0.16.4
|
5 |
-
fastapi
|
6 |
-
uvicorn
|
7 |
-
pydantic
|
8 |
-
beautifulsoup4
|
9 |
-
requests
|
10 |
sentencepiece==0.1.99
|
11 |
-
python-dotenv
|
12 |
httpx==0.24.1
|
13 |
accelerate==0.21.0
|
14 |
pytest==7.3.1
|
15 |
pytest-cov==4.1.0
|
|
|
|
1 |
+
numpy>=1.21.0
|
2 |
+
torch>=1.9.0
|
3 |
+
transformers>=4.11.3
|
4 |
huggingface_hub==0.16.4
|
5 |
+
fastapi>=0.68.0,<0.69.0
|
6 |
+
uvicorn>=0.15.0,<0.16.0
|
7 |
+
pydantic>=1.8.0,<2.0.0
|
8 |
+
beautifulsoup4>=4.10.0
|
9 |
+
requests>=2.26.0
|
10 |
sentencepiece==0.1.99
|
11 |
+
python-dotenv>=0.19.0
|
12 |
httpx==0.24.1
|
13 |
accelerate==0.21.0
|
14 |
pytest==7.3.1
|
15 |
pytest-cov==4.1.0
|
16 |
+
aiohttp>=3.8.1
|