Dan Walsh commited on
Commit
6f0ac93
·
1 Parent(s): 59ea016

Updating URL extraction quality

Browse files
Dockerfile CHANGED
@@ -11,9 +11,17 @@ ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
11
  ENV HF_HOME=/tmp/huggingface_cache
12
  ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
13
 
 
 
 
 
 
14
  # Copy requirements first for better caching
15
  COPY requirements.txt .
16
- RUN pip install --no-cache-dir -r requirements.txt
 
 
 
17
 
18
  # Copy the rest of the application
19
  COPY . .
 
11
  ENV HF_HOME=/tmp/huggingface_cache
12
  ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
13
 
14
+ # Install system dependencies
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ build-essential \
17
+ && rm -rf /var/lib/apt/lists/*
18
+
19
  # Copy requirements first for better caching
20
  COPY requirements.txt .
21
+
22
+ # Install Python dependencies
23
+ RUN pip install --no-cache-dir --upgrade pip && \
24
+ pip install --no-cache-dir -r requirements.txt
25
 
26
  # Copy the rest of the application
27
  COPY . .
app/api/__pycache__/routes.cpython-311.pyc CHANGED
Binary files a/app/api/__pycache__/routes.cpython-311.pyc and b/app/api/__pycache__/routes.cpython-311.pyc differ
 
app/api/routes.py CHANGED
@@ -4,6 +4,9 @@ from typing import Optional, Union
4
  from app.services.summariser import SummariserService
5
  from app.services.url_extractor import URLExtractorService
6
  from app.services.cache import hash_text, get_cached_summary, cache_summary
 
 
 
7
 
8
  router = APIRouter(prefix="/api")
9
  summariser_service = SummariserService()
@@ -62,12 +65,16 @@ async def summarise_text(request: TextSummaryRequest):
62
  async def summarise_url(request: URLSummaryRequest):
63
  try:
64
  # Extract content from URL
 
65
  url_extractor = URLExtractorService()
66
  content = await url_extractor.extract_content(str(request.url))
67
 
68
  if not content or len(content) < 100:
 
69
  raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
70
 
 
 
71
  # Summarise the extracted content
72
  result = summariser_service.summarise(
73
  text=content,
@@ -77,16 +84,19 @@ async def summarise_url(request: URLSummaryRequest):
77
  temperature=request.temperature
78
  )
79
 
 
80
  return {
81
  "original_text_length": len(content),
82
  "summary": result["summary"],
83
  "summary_length": len(result["summary"]),
84
  "source_type": "url",
85
- "source_url": str(request.url)
 
86
  }
87
  except HTTPException:
88
  raise
89
  except Exception as e:
 
90
  raise HTTPException(status_code=500, detail=str(e))
91
 
92
  @router.get("/status")
 
4
  from app.services.summariser import SummariserService
5
  from app.services.url_extractor import URLExtractorService
6
  from app.services.cache import hash_text, get_cached_summary, cache_summary
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
 
11
  router = APIRouter(prefix="/api")
12
  summariser_service = SummariserService()
 
65
  async def summarise_url(request: URLSummaryRequest):
66
  try:
67
  # Extract content from URL
68
+ logger.info(f"Extracting content from URL: {request.url}")
69
  url_extractor = URLExtractorService()
70
  content = await url_extractor.extract_content(str(request.url))
71
 
72
  if not content or len(content) < 100:
73
+ logger.warning(f"Insufficient content extracted from URL: {request.url}")
74
  raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
75
 
76
+ logger.info(f"Extracted {len(content)} characters from {request.url}")
77
+
78
  # Summarise the extracted content
79
  result = summariser_service.summarise(
80
  text=content,
 
84
  temperature=request.temperature
85
  )
86
 
87
+ # Create a more structured response
88
  return {
89
  "original_text_length": len(content),
90
  "summary": result["summary"],
91
  "summary_length": len(result["summary"]),
92
  "source_type": "url",
93
+ "source_url": str(request.url),
94
+ "metadata": result.get("metadata", {})
95
  }
96
  except HTTPException:
97
  raise
98
  except Exception as e:
99
+ logger.error(f"Error processing URL {request.url}: {str(e)}")
100
  raise HTTPException(status_code=500, detail=str(e))
101
 
102
  @router.get("/status")
app/services/__pycache__/summariser.cpython-311.pyc CHANGED
Binary files a/app/services/__pycache__/summariser.cpython-311.pyc and b/app/services/__pycache__/summariser.cpython-311.pyc differ
 
app/services/__pycache__/url_extractor.cpython-311.pyc CHANGED
Binary files a/app/services/__pycache__/url_extractor.cpython-311.pyc and b/app/services/__pycache__/url_extractor.cpython-311.pyc differ
 
app/services/summariser.py CHANGED
@@ -4,6 +4,11 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
4
  import time
5
  import os
6
  import re
 
 
 
 
 
7
 
8
  class SummariserService:
9
  def __init__(self):
@@ -22,8 +27,8 @@ class SummariserService:
22
  "literary": "t5-large"
23
  }
24
 
25
- # Choose the most appropriate model
26
- model_name = model_options["literary"] # Better for literary text
27
 
28
  # Update loading status
29
  self.model_loading_status["is_loading"] = True
@@ -153,6 +158,8 @@ class SummariserService:
153
  Returns:
154
  dict: The generated summary and processing metadata
155
  """
 
 
156
  # Reset and start job tracking
157
  self.current_job = {
158
  "in_progress": True,
@@ -174,6 +181,10 @@ class SummariserService:
174
  }
175
 
176
  try:
 
 
 
 
177
  # Tokenization step
178
  inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
179
  input_ids = inputs.input_ids.to(self.device)
@@ -186,17 +197,19 @@ class SummariserService:
186
  self.current_job["stage"] = "Generating summary"
187
  self.current_job["progress"] = 30
188
 
189
- # Enhanced generation parameters
190
  summary_ids = self.model.generate(
191
  input_ids,
192
  max_length=max_length,
193
  min_length=min_length,
194
  do_sample=do_sample,
195
  temperature=temperature,
196
- num_beams=4,
197
  early_stopping=True,
198
  no_repeat_ngram_size=3,
199
  length_penalty=2.0,
 
 
200
  )
201
 
202
  # Update job status
@@ -212,9 +225,10 @@ class SummariserService:
212
  result["metadata"]["output_word_count"] = len(summary.split())
213
  result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
214
 
 
 
215
  except Exception as e:
216
- # Handle errors gracefully
217
- print(f"Error during summarization: {str(e)}")
218
  result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
219
  result["error"] = str(e)
220
  finally:
@@ -224,3 +238,25 @@ class SummariserService:
224
  self.current_job["progress"] = 100
225
 
226
  return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import time
5
  import os
6
  import re
7
+ import logging
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
 
13
  class SummariserService:
14
  def __init__(self):
 
27
  "literary": "t5-large"
28
  }
29
 
30
+ # Choose the most appropriate model - BART works better for web content
31
+ model_name = model_options["general"] # Use BART for better web content summarization
32
 
33
  # Update loading status
34
  self.model_loading_status["is_loading"] = True
 
158
  Returns:
159
  dict: The generated summary and processing metadata
160
  """
161
+ logger.info(f"Starting summarization of text with {len(text)} characters")
162
+
163
  # Reset and start job tracking
164
  self.current_job = {
165
  "in_progress": True,
 
181
  }
182
 
183
  try:
184
+ # Preprocess the text to focus on main content
185
+ text = self.preprocess_text(text)
186
+ logger.info(f"After preprocessing: {len(text)} characters")
187
+
188
  # Tokenization step
189
  inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
190
  input_ids = inputs.input_ids.to(self.device)
 
197
  self.current_job["stage"] = "Generating summary"
198
  self.current_job["progress"] = 30
199
 
200
+ # Enhanced generation parameters for better web content summarization
201
  summary_ids = self.model.generate(
202
  input_ids,
203
  max_length=max_length,
204
  min_length=min_length,
205
  do_sample=do_sample,
206
  temperature=temperature,
207
+ num_beams=5, # Increased from 4 to 5
208
  early_stopping=True,
209
  no_repeat_ngram_size=3,
210
  length_penalty=2.0,
211
+ top_k=50, # Added for better quality
212
+ top_p=0.95, # Added for better quality
213
  )
214
 
215
  # Update job status
 
225
  result["metadata"]["output_word_count"] = len(summary.split())
226
  result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
227
 
228
+ logger.info(f"Generated summary with {len(summary)} characters")
229
+
230
  except Exception as e:
231
+ logger.error(f"Error during summarization: {str(e)}")
 
232
  result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
233
  result["error"] = str(e)
234
  finally:
 
238
  self.current_job["progress"] = 100
239
 
240
  return result
241
+
242
+ def preprocess_text(self, text):
243
+ """Preprocess text to improve summarization quality."""
244
+ # Remove excessive whitespace
245
+ text = re.sub(r'\s+', ' ', text)
246
+
247
+ # Remove common web page boilerplate text
248
+ text = re.sub(r'Skip to (content|main).*?»', '', text)
249
+ text = re.sub(r'Search for:.*?Search', '', text)
250
+ text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
251
+
252
+ # Remove comment sections (often start with phrases like "X responses to")
253
+ text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
254
+
255
+ # Remove form fields and subscription prompts
256
+ text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
257
+
258
+ # Focus on the first part of very long texts (likely the main content)
259
+ if len(text) > 10000:
260
+ text = text[:10000]
261
+
262
+ return text
app/services/url_extractor.py CHANGED
@@ -1,43 +1,88 @@
1
- import httpx
 
 
 
 
 
 
2
  from bs4 import BeautifulSoup
3
  import re
 
 
 
4
 
5
  class URLExtractorService:
6
- def __init__(self):
7
- self.client = httpx.AsyncClient(timeout=30.0)
 
 
 
 
 
 
 
 
8
 
9
- async def extract_content(self, url):
10
- """
11
- Extract the main content from a URL.
 
 
 
12
 
13
- Args:
14
- url (str): The URL to extract content from
15
 
16
- Returns:
17
- str: The extracted text content
18
- """
19
- try:
20
- response = await self.client.get(url)
21
- response.raise_for_status()
22
 
23
- soup = BeautifulSoup(response.text, 'html.parser')
 
24
 
25
- # Remove script and style elements
26
- for script in soup(["script", "style", "header", "footer", "nav"]):
27
- script.extract()
28
 
29
- # Get text and clean it
30
- text = soup.get_text()
 
31
 
32
- # Break into lines and remove leading/trailing space
33
- lines = (line.strip() for line in text.splitlines())
34
- # Break multi-headlines into a line each
35
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
36
- # Remove blank lines
37
- text = '\n'.join(chunk for chunk in chunks if chunk)
38
 
39
- return text
40
- except Exception as e:
41
- raise Exception(f"Failed to extract content from URL: {str(e)}")
42
- finally:
43
- await self.client.aclose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ try:
2
+ import aiohttp
3
+ AIOHTTP_AVAILABLE = True
4
+ except ImportError:
5
+ AIOHTTP_AVAILABLE = False
6
+ import requests
7
+
8
  from bs4 import BeautifulSoup
9
  import re
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
 
14
  class URLExtractorService:
15
+ async def extract_content(self, url: str) -> str:
16
+ """Extract the main content from a URL."""
17
+ try:
18
+ if AIOHTTP_AVAILABLE:
19
+ return await self._extract_with_aiohttp(url)
20
+ else:
21
+ return self._extract_with_requests(url)
22
+ except Exception as e:
23
+ logger.error(f"Error extracting content from URL {url}: {str(e)}")
24
+ return ""
25
 
26
+ async def _extract_with_aiohttp(self, url: str) -> str:
27
+ """Extract content using aiohttp."""
28
+ async with aiohttp.ClientSession() as session:
29
+ async with session.get(url) as response:
30
+ if response.status != 200:
31
+ return ""
32
 
33
+ html = await response.text()
34
+ return self._parse_html(html)
35
 
36
+ def _extract_with_requests(self, url: str) -> str:
37
+ """Extract content using requests as fallback."""
38
+ response = requests.get(url)
39
+ if response.status_code != 200:
40
+ return ""
 
41
 
42
+ html = response.text
43
+ return self._parse_html(html)
44
 
45
+ def _parse_html(self, html: str) -> str:
46
+ """Parse HTML and extract main content."""
47
+ soup = BeautifulSoup(html, 'html.parser')
48
 
49
+ # Remove elements that typically contain comments or irrelevant content
50
+ for element in soup.select('footer, .comments, #comments, .comment, .respond, .reply, .sidebar, nav, header, script, style, [id*=comment], [class*=comment]'):
51
+ element.decompose()
52
 
53
+ # Try to find the main content using common article containers
54
+ main_content = None
 
 
 
 
55
 
56
+ # Look for article tag first
57
+ if soup.find('article'):
58
+ main_content = soup.find('article')
59
+ # Then try common content div classes/ids
60
+ elif soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)):
61
+ main_content = soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I))
62
+ # Then try main tag
63
+ elif soup.find('main'):
64
+ main_content = soup.find('main')
65
+
66
+ if main_content:
67
+ # Extract text from the main content
68
+ text = main_content.get_text(separator=' ', strip=True)
69
+ else:
70
+ # Fallback to body if no main content container is found
71
+ text = soup.body.get_text(separator=' ', strip=True)
72
+
73
+ # Clean up the text
74
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space
75
+ text = re.sub(r'(\.|\?|!)\s+', r'\1\n\n', text) # Add paragraph breaks after sentences
76
+
77
+ # Remove common web page boilerplate text
78
+ text = re.sub(r'Skip to (content|main).*?»', '', text)
79
+ text = re.sub(r'Search for:.*?Search', '', text)
80
+ text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
81
+
82
+ # Remove comment sections (often start with phrases like "X responses to")
83
+ text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
84
+
85
+ # Remove form fields and subscription prompts
86
+ text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
87
+
88
+ return text
requirements.txt CHANGED
@@ -1,15 +1,16 @@
1
- numpy==1.24.3
2
- torch==2.0.1
3
- transformers==4.30.2
4
  huggingface_hub==0.16.4
5
- fastapi==0.100.0
6
- uvicorn==0.22.0
7
- pydantic==1.10.8
8
- beautifulsoup4==4.12.2
9
- requests==2.31.0
10
  sentencepiece==0.1.99
11
- python-dotenv==1.0.0
12
  httpx==0.24.1
13
  accelerate==0.21.0
14
  pytest==7.3.1
15
  pytest-cov==4.1.0
 
 
1
+ numpy>=1.21.0
2
+ torch>=1.9.0
3
+ transformers>=4.11.3
4
  huggingface_hub==0.16.4
5
+ fastapi>=0.68.0,<0.69.0
6
+ uvicorn>=0.15.0,<0.16.0
7
+ pydantic>=1.8.0,<2.0.0
8
+ beautifulsoup4>=4.10.0
9
+ requests>=2.26.0
10
  sentencepiece==0.1.99
11
+ python-dotenv>=0.19.0
12
  httpx==0.24.1
13
  accelerate==0.21.0
14
  pytest==7.3.1
15
  pytest-cov==4.1.0
16
+ aiohttp>=3.8.1