Spaces:

dang-w
/

ai-content-summariser-api

Running

App Files Files Community

Dan Walsh commited on Mar 11

Commit

6f0ac93

1 Parent(s): 59ea016

Updating URL extraction quality

Browse files

Files changed (8) hide show

Dockerfile +9 -1
app/api/__pycache__/routes.cpython-311.pyc +0 -0
app/api/routes.py +11 -1
app/services/__pycache__/summariser.cpython-311.pyc +0 -0
app/services/__pycache__/url_extractor.cpython-311.pyc +0 -0
app/services/summariser.py +42 -6
app/services/url_extractor.py +76 -31
requirements.txt +10 -9

Dockerfile CHANGED Viewed

@@ -11,9 +11,17 @@ ENV TRANSFORMERS_CACHE=/tmp/huggingface_cache
 ENV HF_HOME=/tmp/huggingface_cache
 ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
 # Copy requirements first for better caching
 COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application
 COPY . .

 ENV HF_HOME=/tmp/huggingface_cache
 ENV HUGGINGFACE_HUB_CACHE=/tmp/huggingface_cache
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+  build-essential \
+  && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
 COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+  pip install --no-cache-dir -r requirements.txt
 # Copy the rest of the application
 COPY . .

app/api/__pycache__/routes.cpython-311.pyc CHANGED Viewed

Binary files a/app/api/__pycache__/routes.cpython-311.pyc and b/app/api/__pycache__/routes.cpython-311.pyc differ

app/api/routes.py CHANGED Viewed

@@ -4,6 +4,9 @@ from typing import Optional, Union
 from app.services.summariser import SummariserService
 from app.services.url_extractor import URLExtractorService
 from app.services.cache import hash_text, get_cached_summary, cache_summary
 router = APIRouter(prefix="/api")
 summariser_service = SummariserService()
@@ -62,12 +65,16 @@ async def summarise_text(request: TextSummaryRequest):
 async def summarise_url(request: URLSummaryRequest):
     try:
         # Extract content from URL
         url_extractor = URLExtractorService()
         content = await url_extractor.extract_content(str(request.url))
         if not content or len(content) < 100:
             raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
         # Summarise the extracted content
         result = summariser_service.summarise(
             text=content,
@@ -77,16 +84,19 @@ async def summarise_url(request: URLSummaryRequest):
             temperature=request.temperature
         )
         return {
             "original_text_length": len(content),
             "summary": result["summary"],
             "summary_length": len(result["summary"]),
             "source_type": "url",
-            "source_url": str(request.url)
         }
     except HTTPException:
         raise
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @router.get("/status")

 from app.services.summariser import SummariserService
 from app.services.url_extractor import URLExtractorService
 from app.services.cache import hash_text, get_cached_summary, cache_summary
+import logging
+logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/api")
 summariser_service = SummariserService()
 async def summarise_url(request: URLSummaryRequest):
     try:
         # Extract content from URL
+        logger.info(f"Extracting content from URL: {request.url}")
         url_extractor = URLExtractorService()
         content = await url_extractor.extract_content(str(request.url))
         if not content or len(content) < 100:
+            logger.warning(f"Insufficient content extracted from URL: {request.url}")
             raise HTTPException(status_code=422, detail="Could not extract sufficient content from the URL")
+        logger.info(f"Extracted {len(content)} characters from {request.url}")
         # Summarise the extracted content
         result = summariser_service.summarise(
             text=content,
             temperature=request.temperature
         )
+        # Create a more structured response
         return {
             "original_text_length": len(content),
             "summary": result["summary"],
             "summary_length": len(result["summary"]),
             "source_type": "url",
+            "source_url": str(request.url),
+            "metadata": result.get("metadata", {})
         }
     except HTTPException:
         raise
     except Exception as e:
+        logger.error(f"Error processing URL {request.url}: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @router.get("/status")

app/services/__pycache__/summariser.cpython-311.pyc CHANGED Viewed

Binary files a/app/services/__pycache__/summariser.cpython-311.pyc and b/app/services/__pycache__/summariser.cpython-311.pyc differ

app/services/__pycache__/url_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/app/services/__pycache__/url_extractor.cpython-311.pyc and b/app/services/__pycache__/url_extractor.cpython-311.pyc differ

app/services/summariser.py CHANGED Viewed

@@ -4,6 +4,11 @@ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import time
 import os
 import re
 class SummariserService:
     def __init__(self):
@@ -22,8 +27,8 @@ class SummariserService:
             "literary": "t5-large"
         }
-        # Choose the most appropriate model
-        model_name = model_options["literary"]  # Better for literary text
         # Update loading status
         self.model_loading_status["is_loading"] = True
@@ -153,6 +158,8 @@ class SummariserService:
         Returns:
             dict: The generated summary and processing metadata
         """
         # Reset and start job tracking
         self.current_job = {
             "in_progress": True,
@@ -174,6 +181,10 @@ class SummariserService:
         }
         try:
             # Tokenization step
             inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
             input_ids = inputs.input_ids.to(self.device)
@@ -186,17 +197,19 @@ class SummariserService:
             self.current_job["stage"] = "Generating summary"
             self.current_job["progress"] = 30
-            # Enhanced generation parameters
             summary_ids = self.model.generate(
                 input_ids,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=do_sample,
                 temperature=temperature,
-                num_beams=4,
                 early_stopping=True,
                 no_repeat_ngram_size=3,
                 length_penalty=2.0,
             )
             # Update job status
@@ -212,9 +225,10 @@ class SummariserService:
             result["metadata"]["output_word_count"] = len(summary.split())
             result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
         except Exception as e:
-            # Handle errors gracefully
-            print(f"Error during summarization: {str(e)}")
             result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
             result["error"] = str(e)
         finally:
@@ -224,3 +238,25 @@ class SummariserService:
             self.current_job["progress"] = 100
         return result

 import time
 import os
 import re
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class SummariserService:
     def __init__(self):
             "literary": "t5-large"
         }
+        # Choose the most appropriate model - BART works better for web content
+        model_name = model_options["general"]  # Use BART for better web content summarization
         # Update loading status
         self.model_loading_status["is_loading"] = True
         Returns:
             dict: The generated summary and processing metadata
         """
+        logger.info(f"Starting summarization of text with {len(text)} characters")
         # Reset and start job tracking
         self.current_job = {
             "in_progress": True,
         }
         try:
+            # Preprocess the text to focus on main content
+            text = self.preprocess_text(text)
+            logger.info(f"After preprocessing: {len(text)} characters")
             # Tokenization step
             inputs = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
             input_ids = inputs.input_ids.to(self.device)
             self.current_job["stage"] = "Generating summary"
             self.current_job["progress"] = 30
+            # Enhanced generation parameters for better web content summarization
             summary_ids = self.model.generate(
                 input_ids,
                 max_length=max_length,
                 min_length=min_length,
                 do_sample=do_sample,
                 temperature=temperature,
+                num_beams=5,  # Increased from 4 to 5
                 early_stopping=True,
                 no_repeat_ngram_size=3,
                 length_penalty=2.0,
+                top_k=50,  # Added for better quality
+                top_p=0.95,  # Added for better quality
             )
             # Update job status
             result["metadata"]["output_word_count"] = len(summary.split())
             result["metadata"]["compression_ratio"] = round(len(summary.split()) / self.current_job["input_word_count"] * 100, 1)
+            logger.info(f"Generated summary with {len(summary)} characters")
         except Exception as e:
+            logger.error(f"Error during summarization: {str(e)}")
             result["summary"] = "An error occurred during summarization. Please try again with a shorter text or different parameters."
             result["error"] = str(e)
         finally:
             self.current_job["progress"] = 100
         return result
+    def preprocess_text(self, text):
+        """Preprocess text to improve summarization quality."""
+        # Remove excessive whitespace
+        text = re.sub(r'\s+', ' ', text)
+        # Remove common web page boilerplate text
+        text = re.sub(r'Skip to (content|main).*?»', '', text)
+        text = re.sub(r'Search for:.*?Search', '', text)
+        text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
+        # Remove comment sections (often start with phrases like "X responses to")
+        text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
+        # Remove form fields and subscription prompts
+        text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
+        # Focus on the first part of very long texts (likely the main content)
+        if len(text) > 10000:
+            text = text[:10000]
+        return text

app/services/url_extractor.py CHANGED Viewed

@@ -1,43 +1,88 @@
-import httpx
 from bs4 import BeautifulSoup
 import re
 class URLExtractorService:
-    def __init__(self):
-        self.client = httpx.AsyncClient(timeout=30.0)
-    async def extract_content(self, url):
-        """
-        Extract the main content from a URL.
-        Args:
-            url (str): The URL to extract content from
-        Returns:
-            str: The extracted text content
-        """
-        try:
-            response = await self.client.get(url)
-            response.raise_for_status()
-            soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove script and style elements
-            for script in soup(["script", "style", "header", "footer", "nav"]):
-                script.extract()
-            # Get text and clean it
-            text = soup.get_text()
-            # Break into lines and remove leading/trailing space
-            lines = (line.strip() for line in text.splitlines())
-            # Break multi-headlines into a line each
-            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-            # Remove blank lines
-            text = '\n'.join(chunk for chunk in chunks if chunk)
-            return text
-        except Exception as e:
-            raise Exception(f"Failed to extract content from URL: {str(e)}")
-        finally:
-            await self.client.aclose()

+try:
+    import aiohttp
+    AIOHTTP_AVAILABLE = True
+except ImportError:
+    AIOHTTP_AVAILABLE = False
+    import requests
 from bs4 import BeautifulSoup
 import re
+import logging
+logger = logging.getLogger(__name__)
 class URLExtractorService:
+    async def extract_content(self, url: str) -> str:
+        """Extract the main content from a URL."""
+        try:
+            if AIOHTTP_AVAILABLE:
+                return await self._extract_with_aiohttp(url)
+            else:
+                return self._extract_with_requests(url)
+        except Exception as e:
+            logger.error(f"Error extracting content from URL {url}: {str(e)}")
+            return ""
+    async def _extract_with_aiohttp(self, url: str) -> str:
+        """Extract content using aiohttp."""
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url) as response:
+                if response.status != 200:
+                    return ""
+                html = await response.text()
+                return self._parse_html(html)
+    def _extract_with_requests(self, url: str) -> str:
+        """Extract content using requests as fallback."""
+        response = requests.get(url)
+        if response.status_code != 200:
+            return ""
+        html = response.text
+        return self._parse_html(html)
+    def _parse_html(self, html: str) -> str:
+        """Parse HTML and extract main content."""
+        soup = BeautifulSoup(html, 'html.parser')
+        # Remove elements that typically contain comments or irrelevant content
+        for element in soup.select('footer, .comments, #comments, .comment, .respond, .reply, .sidebar, nav, header, script, style, [id*=comment], [class*=comment]'):
+            element.decompose()
+        # Try to find the main content using common article containers
+        main_content = None
+        # Look for article tag first
+        if soup.find('article'):
+            main_content = soup.find('article')
+        # Then try common content div classes/ids
+        elif soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I)):
+            main_content = soup.find(class_=re.compile(r'(content|post|article|entry)(-body|-content|-text)?$', re.I))
+        # Then try main tag
+        elif soup.find('main'):
+            main_content = soup.find('main')
+        if main_content:
+            # Extract text from the main content
+            text = main_content.get_text(separator=' ', strip=True)
+        else:
+            # Fallback to body if no main content container is found
+            text = soup.body.get_text(separator=' ', strip=True)
+        # Clean up the text
+        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
+        text = re.sub(r'(\.|\?|!)\s+', r'\1\n\n', text)  # Add paragraph breaks after sentences
+        # Remove common web page boilerplate text
+        text = re.sub(r'Skip to (content|main).*?»', '', text)
+        text = re.sub(r'Search for:.*?Search', '', text)
+        text = re.sub(r'Menu.*?Resources', '', text, flags=re.DOTALL)
+        # Remove comment sections (often start with phrases like "X responses to")
+        text = re.sub(r'\d+ responses to.*?$', '', text, flags=re.DOTALL)
+        # Remove form fields and subscription prompts
+        text = re.sub(r'(Your email address will not be published|Required fields are marked).*?$', '', text, flags=re.DOTALL)
+        return text

requirements.txt CHANGED Viewed

@@ -1,15 +1,16 @@
-numpy==1.24.3
-torch==2.0.1
-transformers==4.30.2
 huggingface_hub==0.16.4
-fastapi==0.100.0
-uvicorn==0.22.0
-pydantic==1.10.8
-beautifulsoup4==4.12.2
-requests==2.31.0
 sentencepiece==0.1.99
-python-dotenv==1.0.0
 httpx==0.24.1
 accelerate==0.21.0
 pytest==7.3.1
 pytest-cov==4.1.0

+numpy>=1.21.0
+torch>=1.9.0
+transformers>=4.11.3
 huggingface_hub==0.16.4
+fastapi>=0.68.0,<0.69.0
+uvicorn>=0.15.0,<0.16.0
+pydantic>=1.8.0,<2.0.0
+beautifulsoup4>=4.10.0
+requests>=2.26.0
 sentencepiece==0.1.99
+python-dotenv>=0.19.0
 httpx==0.24.1
 accelerate==0.21.0
 pytest==7.3.1
 pytest-cov==4.1.0
+aiohttp>=3.8.1