Spaces:

entropy25
/

sentiment-multi

Sleeping

App Files Files Community

entropy25 commited on Aug 4

Commit

f823c9f

verified ·

1 Parent(s): ec43aa0

Create data_utils.py

Browse files

Files changed (1) hide show

data_utils.py +191 -0

data_utils.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import csv
+import json
+import io
+import tempfile
+import re
+import numpy as np
+from datetime import datetime
+from functools import lru_cache
+from collections import Counter
+from typing import List, Dict, Optional, Tuple
+import nltk
+from nltk.corpus import stopwords
+from config import config
+from models import handle_errors
+# Initialize NLTK
+try:
+    nltk.download('stopwords', quiet=True)
+    nltk.download('punkt', quiet=True)
+    STOP_WORDS = set(stopwords.words('english'))
+except:
+    STOP_WORDS = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'}
+# Simplified Text Processing
+class TextProcessor:
+    """Optimized text processing with multi-language support"""
+    @staticmethod
+    @lru_cache(maxsize=config.CACHE_SIZE)
+    def clean_text(text: str, remove_punctuation: bool = True, remove_numbers: bool = False) -> str:
+        """Clean text with language awareness"""
+        text = text.strip()
+        # Don't clean Chinese text aggressively
+        if re.search(r'[\u4e00-\u9fff]', text):
+            return text
+        text = text.lower()
+        if remove_numbers:
+            text = re.sub(r'\d+', '', text)
+        if remove_punctuation:
+            text = re.sub(r'[^\w\s]', '', text)
+        words = text.split()
+        cleaned_words = [w for w in words if w not in STOP_WORDS and len(w) >= config.MIN_WORD_LENGTH]
+        return ' '.join(cleaned_words)
+    @staticmethod
+    def parse_batch_input(text: str) -> List[str]:
+        """Parse batch input from textarea"""
+        lines = text.strip().split('\n')
+        return [line.strip() for line in lines if line.strip()]
+# Enhanced History Manager
+class HistoryManager:
+    """Enhanced history management with filtering"""
+    def __init__(self):
+        self._history = []
+    def add(self, entry: Dict):
+        """Add entry with timestamp"""
+        entry['timestamp'] = datetime.now().isoformat()
+        self._history.append(entry)
+        if len(self._history) > config.MAX_HISTORY_SIZE:
+            self._history = self._history[-config.MAX_HISTORY_SIZE:]
+    def add_batch(self, entries: List[Dict]):
+        """Add multiple entries"""
+        for entry in entries:
+            self.add(entry)
+    def get_all(self) -> List[Dict]:
+        return self._history.copy()
+    def get_recent(self, n: int = 10) -> List[Dict]:
+        return self._history[-n:] if self._history else []
+    def filter_by(self, sentiment: str = None, language: str = None,
+                  min_confidence: float = None) -> List[Dict]:
+        """Filter history by criteria"""
+        filtered = self._history
+        if sentiment:
+            filtered = [h for h in filtered if h['sentiment'] == sentiment]
+        if language:
+            filtered = [h for h in filtered if h.get('language', 'en') == language]
+        if min_confidence:
+            filtered = [h for h in filtered if h['confidence'] >= min_confidence]
+        return filtered
+    def clear(self) -> int:
+        count = len(self._history)
+        self._history.clear()
+        return count
+    def size(self) -> int:
+        return len(self._history)
+    def get_stats(self) -> Dict:
+        """Get comprehensive statistics"""
+        if not self._history:
+            return {}
+        sentiments = [item['sentiment'] for item in self._history]
+        confidences = [item['confidence'] for item in self._history]
+        languages = [item.get('language', 'en') for item in self._history]
+        return {
+            'total_analyses': len(self._history),
+            'positive_count': sentiments.count('Positive'),
+            'negative_count': sentiments.count('Negative'),
+            'neutral_count': sentiments.count('Neutral'),
+            'avg_confidence': np.mean(confidences),
+            'max_confidence': np.max(confidences),
+            'min_confidence': np.min(confidences),
+            'languages_detected': len(set(languages)),
+            'most_common_language': Counter(languages).most_common(1)[0][0] if languages else 'en'
+        }
+# Universal Data Handler
+class DataHandler:
+    """Enhanced data operations"""
+    @staticmethod
+    @handle_errors(default_return=(None, "Export failed"))
+    def export_data(data: List[Dict], format_type: str) -> Tuple[Optional[str], str]:
+        """Export data with comprehensive information"""
+        if not data:
+            return None, "No data to export"
+        temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False,
+                                               suffix=f'.{format_type}', encoding='utf-8')
+        if format_type == 'csv':
+            writer = csv.writer(temp_file)
+            writer.writerow(['Timestamp', 'Text', 'Sentiment', 'Confidence', 'Language',
+                           'Pos_Prob', 'Neg_Prob', 'Neu_Prob', 'Word_Count'])
+            for entry in data:
+                writer.writerow([
+                    entry.get('timestamp', ''),
+                    entry.get('text', ''),
+                    entry.get('sentiment', ''),
+                    f"{entry.get('confidence', 0):.4f}",
+                    entry.get('language', 'en'),
+                    f"{entry.get('pos_prob', 0):.4f}",
+                    f"{entry.get('neg_prob', 0):.4f}",
+                    f"{entry.get('neu_prob', 0):.4f}",
+                    entry.get('word_count', 0)
+                ])
+        elif format_type == 'json':
+            json.dump(data, temp_file, indent=2, ensure_ascii=False)
+        temp_file.close()
+        return temp_file.name, f"Exported {len(data)} entries"
+    @staticmethod
+    @handle_errors(default_return="")
+    def process_file(file) -> str:
+        """Process uploaded files"""
+        if not file:
+            return ""
+        content = file.read().decode('utf-8')
+        if file.name.endswith('.csv'):
+            csv_file = io.StringIO(content)
+            reader = csv.reader(csv_file)
+            try:
+                next(reader)  # Skip header
+                texts = []
+                for row in reader:
+                    if row and row[0].strip():
+                        text = row[0].strip().strip('"')
+                        if text:
+                            texts.append(text)
+                return '\n'.join(texts)
+            except:
+                lines = content.strip().split('\n')[1:]
+                texts = []
+                for line in lines:
+                    if line.strip():
+                        text = line.strip().strip('"')
+                        if text:
+                            texts.append(text)
+                return '\n'.join(texts)
+        return content