Spaces:

entropy25
/

sentiment-multi

Sleeping

App Files Files Community

entropy25 commited on Aug 4

Commit

c185c85

verified ·

1 Parent(s): 33fba47

Create analysis.py

Browse files

Files changed (1) hide show

analysis.py +301 -0

analysis.py ADDED Viewed

	@@ -0,0 +1,301 @@

+import torch
+import numpy as np
+import logging
+import plotly.graph_objects as go
+from typing import Tuple, Dict
+# Advanced analysis imports
+import shap
+import lime
+from lime.lime_text import LimeTextExplainer
+from config import config
+from models import ModelManager, handle_errors
+logger = logging.getLogger(__name__)
+class AdvancedAnalysisEngine:
+    """Advanced analysis using SHAP and LIME with FIXED implementation"""
+    def __init__(self):
+        self.model_manager = ModelManager()
+    def create_prediction_function(self, model, tokenizer, device):
+        """Create FIXED prediction function for SHAP/LIME"""
+        def predict_proba(texts):
+            # Ensure texts is a list
+            if isinstance(texts, str):
+                texts = [texts]
+            elif isinstance(texts, np.ndarray):
+                texts = texts.tolist()
+            # Convert all elements to strings
+            texts = [str(text) for text in texts]
+            results = []
+            batch_size = 16  # Process in smaller batches
+            for i in range(0, len(texts), batch_size):
+                batch_texts = texts[i:i + batch_size]
+                try:
+                    with torch.no_grad():
+                        # Tokenize batch
+                        inputs = tokenizer(
+                            batch_texts,
+                            return_tensors="pt",
+                            padding=True,
+                            truncation=True,
+                            max_length=config.MAX_TEXT_LENGTH
+                        ).to(device)
+                        # Batch inference
+                        outputs = model(**inputs)
+                        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
+                        results.extend(probs)
+                except Exception as e:
+                    logger.error(f"Prediction batch failed: {e}")
+                    # Return neutral predictions for failed batch
+                    batch_size_actual = len(batch_texts)
+                    if hasattr(model.config, 'num_labels') and model.config.num_labels == 3:
+                        neutral_probs = np.array([[0.33, 0.34, 0.33]] * batch_size_actual)
+                    else:
+                        neutral_probs = np.array([[0.5, 0.5]] * batch_size_actual)
+                    results.extend(neutral_probs)
+            return np.array(results)
+        return predict_proba
+    @handle_errors(default_return=("Analysis failed", None, None))
+    def analyze_with_shap(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
+        """FIXED SHAP analysis implementation"""
+        if not text.strip():
+            return "Please enter text for analysis", None, {}
+        # Detect language and get model
+        if language == 'auto':
+            detected_lang = self.model_manager.detect_language(text)
+        else:
+            detected_lang = language
+        model, tokenizer = self.model_manager.get_model(detected_lang)
+        try:
+            # Create FIXED prediction function
+            predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
+            # Test the prediction function first
+            test_pred = predict_fn([text])
+            if test_pred is None or len(test_pred) == 0:
+                return "Prediction function test failed", None, {}
+            # Use SHAP Text Explainer instead of generic Explainer
+            explainer = shap.Explainer(predict_fn, masker=shap.maskers.Text(tokenizer))
+            # Get SHAP values with proper text input
+            shap_values = explainer([text], max_evals=num_samples)
+            # Extract data safely
+            if hasattr(shap_values, 'data') and hasattr(shap_values, 'values'):
+                tokens = shap_values.data[0] if len(shap_values.data) > 0 else []
+                values = shap_values.values[0] if len(shap_values.values) > 0 else []
+            else:
+                return "SHAP values extraction failed", None, {}
+            if len(tokens) == 0 or len(values) == 0:
+                return "No tokens or values extracted from SHAP", None, {}
+            # Handle multi-dimensional values
+            if len(values.shape) > 1:
+                # Use positive class values (last column for 3-class, second for 2-class)
+                pos_values = values[:, -1] if values.shape[1] >= 2 else values[:, 0]
+            else:
+                pos_values = values
+            # Ensure we have matching lengths
+            min_len = min(len(tokens), len(pos_values))
+            tokens = tokens[:min_len]
+            pos_values = pos_values[:min_len]
+            # Create visualization
+            fig = go.Figure()
+            colors = ['red' if v < 0 else 'green' for v in pos_values]
+            fig.add_trace(go.Bar(
+                x=list(range(len(tokens))),
+                y=pos_values,
+                text=tokens,
+                textposition='outside',
+                marker_color=colors,
+                name='SHAP Values',
+                hovertemplate='<b>%{text}</b><br>SHAP Value: %{y:.4f}<extra></extra>'
+            ))
+            fig.update_layout(
+                title=f"SHAP Analysis - Token Importance (Samples: {num_samples})",
+                xaxis_title="Token Index",
+                yaxis_title="SHAP Value",
+                height=500,
+                xaxis=dict(tickmode='array', tickvals=list(range(len(tokens))), ticktext=tokens)
+            )
+            # Create analysis summary
+            analysis_data = {
+                'method': 'SHAP',
+                'language': detected_lang,
+                'total_tokens': len(tokens),
+                'samples_used': num_samples,
+                'positive_influence': sum(1 for v in pos_values if v > 0),
+                'negative_influence': sum(1 for v in pos_values if v < 0),
+                'most_important_tokens': [(str(tokens[i]), float(pos_values[i]))
+                                        for i in np.argsort(np.abs(pos_values))[-5:]]
+            }
+            summary_text = f"""
+**SHAP Analysis Results:**
+- **Language:** {detected_lang.upper()}
+- **Total Tokens:** {analysis_data['total_tokens']}
+- **Samples Used:** {num_samples}
+- **Positive Influence Tokens:** {analysis_data['positive_influence']}
+- **Negative Influence Tokens:** {analysis_data['negative_influence']}
+- **Most Important Tokens:** {', '.join([f"{token}({score:.3f})" for token, score in analysis_data['most_important_tokens']])}
+- **Status:** SHAP analysis completed successfully
+            """
+            return summary_text, fig, analysis_data
+        except Exception as e:
+            logger.error(f"SHAP analysis failed: {e}")
+            error_msg = f"""
+**SHAP Analysis Failed:**
+- **Error:** {str(e)}
+- **Language:** {detected_lang.upper()}
+- **Suggestion:** Try with a shorter text or reduce number of samples
+**Common fixes:**
+- Reduce sample size to 50-100
+- Use shorter input text (< 200 words)
+- Check if model supports the text language
+            """
+            return error_msg, None, {}
+    @handle_errors(default_return=("Analysis failed", None, None))
+    def analyze_with_lime(self, text: str, language: str = 'auto', num_samples: int = 100) -> Tuple[str, go.Figure, Dict]:
+        """FIXED LIME analysis implementation - Bug Fix for mode parameter"""
+        if not text.strip():
+            return "Please enter text for analysis", None, {}
+        # Detect language and get model
+        if language == 'auto':
+            detected_lang = self.model_manager.detect_language(text)
+        else:
+            detected_lang = language
+        model, tokenizer = self.model_manager.get_model(detected_lang)
+        try:
+            # Create FIXED prediction function
+            predict_fn = self.create_prediction_function(model, tokenizer, self.model_manager.device)
+            # Test the prediction function first
+            test_pred = predict_fn([text])
+            if test_pred is None or len(test_pred) == 0:
+                return "Prediction function test failed", None, {}
+            # Determine class names based on model output
+            num_classes = test_pred.shape[1] if len(test_pred.shape) > 1 else 2
+            if num_classes == 3:
+                class_names = ['Negative', 'Neutral', 'Positive']
+            else:
+                class_names = ['Negative', 'Positive']
+            # Initialize LIME explainer - FIXED: Remove 'mode' parameter
+            explainer = LimeTextExplainer(class_names=class_names)
+            # Get LIME explanation
+            exp = explainer.explain_instance(
+                text,
+                predict_fn,
+                num_features=min(20, len(text.split())),  # Limit features
+                num_samples=num_samples
+            )
+            # Extract feature importance
+            lime_data = exp.as_list()
+            if not lime_data:
+                return "No LIME features extracted", None, {}
+            # Create visualization
+            words = [item[0] for item in lime_data]
+            scores = [item[1] for item in lime_data]
+            fig = go.Figure()
+            colors = ['red' if s < 0 else 'green' for s in scores]
+            fig.add_trace(go.Bar(
+                y=words,
+                x=scores,
+                orientation='h',
+                marker_color=colors,
+                text=[f'{s:.3f}' for s in scores],
+                textposition='auto',
+                name='LIME Importance',
+                hovertemplate='<b>%{y}</b><br>Importance: %{x:.4f}<extra></extra>'
+            ))
+            fig.update_layout(
+                title=f"LIME Analysis - Feature Importance (Samples: {num_samples})",
+                xaxis_title="Importance Score",
+                yaxis_title="Words/Phrases",
+                height=500
+            )
+            # Create analysis summary
+            analysis_data = {
+                'method': 'LIME',
+                'language': detected_lang,
+                'features_analyzed': len(lime_data),
+                'samples_used': num_samples,
+                'positive_features': sum(1 for _, score in lime_data if score > 0),
+                'negative_features': sum(1 for _, score in lime_data if score < 0),
+                'feature_importance': lime_data
+            }
+            summary_text = f"""
+**LIME Analysis Results:**
+- **Language:** {detected_lang.upper()}
+- **Features Analyzed:** {analysis_data['features_analyzed']}
+- **Classes:** {', '.join(class_names)}
+- **Samples Used:** {num_samples}
+- **Positive Features:** {analysis_data['positive_features']}
+- **Negative Features:** {analysis_data['negative_features']}
+- **Top Features:** {', '.join([f"{word}({score:.3f})" for word, score in lime_data[:5]])}
+- **Status:** LIME analysis completed successfully
+            """
+            return summary_text, fig, analysis_data
+        except Exception as e:
+            logger.error(f"LIME analysis failed: {e}")
+            error_msg = f"""
+**LIME Analysis Failed:**
+- **Error:** {str(e)}
+- **Language:** {detected_lang.upper()}
+- **Suggestion:** Try with a shorter text or reduce number of samples
+**Bug Fix Applied:**
+- ✅ Removed 'mode' parameter from LimeTextExplainer initialization
+- ✅ This should resolve the "unexpected keyword argument 'mode'" error
+**Common fixes:**
+- Reduce sample size to 50-100
+- Use shorter input text (< 200 words)
+- Check if model supports the text language
+            """
+            return error_msg, None, {}