Spaces:

sandsiv
/

spss_formatting

Sleeping

App Files Files Community

fedec65 commited on Jun 24

Commit

6e492e1

verified ·

1 Parent(s): 4ff500a

Create app.py

Browse files

Files changed (1) hide show

app.py +488 -0

app.py ADDED Viewed

	@@ -0,0 +1,488 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import os
+import traceback
+from typing import Tuple, Dict, Any, Optional
+import tempfile
+class FeedbackTransformer:
+    """
+    A class to transform feedback data with topic and sentiment columns
+    into a binary format where each topic is a separate column.
+    """
+    def __init__(self,
+                 topic_prefix="TOPIC_",
+                 sentiment_prefix="SENTIMENT_",
+                 category_prefix="Categories:",
+                 text_column="TEXT",
+                 recommendation_column="Q4_Weiterempfehlung"):
+        """
+        Initialize the FeedbackTransformer with column specifications.
+        """
+        self.topic_prefix = topic_prefix
+        self.sentiment_prefix = sentiment_prefix
+        self.category_prefix = category_prefix
+        self.text_column = text_column
+        self.recommendation_column = recommendation_column
+        self.data = None
+        self.transformed_data = None
+        self.topic_cols = []
+        self.sentiment_cols = []
+        self.category_cols = []
+        self.unique_topics = set()
+    def load_data(self, file_obj):
+        """
+        Load data from the uploaded file object.
+        """
+        if file_obj is None:
+            raise ValueError("No file uploaded")
+        # Get file extension
+        file_name = file_obj.name if hasattr(file_obj, 'name') else 'unknown'
+        _, file_ext = os.path.splitext(file_name)
+        # Read the data based on file type
+        try:
+            if file_ext.lower() in ['.xlsx', '.xls']:
+                self.data = pd.read_excel(file_obj.name)
+            elif file_ext.lower() == '.csv':
+                # Try comma delimiter first
+                try:
+                    self.data = pd.read_csv(file_obj.name, encoding='utf-8')
+                except:
+                    # If comma fails, try tab delimiter
+                    self.data = pd.read_csv(file_obj.name, sep='\t', encoding='utf-8')
+            else:
+                # Default to tab-delimited
+                self.data = pd.read_csv(file_obj.name, sep='\t', encoding='utf-8')
+        except Exception as e:
+            raise ValueError(f"Error reading file: {str(e)}")
+        return len(self.data), len(self.data.columns)
+    def identify_columns(self):
+        """
+        Identify topic, category, and sentiment columns in the data.
+        """
+        if self.data is None:
+            raise ValueError("Data not loaded")
+        # Extract columns based on prefixes
+        self.topic_cols = [col for col in self.data.columns if self.topic_prefix in col]
+        self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
+        self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
+        # If no columns found with specified prefixes, return all columns for manual selection
+        all_cols = list(self.data.columns)
+        return {
+            'topic_cols': self.topic_cols,
+            'sentiment_cols': self.sentiment_cols,
+            'category_cols': self.category_cols,
+            'all_columns': all_cols
+        }
+    def extract_unique_topics(self):
+        """
+        Extract all unique topics from the topic columns.
+        """
+        self.unique_topics = set()
+        # Extract from topic columns
+        for col in self.topic_cols:
+            self.unique_topics.update(self.data[col].dropna().unique())
+        # Also extract from category columns if they exist
+        for col in self.category_cols:
+            self.unique_topics.update(self.data[col].dropna().unique())
+        # Remove empty topics
+        self.unique_topics = {t for t in self.unique_topics if isinstance(t, str) and t.strip()}
+        return len(self.unique_topics)
+    @staticmethod
+    def create_column_name(topic):
+        """
+        Create a standardized column name from a topic string.
+        """
+        # Remove special characters and standardize
+        topic_clean = str(topic).strip()
+        # Remove brackets and special characters
+        topic_clean = topic_clean.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
+        topic_clean = topic_clean.replace('**', '').replace('*', '')
+        topic_clean = topic_clean.replace('.', '_').replace(' ', '_').replace('&', 'and')
+        topic_clean = topic_clean.replace(':', '_').replace('-', '_').replace('/', '_')
+        # Remove multiple underscores
+        while '__' in topic_clean:
+            topic_clean = topic_clean.replace('__', '_')
+        return topic_clean.lower().strip('_')
+    def transform_data(self):
+        """
+        Transform the data into binary topic columns with sentiment values.
+        """
+        if not self.unique_topics:
+            self.extract_unique_topics()
+        # Create output dataframe with feedback_id
+        self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
+        # Initialize all topic columns to 0
+        for topic in sorted(self.unique_topics):
+            topic_col = self.create_column_name(topic)
+            self.transformed_data[topic_col] = 0
+            self.transformed_data[f'{topic_col}_sentiment'] = None
+        # Fill in the data from topic columns
+        for idx, row in self.data.iterrows():
+            # Process topic columns with sentiments
+            for i, t_col in enumerate(self.topic_cols):
+                topic = row.get(t_col)
+                # Find corresponding sentiment column
+                if i < len(self.sentiment_cols):
+                    sentiment = row.get(self.sentiment_cols[i])
+                else:
+                    sentiment = None
+                if pd.notna(topic) and isinstance(topic, str) and topic.strip():
+                    topic_col = self.create_column_name(topic)
+                    if topic_col in self.transformed_data.columns:
+                        self.transformed_data.loc[idx, topic_col] = 1
+                        # Convert sentiment to numeric value
+                        if pd.notna(sentiment) and isinstance(sentiment, str):
+                            sentiment_lower = sentiment.lower()
+                            if 'positive' in sentiment_lower:
+                                self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 1
+                            elif 'negative' in sentiment_lower:
+                                self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0
+                            elif 'neutral' in sentiment_lower:
+                                self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0.5
+            # Process category columns (these typically don't have sentiments)
+            for c_col in self.category_cols:
+                category = row.get(c_col)
+                if pd.notna(category) and isinstance(category, str) and category.strip():
+                    category_col = self.create_column_name(category)
+                    if category_col in self.transformed_data.columns:
+                        self.transformed_data.loc[idx, category_col] = 1
+        # Add original text if available
+        if self.text_column in self.data.columns:
+            self.transformed_data['original_text'] = self.data[self.text_column]
+        # Add recommendation score if available
+        if self.recommendation_column in self.data.columns:
+            self.transformed_data['recommendation_score'] = self.data[self.recommendation_column]
+        return self.transformed_data.shape
+    def analyze_data(self):
+        """
+        Analyze the transformed data to provide insights.
+        """
+        if self.transformed_data is None:
+            raise ValueError("No transformed data to analyze")
+        # Identify topic columns
+        topic_cols = [col for col in self.transformed_data.columns
+                     if col != 'feedback_id' and
+                        col != 'original_text' and
+                        col != 'recommendation_score' and
+                        not col.endswith('_sentiment')]
+        # Count occurrences of each topic
+        topic_counts = {}
+        for topic in topic_cols:
+            topic_counts[topic] = self.transformed_data[topic].sum()
+        # Sort topics by frequency
+        sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
+        # Prepare analysis summary
+        analysis_text = f"**Analysis Results**\n\n"
+        analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
+        analysis_text += f"Unique topics: {len(topic_cols)}\n\n"
+        analysis_text += "**Top 10 Most Frequent Topics:**\n"
+        for topic, count in sorted_topics[:10]:
+            analysis_text += f"- {topic}: {count} occurrences\n"
+        # Calculate sentiment distributions for top topics
+        analysis_text += "\n**Sentiment Distributions for Top 5 Topics:**\n"
+        for topic, _ in sorted_topics[:5]:
+            sentiment_col = f"{topic}_sentiment"
+            if sentiment_col in self.transformed_data.columns:
+                # Filter rows where the topic is present
+                topic_rows = self.transformed_data[self.transformed_data[topic] == 1]
+                positive = (topic_rows[sentiment_col] == 1.0).sum()
+                negative = (topic_rows[sentiment_col] == 0.0).sum()
+                neutral = (topic_rows[sentiment_col] == 0.5).sum()
+                total = positive + negative + neutral
+                if total > 0:
+                    analysis_text += f"\n{topic} ({total} occurrences):\n"
+                    analysis_text += f"  - Positive: {positive} ({positive/total*100:.1f}%)\n"
+                    analysis_text += f"  - Negative: {negative} ({negative/total*100:.1f}%)\n"
+                    analysis_text += f"  - Neutral: {neutral} ({neutral/total*100:.1f}%)\n"
+        # Calculate number of topics per feedback
+        self.transformed_data['topic_count'] = self.transformed_data[topic_cols].sum(axis=1)
+        avg_topics = self.transformed_data['topic_count'].mean()
+        max_topics = self.transformed_data['topic_count'].max()
+        analysis_text += f"\n**Topics per Feedback:**\n"
+        analysis_text += f"- Average: {avg_topics:.2f}\n"
+        analysis_text += f"- Maximum: {max_topics}\n"
+        return analysis_text
+    def save_transformed_data(self, output_format='xlsx'):
+        """
+        Save the transformed data and return the file path.
+        """
+        if self.transformed_data is None:
+            raise ValueError("No transformed data to save")
+        # Create a temporary file
+        if output_format == 'xlsx':
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
+            self.transformed_data.to_excel(temp_file.name, index=False)
+        else:  # csv
+            temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
+            self.transformed_data.to_csv(temp_file.name, index=False)
+        return temp_file.name
+# Gradio interface functions
+def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
+                text_column, recommendation_column, output_format, analyze_data):
+    """
+    Main processing function for Gradio interface.
+    """
+    try:
+        # Initialize transformer
+        transformer = FeedbackTransformer(
+            topic_prefix=topic_prefix,
+            sentiment_prefix=sentiment_prefix,
+            category_prefix=category_prefix,
+            text_column=text_column,
+            recommendation_column=recommendation_column
+        )
+        # Load data
+        rows, cols = transformer.load_data(file_obj)
+        status_msg = f"✅ Loaded {rows} rows and {cols} columns\n"
+        # Identify columns
+        col_info = transformer.identify_columns()
+        status_msg += f"\n📊 Found columns:\n"
+        status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
+        status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
+        status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
+        # Extract unique topics
+        num_topics = transformer.extract_unique_topics()
+        status_msg += f"\n🎯 Found {num_topics} unique topics\n"
+        # Transform data
+        shape = transformer.transform_data()
+        status_msg += f"\n✨ Transformed data shape: {shape[0]} rows × {shape[1]} columns\n"
+        # Analyze if requested
+        analysis_result = ""
+        if analyze_data:
+            analysis_result = transformer.analyze_data()
+        # Save transformed data
+        output_file = transformer.save_transformed_data(output_format)
+        return status_msg, analysis_result, output_file
+    except Exception as e:
+        error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
+        return error_msg, "", None
+def get_column_preview(file_obj):
+    """
+    Get a preview of columns in the uploaded file.
+    """
+    try:
+        if file_obj is None:
+            return "Please upload a file first."
+        # Read first few rows to get column names
+        file_name = file_obj.name if hasattr(file_obj, 'name') else 'unknown'
+        _, file_ext = os.path.splitext(file_name)
+        if file_ext.lower() in ['.xlsx', '.xls']:
+            df = pd.read_excel(file_obj.name, nrows=5)
+        elif file_ext.lower() == '.csv':
+            try:
+                df = pd.read_csv(file_obj.name, nrows=5)
+            except:
+                df = pd.read_csv(file_obj.name, sep='\t', nrows=5)
+        else:
+            df = pd.read_csv(file_obj.name, sep='\t', nrows=5)
+        columns = list(df.columns)
+        preview = "**Available columns:**\n"
+        for i, col in enumerate(columns, 1):
+            preview += f"{i}. {col}\n"
+        return preview
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Feedback Topic & Sentiment Transformer") as demo:
+    gr.Markdown("""
+    # 📊 Feedback Topic & Sentiment Transformer
+    Transform feedback data with topic and sentiment columns into a binary matrix format.
+    Each unique topic becomes a separate column with 0/1 values and associated sentiment scores.
+    ### 📋 Instructions:
+    1. Upload your Excel, CSV, or tab-delimited text file
+    2. Configure column prefixes (or use defaults)
+    3. Click "Transform Data" to process
+    4. Download the transformed file
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            # File upload
+            input_file = gr.File(
+                label="Upload Input File",
+                file_types=[".xlsx", ".xls", ".csv", ".txt"],
+                type="filepath"
+            )
+            # Column preview button
+            preview_btn = gr.Button("Preview Columns", variant="secondary")
+            column_preview = gr.Textbox(
+                label="Column Preview",
+                lines=10,
+                interactive=False
+            )
+        with gr.Column(scale=1):
+            # Configuration parameters
+            gr.Markdown("### ⚙️ Configuration")
+            topic_prefix = gr.Textbox(
+                label="Topic Column Prefix",
+                value="[**WORKSHOP] SwissLife Taxonomy",
+                info="Prefix to identify topic columns"
+            )
+            sentiment_prefix = gr.Textbox(
+                label="Sentiment Column Prefix",
+                value="ABSA:",
+                info="Prefix to identify sentiment columns"
+            )
+            category_prefix = gr.Textbox(
+                label="Category Column Prefix",
+                value="Categories:",
+                info="Prefix to identify category columns"
+            )
+            text_column = gr.Textbox(
+                label="Text Column Name",
+                value="TEXT",
+                info="Column containing original feedback text"
+            )
+            recommendation_column = gr.Textbox(
+                label="Recommendation Column Name",
+                value="Q4_Weiterempfehlung",
+                info="Column containing recommendation scores"
+            )
+            output_format = gr.Radio(
+                label="Output Format",
+                choices=["xlsx", "csv"],
+                value="xlsx"
+            )
+            analyze_checkbox = gr.Checkbox(
+                label="Analyze transformed data",
+                value=True
+            )
+    # Transform button
+    transform_btn = gr.Button("🔄 Transform Data", variant="primary", size="lg")
+    # Output sections
+    with gr.Row():
+        with gr.Column():
+            status_output = gr.Textbox(
+                label="Processing Status",
+                lines=10,
+                interactive=False
+            )
+        with gr.Column():
+            analysis_output = gr.Markdown(
+                label="Data Analysis"
+            )
+    # Download section
+    output_file = gr.File(
+        label="📥 Download Transformed File",
+        interactive=False
+    )
+    # Event handlers
+    preview_btn.click(
+        fn=get_column_preview,
+        inputs=[input_file],
+        outputs=[column_preview]
+    )
+    transform_btn.click(
+        fn=process_file,
+        inputs=[
+            input_file,
+            topic_prefix,
+            sentiment_prefix,
+            category_prefix,
+            text_column,
+            recommendation_column,
+            output_format,
+            analyze_checkbox
+        ],
+        outputs=[status_output, analysis_output, output_file]
+    )
+    # Examples section
+    gr.Markdown("""
+    ### 📝 Example Column Formats:
+    - **Topic columns**: `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 1`, `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 2`
+    - **Category columns**: `Categories:Topic1`, `Categories:Topic2`
+    - **Sentiment columns**: `ABSA:Sentiment1`, `ABSA:Sentiment2`
+    ### 🎯 Output Format:
+    - Each unique topic becomes a column with values 0 (absent) or 1 (present)
+    - Each topic has an associated `_sentiment` column with values:
+      - 1.0 = Positive
+      - 0.5 = Neutral
+      - 0.0 = Negative
+    - Original text and recommendation scores are preserved if available
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()