Spaces:

sandsiv
/

spss_formatting

Sleeping

App Files Files Community

geekgirl3 commited on Aug 5

Commit

20f8cb2

verified ·

1 Parent(s): 36c3a00

Update app.py

Browse files

Files changed (1) hide show

app.py +359 -627

app.py CHANGED Viewed

@@ -2,363 +2,196 @@ import gradio as gr
 import pandas as pd
 import numpy as np
 import os
-import traceback
-from typing import Tuple, Dict, Any, Optional, List
-import tempfile
-import io
-import datetime
 import re
-class FeedbackTransformer:
-    """
-    A class to transform feedback data with delimited topic and sentiment columns
-    into binary columns with prefixes T_, S_, and C_.
-    """
-    def __init__(self,
-                 topic_prefix="TOPIC_",
-                 sentiment_prefix="SENTIMENT_",
-                 category_prefix="Categories:",
-                 text_column="TEXT",
-                 recommendation_column="Q4_Weiterempfehlung"):
-        """
-        Initialize the FeedbackTransformer with column specifications.
-        """
-        self.topic_prefix = topic_prefix
-        self.sentiment_prefix = sentiment_prefix
-        self.category_prefix = category_prefix
-        self.text_column = text_column
-        self.recommendation_column = recommendation_column
-        self.data = None
-        self.transformed_data = None
-        self.topic_cols = []
-        self.sentiment_cols = []
-        self.category_cols = []
-        self.unique_topics = set()
-        self.unique_categories = set()
-        self.unique_sentiments = set()
-        self.topic_sentiment_mapping = {}  # Map topics to their sentiment values
-        self.file_name = None
         self.original_filename = None
-        self.selected_columns = []
-        self.verbatim_column = None  # Store the verbatim/text column
-    def load_data(self, file_obj):
-        """
-        Load data from the uploaded file object.
-        """
-        if file_obj is None:
-            raise ValueError("No file uploaded")
-        # Get file extension and store original filename
-        file_name = file_obj if isinstance(file_obj, str) else (file_obj.name if hasattr(file_obj, 'name') else 'unknown')
-        self.original_filename = os.path.splitext(os.path.basename(file_name))[0]
-        _, file_ext = os.path.splitext(file_name)
-        # Read the data based on file type
         try:
-            if file_ext.lower() in ['.xlsx', '.xls']:
-                self.data = pd.read_excel(file_obj)
-            elif file_ext.lower() == '.csv':
-                # Try comma delimiter first
-                try:
-                    self.data = pd.read_csv(file_obj, encoding='utf-8')
-                except:
-                    # If comma fails, try tab delimiter
-                    self.data = pd.read_csv(file_obj, sep='\t', encoding='utf-8')
-            else:
-                # Default to tab-delimited
-                self.data = pd.read_csv(file_obj, sep='\t', encoding='utf-8')
         except Exception as e:
-            raise ValueError(f"Error reading file: {str(e)}")
-        return len(self.data), len(self.data.columns)
-    def identify_columns(self):
-        """
-        Identify topic, category, and sentiment columns in the data.
-        """
-        if self.data is None:
-            raise ValueError("Data not loaded")
-        # Extract columns based on prefixes
-        self.topic_cols = [col for col in self.data.columns if "Topic:" in col]
-        self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
-        self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
-        # Try to identify verbatim/text column
-        text_candidates = [col for col in self.data.columns if any(keyword in col.lower() for keyword in ['text', 'verbatim', 'comment', 'feedback'])]
-        if text_candidates:
-            self.verbatim_column = text_candidates[0]  # Use the first match
-        elif self.text_column in self.data.columns:
-            self.verbatim_column = self.text_column
-        # If no columns found with specified prefixes, return all columns for manual selection
-        all_cols = list(self.data.columns)
-        return {
-            'topic_cols': self.topic_cols,
-            'sentiment_cols': self.sentiment_cols,
-            'category_cols': self.category_cols,
-            'all_columns': all_cols,
-            'verbatim_column': self.verbatim_column
-        }
-    def extract_unique_topics_and_categories(self):
-        """
-        Extract all unique topics, categories, and sentiments from the respective columns.
-        """
-        self.unique_topics = set()
-        self.unique_categories = set()
-        self.unique_sentiments = set()
-        self.topic_sentiment_mapping = {}
-        # Extract from topic columns (delimited by |)
-        for col in self.topic_cols:
-            for value in self.data[col].dropna():
-                if isinstance(value, str) and value.strip():
-                    # Split by | delimiter and clean each topic
-                    topics = [topic.strip() for topic in value.split('|') if topic.strip()]
-                    self.unique_topics.update(topics)
-        # Extract from category columns (delimited by |)
-        for col in self.category_cols:
-            for value in self.data[col].dropna():
-                if isinstance(value, str) and value.strip():
-                    # Split by | delimiter and clean each category
-                    categories = [cat.strip() for cat in value.split('|') if cat.strip()]
-                    self.unique_categories.update(categories)
-        # Extract sentiments from sentiment columns and build topic-sentiment mapping
-        for col in self.sentiment_cols:
-            for idx, value in enumerate(self.data[col].dropna()):
-                if isinstance(value, str) and value.strip():
-                    # Split by | delimiter to get individual topic::sentiment pairs
-                    pairs = [pair.strip() for pair in value.split('|') if pair.strip() and '::' in pair]
-                    for pair in pairs:
-                        if '::' in pair:
-                            topic_part, sentiment_part = pair.split('::', 1)
-                            topic = topic_part.strip()
-                            sentiment = sentiment_part.strip()
-                            if topic and sentiment:
-                                self.unique_topics.add(topic)  # Add topic from sentiment data
-                                self.unique_sentiments.add(sentiment)
-                                # Store the mapping for later use
-                                if idx not in self.topic_sentiment_mapping:
-                                    self.topic_sentiment_mapping[idx] = {}
-                                self.topic_sentiment_mapping[idx][topic] = sentiment
-        return len(self.unique_topics), len(self.unique_categories), len(self.unique_sentiments)
-    def set_selected_columns(self, selected_columns):
-        """
-        Set which original columns should be included in the output.
-        """
-        self.selected_columns = selected_columns if selected_columns else []
-    def transform_data(self):
-        """
-        Transform the data into binary columns with T_, S_, and C_ prefixes.
-        """
-        if not self.unique_topics and not self.unique_categories:
-            self.extract_unique_topics_and_categories()
-        # Create output dataframe starting with feedback_id
-        self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
-        # Add selected original columns first (right after feedback_id)
-        for col in self.selected_columns:
-            if col in self.data.columns:
-                self.transformed_data[col] = self.data[col]
-        # Add Verbatim sentiment columns
-        self.transformed_data['Verbatim_Positive'] = 0
-        self.transformed_data['Verbatim_Neutral'] = 0
-        self.transformed_data['Verbatim_Negative'] = 0
-        # Create binary topic columns with T_ prefix
-        for topic in sorted(self.unique_topics):
-            safe_topic_name = self._make_safe_column_name(topic)
-            col_name = f"T_{safe_topic_name}"
-            self.transformed_data[col_name] = 0
-        # Create sentiment columns with S_ prefix (one per topic, containing actual sentiment values)
-        for topic in sorted(self.unique_topics):
-            safe_topic_name = self._make_safe_column_name(topic)
-            col_name = f"S_{safe_topic_name}"
-            self.transformed_data[col_name] = ""  # Initialize with empty strings
-        # Create binary category columns with C_ prefix
-        for category in sorted(self.unique_categories):
-            safe_category_name = self._make_safe_column_name(category)
-            col_name = f"C_{safe_category_name}"
-            self.transformed_data[col_name] = 0
-        # Fill in the data
-        for idx, row in self.data.iterrows():
-            # Process sentiment columns to determine which topics exist in ABSA column
-            topics_in_absa = set()
-            all_sentiments_in_row = set()  # Track all sentiments for verbatim columns
-            for s_col in self.sentiment_cols:
-                sentiment_value = row.get(s_col)
-                if pd.notna(sentiment_value) and isinstance(sentiment_value, str) and sentiment_value.strip():
-                    pairs = [pair.strip() for pair in sentiment_value.split('|') if pair.strip()]
-                    for pair in pairs:
-                        if '::' in pair:
-                            topic_part, sentiment_part = pair.split('::', 1)
-                            topic = topic_part.strip()
-                            sentiment = sentiment_part.strip()
-                            if topic and sentiment:
-                                topics_in_absa.add(topic)
-                                all_sentiments_in_row.add(sentiment.lower())  # Store in lowercase for matching
-                                # Set the actual sentiment value (not 1/0)
-                                safe_topic_name = self._make_safe_column_name(topic)
-                                sentiment_col_name = f"S_{safe_topic_name}"
-                                if sentiment_col_name in self.transformed_data.columns:
-                                    self.transformed_data.loc[idx, sentiment_col_name] = sentiment
-            # Set Verbatim sentiment columns based on sentiments found in ABSA
-            if any(sentiment in all_sentiments_in_row for sentiment in ['positive', 'positiv']):
-                self.transformed_data.loc[idx, 'Verbatim_Positive'] = 1
-            if any(sentiment in all_sentiments_in_row for sentiment in ['neutral']):
-                self.transformed_data.loc[idx, 'Verbatim_Neutral'] = 1
-            if any(sentiment in all_sentiments_in_row for sentiment in ['negative', 'negativ']):
-                self.transformed_data.loc[idx, 'Verbatim_Negative'] = 1
-            # Set T_ columns to 1 if topic exists in ABSA column, 0 otherwise
-            for topic in topics_in_absa:
-                safe_topic_name = self._make_safe_column_name(topic)
-                topic_col_name = f"T_{safe_topic_name}"
-                if topic_col_name in self.transformed_data.columns:
-                    self.transformed_data.loc[idx, topic_col_name] = 1
-            # Process category columns
-            categories_in_row = set()
-            for c_col in self.category_cols:
-                category_value = row.get(c_col)
-                if pd.notna(category_value) and isinstance(category_value, str) and category_value.strip():
-                    categories = [cat.strip() for cat in category_value.split('|') if cat.strip()]
-                    categories_in_row.update(categories)
-            # Set category binary values (always 1 if present in category column)
-            for category in categories_in_row:
-                safe_category_name = self._make_safe_column_name(category)
-                category_col_name = f"C_{safe_category_name}"
-                if category_col_name in self.transformed_data.columns:
-                    self.transformed_data.loc[idx, category_col_name] = 1
-        return self.transformed_data.shape
-    def _make_safe_column_name(self, name):
-        """
-        Convert a name to a safe column name by removing/replacing problematic characters.
-        """
-        # Replace spaces and special characters with underscores
-        safe_name = re.sub(r'[^\w]', '_', str(name))
-        # Remove multiple consecutive underscores
-        safe_name = re.sub(r'_+', '_', safe_name)
-        # Remove leading/trailing underscores
-        safe_name = safe_name.strip('_')
-        return safe_name
-    def analyze_data(self):
-        """
-        Analyze the transformed data to provide insights.
-        """
-        if self.transformed_data is None:
-            raise ValueError("No transformed data to analyze")
-        # Count different types of columns
-        topic_cols = [col for col in self.transformed_data.columns if col.startswith('T_')]
-        sentiment_cols = [col for col in self.transformed_data.columns if col.startswith('S_')]
-        category_cols = [col for col in self.transformed_data.columns if col.startswith('C_')]
-        verbatim_cols = ['Verbatim_Positive', 'Verbatim_Neutral', 'Verbatim_Negative']
-        # Calculate statistics
-        topic_stats = {}
-        for col in topic_cols:
-            topic_stats[col] = self.transformed_data[col].sum()
-        # For sentiment columns, count non-empty values
-        sentiment_stats = {}
-        for col in sentiment_cols:
-            sentiment_stats[col] = (self.transformed_data[col] != "").sum()
-        category_stats = {}
-        for col in category_cols:
-            category_stats[col] = self.transformed_data[col].sum()
-        # Verbatim sentiment statistics
-        verbatim_stats = {}
-        for col in verbatim_cols:
-            if col in self.transformed_data.columns:
-                verbatim_stats[col] = self.transformed_data[col].sum()
-        # Sort by frequency
-        sorted_topics = sorted(topic_stats.items(), key=lambda x: x[1], reverse=True)
-        sorted_sentiments = sorted(sentiment_stats.items(), key=lambda x: x[1], reverse=True)
-        sorted_categories = sorted(category_stats.items(), key=lambda x: x[1], reverse=True)
-        sorted_verbatim = sorted(verbatim_stats.items(), key=lambda x: x[1], reverse=True)
-        # Prepare analysis summary
-        analysis_text = f"**Analysis Results**\n\n"
-        analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
-        analysis_text += f"Selected original columns: {len(self.selected_columns)}\n"
-        analysis_text += f"Verbatim sentiment columns: 3 (Positive, Neutral, Negative)\n"
-        analysis_text += f"Topic columns (T_): {len(topic_cols)}\n"
-        analysis_text += f"Sentiment columns (S_): {len(sentiment_cols)}\n"
-        analysis_text += f"Category columns (C_): {len(category_cols)}\n"
-        analysis_text += f"Verbatim column used: {self.verbatim_column}\n\n"
-        if self.selected_columns:
-            analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
-        # Verbatim sentiment analysis
-        if sorted_verbatim:
-            analysis_text += "**Verbatim Sentiment Distribution:**\n"
-            for verbatim_col, count in sorted_verbatim:
-                percentage = (count / len(self.transformed_data)) * 100
-                analysis_text += f"- {verbatim_col}: {count} occurrences ({percentage:.1f}%)\n"
-        # Topic analysis
-        if sorted_topics:
-            analysis_text += "\n**Top 10 Most Frequent Topics (T_):**\n"
-            for topic_col, count in sorted_topics[:10]:
-                analysis_text += f"- {topic_col}: {count} occurrences\n"
-        # Category analysis
-        if sorted_categories:
-            analysis_text += "\n**Top 10 Most Frequent Categories (C_):**\n"
-            for category_col, count in sorted_categories[:10]:
-                analysis_text += f"- {category_col}: {count} occurrences\n"
-        # Sentiment analysis
-        if sorted_sentiments:
-            analysis_text += "\n**Top 10 Most Frequent Sentiments (S_):**\n"
-            for sentiment_col, count in sorted_sentiments[:10]:
-                analysis_text += f"- {sentiment_col}: {count} sentiment values\n"
-        return analysis_text
     def save_transformed_data(self, output_format='xlsx'):
-        """
-        Save the transformed data and return the file path.
-        """
-        if self.transformed_data is None:
             raise ValueError("No transformed data to save")
         # Create filename with original filename prefix and timestamp
-        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-        prefix = self.original_filename if self.original_filename else 'transformed_feedback'
         if output_format == 'xlsx':
-            filename = f"{prefix}_transformed_topics_{timestamp}.xlsx"
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
-            self.transformed_data.to_excel(temp_file.name, index=False)
             temp_file.close()
             final_path = os.path.join(tempfile.gettempdir(), filename)
@@ -367,9 +200,9 @@ class FeedbackTransformer:
             os.rename(temp_file.name, final_path)
         else:  # csv
-            filename = f"{prefix}_binary_matrix_{timestamp}.csv"
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
-            self.transformed_data.to_csv(temp_file.name, index=False)
             temp_file.close()
             final_path = os.path.join(tempfile.gettempdir(), filename)
@@ -382,257 +215,175 @@ class FeedbackTransformer:
         return final_path
-# Gradio interface functions
-def get_column_selector(file_obj):
-    """
-    Get a combined column preview and selector interface.
-    """
     try:
-        if file_obj is None:
-            return gr.CheckboxGroup(
-                choices=[],
-                value=[],
-                label="📋 Select Columns to Include",
-                info="Upload a file first to see available columns"
-            )
-        # Read first few rows to get column names
-        file_name = file_obj if isinstance(file_obj, str) else (file_obj.name if hasattr(file_obj, 'name') else 'unknown')
-        _, file_ext = os.path.splitext(file_name)
-        if file_ext.lower() in ['.xlsx', '.xls']:
-            df = pd.read_excel(file_obj, nrows=5)
-        elif file_ext.lower() == '.csv':
-            try:
-                df = pd.read_csv(file_obj, nrows=5)
-            except:
-                df = pd.read_csv(file_obj, sep='\t', nrows=5)
-        else:
-            df = pd.read_csv(file_obj, sep='\t', nrows=5)
-        columns = list(df.columns)
-        column_choices = [f"{i+1:2d}. {col}" for i, col in enumerate(columns)]
-        return gr.CheckboxGroup(
-            choices=column_choices,
-            value=[],
-            label=f"📋 Select Columns to Include ({len(columns)} available)",
-            info="Choose which original columns to include in the transformed file (in addition to feedback_id).",
-            elem_classes=["column-selector"]
         )
     except Exception as e:
-        return gr.CheckboxGroup(
-            choices=[],
-            value=[],
-            label="📋 Select Columns to Include",
-            info=f"Error reading file: {str(e)}"
-        )
-def extract_column_names(selected_display_names):
-    """
-    Extract actual column names from the numbered display format.
-    """
-    if not selected_display_names:
-        return []
-    actual_names = []
-    for display_name in selected_display_names:
-        if '. ' in display_name:
-            actual_name = display_name.split('. ', 1)[1]
-            actual_names.append(actual_name)
-        else:
-            actual_names.append(display_name)
-    return actual_names
-def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
-                text_column, recommendation_column, output_format, analyze_data, selected_columns):
-    """
-    Main processing function for Gradio interface.
-    """
     try:
-        # Extract actual column names from display format
-        actual_column_names = extract_column_names(selected_columns)
-        # Initialize transformer
-        transformer = FeedbackTransformer(
-            topic_prefix=topic_prefix,
-            sentiment_prefix=sentiment_prefix,
-            category_prefix=category_prefix,
-            text_column=text_column,
-            recommendation_column=recommendation_column
-        )
-        # Load data
-        rows, cols = transformer.load_data(file_obj)
-        status_msg = f"✅ Loaded {rows} rows and {cols} columns\n"
-        # Set selected columns for inclusion
-        transformer.set_selected_columns(actual_column_names)
-        status_msg += f"📋 Selected {len(actual_column_names)} original columns for inclusion\n"
-        if actual_column_names:
-            status_msg += f"   Selected columns: {', '.join(actual_column_names)}\n"
-        # Identify columns
-        col_info = transformer.identify_columns()
-        status_msg += f"\n📊 Found columns:\n"
-        status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
-        status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
-        status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
-        status_msg += f"- Verbatim column: {col_info['verbatim_column']}\n"
-        # Extract unique topics, categories, and sentiments
-        num_topics, num_categories, num_sentiments = transformer.extract_unique_topics_and_categories()
-        status_msg += f"\n🎯 Found {num_topics} unique topics\n"
-        status_msg += f"🏷️ Found {num_categories} unique categories\n"
-        status_msg += f"💭 Found {num_sentiments} unique sentiments\n"
-        # Transform data
-        shape = transformer.transform_data()
-        status_msg += f"\n✨ Transformed data shape: {shape[0]} rows × {shape[1]} columns\n"
-        status_msg += f"📊 Binary matrix created with T_, S_, C_ prefixes and Verbatim sentiment columns\n"
-        status_msg += f"🔧 T_ columns: 1 if topic present in ABSA column, 0 otherwise\n"
-        status_msg += f"🔧 S_ columns: contain actual sentiment values (not 1/0)\n"
-        status_msg += f"🔧 C_ columns: 1 if category assigned, 0 otherwise\n"
-        status_msg += f"🔧 Verbatim_Positive/Neutral/Negative: 1 if respective sentiment found in ABSA, 0 otherwise\n"
-        # Analyze if requested
-        analysis_result = ""
-        if analyze_data:
-            analysis_result = transformer.analyze_data()
-        # Save transformed data
-        output_file = transformer.save_transformed_data(output_format)
-        status_msg += f"\n💾 File saved successfully: {os.path.basename(output_file)}\n"
-        #status_msg += f"📥 File download should start automatically\n"
-        return status_msg, analysis_result, output_file
     except Exception as e:
-        error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
-        return error_msg, "", None
-# Create Gradio interface
-with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
-.column-selector .form-check {
-    display: block !important;
-    margin-bottom: 8px !important;
-}
-.column-selector .form-check-input {
-    margin-right: 8px !important;
-}
-""") as demo:
     gr.Markdown("""
-    # 📊 Binary Matrix Feedback Transformer
-    Transform feedback data with delimited topic and sentiment columns into binary matrix format.
-    ### 🔧 Processing Logic:
-    - **Verbatim_Positive/Neutral/Negative**: Set to 1 if respective sentiment is found in ABSA column, 0 otherwise
-    - **T_ Columns**: Set to 1 if topic is present in ABSA column, 0 otherwise
-    - **S_ Columns**: One column per topic (e.g., S_Allgemeine_Zufriedenheit) containing actual sentiment values
-    - **C_ Columns**: Set to 1 if category is assigned, 0 otherwise
-    ### 📋 Data Format Requirements:
-    - **Topics**: Delimited by `|` (pipe) in "Topics:" columns (optional)
-    - **Sentiments**: Format `Topic::Sentiment|Topic2::Sentiment2` in ABSA columns
-    - **Categories**: Delimited by `|` (pipe) in "Categories:" columns
-    ### 🆕 Key Logic:
-    - **Verbatim_** columns detect overall sentiment presence regardless of topic
-    - **T_** columns based on ABSA column presence (topics that have sentiment data)
-    - **S_** columns contain actual sentiment values (not binary 1/0)
-    - No automatic column renaming for "Topic:" prefix
     """)
     with gr.Row():
         with gr.Column(scale=1):
-            # File upload
-            gr.Markdown("### 📋 1. Source file upload")
-            input_file = gr.File(
                 label="Upload Input File",
-                file_types=[".xlsx", ".xls", ".csv", ".txt"],
                 type="filepath"
             )
-            # Combined column selector
-            gr.Markdown("### 📋 2. Column Selection")
             column_selector = gr.CheckboxGroup(
                 choices=[],
                 value=[],
-                label="Select Columns to Include",
-                info="Upload a file first to see available columns"
             )
-        with gr.Column(scale=1):
-            # Configuration parameters
-            gr.Markdown("### ⚙️ 3. Configuration")
-            topic_prefix = gr.Textbox(
-                label="Topic Column Identifier",
-                value="Topic:",
-                info="Text to identify topic columns (for reference only)"
             )
-            sentiment_prefix = gr.Textbox(
-                label="Sentiment Column Prefix (ABSA)",
-                value="ABSA:",
-                info="Prefix to identify sentiment columns (format: Topic::Sentiment)"
-            )
-            category_prefix = gr.Textbox(
-                label="Category Column Prefix",
-                value="Categories:",
-                info="Prefix to identify category columns"
-            )
-            text_column = gr.Textbox(
-                label="Text/Verbatim Column Pattern",
-                value="TEXT",
-                info="Pattern to identify verbatim text column (for reference only)"
-            )
-            recommendation_column = gr.Textbox(
-                label="Recommendation Column Name",
-                value="Q4_Weiterempfehlung",
-                info="Column containing recommendation scores (for reference only)"
-            )
-            output_format = gr.Radio(
                 label="Output Format",
-                choices=["xlsx", "csv"],
-                value="xlsx"
             )
-            analyze_checkbox = gr.Checkbox(
-                label="Analyze transformed data",
-                value=True
             )
-    # Transform button
-    transform_btn = gr.Button("🔄 4. Transform to Binary Matrix & Download", variant="primary", size="lg")
-    # Output sections
-    with gr.Row():
-        with gr.Column():
-            status_output = gr.Textbox(
                 label="Processing Status",
-                lines=12,
-                interactive=False
-            )
-        with gr.Column():
-            analysis_output = gr.Markdown(
-                label="Data Analysis"
             )
-    # Download section
-    with gr.Row():
-        with gr.Column():
             gr.Markdown("### 📥 Download Status")
             gr.Markdown("Please click on the link inside the output file size value to download the transformed file (the number value on the right hand side below). You may need to right click and select Save Link As (or something similar)")
             output_file = gr.File(
@@ -640,64 +391,45 @@ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
                 interactive=False,
                 visible=True
             )
-    # Event handlers
-    input_file.change(
-        fn=get_column_selector,
-        inputs=[input_file],
         outputs=[column_selector]
     )
-    transform_btn.click(
-        fn=process_file,
-        inputs=[
-            input_file,
-            topic_prefix,
-            sentiment_prefix,
-            category_prefix,
-            text_column,
-            recommendation_column,
-            output_format,
-            analyze_checkbox,
-            column_selector
-        ],
-        outputs=[status_output, analysis_output, output_file]
     )
-    # Examples section
-    gr.Markdown("""
-    ### 📝 Example Transformations:
-    **Input Data:**
-    ```
-    | feedback_id | ABSA: Sentiments | Categories: Issues |
-    | 1 | Service::Negative|Quality::Positive | Issues|Support |
-    ```
-    **Output Binary Matrix:**
-    ```
-    | feedback_id | Verbatim_Positive | Verbatim_Neutral | Verbatim_Negative | T_Service | T_Quality | S_Service | S_Quality | C_Issues | C_Support |
-    | 1 | 1 | 0 | 1 | 1 | 1 | Negative | Positive | 1 | 1 |
-    ```
-    ### 💡 Column Logic:
-    - **Verbatim_Positive**: 1 if any "Positive"/"Positiv" sentiment found in ABSA
-    - **Verbatim_Neutral**: 1 if any "Neutral" sentiment found in ABSA
-    - **Verbatim_Negative**: 1 if any "Negative"/"Negativ" sentiment found in ABSA
-    - **T_[topic_name]**: 1 if topic exists in ABSA column, 0 otherwise
-    - **S_[topic_name]**: Actual sentiment value for that topic (e.g., "Positive", "Negative")
-    - **C_[category_name]**: 1 if category is assigned, 0 otherwise
-    - Safe column names (special characters replaced with underscores)
-    ### 🔍 Key Changes Made:
-    - **NEW**: Added Verbatim_Positive, Verbatim_Neutral, Verbatim_Negative columns
-    - These columns are set to 1 if the respective sentiment is found anywhere in the ABSA column
-    - Supports both English (Positive/Negative/Neutral) and German (Positiv/Negativ) sentiment detection
-    - Removed automatic "Topic:" column renaming logic
-    - T_ columns are now binary (1/0) based on topic existence in ABSA column
-    - Topics are extracted from ABSA sentiment data for T_ column creation
     """)
-# Launch the app
 if __name__ == "__main__":
-    demo.launch()

 import pandas as pd
 import numpy as np
 import os
 import re
+import tempfile
+import shutil
+from datetime import datetime
+from typing import List, Tuple, Dict, Any
+import json
+from io import BytesIO
+class CSVBinaryTransformer:
+    def __init__(self):
+        self.df = None
         self.original_filename = None
+        self.processed_df = None
+    def load_csv(self, file_path: str) -> Tuple[pd.DataFrame, List[str]]:
+        """Load CSV file and return dataframe and column list"""
         try:
+            self.df = pd.read_csv(file_path)
+            self.original_filename = os.path.splitext(os.path.basename(file_path))[0]
+            # Create checkbox options for columns
+            column_choices = [(col, col) for col in self.df.columns.tolist()]
+            return self.df.head(10), column_choices
         except Exception as e:
+            raise gr.Error(f"Error loading CSV: {str(e)}")
+    def select_all_columns(self) -> List[str]:
+        """Return all column names for select all functionality"""
+        if self.df is not None:
+            return self.df.columns.tolist()
+        return []
+    def deselect_all_columns(self) -> List[str]:
+        """Return empty list for deselect all functionality"""
+        return []
+    def process_absa_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Process ABSA prefixed columns to create sentiment and topic columns"""
+        absa_columns = [col for col in df.columns if col.startswith('ABSA')]
+        if not absa_columns:
+            return df
+        # Create verbatim sentiment columns
+        df['Verbatim_Positive'] = 0
+        df['Verbatim_Neutral'] = 0
+        df['Verbatim_Negative'] = 0
+        # Dictionary to store topic-sentiment combinations
+        topic_sentiment_combinations = set()
+        for col in absa_columns:
+            for idx, value in df[col].items():
+                if pd.isna(value):
+                    continue
+                value_str = str(value)
+                # Split by pipe delimiter to get individual topic::sentiment pairs
+                pairs = [pair.strip() for pair in value_str.split('|') if pair.strip()]
+                for pair in pairs:
+                    if '::' in pair:
+                        topic_part, sentiment_part = pair.split('::', 1)
+                        topic = topic_part.strip()
+                        sentiment = sentiment_part.strip()
+                        if topic and sentiment:
+                            topic_sentiment_combinations.add((topic, sentiment))
+                            # Update verbatim sentiment columns based on sentiment
+                            sentiment_lower = sentiment.lower()
+                            if 'positive' in sentiment_lower:
+                                df.at[idx, 'Verbatim_Positive'] = 1
+                            elif 'negative' in sentiment_lower:
+                                df.at[idx, 'Verbatim_Negative'] = 1
+                            elif 'neutral' in sentiment_lower:
+                                df.at[idx, 'Verbatim_Neutral'] = 1
+        # Create columns for topic-sentiment combinations
+        for topic, sentiment in topic_sentiment_combinations:
+            # Clean topic name for column naming
+            safe_topic = re.sub(r'[^\w]', '_', topic).strip('_')
+            col_name = f"S_{safe_topic}"
+            if col_name not in df.columns:
+                df[col_name] = ""
+        # Fill the S_ columns with sentiment values
+        for col in absa_columns:
+            for idx, value in df[col].items():
+                if pd.isna(value):
+                    continue
+                value_str = str(value)
+                # Split by pipe delimiter to get individual topic::sentiment pairs
+                pairs = [pair.strip() for pair in value_str.split('|') if pair.strip()]
+                for pair in pairs:
+                    if '::' in pair:
+                        topic_part, sentiment_part = pair.split('::', 1)
+                        topic = topic_part.strip()
+                        sentiment = sentiment_part.strip()
+                        if topic and sentiment:
+                            # Clean topic name for column naming
+                            safe_topic = re.sub(r'[^\w]', '_', topic).strip('_')
+                            col_name = f"S_{safe_topic}"
+                            if col_name in df.columns:
+                                df.at[idx, col_name] = sentiment
+        return df
+    def process_categories_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Process Categories prefixed columns to create binary category columns"""
+        category_columns = [col for col in df.columns if col.startswith('Categories')]
+        if not category_columns:
+            return df
+        # Collect all unique categories
+        all_categories = set()
+        for col in category_columns:
+            for value in df[col].dropna():
+                if isinstance(value, str):
+                    # Split by common delimiters
+                    categories = re.split(r'[,;|]', value)
+                    for cat in categories:
+                        cat = cat.strip()
+                        if cat:
+                            all_categories.add(cat)
+        # Create binary columns for each category
+        for category in all_categories:
+            col_name = f"C_{category}"
+            df[col_name] = 0
+            for col in category_columns:
+                for idx, value in df[col].items():
+                    if pd.isna(value):
+                        continue
+                    if isinstance(value, str) and category.lower() in value.lower():
+                        df.at[idx, col_name] = 1
+        return df
+    def process_topics_column(self, df: pd.DataFrame, topics_column: str) -> pd.DataFrame:
+        """Process specified topics column to create binary topic columns"""
+        if not topics_column or topics_column not in df.columns:
+            return df
+        # Collect all unique topics
+        all_topics = set()
+        for value in df[topics_column].dropna():
+            if isinstance(value, str):
+                # Split by common delimiters
+                topics = re.split(r'[,;|]', value)
+                for topic in topics:
+                    topic = topic.strip()
+                    if topic:
+                        all_topics.add(topic)
+        # Create binary columns for each topic
+        for topic in all_topics:
+            col_name = f"T_{topic}"
+            df[col_name] = 0
+            for idx, value in df[topics_column].items():
+                if pd.isna(value):
+                    continue
+                if isinstance(value, str) and topic.lower() in value.lower():
+                    df.at[idx, col_name] = 1
+        return df
     def save_transformed_data(self, output_format='xlsx'):
+        """Save the transformed data and return the file path - using exact same method as working version"""
+        if self.processed_df is None:
             raise ValueError("No transformed data to save")
         # Create filename with original filename prefix and timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        prefix = self.original_filename if self.original_filename else 'transformed_data'
         if output_format == 'xlsx':
+            filename = f"{prefix}_BinaryTransformation_{timestamp}.xlsx"
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
+            self.processed_df.to_excel(temp_file.name, index=False)
             temp_file.close()
             final_path = os.path.join(tempfile.gettempdir(), filename)
             os.rename(temp_file.name, final_path)
         else:  # csv
+            filename = f"{prefix}_BinaryTransformation_{timestamp}.csv"
             temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
+            self.processed_df.to_csv(temp_file.name, index=False)
             temp_file.close()
             final_path = os.path.join(tempfile.gettempdir(), filename)
         return final_path
+# Initialize transformer
+transformer = CSVBinaryTransformer()
+def handle_file_upload(file):
+    """Handle CSV file upload"""
+    if file is None:
+        return None, gr.update(choices=[], value=[]), "Please upload a CSV file"
     try:
+        preview_df, column_choices = transformer.load_csv(file.name)
+        preview_html = preview_df.to_html(classes="table table-striped", table_id="upload-preview")
+        # Update the checkbox group with new choices and make all columns visible
+        column_names = [col for col, _ in column_choices]
+        return (
+            preview_html,
+            gr.update(choices=column_choices, value=[], visible=True),
+            f"✅ Successfully loaded CSV with {len(transformer.df)} rows and {len(transformer.df.columns)} columns"
         )
     except Exception as e:
+        return None, gr.update(choices=[], value=[], visible=False), f"❌ Error: {str(e)}"
+def select_all():
+    """Select all columns"""
+    if transformer.df is not None:
+        return gr.update(value=transformer.select_all_columns())
+    return gr.update(value=[])
+def deselect_all():
+    """Deselect all columns"""
+    return gr.update(value=transformer.deselect_all_columns())
+def process_transformation(selected_columns, topics_column, export_format):
+    """Process the transformation - using exact same pattern as working version"""
     try:
+        if transformer.df is None:
+            return None, None, "❌ Error: No CSV file loaded"
+        if not selected_columns:
+            return None, None, "❌ Error: Please select at least one column"
+        # Create a copy of the dataframe with selected columns
+        processed_df = transformer.df[selected_columns].copy()
+        # Process ABSA columns
+        processed_df = transformer.process_absa_columns(processed_df)
+        # Process Categories columns
+        processed_df = transformer.process_categories_columns(processed_df)
+        # Process Topics column
+        processed_df = transformer.process_topics_column(processed_df, topics_column)
+        # Store processed data
+        transformer.processed_df = processed_df
+        # Generate preview
+        preview_html = processed_df.head(20).to_html(classes="table table-striped", table_id="preview-table")
+        # Save file using the exact same method as working version
+        output_file = transformer.save_transformed_data(export_format.lower().replace(' (.', '').replace(')', ''))
+        success_msg = f"✅ Transformation completed! Generated file: {os.path.basename(output_file)}"
+        success_msg += f"\n📊 Processed {len(transformer.processed_df)} rows with {len(transformer.processed_df.columns)} columns"
+        success_msg += f"\n💾 File saved successfully"
+        success_msg += f"\n📥 File download should start automatically"
+        return preview_html, output_file, success_msg
     except Exception as e:
+        import traceback
+        error_msg = f"❌ Error during transformation: {str(e)}\n\n{traceback.format_exc()}"
+        return None, None, error_msg
+# Create Gradio interface - using similar structure to working version
+with gr.Blocks(title="CSV Binary Transformation Tool", theme=gr.themes.Soft()) as app:
     gr.Markdown("""
+    # 📊 CSV Binary Transformation Tool
+    This tool transforms CSV files by creating binary columns for sentiment analysis, categories, and topics.
+    ## Features:
+    - **ABSA Processing**: Creates sentiment columns and topic-sentiment combinations
+    - **Category Processing**: Creates binary columns for each category
+    - **Topic Processing**: Creates binary columns for each topic
+    - **Flexible Export**: Support for CSV and Excel formats
     """)
     with gr.Row():
         with gr.Column(scale=1):
+            # File upload section
+            gr.Markdown("### 1. Upload CSV File")
+            file_input = gr.File(
                 label="Upload Input File",
+                file_types=[".csv"],
                 type="filepath"
             )
+            upload_status = gr.Textbox(
+                label="Upload Status",
+                interactive=False,
+                lines=2
+            )
+            # Column selection section
+            gr.Markdown("### 2. Select Columns")
+            gr.Markdown("*Choose which columns from your CSV to include in the output file*")
+            with gr.Row():
+                select_all_btn = gr.Button("✓ Select All", size="sm", variant="secondary")
+                deselect_all_btn = gr.Button("✗ Deselect All", size="sm", variant="secondary")
             column_selector = gr.CheckboxGroup(
+                label="Choose columns to include in output",
                 choices=[],
                 value=[],
+                interactive=True,
+                visible=False,
+                info="Select the columns you want to include in the transformed output file"
             )
+            # Topics column input
+            gr.Markdown("### 3. Specify Topics Column")
+            topics_column_input = gr.Textbox(
+                label="Topics Column Name",
+                placeholder="Enter the name of the column containing topics",
+                info="This column will be used to create T_<topic> binary columns"
             )
+            # Export options
+            gr.Markdown("### 4. Export Settings")
+            export_format = gr.Radio(
                 label="Output Format",
+                choices=["CSV (.csv)", "Excel (.xlsx)"],
+                value="Excel (.xlsx)"
             )
+            # Process button
+            process_btn = gr.Button(
+                "🚀 Transform Data",
+                variant="primary",
+                size="lg"
             )
+        with gr.Column(scale=2):
+            # Preview sections
+            gr.Markdown("### File Preview")
+            with gr.Tabs():
+                with gr.Tab("Original Data"):
+                    original_preview = gr.HTML(
+                        label="Original Data Preview (First 10 rows)",
+                        value="<p>No file uploaded yet</p>"
+                    )
+                with gr.Tab("Transformed Data"):
+                    transformed_preview = gr.HTML(
+                        label="Transformed Data Preview (First 20 rows)",
+                        value="<p>No transformation performed yet</p>"
+                    )
+            # Status and download
+            process_status = gr.Textbox(
                 label="Processing Status",
+                interactive=False,
+                lines=6
             )
+            # Download section - using exact same setup as working version
             gr.Markdown("### 📥 Download Status")
             gr.Markdown("Please click on the link inside the output file size value to download the transformed file (the number value on the right hand side below). You may need to right click and select Save Link As (or something similar)")
             output_file = gr.File(
                 interactive=False,
                 visible=True
             )
+    # Event handlers - same pattern as working version
+    file_input.change(
+        fn=handle_file_upload,
+        inputs=[file_input],
+        outputs=[original_preview, column_selector, upload_status]
+    )
+    select_all_btn.click(
+        fn=select_all,
         outputs=[column_selector]
     )
+    deselect_all_btn.click(
+        fn=deselect_all,
+        outputs=[column_selector]
     )
+    process_btn.click(
+        fn=process_transformation,
+        inputs=[column_selector, topics_column_input, export_format],
+        outputs=[transformed_preview, output_file, process_status]
+    )
+    # Add custom CSS for better table styling
+    app.load(js="""
+    function() {
+        const style = document.createElement('style');
+        style.textContent = `
+            .table { font-size: 12px; }
+            .table th, .table td { padding: 4px 8px; }
+            #upload-preview, #preview-table { max-height: 400px; overflow-y: auto; }
+        `;
+        document.head.appendChild(style);
+    }
     """)
 if __name__ == "__main__":
+    app.launch(
+        share=True,
+        max_file_size="50mb"
+    )