Spaces:

sandsiv
/

spss_formatting

Sleeping

App Files Files Community

geekgirl3 commited on Jul 4

Commit

19651ed

verified ·

1 Parent(s): 42fdecf

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -85

app.py CHANGED Viewed

@@ -42,7 +42,6 @@ class FeedbackTransformer:
         self.original_filename = None
         self.selected_columns = []
         self.verbatim_column = None  # Store the verbatim/text column
-        self.dynamic_topic_prefix = None  # Store dynamically extracted topic prefix
     def load_data(self, file_obj):
         """
@@ -75,37 +74,6 @@ class FeedbackTransformer:
         return len(self.data), len(self.data.columns)
-    def extract_topic_prefix_from_category(self):
-        """
-        Extract the topic prefix from a column containing "Category:"
-        by finding text between "Category:" and "("
-        """
-        # Look for columns containing "Category:"
-        category_pattern_cols = [col for col in self.data.columns if "Category:" in col]
-        if category_pattern_cols:
-            # Use the first matching column
-            category_col = category_pattern_cols[0]
-            # Try to extract from column name first
-            match = re.search(r'Category:\s*([^(]+)\s*\(', category_col)
-            if match:
-                extracted_prefix = match.group(1).strip() + ":"
-                self.dynamic_topic_prefix = extracted_prefix
-                return extracted_prefix
-            # If not found in column name, try to extract from column values
-            for value in self.data[category_col].dropna():
-                if isinstance(value, str):
-                    match = re.search(r'Category:\s*([^(]+)\s*\(', value)
-                    if match:
-                        extracted_prefix = match.group(1).strip() + ":"
-                        self.dynamic_topic_prefix = extracted_prefix
-                        return extracted_prefix
-        # If no match found, return None
-        return None
     def identify_columns(self):
         """
         Identify topic, category, and sentiment columns in the data.
@@ -113,22 +81,8 @@ class FeedbackTransformer:
         if self.data is None:
             raise ValueError("Data not loaded")
-        # First try to extract topic prefix dynamically
-        extracted_prefix = self.extract_topic_prefix_from_category()
-        # Use dynamic prefix if found, otherwise use the provided topic_prefix
-        topic_identifier = extracted_prefix if extracted_prefix else self.topic_prefix
-        # Log the prefix being used
-        print(f"Using topic prefix: '{topic_identifier}'")
         # Extract columns based on prefixes
-        # For topic columns, use the dynamic or provided prefix
-        if topic_identifier:
-            self.topic_cols = [col for col in self.data.columns if topic_identifier in col]
-        else:
-            self.topic_cols = [col for col in self.data.columns if "Topic:" in col]
         self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
         self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
@@ -147,8 +101,7 @@ class FeedbackTransformer:
             'sentiment_cols': self.sentiment_cols,
             'category_cols': self.category_cols,
             'all_columns': all_cols,
-            'verbatim_column': self.verbatim_column,
-            'dynamic_topic_prefix': self.dynamic_topic_prefix
         }
     def extract_unique_topics_and_categories(self):
@@ -359,13 +312,7 @@ class FeedbackTransformer:
         analysis_text += f"Topic columns (T_): {len(topic_cols)}\n"
         analysis_text += f"Sentiment columns (S_): {len(sentiment_cols)}\n"
         analysis_text += f"Category columns (C_): {len(category_cols)}\n"
-        analysis_text += f"Verbatim column used: {self.verbatim_column}\n"
-        # Add dynamic topic prefix info
-        if self.dynamic_topic_prefix:
-            analysis_text += f"Dynamic topic prefix extracted: '{self.dynamic_topic_prefix}'\n\n"
-        else:
-            analysis_text += f"Topic prefix used: '{self.topic_prefix}'\n\n"
         if self.selected_columns:
             analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
@@ -538,10 +485,6 @@ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
         status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
         status_msg += f"- Verbatim column: {col_info['verbatim_column']}\n"
-        # Add dynamic topic prefix info
-        if col_info.get('dynamic_topic_prefix'):
-            status_msg += f"- Dynamic topic prefix extracted: '{col_info['dynamic_topic_prefix']}'\n"
         # Extract unique topics, categories, and sentiments
         num_topics, num_categories, num_sentiments = transformer.extract_unique_topics_and_categories()
         status_msg += f"\n🎯 Found {num_topics} unique topics\n"
@@ -589,22 +532,21 @@ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
     Transform feedback data with delimited topic and sentiment columns into binary matrix format.
     ### 🔧 Processing Logic:
-    - **Automatic Topic Prefix Detection**: Extracts topic prefix from columns containing "Category:" by finding text between "Category:" and "("
     - **Verbatim_Positive/Neutral/Negative**: Set to 1 if respective sentiment is found in ABSA column, 0 otherwise
     - **T_ Columns**: Set to 1 if topic is present in ABSA column, 0 otherwise
     - **S_ Columns**: One column per topic (e.g., S_Allgemeine_Zufriedenheit) containing actual sentiment values
     - **C_ Columns**: Set to 1 if category is assigned, 0 otherwise
     ### 📋 Data Format Requirements:
-    - **Topics**: Delimited by `|` (pipe) in columns identified by dynamic or manual prefix
     - **Sentiments**: Format `Topic::Sentiment|Topic2::Sentiment2` in ABSA columns
     - **Categories**: Delimited by `|` (pipe) in "Categories:" columns
-    ### 🆕 Key Features:
-    - **Dynamic Topic Prefix Extraction**: Automatically extracts topic prefix from "Category:" columns
     - **Verbatim_** columns detect overall sentiment presence regardless of topic
     - **T_** columns based on ABSA column presence (topics that have sentiment data)
     - **S_** columns contain actual sentiment values (not binary 1/0)
     """)
     with gr.Row():
@@ -631,9 +573,9 @@ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
             gr.Markdown("### ⚙️ 3. Configuration")
             topic_prefix = gr.Textbox(
-                label="Topic Column Identifier (Fallback)",
                 value="Topic:",
-                info="Fallback identifier if dynamic extraction from Category: column fails"
             )
             sentiment_prefix = gr.Textbox(
@@ -726,36 +668,37 @@ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
     gr.Markdown("""
     ### 📝 Example Transformations:
-    **Input Data with Dynamic Topic Extraction:**
     ```
-    | Column: "Category: Service (ABC)" | ABSA: Sentiments | Categories: Issues |
     | 1 | Service::Negative|Quality::Positive | Issues|Support |
     ```
-    **System will:**
-    1. Extract "Service:" from "Category: Service (ABC)" column
-    2. Use "Service:" to identify topic columns instead of "Topic:"
     **Output Binary Matrix:**
     ```
     | feedback_id | Verbatim_Positive | Verbatim_Neutral | Verbatim_Negative | T_Service | T_Quality | S_Service | S_Quality | C_Issues | C_Support |
     | 1 | 1 | 0 | 1 | 1 | 1 | Negative | Positive | 1 | 1 |
     ```
-    ### 💡 Dynamic Topic Prefix Logic:
-    - Searches for columns containing "Category:"
-    - Extracts text between "Category:" and "(" (e.g., "Service" from "Category: Service (ABC)")
-    - Adds ":" to create the topic prefix (e.g., "Service:")
-    - Uses this prefix to identify topic columns
-    - Falls back to manual "Topic Column Identifier" if extraction fails
-    ### 🔍 Key Changes in This Version:
-    - **NEW**: Automatic extraction of topic prefix from Category columns
-    - Dynamically identifies topic columns based on extracted prefix
-    - Maintains all other functionality (Verbatim columns, T_, S_, C_ logic)
-    - Provides fallback to manual topic prefix if extraction fails
     """)
 # Launch the app
 if __name__ == "__main__":
-    demo.launch(share=True)

         self.original_filename = None
         self.selected_columns = []
         self.verbatim_column = None  # Store the verbatim/text column
     def load_data(self, file_obj):
         """
         return len(self.data), len(self.data.columns)
     def identify_columns(self):
         """
         Identify topic, category, and sentiment columns in the data.
         if self.data is None:
             raise ValueError("Data not loaded")
         # Extract columns based on prefixes
+        self.topic_cols = [col for col in self.data.columns if "Topic:" in col]
         self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
         self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
             'sentiment_cols': self.sentiment_cols,
             'category_cols': self.category_cols,
             'all_columns': all_cols,
+            'verbatim_column': self.verbatim_column
         }
     def extract_unique_topics_and_categories(self):
         analysis_text += f"Topic columns (T_): {len(topic_cols)}\n"
         analysis_text += f"Sentiment columns (S_): {len(sentiment_cols)}\n"
         analysis_text += f"Category columns (C_): {len(category_cols)}\n"
+        analysis_text += f"Verbatim column used: {self.verbatim_column}\n\n"
         if self.selected_columns:
             analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
         status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
         status_msg += f"- Verbatim column: {col_info['verbatim_column']}\n"
         # Extract unique topics, categories, and sentiments
         num_topics, num_categories, num_sentiments = transformer.extract_unique_topics_and_categories()
         status_msg += f"\n🎯 Found {num_topics} unique topics\n"
     Transform feedback data with delimited topic and sentiment columns into binary matrix format.
     ### 🔧 Processing Logic:
     - **Verbatim_Positive/Neutral/Negative**: Set to 1 if respective sentiment is found in ABSA column, 0 otherwise
     - **T_ Columns**: Set to 1 if topic is present in ABSA column, 0 otherwise
     - **S_ Columns**: One column per topic (e.g., S_Allgemeine_Zufriedenheit) containing actual sentiment values
     - **C_ Columns**: Set to 1 if category is assigned, 0 otherwise
     ### 📋 Data Format Requirements:
+    - **Topics**: Delimited by `|` (pipe) in "Topics:" columns (optional)
     - **Sentiments**: Format `Topic::Sentiment|Topic2::Sentiment2` in ABSA columns
     - **Categories**: Delimited by `|` (pipe) in "Categories:" columns
+    ### 🆕 Key Logic:
     - **Verbatim_** columns detect overall sentiment presence regardless of topic
     - **T_** columns based on ABSA column presence (topics that have sentiment data)
     - **S_** columns contain actual sentiment values (not binary 1/0)
+    - No automatic column renaming for "Topic:" prefix
     """)
     with gr.Row():
             gr.Markdown("### ⚙️ 3. Configuration")
             topic_prefix = gr.Textbox(
+                label="Topic Column Identifier",
                 value="Topic:",
+                info="Text to identify topic columns (for reference only)"
             )
             sentiment_prefix = gr.Textbox(
     gr.Markdown("""
     ### 📝 Example Transformations:
+    **Input Data:**
     ```
+    | feedback_id | ABSA: Sentiments | Categories: Issues |
     | 1 | Service::Negative|Quality::Positive | Issues|Support |
     ```
     **Output Binary Matrix:**
     ```
     | feedback_id | Verbatim_Positive | Verbatim_Neutral | Verbatim_Negative | T_Service | T_Quality | S_Service | S_Quality | C_Issues | C_Support |
     | 1 | 1 | 0 | 1 | 1 | 1 | Negative | Positive | 1 | 1 |
     ```
+    ### 💡 Column Logic:
+    - **Verbatim_Positive**: 1 if any "Positive"/"Positiv" sentiment found in ABSA
+    - **Verbatim_Neutral**: 1 if any "Neutral" sentiment found in ABSA
+    - **Verbatim_Negative**: 1 if any "Negative"/"Negativ" sentiment found in ABSA
+    - **T_[topic_name]**: 1 if topic exists in ABSA column, 0 otherwise
+    - **S_[topic_name]**: Actual sentiment value for that topic (e.g., "Positive", "Negative")
+    - **C_[category_name]**: 1 if category is assigned, 0 otherwise
+    - Safe column names (special characters replaced with underscores)
+    ### 🔍 Key Changes Made:
+    - **NEW**: Added Verbatim_Positive, Verbatim_Neutral, Verbatim_Negative columns
+    - These columns are set to 1 if the respective sentiment is found anywhere in the ABSA column
+    - Supports both English (Positive/Negative/Neutral) and German (Positiv/Negativ) sentiment detection
+    - Removed automatic "Topic:" column renaming logic
+    - T_ columns are now binary (1/0) based on topic existence in ABSA column
+    - Topics are extracted from ABSA sentiment data for T_ column creation
     """)
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(share=True)