geekgirl3 commited on
Commit
42fdecf
Β·
verified Β·
1 Parent(s): a29b781

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +354 -190
app.py CHANGED
@@ -3,15 +3,16 @@ import pandas as pd
3
  import numpy as np
4
  import os
5
  import traceback
6
- from typing import Tuple, Dict, Any, Optional
7
  import tempfile
8
  import io
9
  import datetime
 
10
 
11
  class FeedbackTransformer:
12
  """
13
- A class to transform feedback data with topic and sentiment columns
14
- into a binary format where each topic is a separate column.
15
  """
16
 
17
  def __init__(self,
@@ -34,9 +35,14 @@ class FeedbackTransformer:
34
  self.sentiment_cols = []
35
  self.category_cols = []
36
  self.unique_topics = set()
 
 
 
37
  self.file_name = None
38
  self.original_filename = None
39
- self.selected_columns = [] # Store columns selected for inclusion
 
 
40
 
41
  def load_data(self, file_obj):
42
  """
@@ -69,6 +75,37 @@ class FeedbackTransformer:
69
 
70
  return len(self.data), len(self.data.columns)
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  def identify_columns(self):
73
  """
74
  Identify topic, category, and sentiment columns in the data.
@@ -76,11 +113,32 @@ class FeedbackTransformer:
76
  if self.data is None:
77
  raise ValueError("Data not loaded")
78
 
 
 
 
 
 
 
 
 
 
79
  # Extract columns based on prefixes
80
- self.topic_cols = [col for col in self.data.columns if self.topic_prefix in col]
 
 
 
 
 
81
  self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
82
  self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
83
 
 
 
 
 
 
 
 
84
  # If no columns found with specified prefixes, return all columns for manual selection
85
  all_cols = list(self.data.columns)
86
 
@@ -88,44 +146,57 @@ class FeedbackTransformer:
88
  'topic_cols': self.topic_cols,
89
  'sentiment_cols': self.sentiment_cols,
90
  'category_cols': self.category_cols,
91
- 'all_columns': all_cols
 
 
92
  }
93
 
94
- def extract_unique_topics(self):
95
  """
96
- Extract all unique topics from the topic columns.
97
  """
98
  self.unique_topics = set()
 
 
 
99
 
100
- # Extract from topic columns
101
  for col in self.topic_cols:
102
- self.unique_topics.update(self.data[col].dropna().unique())
 
 
 
 
103
 
104
- # Also extract from category columns if they exist
105
  for col in self.category_cols:
106
- self.unique_topics.update(self.data[col].dropna().unique())
107
-
108
- # Remove empty topics
109
- self.unique_topics = {t for t in self.unique_topics if isinstance(t, str) and t.strip()}
110
-
111
- return len(self.unique_topics)
112
-
113
- @staticmethod
114
- def create_column_name(topic):
115
- """
116
- Create a standardized column name from a topic string.
117
- """
118
- # Remove special characters and standardize
119
- topic_clean = str(topic).strip()
120
- # Remove brackets and special characters
121
- topic_clean = topic_clean.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
122
- topic_clean = topic_clean.replace('**', '').replace('*', '')
123
- topic_clean = topic_clean.replace('.', '_').replace(' ', '_').replace('&', 'and')
124
- topic_clean = topic_clean.replace(':', '_').replace('-', '_').replace('/', '_')
125
- # Remove multiple underscores
126
- while '__' in topic_clean:
127
- topic_clean = topic_clean.replace('__', '_')
128
- return topic_clean.lower().strip('_')
 
 
 
 
129
 
130
  def set_selected_columns(self, selected_columns):
131
  """
@@ -135,10 +206,10 @@ class FeedbackTransformer:
135
 
136
  def transform_data(self):
137
  """
138
- Transform the data into binary topic columns with sentiment values.
139
  """
140
- if not self.unique_topics:
141
- self.extract_unique_topics()
142
 
143
  # Create output dataframe starting with feedback_id
144
  self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
@@ -148,49 +219,99 @@ class FeedbackTransformer:
148
  if col in self.data.columns:
149
  self.transformed_data[col] = self.data[col]
150
 
151
- # Initialize all topic columns to 0
 
 
 
 
 
152
  for topic in sorted(self.unique_topics):
153
- topic_col = self.create_column_name(topic)
154
- self.transformed_data[topic_col] = 0
155
- self.transformed_data[f'{topic_col}_sentiment'] = None
156
 
157
- # Fill in the data from topic columns
 
 
 
 
 
 
 
 
 
 
 
 
158
  for idx, row in self.data.iterrows():
159
- # Process topic columns with sentiments
160
- for i, t_col in enumerate(self.topic_cols):
161
- topic = row.get(t_col)
162
-
163
- # Find corresponding sentiment column
164
- if i < len(self.sentiment_cols):
165
- sentiment = row.get(self.sentiment_cols[i])
166
- else:
167
- sentiment = None
168
-
169
- if pd.notna(topic) and isinstance(topic, str) and topic.strip():
170
- topic_col = self.create_column_name(topic)
171
- if topic_col in self.transformed_data.columns:
172
- self.transformed_data.loc[idx, topic_col] = 1
173
-
174
- # Convert sentiment to numeric value
175
- if pd.notna(sentiment) and isinstance(sentiment, str):
176
- sentiment_lower = sentiment.lower()
177
- if 'positive' in sentiment_lower:
178
- self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 1
179
- elif 'negative' in sentiment_lower:
180
- self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0
181
- elif 'neutral' in sentiment_lower:
182
- self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0.5
183
-
184
- # Process category columns (these typically don't have sentiments)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  for c_col in self.category_cols:
186
- category = row.get(c_col)
187
- if pd.notna(category) and isinstance(category, str) and category.strip():
188
- category_col = self.create_column_name(category)
189
- if category_col in self.transformed_data.columns:
190
- self.transformed_data.loc[idx, category_col] = 1
 
 
 
 
 
 
191
 
192
  return self.transformed_data.shape
193
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  def analyze_data(self):
195
  """
196
  Analyze the transformed data to provide insights.
@@ -198,107 +319,117 @@ class FeedbackTransformer:
198
  if self.transformed_data is None:
199
  raise ValueError("No transformed data to analyze")
200
 
201
- # Identify topic columns (exclude feedback_id, selected original columns, and sentiment columns)
202
- excluded_cols = ['feedback_id'] + self.selected_columns
203
- topic_cols = [col for col in self.transformed_data.columns
204
- if col not in excluded_cols and not col.endswith('_sentiment')]
205
-
206
- # Count occurrences of each topic
207
- topic_counts = {}
208
- for topic in topic_cols:
209
- topic_counts[topic] = self.transformed_data[topic].sum()
210
-
211
- # Sort topics by frequency
212
- sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  # Prepare analysis summary
215
  analysis_text = f"**Analysis Results**\n\n"
216
  analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
217
  analysis_text += f"Selected original columns: {len(self.selected_columns)}\n"
218
- analysis_text += f"Unique topics: {len(topic_cols)}\n\n"
 
 
 
 
 
 
 
 
 
 
219
 
220
  if self.selected_columns:
221
  analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
222
 
223
- analysis_text += "**Top 10 Most Frequent Topics:**\n"
224
- for topic, count in sorted_topics[:10]:
225
- analysis_text += f"- {topic}: {count} occurrences\n"
226
-
227
- # Calculate sentiment distributions for top topics
228
- analysis_text += "\n**Sentiment Distributions for Top 5 Topics:**\n"
229
- for topic, _ in sorted_topics[:5]:
230
- sentiment_col = f"{topic}_sentiment"
231
- if sentiment_col in self.transformed_data.columns:
232
- # Filter rows where the topic is present
233
- topic_rows = self.transformed_data[self.transformed_data[topic] == 1]
234
-
235
- positive = (topic_rows[sentiment_col] == 1.0).sum()
236
- negative = (topic_rows[sentiment_col] == 0.0).sum()
237
- neutral = (topic_rows[sentiment_col] == 0.5).sum()
238
-
239
- total = positive + negative + neutral
240
-
241
- if total > 0:
242
- analysis_text += f"\n{topic} ({total} occurrences):\n"
243
- analysis_text += f" - Positive: {positive} ({positive/total*100:.1f}%)\n"
244
- analysis_text += f" - Negative: {negative} ({negative/total*100:.1f}%)\n"
245
- analysis_text += f" - Neutral: {neutral} ({neutral/total*100:.1f}%)\n"
246
-
247
- # Calculate number of topics per feedback
248
- self.transformed_data['topic_count'] = self.transformed_data[topic_cols].sum(axis=1)
249
- avg_topics = self.transformed_data['topic_count'].mean()
250
- max_topics = self.transformed_data['topic_count'].max()
251
-
252
- analysis_text += f"\n**Topics per Feedback:**\n"
253
- analysis_text += f"- Average: {avg_topics:.2f}\n"
254
- analysis_text += f"- Maximum: {max_topics}\n"
255
-
256
- # Remove the temporary topic_count column
257
- self.transformed_data.drop('topic_count', axis=1, inplace=True)
258
 
259
  return analysis_text
260
 
261
  def save_transformed_data(self, output_format='xlsx'):
262
  """
263
  Save the transformed data and return the file path.
264
- Modified to work properly with Hugging Face Spaces downloads.
265
  """
266
  if self.transformed_data is None:
267
  raise ValueError("No transformed data to save")
268
 
269
  # Create filename with original filename prefix and timestamp
270
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
271
-
272
- # Use original filename as prefix, or fallback to 'transformed_feedback' if not available
273
  prefix = self.original_filename if self.original_filename else 'transformed_feedback'
274
 
275
  if output_format == 'xlsx':
276
- filename = f"{prefix}_transformed_{timestamp}.xlsx"
277
- # Create temporary file that Gradio can handle
278
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
279
  self.transformed_data.to_excel(temp_file.name, index=False)
280
  temp_file.close()
281
-
282
- # Rename the temporary file to have a meaningful name
283
  final_path = os.path.join(tempfile.gettempdir(), filename)
284
  if os.path.exists(final_path):
285
  os.remove(final_path)
286
  os.rename(temp_file.name, final_path)
287
-
288
  else: # csv
289
- filename = f"{prefix}_transformed_{timestamp}.csv"
290
- # Create temporary file that Gradio can handle
291
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
292
  self.transformed_data.to_csv(temp_file.name, index=False)
293
  temp_file.close()
294
-
295
- # Rename the temporary file to have a meaningful name
296
  final_path = os.path.join(tempfile.gettempdir(), filename)
297
  if os.path.exists(final_path):
298
  os.remove(final_path)
299
  os.rename(temp_file.name, final_path)
300
 
301
- # Verify file was created and is readable
302
  if not os.path.exists(final_path):
303
  raise ValueError(f"Failed to create output file: {final_path}")
304
 
@@ -334,17 +465,14 @@ def get_column_selector(file_obj):
334
  df = pd.read_csv(file_obj, sep='\t', nrows=5)
335
 
336
  columns = list(df.columns)
337
-
338
- # Create column display with indices for easier reference
339
  column_choices = [f"{i+1:2d}. {col}" for i, col in enumerate(columns)]
340
-
341
- # Return updated CheckboxGroup with numbered columns and individual rows
342
  return gr.CheckboxGroup(
343
  choices=column_choices,
344
- value=[], # No columns selected by default
345
  label=f"πŸ“‹ Select Columns to Include ({len(columns)} available)",
346
- info="Choose which original columns to include in the transformed file (in addition to feedback_id). Columns are numbered for easy reference.",
347
- elem_classes=["column-selector"] # Add CSS class for styling
348
  )
349
 
350
  except Exception as e:
@@ -362,16 +490,15 @@ def extract_column_names(selected_display_names):
362
  """
363
  if not selected_display_names:
364
  return []
365
-
366
  actual_names = []
367
  for display_name in selected_display_names:
368
- # Remove the number prefix (e.g., "1. Column Name" -> "Column Name")
369
  if '. ' in display_name:
370
  actual_name = display_name.split('. ', 1)[1]
371
  actual_names.append(actual_name)
372
  else:
373
  actual_names.append(display_name)
374
-
375
  return actual_names
376
 
377
 
@@ -383,7 +510,7 @@ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
383
  try:
384
  # Extract actual column names from display format
385
  actual_column_names = extract_column_names(selected_columns)
386
-
387
  # Initialize transformer
388
  transformer = FeedbackTransformer(
389
  topic_prefix=topic_prefix,
@@ -409,14 +536,26 @@ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
409
  status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
410
  status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
411
  status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
 
 
 
 
 
412
 
413
- # Extract unique topics
414
- num_topics = transformer.extract_unique_topics()
415
  status_msg += f"\n🎯 Found {num_topics} unique topics\n"
 
 
416
 
417
  # Transform data
418
  shape = transformer.transform_data()
419
  status_msg += f"\n✨ Transformed data shape: {shape[0]} rows Γ— {shape[1]} columns\n"
 
 
 
 
 
420
 
421
  # Analyze if requested
422
  analysis_result = ""
@@ -426,6 +565,7 @@ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
426
  # Save transformed data
427
  output_file = transformer.save_transformed_data(output_format)
428
  status_msg += f"\nπŸ’Ύ File saved successfully: {os.path.basename(output_file)}\n"
 
429
 
430
  return status_msg, analysis_result, output_file
431
 
@@ -435,7 +575,7 @@ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
435
 
436
 
437
  # Create Gradio interface
438
- with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
439
  .column-selector .form-check {
440
  display: block !important;
441
  margin-bottom: 8px !important;
@@ -445,15 +585,26 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
445
  }
446
  """) as demo:
447
  gr.Markdown("""
448
- # πŸ“Š Feedback Topic & Sentiment Transformer
449
- Transform feedback data with topic and sentiment columns into a binary matrix format.
450
- Each unique topic becomes a separate column with 0/1 values and associated sentiment scores.
451
- ### πŸ“‹ Instructions:
452
- 1. Upload your Excel, CSV, or tab-delimited text file
453
- 2. Select which original columns to include in the output
454
- 3. Configure column prefixes (or use defaults)
455
- 4. Click "Transform Data" to process
456
- 5. Download the transformed file
 
 
 
 
 
 
 
 
 
 
 
457
  """)
458
 
459
  with gr.Row():
@@ -466,7 +617,7 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
466
  type="filepath"
467
  )
468
 
469
- # Combined column selector (replaces both preview and checkboxes)
470
  gr.Markdown("### πŸ“‹ 2. Column Selection")
471
  column_selector = gr.CheckboxGroup(
472
  choices=[],
@@ -477,18 +628,18 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
477
 
478
  with gr.Column(scale=1):
479
  # Configuration parameters
480
- gr.Markdown("### βš™οΈ 3. Configuration of column prefixes ")
481
 
482
  topic_prefix = gr.Textbox(
483
- label="Topic Column Prefix",
484
- value="[**WORKSHOP] SwissLife Taxonomy",
485
- info="Prefix to identify topic columns"
486
  )
487
 
488
  sentiment_prefix = gr.Textbox(
489
- label="Sentiment Column Prefix",
490
  value="ABSA:",
491
- info="Prefix to identify sentiment columns"
492
  )
493
 
494
  category_prefix = gr.Textbox(
@@ -498,9 +649,9 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
498
  )
499
 
500
  text_column = gr.Textbox(
501
- label="Text Column Name",
502
  value="TEXT",
503
- info="Column containing original feedback text (for reference only)"
504
  )
505
 
506
  recommendation_column = gr.Textbox(
@@ -521,14 +672,14 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
521
  )
522
 
523
  # Transform button
524
- transform_btn = gr.Button("πŸ”„ 4. Transform Data", variant="primary", size="lg")
525
 
526
  # Output sections
527
  with gr.Row():
528
  with gr.Column():
529
  status_output = gr.Textbox(
530
  label="Processing Status",
531
- lines=10,
532
  interactive=False
533
  )
534
 
@@ -537,12 +688,13 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
537
  label="Data Analysis"
538
  )
539
 
540
- # Download section - Modified for better download functionality
541
  with gr.Row():
542
  with gr.Column():
543
- gr.Markdown("### πŸ“₯ 5. Download Transformed File")
 
544
  output_file = gr.File(
545
- label="Transformed File",
546
  interactive=False,
547
  visible=True
548
  )
@@ -572,26 +724,38 @@ with gr.Blocks(title="Feedback Topic & Sentiment Transformer", css="""
572
 
573
  # Examples section
574
  gr.Markdown("""
575
- ### πŸ“ Example Column Formats:
576
- - **Topic columns**: `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 1`, `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 2`
577
- - **Category columns**: `Categories:Topic1`, `Categories:Topic2`
578
- - **Sentiment columns**: `ABSA:Sentiment1`, `ABSA:Sentiment2`
579
- ### 🎯 Output Format:
580
- - **feedback_id**: Unique identifier for each row
581
- - **Selected original columns**: Any columns you selected from the original file
582
- - **Topic columns**: Each unique topic becomes a column with values 0 (absent) or 1 (present)
583
- - **Sentiment columns**: Each topic has an associated `_sentiment` column with values:
584
- - 1.0 = Positive
585
- - 0.5 = Neutral
586
- - 0.0 = Negative
587
- - **Output filename**: `[original_filename]_transformed_[timestamp].[format]`
588
- ### πŸ’‘ Tips:
589
- - Use the numbered column list to easily identify and select columns
590
- - The text and recommendation column names in configuration are now for reference only
591
- - To include them in output, select them using the column checkboxes
592
- - Click on the download button that appears after processing to download the file
 
 
 
 
 
 
 
 
 
 
 
 
593
  """)
594
 
595
  # Launch the app
596
  if __name__ == "__main__":
597
- demo.launch()
 
3
  import numpy as np
4
  import os
5
  import traceback
6
+ from typing import Tuple, Dict, Any, Optional, List
7
  import tempfile
8
  import io
9
  import datetime
10
+ import re
11
 
12
  class FeedbackTransformer:
13
  """
14
+ A class to transform feedback data with delimited topic and sentiment columns
15
+ into binary columns with prefixes T_, S_, and C_.
16
  """
17
 
18
  def __init__(self,
 
35
  self.sentiment_cols = []
36
  self.category_cols = []
37
  self.unique_topics = set()
38
+ self.unique_categories = set()
39
+ self.unique_sentiments = set()
40
+ self.topic_sentiment_mapping = {} # Map topics to their sentiment values
41
  self.file_name = None
42
  self.original_filename = None
43
+ self.selected_columns = []
44
+ self.verbatim_column = None # Store the verbatim/text column
45
+ self.dynamic_topic_prefix = None # Store dynamically extracted topic prefix
46
 
47
  def load_data(self, file_obj):
48
  """
 
75
 
76
  return len(self.data), len(self.data.columns)
77
 
78
+ def extract_topic_prefix_from_category(self):
79
+ """
80
+ Extract the topic prefix from a column containing "Category:"
81
+ by finding text between "Category:" and "("
82
+ """
83
+ # Look for columns containing "Category:"
84
+ category_pattern_cols = [col for col in self.data.columns if "Category:" in col]
85
+
86
+ if category_pattern_cols:
87
+ # Use the first matching column
88
+ category_col = category_pattern_cols[0]
89
+
90
+ # Try to extract from column name first
91
+ match = re.search(r'Category:\s*([^(]+)\s*\(', category_col)
92
+ if match:
93
+ extracted_prefix = match.group(1).strip() + ":"
94
+ self.dynamic_topic_prefix = extracted_prefix
95
+ return extracted_prefix
96
+
97
+ # If not found in column name, try to extract from column values
98
+ for value in self.data[category_col].dropna():
99
+ if isinstance(value, str):
100
+ match = re.search(r'Category:\s*([^(]+)\s*\(', value)
101
+ if match:
102
+ extracted_prefix = match.group(1).strip() + ":"
103
+ self.dynamic_topic_prefix = extracted_prefix
104
+ return extracted_prefix
105
+
106
+ # If no match found, return None
107
+ return None
108
+
109
  def identify_columns(self):
110
  """
111
  Identify topic, category, and sentiment columns in the data.
 
113
  if self.data is None:
114
  raise ValueError("Data not loaded")
115
 
116
+ # First try to extract topic prefix dynamically
117
+ extracted_prefix = self.extract_topic_prefix_from_category()
118
+
119
+ # Use dynamic prefix if found, otherwise use the provided topic_prefix
120
+ topic_identifier = extracted_prefix if extracted_prefix else self.topic_prefix
121
+
122
+ # Log the prefix being used
123
+ print(f"Using topic prefix: '{topic_identifier}'")
124
+
125
  # Extract columns based on prefixes
126
+ # For topic columns, use the dynamic or provided prefix
127
+ if topic_identifier:
128
+ self.topic_cols = [col for col in self.data.columns if topic_identifier in col]
129
+ else:
130
+ self.topic_cols = [col for col in self.data.columns if "Topic:" in col]
131
+
132
  self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
133
  self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
134
 
135
+ # Try to identify verbatim/text column
136
+ text_candidates = [col for col in self.data.columns if any(keyword in col.lower() for keyword in ['text', 'verbatim', 'comment', 'feedback'])]
137
+ if text_candidates:
138
+ self.verbatim_column = text_candidates[0] # Use the first match
139
+ elif self.text_column in self.data.columns:
140
+ self.verbatim_column = self.text_column
141
+
142
  # If no columns found with specified prefixes, return all columns for manual selection
143
  all_cols = list(self.data.columns)
144
 
 
146
  'topic_cols': self.topic_cols,
147
  'sentiment_cols': self.sentiment_cols,
148
  'category_cols': self.category_cols,
149
+ 'all_columns': all_cols,
150
+ 'verbatim_column': self.verbatim_column,
151
+ 'dynamic_topic_prefix': self.dynamic_topic_prefix
152
  }
153
 
154
+ def extract_unique_topics_and_categories(self):
155
  """
156
+ Extract all unique topics, categories, and sentiments from the respective columns.
157
  """
158
  self.unique_topics = set()
159
+ self.unique_categories = set()
160
+ self.unique_sentiments = set()
161
+ self.topic_sentiment_mapping = {}
162
 
163
+ # Extract from topic columns (delimited by |)
164
  for col in self.topic_cols:
165
+ for value in self.data[col].dropna():
166
+ if isinstance(value, str) and value.strip():
167
+ # Split by | delimiter and clean each topic
168
+ topics = [topic.strip() for topic in value.split('|') if topic.strip()]
169
+ self.unique_topics.update(topics)
170
 
171
+ # Extract from category columns (delimited by |)
172
  for col in self.category_cols:
173
+ for value in self.data[col].dropna():
174
+ if isinstance(value, str) and value.strip():
175
+ # Split by | delimiter and clean each category
176
+ categories = [cat.strip() for cat in value.split('|') if cat.strip()]
177
+ self.unique_categories.update(categories)
178
+
179
+ # Extract sentiments from sentiment columns and build topic-sentiment mapping
180
+ for col in self.sentiment_cols:
181
+ for idx, value in enumerate(self.data[col].dropna()):
182
+ if isinstance(value, str) and value.strip():
183
+ # Split by | delimiter to get individual topic::sentiment pairs
184
+ pairs = [pair.strip() for pair in value.split('|') if pair.strip() and '::' in pair]
185
+ for pair in pairs:
186
+ if '::' in pair:
187
+ topic_part, sentiment_part = pair.split('::', 1)
188
+ topic = topic_part.strip()
189
+ sentiment = sentiment_part.strip()
190
+ if topic and sentiment:
191
+ self.unique_topics.add(topic) # Add topic from sentiment data
192
+ self.unique_sentiments.add(sentiment)
193
+
194
+ # Store the mapping for later use
195
+ if idx not in self.topic_sentiment_mapping:
196
+ self.topic_sentiment_mapping[idx] = {}
197
+ self.topic_sentiment_mapping[idx][topic] = sentiment
198
+
199
+ return len(self.unique_topics), len(self.unique_categories), len(self.unique_sentiments)
200
 
201
  def set_selected_columns(self, selected_columns):
202
  """
 
206
 
207
  def transform_data(self):
208
  """
209
+ Transform the data into binary columns with T_, S_, and C_ prefixes.
210
  """
211
+ if not self.unique_topics and not self.unique_categories:
212
+ self.extract_unique_topics_and_categories()
213
 
214
  # Create output dataframe starting with feedback_id
215
  self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
 
219
  if col in self.data.columns:
220
  self.transformed_data[col] = self.data[col]
221
 
222
+ # Add Verbatim sentiment columns
223
+ self.transformed_data['Verbatim_Positive'] = 0
224
+ self.transformed_data['Verbatim_Neutral'] = 0
225
+ self.transformed_data['Verbatim_Negative'] = 0
226
+
227
+ # Create binary topic columns with T_ prefix
228
  for topic in sorted(self.unique_topics):
229
+ safe_topic_name = self._make_safe_column_name(topic)
230
+ col_name = f"T_{safe_topic_name}"
231
+ self.transformed_data[col_name] = 0
232
 
233
+ # Create sentiment columns with S_ prefix (one per topic, containing actual sentiment values)
234
+ for topic in sorted(self.unique_topics):
235
+ safe_topic_name = self._make_safe_column_name(topic)
236
+ col_name = f"S_{safe_topic_name}"
237
+ self.transformed_data[col_name] = "" # Initialize with empty strings
238
+
239
+ # Create binary category columns with C_ prefix
240
+ for category in sorted(self.unique_categories):
241
+ safe_category_name = self._make_safe_column_name(category)
242
+ col_name = f"C_{safe_category_name}"
243
+ self.transformed_data[col_name] = 0
244
+
245
+ # Fill in the data
246
  for idx, row in self.data.iterrows():
247
+ # Process sentiment columns to determine which topics exist in ABSA column
248
+ topics_in_absa = set()
249
+ all_sentiments_in_row = set() # Track all sentiments for verbatim columns
250
+
251
+ for s_col in self.sentiment_cols:
252
+ sentiment_value = row.get(s_col)
253
+ if pd.notna(sentiment_value) and isinstance(sentiment_value, str) and sentiment_value.strip():
254
+ pairs = [pair.strip() for pair in sentiment_value.split('|') if pair.strip()]
255
+ for pair in pairs:
256
+ if '::' in pair:
257
+ topic_part, sentiment_part = pair.split('::', 1)
258
+ topic = topic_part.strip()
259
+ sentiment = sentiment_part.strip()
260
+
261
+ if topic and sentiment:
262
+ topics_in_absa.add(topic)
263
+ all_sentiments_in_row.add(sentiment.lower()) # Store in lowercase for matching
264
+
265
+ # Set the actual sentiment value (not 1/0)
266
+ safe_topic_name = self._make_safe_column_name(topic)
267
+ sentiment_col_name = f"S_{safe_topic_name}"
268
+ if sentiment_col_name in self.transformed_data.columns:
269
+ self.transformed_data.loc[idx, sentiment_col_name] = sentiment
270
+
271
+ # Set Verbatim sentiment columns based on sentiments found in ABSA
272
+ if any(sentiment in all_sentiments_in_row for sentiment in ['positive', 'positiv']):
273
+ self.transformed_data.loc[idx, 'Verbatim_Positive'] = 1
274
+ if any(sentiment in all_sentiments_in_row for sentiment in ['neutral']):
275
+ self.transformed_data.loc[idx, 'Verbatim_Neutral'] = 1
276
+ if any(sentiment in all_sentiments_in_row for sentiment in ['negative', 'negativ']):
277
+ self.transformed_data.loc[idx, 'Verbatim_Negative'] = 1
278
+
279
+ # Set T_ columns to 1 if topic exists in ABSA column, 0 otherwise
280
+ for topic in topics_in_absa:
281
+ safe_topic_name = self._make_safe_column_name(topic)
282
+ topic_col_name = f"T_{safe_topic_name}"
283
+ if topic_col_name in self.transformed_data.columns:
284
+ self.transformed_data.loc[idx, topic_col_name] = 1
285
+
286
+ # Process category columns
287
+ categories_in_row = set()
288
  for c_col in self.category_cols:
289
+ category_value = row.get(c_col)
290
+ if pd.notna(category_value) and isinstance(category_value, str) and category_value.strip():
291
+ categories = [cat.strip() for cat in category_value.split('|') if cat.strip()]
292
+ categories_in_row.update(categories)
293
+
294
+ # Set category binary values (always 1 if present in category column)
295
+ for category in categories_in_row:
296
+ safe_category_name = self._make_safe_column_name(category)
297
+ category_col_name = f"C_{safe_category_name}"
298
+ if category_col_name in self.transformed_data.columns:
299
+ self.transformed_data.loc[idx, category_col_name] = 1
300
 
301
  return self.transformed_data.shape
302
 
303
+ def _make_safe_column_name(self, name):
304
+ """
305
+ Convert a name to a safe column name by removing/replacing problematic characters.
306
+ """
307
+ # Replace spaces and special characters with underscores
308
+ safe_name = re.sub(r'[^\w]', '_', str(name))
309
+ # Remove multiple consecutive underscores
310
+ safe_name = re.sub(r'_+', '_', safe_name)
311
+ # Remove leading/trailing underscores
312
+ safe_name = safe_name.strip('_')
313
+ return safe_name
314
+
315
  def analyze_data(self):
316
  """
317
  Analyze the transformed data to provide insights.
 
319
  if self.transformed_data is None:
320
  raise ValueError("No transformed data to analyze")
321
 
322
+ # Count different types of columns
323
+ topic_cols = [col for col in self.transformed_data.columns if col.startswith('T_')]
324
+ sentiment_cols = [col for col in self.transformed_data.columns if col.startswith('S_')]
325
+ category_cols = [col for col in self.transformed_data.columns if col.startswith('C_')]
326
+ verbatim_cols = ['Verbatim_Positive', 'Verbatim_Neutral', 'Verbatim_Negative']
327
+
328
+ # Calculate statistics
329
+ topic_stats = {}
330
+ for col in topic_cols:
331
+ topic_stats[col] = self.transformed_data[col].sum()
332
+
333
+ # For sentiment columns, count non-empty values
334
+ sentiment_stats = {}
335
+ for col in sentiment_cols:
336
+ sentiment_stats[col] = (self.transformed_data[col] != "").sum()
337
+
338
+ category_stats = {}
339
+ for col in category_cols:
340
+ category_stats[col] = self.transformed_data[col].sum()
341
+
342
+ # Verbatim sentiment statistics
343
+ verbatim_stats = {}
344
+ for col in verbatim_cols:
345
+ if col in self.transformed_data.columns:
346
+ verbatim_stats[col] = self.transformed_data[col].sum()
347
+
348
+ # Sort by frequency
349
+ sorted_topics = sorted(topic_stats.items(), key=lambda x: x[1], reverse=True)
350
+ sorted_sentiments = sorted(sentiment_stats.items(), key=lambda x: x[1], reverse=True)
351
+ sorted_categories = sorted(category_stats.items(), key=lambda x: x[1], reverse=True)
352
+ sorted_verbatim = sorted(verbatim_stats.items(), key=lambda x: x[1], reverse=True)
353
 
354
  # Prepare analysis summary
355
  analysis_text = f"**Analysis Results**\n\n"
356
  analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
357
  analysis_text += f"Selected original columns: {len(self.selected_columns)}\n"
358
+ analysis_text += f"Verbatim sentiment columns: 3 (Positive, Neutral, Negative)\n"
359
+ analysis_text += f"Topic columns (T_): {len(topic_cols)}\n"
360
+ analysis_text += f"Sentiment columns (S_): {len(sentiment_cols)}\n"
361
+ analysis_text += f"Category columns (C_): {len(category_cols)}\n"
362
+ analysis_text += f"Verbatim column used: {self.verbatim_column}\n"
363
+
364
+ # Add dynamic topic prefix info
365
+ if self.dynamic_topic_prefix:
366
+ analysis_text += f"Dynamic topic prefix extracted: '{self.dynamic_topic_prefix}'\n\n"
367
+ else:
368
+ analysis_text += f"Topic prefix used: '{self.topic_prefix}'\n\n"
369
 
370
  if self.selected_columns:
371
  analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
372
 
373
+ # Verbatim sentiment analysis
374
+ if sorted_verbatim:
375
+ analysis_text += "**Verbatim Sentiment Distribution:**\n"
376
+ for verbatim_col, count in sorted_verbatim:
377
+ percentage = (count / len(self.transformed_data)) * 100
378
+ analysis_text += f"- {verbatim_col}: {count} occurrences ({percentage:.1f}%)\n"
379
+
380
+ # Topic analysis
381
+ if sorted_topics:
382
+ analysis_text += "\n**Top 10 Most Frequent Topics (T_):**\n"
383
+ for topic_col, count in sorted_topics[:10]:
384
+ analysis_text += f"- {topic_col}: {count} occurrences\n"
385
+
386
+ # Category analysis
387
+ if sorted_categories:
388
+ analysis_text += "\n**Top 10 Most Frequent Categories (C_):**\n"
389
+ for category_col, count in sorted_categories[:10]:
390
+ analysis_text += f"- {category_col}: {count} occurrences\n"
391
+
392
+ # Sentiment analysis
393
+ if sorted_sentiments:
394
+ analysis_text += "\n**Top 10 Most Frequent Sentiments (S_):**\n"
395
+ for sentiment_col, count in sorted_sentiments[:10]:
396
+ analysis_text += f"- {sentiment_col}: {count} sentiment values\n"
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  return analysis_text
399
 
400
  def save_transformed_data(self, output_format='xlsx'):
401
  """
402
  Save the transformed data and return the file path.
 
403
  """
404
  if self.transformed_data is None:
405
  raise ValueError("No transformed data to save")
406
 
407
  # Create filename with original filename prefix and timestamp
408
  timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
 
 
409
  prefix = self.original_filename if self.original_filename else 'transformed_feedback'
410
 
411
  if output_format == 'xlsx':
412
+ filename = f"{prefix}_transformed_topics_{timestamp}.xlsx"
 
413
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
414
  self.transformed_data.to_excel(temp_file.name, index=False)
415
  temp_file.close()
416
+
 
417
  final_path = os.path.join(tempfile.gettempdir(), filename)
418
  if os.path.exists(final_path):
419
  os.remove(final_path)
420
  os.rename(temp_file.name, final_path)
421
+
422
  else: # csv
423
+ filename = f"{prefix}_binary_matrix_{timestamp}.csv"
 
424
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
425
  self.transformed_data.to_csv(temp_file.name, index=False)
426
  temp_file.close()
427
+
 
428
  final_path = os.path.join(tempfile.gettempdir(), filename)
429
  if os.path.exists(final_path):
430
  os.remove(final_path)
431
  os.rename(temp_file.name, final_path)
432
 
 
433
  if not os.path.exists(final_path):
434
  raise ValueError(f"Failed to create output file: {final_path}")
435
 
 
465
  df = pd.read_csv(file_obj, sep='\t', nrows=5)
466
 
467
  columns = list(df.columns)
 
 
468
  column_choices = [f"{i+1:2d}. {col}" for i, col in enumerate(columns)]
469
+
 
470
  return gr.CheckboxGroup(
471
  choices=column_choices,
472
+ value=[],
473
  label=f"πŸ“‹ Select Columns to Include ({len(columns)} available)",
474
+ info="Choose which original columns to include in the transformed file (in addition to feedback_id).",
475
+ elem_classes=["column-selector"]
476
  )
477
 
478
  except Exception as e:
 
490
  """
491
  if not selected_display_names:
492
  return []
493
+
494
  actual_names = []
495
  for display_name in selected_display_names:
 
496
  if '. ' in display_name:
497
  actual_name = display_name.split('. ', 1)[1]
498
  actual_names.append(actual_name)
499
  else:
500
  actual_names.append(display_name)
501
+
502
  return actual_names
503
 
504
 
 
510
  try:
511
  # Extract actual column names from display format
512
  actual_column_names = extract_column_names(selected_columns)
513
+
514
  # Initialize transformer
515
  transformer = FeedbackTransformer(
516
  topic_prefix=topic_prefix,
 
536
  status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
537
  status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
538
  status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
539
+ status_msg += f"- Verbatim column: {col_info['verbatim_column']}\n"
540
+
541
+ # Add dynamic topic prefix info
542
+ if col_info.get('dynamic_topic_prefix'):
543
+ status_msg += f"- Dynamic topic prefix extracted: '{col_info['dynamic_topic_prefix']}'\n"
544
 
545
+ # Extract unique topics, categories, and sentiments
546
+ num_topics, num_categories, num_sentiments = transformer.extract_unique_topics_and_categories()
547
  status_msg += f"\n🎯 Found {num_topics} unique topics\n"
548
+ status_msg += f"🏷️ Found {num_categories} unique categories\n"
549
+ status_msg += f"πŸ’­ Found {num_sentiments} unique sentiments\n"
550
 
551
  # Transform data
552
  shape = transformer.transform_data()
553
  status_msg += f"\n✨ Transformed data shape: {shape[0]} rows Γ— {shape[1]} columns\n"
554
+ status_msg += f"πŸ“Š Binary matrix created with T_, S_, C_ prefixes and Verbatim sentiment columns\n"
555
+ status_msg += f"πŸ”§ T_ columns: 1 if topic present in ABSA column, 0 otherwise\n"
556
+ status_msg += f"πŸ”§ S_ columns: contain actual sentiment values (not 1/0)\n"
557
+ status_msg += f"πŸ”§ C_ columns: 1 if category assigned, 0 otherwise\n"
558
+ status_msg += f"πŸ”§ Verbatim_Positive/Neutral/Negative: 1 if respective sentiment found in ABSA, 0 otherwise\n"
559
 
560
  # Analyze if requested
561
  analysis_result = ""
 
565
  # Save transformed data
566
  output_file = transformer.save_transformed_data(output_format)
567
  status_msg += f"\nπŸ’Ύ File saved successfully: {os.path.basename(output_file)}\n"
568
+ #status_msg += f"πŸ“₯ File download should start automatically\n"
569
 
570
  return status_msg, analysis_result, output_file
571
 
 
575
 
576
 
577
  # Create Gradio interface
578
+ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
579
  .column-selector .form-check {
580
  display: block !important;
581
  margin-bottom: 8px !important;
 
585
  }
586
  """) as demo:
587
  gr.Markdown("""
588
+ # πŸ“Š Binary Matrix Feedback Transformer
589
+ Transform feedback data with delimited topic and sentiment columns into binary matrix format.
590
+
591
+ ### πŸ”§ Processing Logic:
592
+ - **Automatic Topic Prefix Detection**: Extracts topic prefix from columns containing "Category:" by finding text between "Category:" and "("
593
+ - **Verbatim_Positive/Neutral/Negative**: Set to 1 if respective sentiment is found in ABSA column, 0 otherwise
594
+ - **T_ Columns**: Set to 1 if topic is present in ABSA column, 0 otherwise
595
+ - **S_ Columns**: One column per topic (e.g., S_Allgemeine_Zufriedenheit) containing actual sentiment values
596
+ - **C_ Columns**: Set to 1 if category is assigned, 0 otherwise
597
+
598
+ ### πŸ“‹ Data Format Requirements:
599
+ - **Topics**: Delimited by `|` (pipe) in columns identified by dynamic or manual prefix
600
+ - **Sentiments**: Format `Topic::Sentiment|Topic2::Sentiment2` in ABSA columns
601
+ - **Categories**: Delimited by `|` (pipe) in "Categories:" columns
602
+
603
+ ### πŸ†• Key Features:
604
+ - **Dynamic Topic Prefix Extraction**: Automatically extracts topic prefix from "Category:" columns
605
+ - **Verbatim_** columns detect overall sentiment presence regardless of topic
606
+ - **T_** columns based on ABSA column presence (topics that have sentiment data)
607
+ - **S_** columns contain actual sentiment values (not binary 1/0)
608
  """)
609
 
610
  with gr.Row():
 
617
  type="filepath"
618
  )
619
 
620
+ # Combined column selector
621
  gr.Markdown("### πŸ“‹ 2. Column Selection")
622
  column_selector = gr.CheckboxGroup(
623
  choices=[],
 
628
 
629
  with gr.Column(scale=1):
630
  # Configuration parameters
631
+ gr.Markdown("### βš™οΈ 3. Configuration")
632
 
633
  topic_prefix = gr.Textbox(
634
+ label="Topic Column Identifier (Fallback)",
635
+ value="Topic:",
636
+ info="Fallback identifier if dynamic extraction from Category: column fails"
637
  )
638
 
639
  sentiment_prefix = gr.Textbox(
640
+ label="Sentiment Column Prefix (ABSA)",
641
  value="ABSA:",
642
+ info="Prefix to identify sentiment columns (format: Topic::Sentiment)"
643
  )
644
 
645
  category_prefix = gr.Textbox(
 
649
  )
650
 
651
  text_column = gr.Textbox(
652
+ label="Text/Verbatim Column Pattern",
653
  value="TEXT",
654
+ info="Pattern to identify verbatim text column (for reference only)"
655
  )
656
 
657
  recommendation_column = gr.Textbox(
 
672
  )
673
 
674
  # Transform button
675
+ transform_btn = gr.Button("πŸ”„ 4. Transform to Binary Matrix & Download", variant="primary", size="lg")
676
 
677
  # Output sections
678
  with gr.Row():
679
  with gr.Column():
680
  status_output = gr.Textbox(
681
  label="Processing Status",
682
+ lines=12,
683
  interactive=False
684
  )
685
 
 
688
  label="Data Analysis"
689
  )
690
 
691
+ # Download section
692
  with gr.Row():
693
  with gr.Column():
694
+ gr.Markdown("### πŸ“₯ Download Status")
695
+ gr.Markdown("Please click on the link inside the output file size value to download the transformed file (the number value on the right hand side below). You may need to right click and select Save Link As (or something similar)")
696
  output_file = gr.File(
697
+ label="Transformed Binary Matrix (Auto-Download)",
698
  interactive=False,
699
  visible=True
700
  )
 
724
 
725
  # Examples section
726
  gr.Markdown("""
727
+ ### πŸ“ Example Transformations:
728
+
729
+ **Input Data with Dynamic Topic Extraction:**
730
+ ```
731
+ | Column: "Category: Service (ABC)" | ABSA: Sentiments | Categories: Issues |
732
+ | 1 | Service::Negative|Quality::Positive | Issues|Support |
733
+ ```
734
+
735
+ **System will:**
736
+ 1. Extract "Service:" from "Category: Service (ABC)" column
737
+ 2. Use "Service:" to identify topic columns instead of "Topic:"
738
+
739
+ **Output Binary Matrix:**
740
+ ```
741
+ | feedback_id | Verbatim_Positive | Verbatim_Neutral | Verbatim_Negative | T_Service | T_Quality | S_Service | S_Quality | C_Issues | C_Support |
742
+ | 1 | 1 | 0 | 1 | 1 | 1 | Negative | Positive | 1 | 1 |
743
+ ```
744
+
745
+ ### πŸ’‘ Dynamic Topic Prefix Logic:
746
+ - Searches for columns containing "Category:"
747
+ - Extracts text between "Category:" and "(" (e.g., "Service" from "Category: Service (ABC)")
748
+ - Adds ":" to create the topic prefix (e.g., "Service:")
749
+ - Uses this prefix to identify topic columns
750
+ - Falls back to manual "Topic Column Identifier" if extraction fails
751
+
752
+ ### πŸ” Key Changes in This Version:
753
+ - **NEW**: Automatic extraction of topic prefix from Category columns
754
+ - Dynamically identifies topic columns based on extracted prefix
755
+ - Maintains all other functionality (Verbatim columns, T_, S_, C_ logic)
756
+ - Provides fallback to manual topic prefix if extraction fails
757
  """)
758
 
759
  # Launch the app
760
  if __name__ == "__main__":
761
+ demo.launch(share=True)