geekgirl3 commited on
Commit
20f8cb2
Β·
verified Β·
1 Parent(s): 36c3a00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +359 -627
app.py CHANGED
@@ -2,363 +2,196 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import os
5
- import traceback
6
- from typing import Tuple, Dict, Any, Optional, List
7
- import tempfile
8
- import io
9
- import datetime
10
  import re
11
-
12
- class FeedbackTransformer:
13
- """
14
- A class to transform feedback data with delimited topic and sentiment columns
15
- into binary columns with prefixes T_, S_, and C_.
16
- """
17
-
18
- def __init__(self,
19
- topic_prefix="TOPIC_",
20
- sentiment_prefix="SENTIMENT_",
21
- category_prefix="Categories:",
22
- text_column="TEXT",
23
- recommendation_column="Q4_Weiterempfehlung"):
24
- """
25
- Initialize the FeedbackTransformer with column specifications.
26
- """
27
- self.topic_prefix = topic_prefix
28
- self.sentiment_prefix = sentiment_prefix
29
- self.category_prefix = category_prefix
30
- self.text_column = text_column
31
- self.recommendation_column = recommendation_column
32
- self.data = None
33
- self.transformed_data = None
34
- self.topic_cols = []
35
- self.sentiment_cols = []
36
- self.category_cols = []
37
- self.unique_topics = set()
38
- self.unique_categories = set()
39
- self.unique_sentiments = set()
40
- self.topic_sentiment_mapping = {} # Map topics to their sentiment values
41
- self.file_name = None
42
  self.original_filename = None
43
- self.selected_columns = []
44
- self.verbatim_column = None # Store the verbatim/text column
45
-
46
- def load_data(self, file_obj):
47
- """
48
- Load data from the uploaded file object.
49
- """
50
- if file_obj is None:
51
- raise ValueError("No file uploaded")
52
-
53
- # Get file extension and store original filename
54
- file_name = file_obj if isinstance(file_obj, str) else (file_obj.name if hasattr(file_obj, 'name') else 'unknown')
55
- self.original_filename = os.path.splitext(os.path.basename(file_name))[0]
56
- _, file_ext = os.path.splitext(file_name)
57
-
58
- # Read the data based on file type
59
  try:
60
- if file_ext.lower() in ['.xlsx', '.xls']:
61
- self.data = pd.read_excel(file_obj)
62
- elif file_ext.lower() == '.csv':
63
- # Try comma delimiter first
64
- try:
65
- self.data = pd.read_csv(file_obj, encoding='utf-8')
66
- except:
67
- # If comma fails, try tab delimiter
68
- self.data = pd.read_csv(file_obj, sep='\t', encoding='utf-8')
69
- else:
70
- # Default to tab-delimited
71
- self.data = pd.read_csv(file_obj, sep='\t', encoding='utf-8')
72
  except Exception as e:
73
- raise ValueError(f"Error reading file: {str(e)}")
74
-
75
- return len(self.data), len(self.data.columns)
76
-
77
- def identify_columns(self):
78
- """
79
- Identify topic, category, and sentiment columns in the data.
80
- """
81
- if self.data is None:
82
- raise ValueError("Data not loaded")
83
-
84
- # Extract columns based on prefixes
85
- self.topic_cols = [col for col in self.data.columns if "Topic:" in col]
86
- self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
87
- self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
88
-
89
- # Try to identify verbatim/text column
90
- text_candidates = [col for col in self.data.columns if any(keyword in col.lower() for keyword in ['text', 'verbatim', 'comment', 'feedback'])]
91
- if text_candidates:
92
- self.verbatim_column = text_candidates[0] # Use the first match
93
- elif self.text_column in self.data.columns:
94
- self.verbatim_column = self.text_column
95
-
96
- # If no columns found with specified prefixes, return all columns for manual selection
97
- all_cols = list(self.data.columns)
98
-
99
- return {
100
- 'topic_cols': self.topic_cols,
101
- 'sentiment_cols': self.sentiment_cols,
102
- 'category_cols': self.category_cols,
103
- 'all_columns': all_cols,
104
- 'verbatim_column': self.verbatim_column
105
- }
106
-
107
- def extract_unique_topics_and_categories(self):
108
- """
109
- Extract all unique topics, categories, and sentiments from the respective columns.
110
- """
111
- self.unique_topics = set()
112
- self.unique_categories = set()
113
- self.unique_sentiments = set()
114
- self.topic_sentiment_mapping = {}
115
-
116
- # Extract from topic columns (delimited by |)
117
- for col in self.topic_cols:
118
- for value in self.data[col].dropna():
119
- if isinstance(value, str) and value.strip():
120
- # Split by | delimiter and clean each topic
121
- topics = [topic.strip() for topic in value.split('|') if topic.strip()]
122
- self.unique_topics.update(topics)
123
-
124
- # Extract from category columns (delimited by |)
125
- for col in self.category_cols:
126
- for value in self.data[col].dropna():
127
- if isinstance(value, str) and value.strip():
128
- # Split by | delimiter and clean each category
129
- categories = [cat.strip() for cat in value.split('|') if cat.strip()]
130
- self.unique_categories.update(categories)
131
-
132
- # Extract sentiments from sentiment columns and build topic-sentiment mapping
133
- for col in self.sentiment_cols:
134
- for idx, value in enumerate(self.data[col].dropna()):
135
- if isinstance(value, str) and value.strip():
136
- # Split by | delimiter to get individual topic::sentiment pairs
137
- pairs = [pair.strip() for pair in value.split('|') if pair.strip() and '::' in pair]
138
- for pair in pairs:
139
- if '::' in pair:
140
- topic_part, sentiment_part = pair.split('::', 1)
141
- topic = topic_part.strip()
142
- sentiment = sentiment_part.strip()
143
- if topic and sentiment:
144
- self.unique_topics.add(topic) # Add topic from sentiment data
145
- self.unique_sentiments.add(sentiment)
146
-
147
- # Store the mapping for later use
148
- if idx not in self.topic_sentiment_mapping:
149
- self.topic_sentiment_mapping[idx] = {}
150
- self.topic_sentiment_mapping[idx][topic] = sentiment
151
-
152
- return len(self.unique_topics), len(self.unique_categories), len(self.unique_sentiments)
153
-
154
- def set_selected_columns(self, selected_columns):
155
- """
156
- Set which original columns should be included in the output.
157
- """
158
- self.selected_columns = selected_columns if selected_columns else []
159
-
160
- def transform_data(self):
161
- """
162
- Transform the data into binary columns with T_, S_, and C_ prefixes.
163
- """
164
- if not self.unique_topics and not self.unique_categories:
165
- self.extract_unique_topics_and_categories()
166
-
167
- # Create output dataframe starting with feedback_id
168
- self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
169
-
170
- # Add selected original columns first (right after feedback_id)
171
- for col in self.selected_columns:
172
- if col in self.data.columns:
173
- self.transformed_data[col] = self.data[col]
174
-
175
- # Add Verbatim sentiment columns
176
- self.transformed_data['Verbatim_Positive'] = 0
177
- self.transformed_data['Verbatim_Neutral'] = 0
178
- self.transformed_data['Verbatim_Negative'] = 0
179
-
180
- # Create binary topic columns with T_ prefix
181
- for topic in sorted(self.unique_topics):
182
- safe_topic_name = self._make_safe_column_name(topic)
183
- col_name = f"T_{safe_topic_name}"
184
- self.transformed_data[col_name] = 0
185
-
186
- # Create sentiment columns with S_ prefix (one per topic, containing actual sentiment values)
187
- for topic in sorted(self.unique_topics):
188
- safe_topic_name = self._make_safe_column_name(topic)
189
- col_name = f"S_{safe_topic_name}"
190
- self.transformed_data[col_name] = "" # Initialize with empty strings
191
-
192
- # Create binary category columns with C_ prefix
193
- for category in sorted(self.unique_categories):
194
- safe_category_name = self._make_safe_column_name(category)
195
- col_name = f"C_{safe_category_name}"
196
- self.transformed_data[col_name] = 0
197
-
198
- # Fill in the data
199
- for idx, row in self.data.iterrows():
200
- # Process sentiment columns to determine which topics exist in ABSA column
201
- topics_in_absa = set()
202
- all_sentiments_in_row = set() # Track all sentiments for verbatim columns
203
-
204
- for s_col in self.sentiment_cols:
205
- sentiment_value = row.get(s_col)
206
- if pd.notna(sentiment_value) and isinstance(sentiment_value, str) and sentiment_value.strip():
207
- pairs = [pair.strip() for pair in sentiment_value.split('|') if pair.strip()]
208
- for pair in pairs:
209
- if '::' in pair:
210
- topic_part, sentiment_part = pair.split('::', 1)
211
- topic = topic_part.strip()
212
- sentiment = sentiment_part.strip()
213
-
214
- if topic and sentiment:
215
- topics_in_absa.add(topic)
216
- all_sentiments_in_row.add(sentiment.lower()) # Store in lowercase for matching
217
-
218
- # Set the actual sentiment value (not 1/0)
219
- safe_topic_name = self._make_safe_column_name(topic)
220
- sentiment_col_name = f"S_{safe_topic_name}"
221
- if sentiment_col_name in self.transformed_data.columns:
222
- self.transformed_data.loc[idx, sentiment_col_name] = sentiment
223
-
224
- # Set Verbatim sentiment columns based on sentiments found in ABSA
225
- if any(sentiment in all_sentiments_in_row for sentiment in ['positive', 'positiv']):
226
- self.transformed_data.loc[idx, 'Verbatim_Positive'] = 1
227
- if any(sentiment in all_sentiments_in_row for sentiment in ['neutral']):
228
- self.transformed_data.loc[idx, 'Verbatim_Neutral'] = 1
229
- if any(sentiment in all_sentiments_in_row for sentiment in ['negative', 'negativ']):
230
- self.transformed_data.loc[idx, 'Verbatim_Negative'] = 1
231
-
232
- # Set T_ columns to 1 if topic exists in ABSA column, 0 otherwise
233
- for topic in topics_in_absa:
234
- safe_topic_name = self._make_safe_column_name(topic)
235
- topic_col_name = f"T_{safe_topic_name}"
236
- if topic_col_name in self.transformed_data.columns:
237
- self.transformed_data.loc[idx, topic_col_name] = 1
238
-
239
- # Process category columns
240
- categories_in_row = set()
241
- for c_col in self.category_cols:
242
- category_value = row.get(c_col)
243
- if pd.notna(category_value) and isinstance(category_value, str) and category_value.strip():
244
- categories = [cat.strip() for cat in category_value.split('|') if cat.strip()]
245
- categories_in_row.update(categories)
246
-
247
- # Set category binary values (always 1 if present in category column)
248
- for category in categories_in_row:
249
- safe_category_name = self._make_safe_column_name(category)
250
- category_col_name = f"C_{safe_category_name}"
251
- if category_col_name in self.transformed_data.columns:
252
- self.transformed_data.loc[idx, category_col_name] = 1
253
-
254
- return self.transformed_data.shape
255
-
256
- def _make_safe_column_name(self, name):
257
- """
258
- Convert a name to a safe column name by removing/replacing problematic characters.
259
- """
260
- # Replace spaces and special characters with underscores
261
- safe_name = re.sub(r'[^\w]', '_', str(name))
262
- # Remove multiple consecutive underscores
263
- safe_name = re.sub(r'_+', '_', safe_name)
264
- # Remove leading/trailing underscores
265
- safe_name = safe_name.strip('_')
266
- return safe_name
267
-
268
- def analyze_data(self):
269
- """
270
- Analyze the transformed data to provide insights.
271
- """
272
- if self.transformed_data is None:
273
- raise ValueError("No transformed data to analyze")
274
-
275
- # Count different types of columns
276
- topic_cols = [col for col in self.transformed_data.columns if col.startswith('T_')]
277
- sentiment_cols = [col for col in self.transformed_data.columns if col.startswith('S_')]
278
- category_cols = [col for col in self.transformed_data.columns if col.startswith('C_')]
279
- verbatim_cols = ['Verbatim_Positive', 'Verbatim_Neutral', 'Verbatim_Negative']
280
-
281
- # Calculate statistics
282
- topic_stats = {}
283
- for col in topic_cols:
284
- topic_stats[col] = self.transformed_data[col].sum()
285
-
286
- # For sentiment columns, count non-empty values
287
- sentiment_stats = {}
288
- for col in sentiment_cols:
289
- sentiment_stats[col] = (self.transformed_data[col] != "").sum()
290
-
291
- category_stats = {}
292
- for col in category_cols:
293
- category_stats[col] = self.transformed_data[col].sum()
294
-
295
- # Verbatim sentiment statistics
296
- verbatim_stats = {}
297
- for col in verbatim_cols:
298
- if col in self.transformed_data.columns:
299
- verbatim_stats[col] = self.transformed_data[col].sum()
300
-
301
- # Sort by frequency
302
- sorted_topics = sorted(topic_stats.items(), key=lambda x: x[1], reverse=True)
303
- sorted_sentiments = sorted(sentiment_stats.items(), key=lambda x: x[1], reverse=True)
304
- sorted_categories = sorted(category_stats.items(), key=lambda x: x[1], reverse=True)
305
- sorted_verbatim = sorted(verbatim_stats.items(), key=lambda x: x[1], reverse=True)
306
-
307
- # Prepare analysis summary
308
- analysis_text = f"**Analysis Results**\n\n"
309
- analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
310
- analysis_text += f"Selected original columns: {len(self.selected_columns)}\n"
311
- analysis_text += f"Verbatim sentiment columns: 3 (Positive, Neutral, Negative)\n"
312
- analysis_text += f"Topic columns (T_): {len(topic_cols)}\n"
313
- analysis_text += f"Sentiment columns (S_): {len(sentiment_cols)}\n"
314
- analysis_text += f"Category columns (C_): {len(category_cols)}\n"
315
- analysis_text += f"Verbatim column used: {self.verbatim_column}\n\n"
316
-
317
- if self.selected_columns:
318
- analysis_text += f"**Included Original Columns:** {', '.join(self.selected_columns)}\n\n"
319
-
320
- # Verbatim sentiment analysis
321
- if sorted_verbatim:
322
- analysis_text += "**Verbatim Sentiment Distribution:**\n"
323
- for verbatim_col, count in sorted_verbatim:
324
- percentage = (count / len(self.transformed_data)) * 100
325
- analysis_text += f"- {verbatim_col}: {count} occurrences ({percentage:.1f}%)\n"
326
-
327
- # Topic analysis
328
- if sorted_topics:
329
- analysis_text += "\n**Top 10 Most Frequent Topics (T_):**\n"
330
- for topic_col, count in sorted_topics[:10]:
331
- analysis_text += f"- {topic_col}: {count} occurrences\n"
332
-
333
- # Category analysis
334
- if sorted_categories:
335
- analysis_text += "\n**Top 10 Most Frequent Categories (C_):**\n"
336
- for category_col, count in sorted_categories[:10]:
337
- analysis_text += f"- {category_col}: {count} occurrences\n"
338
-
339
- # Sentiment analysis
340
- if sorted_sentiments:
341
- analysis_text += "\n**Top 10 Most Frequent Sentiments (S_):**\n"
342
- for sentiment_col, count in sorted_sentiments[:10]:
343
- analysis_text += f"- {sentiment_col}: {count} sentiment values\n"
344
-
345
- return analysis_text
346
-
347
  def save_transformed_data(self, output_format='xlsx'):
348
- """
349
- Save the transformed data and return the file path.
350
- """
351
- if self.transformed_data is None:
352
  raise ValueError("No transformed data to save")
353
 
354
  # Create filename with original filename prefix and timestamp
355
- timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
356
- prefix = self.original_filename if self.original_filename else 'transformed_feedback'
357
 
358
  if output_format == 'xlsx':
359
- filename = f"{prefix}_transformed_topics_{timestamp}.xlsx"
360
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
361
- self.transformed_data.to_excel(temp_file.name, index=False)
362
  temp_file.close()
363
 
364
  final_path = os.path.join(tempfile.gettempdir(), filename)
@@ -367,9 +200,9 @@ class FeedbackTransformer:
367
  os.rename(temp_file.name, final_path)
368
 
369
  else: # csv
370
- filename = f"{prefix}_binary_matrix_{timestamp}.csv"
371
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
372
- self.transformed_data.to_csv(temp_file.name, index=False)
373
  temp_file.close()
374
 
375
  final_path = os.path.join(tempfile.gettempdir(), filename)
@@ -382,257 +215,175 @@ class FeedbackTransformer:
382
 
383
  return final_path
384
 
 
 
385
 
386
- # Gradio interface functions
387
- def get_column_selector(file_obj):
388
- """
389
- Get a combined column preview and selector interface.
390
- """
391
  try:
392
- if file_obj is None:
393
- return gr.CheckboxGroup(
394
- choices=[],
395
- value=[],
396
- label="πŸ“‹ Select Columns to Include",
397
- info="Upload a file first to see available columns"
398
- )
399
-
400
- # Read first few rows to get column names
401
- file_name = file_obj if isinstance(file_obj, str) else (file_obj.name if hasattr(file_obj, 'name') else 'unknown')
402
- _, file_ext = os.path.splitext(file_name)
403
-
404
- if file_ext.lower() in ['.xlsx', '.xls']:
405
- df = pd.read_excel(file_obj, nrows=5)
406
- elif file_ext.lower() == '.csv':
407
- try:
408
- df = pd.read_csv(file_obj, nrows=5)
409
- except:
410
- df = pd.read_csv(file_obj, sep='\t', nrows=5)
411
- else:
412
- df = pd.read_csv(file_obj, sep='\t', nrows=5)
413
-
414
- columns = list(df.columns)
415
- column_choices = [f"{i+1:2d}. {col}" for i, col in enumerate(columns)]
416
-
417
- return gr.CheckboxGroup(
418
- choices=column_choices,
419
- value=[],
420
- label=f"πŸ“‹ Select Columns to Include ({len(columns)} available)",
421
- info="Choose which original columns to include in the transformed file (in addition to feedback_id).",
422
- elem_classes=["column-selector"]
423
  )
424
-
425
  except Exception as e:
426
- return gr.CheckboxGroup(
427
- choices=[],
428
- value=[],
429
- label="πŸ“‹ Select Columns to Include",
430
- info=f"Error reading file: {str(e)}"
431
- )
432
-
433
-
434
- def extract_column_names(selected_display_names):
435
- """
436
- Extract actual column names from the numbered display format.
437
- """
438
- if not selected_display_names:
439
- return []
440
-
441
- actual_names = []
442
- for display_name in selected_display_names:
443
- if '. ' in display_name:
444
- actual_name = display_name.split('. ', 1)[1]
445
- actual_names.append(actual_name)
446
- else:
447
- actual_names.append(display_name)
448
 
449
- return actual_names
 
 
 
 
450
 
 
 
 
451
 
452
- def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
453
- text_column, recommendation_column, output_format, analyze_data, selected_columns):
454
- """
455
- Main processing function for Gradio interface.
456
- """
457
  try:
458
- # Extract actual column names from display format
459
- actual_column_names = extract_column_names(selected_columns)
460
-
461
- # Initialize transformer
462
- transformer = FeedbackTransformer(
463
- topic_prefix=topic_prefix,
464
- sentiment_prefix=sentiment_prefix,
465
- category_prefix=category_prefix,
466
- text_column=text_column,
467
- recommendation_column=recommendation_column
468
- )
469
-
470
- # Load data
471
- rows, cols = transformer.load_data(file_obj)
472
- status_msg = f"βœ… Loaded {rows} rows and {cols} columns\n"
473
-
474
- # Set selected columns for inclusion
475
- transformer.set_selected_columns(actual_column_names)
476
- status_msg += f"πŸ“‹ Selected {len(actual_column_names)} original columns for inclusion\n"
477
- if actual_column_names:
478
- status_msg += f" Selected columns: {', '.join(actual_column_names)}\n"
479
-
480
- # Identify columns
481
- col_info = transformer.identify_columns()
482
- status_msg += f"\nπŸ“Š Found columns:\n"
483
- status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
484
- status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
485
- status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
486
- status_msg += f"- Verbatim column: {col_info['verbatim_column']}\n"
487
-
488
- # Extract unique topics, categories, and sentiments
489
- num_topics, num_categories, num_sentiments = transformer.extract_unique_topics_and_categories()
490
- status_msg += f"\n🎯 Found {num_topics} unique topics\n"
491
- status_msg += f"🏷️ Found {num_categories} unique categories\n"
492
- status_msg += f"πŸ’­ Found {num_sentiments} unique sentiments\n"
493
-
494
- # Transform data
495
- shape = transformer.transform_data()
496
- status_msg += f"\n✨ Transformed data shape: {shape[0]} rows Γ— {shape[1]} columns\n"
497
- status_msg += f"πŸ“Š Binary matrix created with T_, S_, C_ prefixes and Verbatim sentiment columns\n"
498
- status_msg += f"πŸ”§ T_ columns: 1 if topic present in ABSA column, 0 otherwise\n"
499
- status_msg += f"πŸ”§ S_ columns: contain actual sentiment values (not 1/0)\n"
500
- status_msg += f"πŸ”§ C_ columns: 1 if category assigned, 0 otherwise\n"
501
- status_msg += f"πŸ”§ Verbatim_Positive/Neutral/Negative: 1 if respective sentiment found in ABSA, 0 otherwise\n"
502
-
503
- # Analyze if requested
504
- analysis_result = ""
505
- if analyze_data:
506
- analysis_result = transformer.analyze_data()
507
-
508
- # Save transformed data
509
- output_file = transformer.save_transformed_data(output_format)
510
- status_msg += f"\nπŸ’Ύ File saved successfully: {os.path.basename(output_file)}\n"
511
- #status_msg += f"πŸ“₯ File download should start automatically\n"
512
-
513
- return status_msg, analysis_result, output_file
514
-
515
  except Exception as e:
516
- error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
517
- return error_msg, "", None
 
518
 
519
-
520
- # Create Gradio interface
521
- with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
522
- .column-selector .form-check {
523
- display: block !important;
524
- margin-bottom: 8px !important;
525
- }
526
- .column-selector .form-check-input {
527
- margin-right: 8px !important;
528
- }
529
- """) as demo:
530
  gr.Markdown("""
531
- # πŸ“Š Binary Matrix Feedback Transformer
532
- Transform feedback data with delimited topic and sentiment columns into binary matrix format.
533
-
534
- ### πŸ”§ Processing Logic:
535
- - **Verbatim_Positive/Neutral/Negative**: Set to 1 if respective sentiment is found in ABSA column, 0 otherwise
536
- - **T_ Columns**: Set to 1 if topic is present in ABSA column, 0 otherwise
537
- - **S_ Columns**: One column per topic (e.g., S_Allgemeine_Zufriedenheit) containing actual sentiment values
538
- - **C_ Columns**: Set to 1 if category is assigned, 0 otherwise
539
-
540
- ### πŸ“‹ Data Format Requirements:
541
- - **Topics**: Delimited by `|` (pipe) in "Topics:" columns (optional)
542
- - **Sentiments**: Format `Topic::Sentiment|Topic2::Sentiment2` in ABSA columns
543
- - **Categories**: Delimited by `|` (pipe) in "Categories:" columns
544
-
545
- ### πŸ†• Key Logic:
546
- - **Verbatim_** columns detect overall sentiment presence regardless of topic
547
- - **T_** columns based on ABSA column presence (topics that have sentiment data)
548
- - **S_** columns contain actual sentiment values (not binary 1/0)
549
- - No automatic column renaming for "Topic:" prefix
550
  """)
551
-
552
  with gr.Row():
553
  with gr.Column(scale=1):
554
- # File upload
555
- gr.Markdown("### πŸ“‹ 1. Source file upload")
556
- input_file = gr.File(
557
  label="Upload Input File",
558
- file_types=[".xlsx", ".xls", ".csv", ".txt"],
559
  type="filepath"
560
  )
561
-
562
- # Combined column selector
563
- gr.Markdown("### πŸ“‹ 2. Column Selection")
 
 
 
 
 
 
 
 
 
 
 
564
  column_selector = gr.CheckboxGroup(
 
565
  choices=[],
566
  value=[],
567
- label="Select Columns to Include",
568
- info="Upload a file first to see available columns"
 
569
  )
570
-
571
- with gr.Column(scale=1):
572
- # Configuration parameters
573
- gr.Markdown("### βš™οΈ 3. Configuration")
574
-
575
- topic_prefix = gr.Textbox(
576
- label="Topic Column Identifier",
577
- value="Topic:",
578
- info="Text to identify topic columns (for reference only)"
579
  )
580
-
581
- sentiment_prefix = gr.Textbox(
582
- label="Sentiment Column Prefix (ABSA)",
583
- value="ABSA:",
584
- info="Prefix to identify sentiment columns (format: Topic::Sentiment)"
585
- )
586
-
587
- category_prefix = gr.Textbox(
588
- label="Category Column Prefix",
589
- value="Categories:",
590
- info="Prefix to identify category columns"
591
- )
592
-
593
- text_column = gr.Textbox(
594
- label="Text/Verbatim Column Pattern",
595
- value="TEXT",
596
- info="Pattern to identify verbatim text column (for reference only)"
597
- )
598
-
599
- recommendation_column = gr.Textbox(
600
- label="Recommendation Column Name",
601
- value="Q4_Weiterempfehlung",
602
- info="Column containing recommendation scores (for reference only)"
603
- )
604
-
605
- output_format = gr.Radio(
606
  label="Output Format",
607
- choices=["xlsx", "csv"],
608
- value="xlsx"
609
  )
610
-
611
- analyze_checkbox = gr.Checkbox(
612
- label="Analyze transformed data",
613
- value=True
 
 
614
  )
615
-
616
- # Transform button
617
- transform_btn = gr.Button("πŸ”„ 4. Transform to Binary Matrix & Download", variant="primary", size="lg")
618
-
619
- # Output sections
620
- with gr.Row():
621
- with gr.Column():
622
- status_output = gr.Textbox(
 
 
 
 
 
 
 
 
 
 
 
 
623
  label="Processing Status",
624
- lines=12,
625
- interactive=False
626
- )
627
-
628
- with gr.Column():
629
- analysis_output = gr.Markdown(
630
- label="Data Analysis"
631
  )
632
-
633
- # Download section
634
- with gr.Row():
635
- with gr.Column():
636
  gr.Markdown("### πŸ“₯ Download Status")
637
  gr.Markdown("Please click on the link inside the output file size value to download the transformed file (the number value on the right hand side below). You may need to right click and select Save Link As (or something similar)")
638
  output_file = gr.File(
@@ -640,64 +391,45 @@ with gr.Blocks(title="Binary Matrix Feedback Transformer", css="""
640
  interactive=False,
641
  visible=True
642
  )
643
-
644
- # Event handlers
645
- input_file.change(
646
- fn=get_column_selector,
647
- inputs=[input_file],
 
 
 
 
 
648
  outputs=[column_selector]
649
  )
650
-
651
- transform_btn.click(
652
- fn=process_file,
653
- inputs=[
654
- input_file,
655
- topic_prefix,
656
- sentiment_prefix,
657
- category_prefix,
658
- text_column,
659
- recommendation_column,
660
- output_format,
661
- analyze_checkbox,
662
- column_selector
663
- ],
664
- outputs=[status_output, analysis_output, output_file]
665
  )
666
-
667
- # Examples section
668
- gr.Markdown("""
669
- ### πŸ“ Example Transformations:
670
-
671
- **Input Data:**
672
- ```
673
- | feedback_id | ABSA: Sentiments | Categories: Issues |
674
- | 1 | Service::Negative|Quality::Positive | Issues|Support |
675
- ```
676
-
677
- **Output Binary Matrix:**
678
- ```
679
- | feedback_id | Verbatim_Positive | Verbatim_Neutral | Verbatim_Negative | T_Service | T_Quality | S_Service | S_Quality | C_Issues | C_Support |
680
- | 1 | 1 | 0 | 1 | 1 | 1 | Negative | Positive | 1 | 1 |
681
- ```
682
-
683
- ### πŸ’‘ Column Logic:
684
- - **Verbatim_Positive**: 1 if any "Positive"/"Positiv" sentiment found in ABSA
685
- - **Verbatim_Neutral**: 1 if any "Neutral" sentiment found in ABSA
686
- - **Verbatim_Negative**: 1 if any "Negative"/"Negativ" sentiment found in ABSA
687
- - **T_[topic_name]**: 1 if topic exists in ABSA column, 0 otherwise
688
- - **S_[topic_name]**: Actual sentiment value for that topic (e.g., "Positive", "Negative")
689
- - **C_[category_name]**: 1 if category is assigned, 0 otherwise
690
- - Safe column names (special characters replaced with underscores)
691
-
692
- ### πŸ” Key Changes Made:
693
- - **NEW**: Added Verbatim_Positive, Verbatim_Neutral, Verbatim_Negative columns
694
- - These columns are set to 1 if the respective sentiment is found anywhere in the ABSA column
695
- - Supports both English (Positive/Negative/Neutral) and German (Positiv/Negativ) sentiment detection
696
- - Removed automatic "Topic:" column renaming logic
697
- - T_ columns are now binary (1/0) based on topic existence in ABSA column
698
- - Topics are extracted from ABSA sentiment data for T_ column creation
699
  """)
700
 
701
- # Launch the app
702
  if __name__ == "__main__":
703
- demo.launch()
 
 
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import os
 
 
 
 
 
5
  import re
6
+ import tempfile
7
+ import shutil
8
+ from datetime import datetime
9
+ from typing import List, Tuple, Dict, Any
10
+ import json
11
+ from io import BytesIO
12
+
13
+ class CSVBinaryTransformer:
14
+ def __init__(self):
15
+ self.df = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  self.original_filename = None
17
+ self.processed_df = None
18
+
19
+ def load_csv(self, file_path: str) -> Tuple[pd.DataFrame, List[str]]:
20
+ """Load CSV file and return dataframe and column list"""
 
 
 
 
 
 
 
 
 
 
 
 
21
  try:
22
+ self.df = pd.read_csv(file_path)
23
+ self.original_filename = os.path.splitext(os.path.basename(file_path))[0]
24
+
25
+ # Create checkbox options for columns
26
+ column_choices = [(col, col) for col in self.df.columns.tolist()]
27
+
28
+ return self.df.head(10), column_choices
 
 
 
 
 
29
  except Exception as e:
30
+ raise gr.Error(f"Error loading CSV: {str(e)}")
31
+
32
+ def select_all_columns(self) -> List[str]:
33
+ """Return all column names for select all functionality"""
34
+ if self.df is not None:
35
+ return self.df.columns.tolist()
36
+ return []
37
+
38
+ def deselect_all_columns(self) -> List[str]:
39
+ """Return empty list for deselect all functionality"""
40
+ return []
41
+
42
+ def process_absa_columns(self, df: pd.DataFrame) -> pd.DataFrame:
43
+ """Process ABSA prefixed columns to create sentiment and topic columns"""
44
+ absa_columns = [col for col in df.columns if col.startswith('ABSA')]
45
+
46
+ if not absa_columns:
47
+ return df
48
+
49
+ # Create verbatim sentiment columns
50
+ df['Verbatim_Positive'] = 0
51
+ df['Verbatim_Neutral'] = 0
52
+ df['Verbatim_Negative'] = 0
53
+
54
+ # Dictionary to store topic-sentiment combinations
55
+ topic_sentiment_combinations = set()
56
+
57
+ for col in absa_columns:
58
+ for idx, value in df[col].items():
59
+ if pd.isna(value):
60
+ continue
61
+
62
+ value_str = str(value)
63
+
64
+ # Split by pipe delimiter to get individual topic::sentiment pairs
65
+ pairs = [pair.strip() for pair in value_str.split('|') if pair.strip()]
66
+
67
+ for pair in pairs:
68
+ if '::' in pair:
69
+ topic_part, sentiment_part = pair.split('::', 1)
70
+ topic = topic_part.strip()
71
+ sentiment = sentiment_part.strip()
72
+
73
+ if topic and sentiment:
74
+ topic_sentiment_combinations.add((topic, sentiment))
75
+
76
+ # Update verbatim sentiment columns based on sentiment
77
+ sentiment_lower = sentiment.lower()
78
+ if 'positive' in sentiment_lower:
79
+ df.at[idx, 'Verbatim_Positive'] = 1
80
+ elif 'negative' in sentiment_lower:
81
+ df.at[idx, 'Verbatim_Negative'] = 1
82
+ elif 'neutral' in sentiment_lower:
83
+ df.at[idx, 'Verbatim_Neutral'] = 1
84
+
85
+ # Create columns for topic-sentiment combinations
86
+ for topic, sentiment in topic_sentiment_combinations:
87
+ # Clean topic name for column naming
88
+ safe_topic = re.sub(r'[^\w]', '_', topic).strip('_')
89
+ col_name = f"S_{safe_topic}"
90
+ if col_name not in df.columns:
91
+ df[col_name] = ""
92
+
93
+ # Fill the S_ columns with sentiment values
94
+ for col in absa_columns:
95
+ for idx, value in df[col].items():
96
+ if pd.isna(value):
97
+ continue
98
+
99
+ value_str = str(value)
100
+ # Split by pipe delimiter to get individual topic::sentiment pairs
101
+ pairs = [pair.strip() for pair in value_str.split('|') if pair.strip()]
102
+
103
+ for pair in pairs:
104
+ if '::' in pair:
105
+ topic_part, sentiment_part = pair.split('::', 1)
106
+ topic = topic_part.strip()
107
+ sentiment = sentiment_part.strip()
108
+
109
+ if topic and sentiment:
110
+ # Clean topic name for column naming
111
+ safe_topic = re.sub(r'[^\w]', '_', topic).strip('_')
112
+ col_name = f"S_{safe_topic}"
113
+ if col_name in df.columns:
114
+ df.at[idx, col_name] = sentiment
115
+
116
+ return df
117
+
118
+ def process_categories_columns(self, df: pd.DataFrame) -> pd.DataFrame:
119
+ """Process Categories prefixed columns to create binary category columns"""
120
+ category_columns = [col for col in df.columns if col.startswith('Categories')]
121
+
122
+ if not category_columns:
123
+ return df
124
+
125
+ # Collect all unique categories
126
+ all_categories = set()
127
+
128
+ for col in category_columns:
129
+ for value in df[col].dropna():
130
+ if isinstance(value, str):
131
+ # Split by common delimiters
132
+ categories = re.split(r'[,;|]', value)
133
+ for cat in categories:
134
+ cat = cat.strip()
135
+ if cat:
136
+ all_categories.add(cat)
137
+
138
+ # Create binary columns for each category
139
+ for category in all_categories:
140
+ col_name = f"C_{category}"
141
+ df[col_name] = 0
142
+
143
+ for col in category_columns:
144
+ for idx, value in df[col].items():
145
+ if pd.isna(value):
146
+ continue
147
+ if isinstance(value, str) and category.lower() in value.lower():
148
+ df.at[idx, col_name] = 1
149
+
150
+ return df
151
+
152
+ def process_topics_column(self, df: pd.DataFrame, topics_column: str) -> pd.DataFrame:
153
+ """Process specified topics column to create binary topic columns"""
154
+ if not topics_column or topics_column not in df.columns:
155
+ return df
156
+
157
+ # Collect all unique topics
158
+ all_topics = set()
159
+
160
+ for value in df[topics_column].dropna():
161
+ if isinstance(value, str):
162
+ # Split by common delimiters
163
+ topics = re.split(r'[,;|]', value)
164
+ for topic in topics:
165
+ topic = topic.strip()
166
+ if topic:
167
+ all_topics.add(topic)
168
+
169
+ # Create binary columns for each topic
170
+ for topic in all_topics:
171
+ col_name = f"T_{topic}"
172
+ df[col_name] = 0
173
+
174
+ for idx, value in df[topics_column].items():
175
+ if pd.isna(value):
176
+ continue
177
+ if isinstance(value, str) and topic.lower() in value.lower():
178
+ df.at[idx, col_name] = 1
179
+
180
+ return df
181
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  def save_transformed_data(self, output_format='xlsx'):
183
+ """Save the transformed data and return the file path - using exact same method as working version"""
184
+ if self.processed_df is None:
 
 
185
  raise ValueError("No transformed data to save")
186
 
187
  # Create filename with original filename prefix and timestamp
188
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
189
+ prefix = self.original_filename if self.original_filename else 'transformed_data'
190
 
191
  if output_format == 'xlsx':
192
+ filename = f"{prefix}_BinaryTransformation_{timestamp}.xlsx"
193
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
194
+ self.processed_df.to_excel(temp_file.name, index=False)
195
  temp_file.close()
196
 
197
  final_path = os.path.join(tempfile.gettempdir(), filename)
 
200
  os.rename(temp_file.name, final_path)
201
 
202
  else: # csv
203
+ filename = f"{prefix}_BinaryTransformation_{timestamp}.csv"
204
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
205
+ self.processed_df.to_csv(temp_file.name, index=False)
206
  temp_file.close()
207
 
208
  final_path = os.path.join(tempfile.gettempdir(), filename)
 
215
 
216
  return final_path
217
 
218
+ # Initialize transformer
219
+ transformer = CSVBinaryTransformer()
220
 
221
+ def handle_file_upload(file):
222
+ """Handle CSV file upload"""
223
+ if file is None:
224
+ return None, gr.update(choices=[], value=[]), "Please upload a CSV file"
225
+
226
  try:
227
+ preview_df, column_choices = transformer.load_csv(file.name)
228
+ preview_html = preview_df.to_html(classes="table table-striped", table_id="upload-preview")
229
+
230
+ # Update the checkbox group with new choices and make all columns visible
231
+ column_names = [col for col, _ in column_choices]
232
+
233
+ return (
234
+ preview_html,
235
+ gr.update(choices=column_choices, value=[], visible=True),
236
+ f"βœ… Successfully loaded CSV with {len(transformer.df)} rows and {len(transformer.df.columns)} columns"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  )
 
238
  except Exception as e:
239
+ return None, gr.update(choices=[], value=[], visible=False), f"❌ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
+ def select_all():
242
+ """Select all columns"""
243
+ if transformer.df is not None:
244
+ return gr.update(value=transformer.select_all_columns())
245
+ return gr.update(value=[])
246
 
247
+ def deselect_all():
248
+ """Deselect all columns"""
249
+ return gr.update(value=transformer.deselect_all_columns())
250
 
251
+ def process_transformation(selected_columns, topics_column, export_format):
252
+ """Process the transformation - using exact same pattern as working version"""
 
 
 
253
  try:
254
+ if transformer.df is None:
255
+ return None, None, "❌ Error: No CSV file loaded"
256
+
257
+ if not selected_columns:
258
+ return None, None, "❌ Error: Please select at least one column"
259
+
260
+ # Create a copy of the dataframe with selected columns
261
+ processed_df = transformer.df[selected_columns].copy()
262
+
263
+ # Process ABSA columns
264
+ processed_df = transformer.process_absa_columns(processed_df)
265
+
266
+ # Process Categories columns
267
+ processed_df = transformer.process_categories_columns(processed_df)
268
+
269
+ # Process Topics column
270
+ processed_df = transformer.process_topics_column(processed_df, topics_column)
271
+
272
+ # Store processed data
273
+ transformer.processed_df = processed_df
274
+
275
+ # Generate preview
276
+ preview_html = processed_df.head(20).to_html(classes="table table-striped", table_id="preview-table")
277
+
278
+ # Save file using the exact same method as working version
279
+ output_file = transformer.save_transformed_data(export_format.lower().replace(' (.', '').replace(')', ''))
280
+
281
+ success_msg = f"βœ… Transformation completed! Generated file: {os.path.basename(output_file)}"
282
+ success_msg += f"\nπŸ“Š Processed {len(transformer.processed_df)} rows with {len(transformer.processed_df.columns)} columns"
283
+ success_msg += f"\nπŸ’Ύ File saved successfully"
284
+ success_msg += f"\nπŸ“₯ File download should start automatically"
285
+
286
+ return preview_html, output_file, success_msg
287
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  except Exception as e:
289
+ import traceback
290
+ error_msg = f"❌ Error during transformation: {str(e)}\n\n{traceback.format_exc()}"
291
+ return None, None, error_msg
292
 
293
+ # Create Gradio interface - using similar structure to working version
294
+ with gr.Blocks(title="CSV Binary Transformation Tool", theme=gr.themes.Soft()) as app:
 
 
 
 
 
 
 
 
 
295
  gr.Markdown("""
296
+ # πŸ“Š CSV Binary Transformation Tool
297
+
298
+ This tool transforms CSV files by creating binary columns for sentiment analysis, categories, and topics.
299
+
300
+ ## Features:
301
+ - **ABSA Processing**: Creates sentiment columns and topic-sentiment combinations
302
+ - **Category Processing**: Creates binary columns for each category
303
+ - **Topic Processing**: Creates binary columns for each topic
304
+ - **Flexible Export**: Support for CSV and Excel formats
 
 
 
 
 
 
 
 
 
 
305
  """)
306
+
307
  with gr.Row():
308
  with gr.Column(scale=1):
309
+ # File upload section
310
+ gr.Markdown("### 1. Upload CSV File")
311
+ file_input = gr.File(
312
  label="Upload Input File",
313
+ file_types=[".csv"],
314
  type="filepath"
315
  )
316
+ upload_status = gr.Textbox(
317
+ label="Upload Status",
318
+ interactive=False,
319
+ lines=2
320
+ )
321
+
322
+ # Column selection section
323
+ gr.Markdown("### 2. Select Columns")
324
+ gr.Markdown("*Choose which columns from your CSV to include in the output file*")
325
+
326
+ with gr.Row():
327
+ select_all_btn = gr.Button("βœ“ Select All", size="sm", variant="secondary")
328
+ deselect_all_btn = gr.Button("βœ— Deselect All", size="sm", variant="secondary")
329
+
330
  column_selector = gr.CheckboxGroup(
331
+ label="Choose columns to include in output",
332
  choices=[],
333
  value=[],
334
+ interactive=True,
335
+ visible=False,
336
+ info="Select the columns you want to include in the transformed output file"
337
  )
338
+
339
+ # Topics column input
340
+ gr.Markdown("### 3. Specify Topics Column")
341
+ topics_column_input = gr.Textbox(
342
+ label="Topics Column Name",
343
+ placeholder="Enter the name of the column containing topics",
344
+ info="This column will be used to create T_<topic> binary columns"
 
 
345
  )
346
+
347
+ # Export options
348
+ gr.Markdown("### 4. Export Settings")
349
+ export_format = gr.Radio(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  label="Output Format",
351
+ choices=["CSV (.csv)", "Excel (.xlsx)"],
352
+ value="Excel (.xlsx)"
353
  )
354
+
355
+ # Process button
356
+ process_btn = gr.Button(
357
+ "πŸš€ Transform Data",
358
+ variant="primary",
359
+ size="lg"
360
  )
361
+
362
+ with gr.Column(scale=2):
363
+ # Preview sections
364
+ gr.Markdown("### File Preview")
365
+
366
+ with gr.Tabs():
367
+ with gr.Tab("Original Data"):
368
+ original_preview = gr.HTML(
369
+ label="Original Data Preview (First 10 rows)",
370
+ value="<p>No file uploaded yet</p>"
371
+ )
372
+
373
+ with gr.Tab("Transformed Data"):
374
+ transformed_preview = gr.HTML(
375
+ label="Transformed Data Preview (First 20 rows)",
376
+ value="<p>No transformation performed yet</p>"
377
+ )
378
+
379
+ # Status and download
380
+ process_status = gr.Textbox(
381
  label="Processing Status",
382
+ interactive=False,
383
+ lines=6
 
 
 
 
 
384
  )
385
+
386
+ # Download section - using exact same setup as working version
 
 
387
  gr.Markdown("### πŸ“₯ Download Status")
388
  gr.Markdown("Please click on the link inside the output file size value to download the transformed file (the number value on the right hand side below). You may need to right click and select Save Link As (or something similar)")
389
  output_file = gr.File(
 
391
  interactive=False,
392
  visible=True
393
  )
394
+
395
+ # Event handlers - same pattern as working version
396
+ file_input.change(
397
+ fn=handle_file_upload,
398
+ inputs=[file_input],
399
+ outputs=[original_preview, column_selector, upload_status]
400
+ )
401
+
402
+ select_all_btn.click(
403
+ fn=select_all,
404
  outputs=[column_selector]
405
  )
406
+
407
+ deselect_all_btn.click(
408
+ fn=deselect_all,
409
+ outputs=[column_selector]
 
 
 
 
 
 
 
 
 
 
 
410
  )
411
+
412
+ process_btn.click(
413
+ fn=process_transformation,
414
+ inputs=[column_selector, topics_column_input, export_format],
415
+ outputs=[transformed_preview, output_file, process_status]
416
+ )
417
+
418
+ # Add custom CSS for better table styling
419
+ app.load(js="""
420
+ function() {
421
+ const style = document.createElement('style');
422
+ style.textContent = `
423
+ .table { font-size: 12px; }
424
+ .table th, .table td { padding: 4px 8px; }
425
+ #upload-preview, #preview-table { max-height: 400px; overflow-y: auto; }
426
+ `;
427
+ document.head.appendChild(style);
428
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  """)
430
 
 
431
  if __name__ == "__main__":
432
+ app.launch(
433
+ share=True,
434
+ max_file_size="50mb"
435
+ )