fedec65 commited on
Commit
6e492e1
Β·
verified Β·
1 Parent(s): 4ff500a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +488 -0
app.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ import traceback
6
+ from typing import Tuple, Dict, Any, Optional
7
+ import tempfile
8
+
9
+ class FeedbackTransformer:
10
+ """
11
+ A class to transform feedback data with topic and sentiment columns
12
+ into a binary format where each topic is a separate column.
13
+ """
14
+
15
+ def __init__(self,
16
+ topic_prefix="TOPIC_",
17
+ sentiment_prefix="SENTIMENT_",
18
+ category_prefix="Categories:",
19
+ text_column="TEXT",
20
+ recommendation_column="Q4_Weiterempfehlung"):
21
+ """
22
+ Initialize the FeedbackTransformer with column specifications.
23
+ """
24
+ self.topic_prefix = topic_prefix
25
+ self.sentiment_prefix = sentiment_prefix
26
+ self.category_prefix = category_prefix
27
+ self.text_column = text_column
28
+ self.recommendation_column = recommendation_column
29
+ self.data = None
30
+ self.transformed_data = None
31
+ self.topic_cols = []
32
+ self.sentiment_cols = []
33
+ self.category_cols = []
34
+ self.unique_topics = set()
35
+
36
+ def load_data(self, file_obj):
37
+ """
38
+ Load data from the uploaded file object.
39
+ """
40
+ if file_obj is None:
41
+ raise ValueError("No file uploaded")
42
+
43
+ # Get file extension
44
+ file_name = file_obj.name if hasattr(file_obj, 'name') else 'unknown'
45
+ _, file_ext = os.path.splitext(file_name)
46
+
47
+ # Read the data based on file type
48
+ try:
49
+ if file_ext.lower() in ['.xlsx', '.xls']:
50
+ self.data = pd.read_excel(file_obj.name)
51
+ elif file_ext.lower() == '.csv':
52
+ # Try comma delimiter first
53
+ try:
54
+ self.data = pd.read_csv(file_obj.name, encoding='utf-8')
55
+ except:
56
+ # If comma fails, try tab delimiter
57
+ self.data = pd.read_csv(file_obj.name, sep='\t', encoding='utf-8')
58
+ else:
59
+ # Default to tab-delimited
60
+ self.data = pd.read_csv(file_obj.name, sep='\t', encoding='utf-8')
61
+ except Exception as e:
62
+ raise ValueError(f"Error reading file: {str(e)}")
63
+
64
+ return len(self.data), len(self.data.columns)
65
+
66
+ def identify_columns(self):
67
+ """
68
+ Identify topic, category, and sentiment columns in the data.
69
+ """
70
+ if self.data is None:
71
+ raise ValueError("Data not loaded")
72
+
73
+ # Extract columns based on prefixes
74
+ self.topic_cols = [col for col in self.data.columns if self.topic_prefix in col]
75
+ self.sentiment_cols = [col for col in self.data.columns if self.sentiment_prefix in col]
76
+ self.category_cols = [col for col in self.data.columns if col.startswith(self.category_prefix)]
77
+
78
+ # If no columns found with specified prefixes, return all columns for manual selection
79
+ all_cols = list(self.data.columns)
80
+
81
+ return {
82
+ 'topic_cols': self.topic_cols,
83
+ 'sentiment_cols': self.sentiment_cols,
84
+ 'category_cols': self.category_cols,
85
+ 'all_columns': all_cols
86
+ }
87
+
88
+ def extract_unique_topics(self):
89
+ """
90
+ Extract all unique topics from the topic columns.
91
+ """
92
+ self.unique_topics = set()
93
+
94
+ # Extract from topic columns
95
+ for col in self.topic_cols:
96
+ self.unique_topics.update(self.data[col].dropna().unique())
97
+
98
+ # Also extract from category columns if they exist
99
+ for col in self.category_cols:
100
+ self.unique_topics.update(self.data[col].dropna().unique())
101
+
102
+ # Remove empty topics
103
+ self.unique_topics = {t for t in self.unique_topics if isinstance(t, str) and t.strip()}
104
+
105
+ return len(self.unique_topics)
106
+
107
+ @staticmethod
108
+ def create_column_name(topic):
109
+ """
110
+ Create a standardized column name from a topic string.
111
+ """
112
+ # Remove special characters and standardize
113
+ topic_clean = str(topic).strip()
114
+ # Remove brackets and special characters
115
+ topic_clean = topic_clean.replace('[', '').replace(']', '').replace('(', '').replace(')', '')
116
+ topic_clean = topic_clean.replace('**', '').replace('*', '')
117
+ topic_clean = topic_clean.replace('.', '_').replace(' ', '_').replace('&', 'and')
118
+ topic_clean = topic_clean.replace(':', '_').replace('-', '_').replace('/', '_')
119
+ # Remove multiple underscores
120
+ while '__' in topic_clean:
121
+ topic_clean = topic_clean.replace('__', '_')
122
+ return topic_clean.lower().strip('_')
123
+
124
+ def transform_data(self):
125
+ """
126
+ Transform the data into binary topic columns with sentiment values.
127
+ """
128
+ if not self.unique_topics:
129
+ self.extract_unique_topics()
130
+
131
+ # Create output dataframe with feedback_id
132
+ self.transformed_data = pd.DataFrame({'feedback_id': range(1, len(self.data) + 1)})
133
+
134
+ # Initialize all topic columns to 0
135
+ for topic in sorted(self.unique_topics):
136
+ topic_col = self.create_column_name(topic)
137
+ self.transformed_data[topic_col] = 0
138
+ self.transformed_data[f'{topic_col}_sentiment'] = None
139
+
140
+ # Fill in the data from topic columns
141
+ for idx, row in self.data.iterrows():
142
+ # Process topic columns with sentiments
143
+ for i, t_col in enumerate(self.topic_cols):
144
+ topic = row.get(t_col)
145
+
146
+ # Find corresponding sentiment column
147
+ if i < len(self.sentiment_cols):
148
+ sentiment = row.get(self.sentiment_cols[i])
149
+ else:
150
+ sentiment = None
151
+
152
+ if pd.notna(topic) and isinstance(topic, str) and topic.strip():
153
+ topic_col = self.create_column_name(topic)
154
+ if topic_col in self.transformed_data.columns:
155
+ self.transformed_data.loc[idx, topic_col] = 1
156
+
157
+ # Convert sentiment to numeric value
158
+ if pd.notna(sentiment) and isinstance(sentiment, str):
159
+ sentiment_lower = sentiment.lower()
160
+ if 'positive' in sentiment_lower:
161
+ self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 1
162
+ elif 'negative' in sentiment_lower:
163
+ self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0
164
+ elif 'neutral' in sentiment_lower:
165
+ self.transformed_data.loc[idx, f'{topic_col}_sentiment'] = 0.5
166
+
167
+ # Process category columns (these typically don't have sentiments)
168
+ for c_col in self.category_cols:
169
+ category = row.get(c_col)
170
+ if pd.notna(category) and isinstance(category, str) and category.strip():
171
+ category_col = self.create_column_name(category)
172
+ if category_col in self.transformed_data.columns:
173
+ self.transformed_data.loc[idx, category_col] = 1
174
+
175
+ # Add original text if available
176
+ if self.text_column in self.data.columns:
177
+ self.transformed_data['original_text'] = self.data[self.text_column]
178
+
179
+ # Add recommendation score if available
180
+ if self.recommendation_column in self.data.columns:
181
+ self.transformed_data['recommendation_score'] = self.data[self.recommendation_column]
182
+
183
+ return self.transformed_data.shape
184
+
185
+ def analyze_data(self):
186
+ """
187
+ Analyze the transformed data to provide insights.
188
+ """
189
+ if self.transformed_data is None:
190
+ raise ValueError("No transformed data to analyze")
191
+
192
+ # Identify topic columns
193
+ topic_cols = [col for col in self.transformed_data.columns
194
+ if col != 'feedback_id' and
195
+ col != 'original_text' and
196
+ col != 'recommendation_score' and
197
+ not col.endswith('_sentiment')]
198
+
199
+ # Count occurrences of each topic
200
+ topic_counts = {}
201
+ for topic in topic_cols:
202
+ topic_counts[topic] = self.transformed_data[topic].sum()
203
+
204
+ # Sort topics by frequency
205
+ sorted_topics = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)
206
+
207
+ # Prepare analysis summary
208
+ analysis_text = f"**Analysis Results**\n\n"
209
+ analysis_text += f"Total feedbacks: {len(self.transformed_data)}\n"
210
+ analysis_text += f"Unique topics: {len(topic_cols)}\n\n"
211
+
212
+ analysis_text += "**Top 10 Most Frequent Topics:**\n"
213
+ for topic, count in sorted_topics[:10]:
214
+ analysis_text += f"- {topic}: {count} occurrences\n"
215
+
216
+ # Calculate sentiment distributions for top topics
217
+ analysis_text += "\n**Sentiment Distributions for Top 5 Topics:**\n"
218
+ for topic, _ in sorted_topics[:5]:
219
+ sentiment_col = f"{topic}_sentiment"
220
+ if sentiment_col in self.transformed_data.columns:
221
+ # Filter rows where the topic is present
222
+ topic_rows = self.transformed_data[self.transformed_data[topic] == 1]
223
+
224
+ positive = (topic_rows[sentiment_col] == 1.0).sum()
225
+ negative = (topic_rows[sentiment_col] == 0.0).sum()
226
+ neutral = (topic_rows[sentiment_col] == 0.5).sum()
227
+
228
+ total = positive + negative + neutral
229
+
230
+ if total > 0:
231
+ analysis_text += f"\n{topic} ({total} occurrences):\n"
232
+ analysis_text += f" - Positive: {positive} ({positive/total*100:.1f}%)\n"
233
+ analysis_text += f" - Negative: {negative} ({negative/total*100:.1f}%)\n"
234
+ analysis_text += f" - Neutral: {neutral} ({neutral/total*100:.1f}%)\n"
235
+
236
+ # Calculate number of topics per feedback
237
+ self.transformed_data['topic_count'] = self.transformed_data[topic_cols].sum(axis=1)
238
+ avg_topics = self.transformed_data['topic_count'].mean()
239
+ max_topics = self.transformed_data['topic_count'].max()
240
+
241
+ analysis_text += f"\n**Topics per Feedback:**\n"
242
+ analysis_text += f"- Average: {avg_topics:.2f}\n"
243
+ analysis_text += f"- Maximum: {max_topics}\n"
244
+
245
+ return analysis_text
246
+
247
+ def save_transformed_data(self, output_format='xlsx'):
248
+ """
249
+ Save the transformed data and return the file path.
250
+ """
251
+ if self.transformed_data is None:
252
+ raise ValueError("No transformed data to save")
253
+
254
+ # Create a temporary file
255
+ if output_format == 'xlsx':
256
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx')
257
+ self.transformed_data.to_excel(temp_file.name, index=False)
258
+ else: # csv
259
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
260
+ self.transformed_data.to_csv(temp_file.name, index=False)
261
+
262
+ return temp_file.name
263
+
264
+
265
+ # Gradio interface functions
266
+ def process_file(file_obj, topic_prefix, sentiment_prefix, category_prefix,
267
+ text_column, recommendation_column, output_format, analyze_data):
268
+ """
269
+ Main processing function for Gradio interface.
270
+ """
271
+ try:
272
+ # Initialize transformer
273
+ transformer = FeedbackTransformer(
274
+ topic_prefix=topic_prefix,
275
+ sentiment_prefix=sentiment_prefix,
276
+ category_prefix=category_prefix,
277
+ text_column=text_column,
278
+ recommendation_column=recommendation_column
279
+ )
280
+
281
+ # Load data
282
+ rows, cols = transformer.load_data(file_obj)
283
+ status_msg = f"βœ… Loaded {rows} rows and {cols} columns\n"
284
+
285
+ # Identify columns
286
+ col_info = transformer.identify_columns()
287
+ status_msg += f"\nπŸ“Š Found columns:\n"
288
+ status_msg += f"- Topic columns: {len(col_info['topic_cols'])}\n"
289
+ status_msg += f"- Sentiment columns: {len(col_info['sentiment_cols'])}\n"
290
+ status_msg += f"- Category columns: {len(col_info['category_cols'])}\n"
291
+
292
+ # Extract unique topics
293
+ num_topics = transformer.extract_unique_topics()
294
+ status_msg += f"\n🎯 Found {num_topics} unique topics\n"
295
+
296
+ # Transform data
297
+ shape = transformer.transform_data()
298
+ status_msg += f"\n✨ Transformed data shape: {shape[0]} rows Γ— {shape[1]} columns\n"
299
+
300
+ # Analyze if requested
301
+ analysis_result = ""
302
+ if analyze_data:
303
+ analysis_result = transformer.analyze_data()
304
+
305
+ # Save transformed data
306
+ output_file = transformer.save_transformed_data(output_format)
307
+
308
+ return status_msg, analysis_result, output_file
309
+
310
+ except Exception as e:
311
+ error_msg = f"❌ Error: {str(e)}\n\n{traceback.format_exc()}"
312
+ return error_msg, "", None
313
+
314
+
315
+ def get_column_preview(file_obj):
316
+ """
317
+ Get a preview of columns in the uploaded file.
318
+ """
319
+ try:
320
+ if file_obj is None:
321
+ return "Please upload a file first."
322
+
323
+ # Read first few rows to get column names
324
+ file_name = file_obj.name if hasattr(file_obj, 'name') else 'unknown'
325
+ _, file_ext = os.path.splitext(file_name)
326
+
327
+ if file_ext.lower() in ['.xlsx', '.xls']:
328
+ df = pd.read_excel(file_obj.name, nrows=5)
329
+ elif file_ext.lower() == '.csv':
330
+ try:
331
+ df = pd.read_csv(file_obj.name, nrows=5)
332
+ except:
333
+ df = pd.read_csv(file_obj.name, sep='\t', nrows=5)
334
+ else:
335
+ df = pd.read_csv(file_obj.name, sep='\t', nrows=5)
336
+
337
+ columns = list(df.columns)
338
+ preview = "**Available columns:**\n"
339
+ for i, col in enumerate(columns, 1):
340
+ preview += f"{i}. {col}\n"
341
+
342
+ return preview
343
+
344
+ except Exception as e:
345
+ return f"Error reading file: {str(e)}"
346
+
347
+
348
+ # Create Gradio interface
349
+ with gr.Blocks(title="Feedback Topic & Sentiment Transformer") as demo:
350
+ gr.Markdown("""
351
+ # πŸ“Š Feedback Topic & Sentiment Transformer
352
+
353
+ Transform feedback data with topic and sentiment columns into a binary matrix format.
354
+ Each unique topic becomes a separate column with 0/1 values and associated sentiment scores.
355
+
356
+ ### πŸ“‹ Instructions:
357
+ 1. Upload your Excel, CSV, or tab-delimited text file
358
+ 2. Configure column prefixes (or use defaults)
359
+ 3. Click "Transform Data" to process
360
+ 4. Download the transformed file
361
+ """)
362
+
363
+ with gr.Row():
364
+ with gr.Column(scale=1):
365
+ # File upload
366
+ input_file = gr.File(
367
+ label="Upload Input File",
368
+ file_types=[".xlsx", ".xls", ".csv", ".txt"],
369
+ type="filepath"
370
+ )
371
+
372
+ # Column preview button
373
+ preview_btn = gr.Button("Preview Columns", variant="secondary")
374
+ column_preview = gr.Textbox(
375
+ label="Column Preview",
376
+ lines=10,
377
+ interactive=False
378
+ )
379
+
380
+ with gr.Column(scale=1):
381
+ # Configuration parameters
382
+ gr.Markdown("### βš™οΈ Configuration")
383
+
384
+ topic_prefix = gr.Textbox(
385
+ label="Topic Column Prefix",
386
+ value="[**WORKSHOP] SwissLife Taxonomy",
387
+ info="Prefix to identify topic columns"
388
+ )
389
+
390
+ sentiment_prefix = gr.Textbox(
391
+ label="Sentiment Column Prefix",
392
+ value="ABSA:",
393
+ info="Prefix to identify sentiment columns"
394
+ )
395
+
396
+ category_prefix = gr.Textbox(
397
+ label="Category Column Prefix",
398
+ value="Categories:",
399
+ info="Prefix to identify category columns"
400
+ )
401
+
402
+ text_column = gr.Textbox(
403
+ label="Text Column Name",
404
+ value="TEXT",
405
+ info="Column containing original feedback text"
406
+ )
407
+
408
+ recommendation_column = gr.Textbox(
409
+ label="Recommendation Column Name",
410
+ value="Q4_Weiterempfehlung",
411
+ info="Column containing recommendation scores"
412
+ )
413
+
414
+ output_format = gr.Radio(
415
+ label="Output Format",
416
+ choices=["xlsx", "csv"],
417
+ value="xlsx"
418
+ )
419
+
420
+ analyze_checkbox = gr.Checkbox(
421
+ label="Analyze transformed data",
422
+ value=True
423
+ )
424
+
425
+ # Transform button
426
+ transform_btn = gr.Button("πŸ”„ Transform Data", variant="primary", size="lg")
427
+
428
+ # Output sections
429
+ with gr.Row():
430
+ with gr.Column():
431
+ status_output = gr.Textbox(
432
+ label="Processing Status",
433
+ lines=10,
434
+ interactive=False
435
+ )
436
+
437
+ with gr.Column():
438
+ analysis_output = gr.Markdown(
439
+ label="Data Analysis"
440
+ )
441
+
442
+ # Download section
443
+ output_file = gr.File(
444
+ label="πŸ“₯ Download Transformed File",
445
+ interactive=False
446
+ )
447
+
448
+ # Event handlers
449
+ preview_btn.click(
450
+ fn=get_column_preview,
451
+ inputs=[input_file],
452
+ outputs=[column_preview]
453
+ )
454
+
455
+ transform_btn.click(
456
+ fn=process_file,
457
+ inputs=[
458
+ input_file,
459
+ topic_prefix,
460
+ sentiment_prefix,
461
+ category_prefix,
462
+ text_column,
463
+ recommendation_column,
464
+ output_format,
465
+ analyze_checkbox
466
+ ],
467
+ outputs=[status_output, analysis_output, output_file]
468
+ )
469
+
470
+ # Examples section
471
+ gr.Markdown("""
472
+ ### πŸ“ Example Column Formats:
473
+ - **Topic columns**: `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 1`, `[**WORKSHOP] SwissLife Taxonomy(Kommentar) 2`
474
+ - **Category columns**: `Categories:Topic1`, `Categories:Topic2`
475
+ - **Sentiment columns**: `ABSA:Sentiment1`, `ABSA:Sentiment2`
476
+
477
+ ### 🎯 Output Format:
478
+ - Each unique topic becomes a column with values 0 (absent) or 1 (present)
479
+ - Each topic has an associated `_sentiment` column with values:
480
+ - 1.0 = Positive
481
+ - 0.5 = Neutral
482
+ - 0.0 = Negative
483
+ - Original text and recommendation scores are preserved if available
484
+ """)
485
+
486
+ # Launch the app
487
+ if __name__ == "__main__":
488
+ demo.launch()