Dhruv Pawar commited on
Commit
1273036
Β·
0 Parent(s):

Initial commit: Neural Data Analyst v1.0

Browse files
Files changed (10) hide show
  1. .env.template +10 -0
  2. .gitignore +134 -0
  3. advanced_features.py +730 -0
  4. database_manager.py +285 -0
  5. eda_analyzer.py +593 -0
  6. main.py +0 -0
  7. requirements.txt +12 -0
  8. setup.bat +0 -0
  9. setup.sh +0 -0
  10. test.py +175 -0
.env.template ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Neural Data Analyst Environment Variables
2
+ # Copy this file to .env and add your actual API key
3
+
4
+ # Groq API Configuration
5
+ GROQ_API_KEY=your_groq_api_key_here
6
+
7
+ # Optional: Default model to use
8
+ DEFAULT_MODEL=llama-3.3-70b-versatile
9
+
10
+ # Get your Groq API key from: https://console.groq.com/keys
.gitignore ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Neural Data Analyst - .gitignore
2
+
3
+ # Environment Variables (IMPORTANT: Never commit API keys!)
4
+ .env
5
+ .env.local
6
+ .env.development
7
+ .env.test
8
+ .env.production
9
+
10
+ # Python
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+ *.so
15
+ .Python
16
+ build/
17
+ develop-eggs/
18
+ dist/
19
+ downloads/
20
+ eggs/
21
+ .eggs/
22
+ lib/
23
+ lib64/
24
+ parts/
25
+ sdist/
26
+ var/
27
+ wheels/
28
+ pip-wheel-metadata/
29
+ share/python-wheels/
30
+ *.egg-info/
31
+ .installed.cfg
32
+ *.egg
33
+ MANIFEST
34
+
35
+ # Virtual Environments
36
+ venv/
37
+ env/
38
+ ENV/
39
+ env.bak/
40
+ venv.bak/
41
+ .venv/
42
+
43
+ # Streamlit
44
+ .streamlit/
45
+ streamlit_cache/
46
+
47
+ # Database files
48
+ *.db
49
+ *.sqlite
50
+ *.sqlite3
51
+ neural_analyst_db/
52
+ analysis_history.json
53
+
54
+ # Logs
55
+ *.log
56
+ logs/
57
+ neural_logs/
58
+
59
+ # Cache
60
+ .cache/
61
+ cache/
62
+ temp/
63
+ tmp/
64
+
65
+ # Data files (add your data files here)
66
+ data/
67
+ datasets/
68
+ uploads/
69
+ *.csv
70
+ *.json
71
+ *.xlsx
72
+ *.xls
73
+ sample_data.csv
74
+
75
+ # IDE and Editor files
76
+ .vscode/
77
+ .idea/
78
+ *.swp
79
+ *.swo
80
+ *~
81
+ .DS_Store
82
+ Thumbs.db
83
+
84
+ # Jupyter Notebooks
85
+ .ipynb_checkpoints/
86
+ *.ipynb
87
+
88
+ # pytest
89
+ .pytest_cache/
90
+ .coverage
91
+ htmlcov/
92
+
93
+ # mypy
94
+ .mypy_cache/
95
+ .dmypy.json
96
+ dmypy.json
97
+
98
+ # Documentation builds
99
+ docs/_build/
100
+
101
+ # PyInstaller
102
+ *.manifest
103
+ *.spec
104
+
105
+ # Unit test / coverage reports
106
+ htmlcov/
107
+ .tox/
108
+ .coverage
109
+ .coverage.*
110
+ .cache
111
+ nosetests.xml
112
+ coverage.xml
113
+ *.cover
114
+ *.py,cover
115
+ .hypothesis/
116
+
117
+ # Backup files
118
+ *.backup
119
+ *.bak
120
+ *.old
121
+
122
+ # Temporary files
123
+ *.tmp
124
+ *.temp
125
+
126
+ # API Keys and Secrets (double protection)
127
+ secrets.toml
128
+ .secrets.toml
129
+ api_keys.txt
130
+ config.json
131
+
132
+ # Local configuration
133
+ local_config.py
134
+ config_local.py
advanced_features.py ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import plotly.express as px
5
+ import plotly.graph_objects as go
6
+ from plotly.subplots import make_subplots
7
+ import json
8
+ from datetime import datetime, timedelta
9
+ from typing import Dict, List, Any
10
+ import base64
11
+ from io import BytesIO
12
+
13
+ # Additional advanced features for Neural Data Analyst
14
+
15
+ class AdvancedFeatures:
16
+ """Advanced features and utilities for the Neural Data Analyst"""
17
+
18
+ def __init__(self, db_manager):
19
+ self.db_manager = db_manager
20
+
21
+ def render_advanced_analytics_dashboard(self, df: pd.DataFrame):
22
+ """Render advanced analytics dashboard"""
23
+ st.markdown("## πŸ”¬ Advanced Analytics Dashboard")
24
+
25
+ tabs = st.tabs([
26
+ "πŸ“Š Interactive Plots",
27
+ "🎯 Smart Recommendations",
28
+ "πŸ“ˆ Trend Analysis",
29
+ "πŸ” Anomaly Detection",
30
+ "πŸ“‹ Report Generator"
31
+ ])
32
+
33
+ with tabs[0]:
34
+ self.render_interactive_plots(df)
35
+
36
+ with tabs[1]:
37
+ self.render_smart_recommendations(df)
38
+
39
+ with tabs[2]:
40
+ self.render_trend_analysis(df)
41
+
42
+ with tabs[3]:
43
+ self.render_anomaly_detection(df)
44
+
45
+ with tabs[4]:
46
+ self.render_report_generator(df)
47
+
48
+ def render_interactive_plots(self, df: pd.DataFrame):
49
+ """Render interactive plotting interface"""
50
+ st.markdown("### πŸ“Š Interactive Plot Builder")
51
+
52
+ col1, col2, col3 = st.columns(3)
53
+
54
+ with col1:
55
+ plot_type = st.selectbox(
56
+ "Plot Type",
57
+ ["Scatter", "Line", "Bar", "Histogram", "Box", "Violin", "Heatmap", "3D Scatter"]
58
+ )
59
+
60
+ with col2:
61
+ x_column = st.selectbox("X-axis", df.columns)
62
+
63
+ with col3:
64
+ y_column = st.selectbox("Y-axis", df.columns)
65
+
66
+ # Color and size options
67
+ col1, col2 = st.columns(2)
68
+ with col1:
69
+ color_column = st.selectbox("Color by", ["None"] + list(df.columns))
70
+ with col2:
71
+ size_column = st.selectbox("Size by", ["None"] + list(df.select_dtypes(include=[np.number]).columns))
72
+
73
+ # Generate plot based on selections
74
+ if st.button("🎨 Generate Plot"):
75
+ fig = self.create_dynamic_plot(df, plot_type, x_column, y_column, color_column, size_column)
76
+ if fig:
77
+ st.plotly_chart(fig, use_container_width=True)
78
+
79
+ # Plot gallery
80
+ with st.expander("πŸ–ΌοΈ Quick Plot Gallery"):
81
+ self.render_plot_gallery(df)
82
+
83
+ def create_dynamic_plot(self, df: pd.DataFrame, plot_type: str, x_col: str, y_col: str,
84
+ color_col: str = None, size_col: str = None):
85
+ """Create dynamic plot based on user selections"""
86
+ try:
87
+ kwargs = {
88
+ 'data_frame': df,
89
+ 'x': x_col,
90
+ 'title': f'{plot_type} Plot: {x_col} vs {y_col}'
91
+ }
92
+
93
+ if y_col and y_col != x_col:
94
+ kwargs['y'] = y_col
95
+
96
+ if color_col and color_col != "None":
97
+ kwargs['color'] = color_col
98
+
99
+ if size_col and size_col != "None" and plot_type in ["Scatter", "3D Scatter"]:
100
+ kwargs['size'] = size_col
101
+
102
+ if plot_type == "Scatter":
103
+ fig = px.scatter(**kwargs)
104
+ elif plot_type == "Line":
105
+ fig = px.line(**kwargs)
106
+ elif plot_type == "Bar":
107
+ fig = px.bar(**kwargs)
108
+ elif plot_type == "Histogram":
109
+ fig = px.histogram(df, x=x_col, title=f'Histogram: {x_col}')
110
+ elif plot_type == "Box":
111
+ fig = px.box(**kwargs)
112
+ elif plot_type == "Violin":
113
+ fig = px.violin(**kwargs)
114
+ elif plot_type == "3D Scatter":
115
+ z_col = st.selectbox("Z-axis", df.select_dtypes(include=[np.number]).columns)
116
+ kwargs['z'] = z_col
117
+ fig = px.scatter_3d(**kwargs)
118
+ elif plot_type == "Heatmap":
119
+ numeric_df = df.select_dtypes(include=[np.number])
120
+ corr_matrix = numeric_df.corr()
121
+ fig = px.imshow(corr_matrix, text_auto=True, title="Correlation Heatmap")
122
+ else:
123
+ return None
124
+
125
+ fig.update_layout(
126
+ plot_bgcolor='rgba(0,0,0,0)',
127
+ paper_bgcolor='rgba(0,0,0,0)',
128
+ font=dict(color='white')
129
+ )
130
+
131
+ return fig
132
+
133
+ except Exception as e:
134
+ st.error(f"Error creating plot: {str(e)}")
135
+ return None
136
+
137
+ def render_plot_gallery(self, df: pd.DataFrame):
138
+ """Render quick plot gallery"""
139
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
140
+
141
+ if len(numeric_cols) >= 2:
142
+ col1, col2 = st.columns(2)
143
+
144
+ with col1:
145
+ # Quick correlation plot
146
+ fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
147
+ title="Quick Correlation View")
148
+ fig.update_layout(height=300)
149
+ st.plotly_chart(fig, use_container_width=True)
150
+
151
+ with col2:
152
+ # Quick distribution plot
153
+ fig = px.histogram(df, x=numeric_cols[0], title="Quick Distribution")
154
+ fig.update_layout(height=300)
155
+ st.plotly_chart(fig, use_container_width=True)
156
+
157
+ def render_smart_recommendations(self, df: pd.DataFrame):
158
+ """Render smart analysis recommendations"""
159
+ st.markdown("### 🎯 Smart Analysis Recommendations")
160
+
161
+ recommendations = self.generate_analysis_recommendations(df)
162
+
163
+ for i, rec in enumerate(recommendations):
164
+ with st.expander(f"πŸ’‘ {rec['title']}", expanded=i == 0):
165
+ st.markdown(f"**Recommendation:** {rec['description']}")
166
+ st.markdown(f"**Rationale:** {rec['rationale']}")
167
+
168
+ if st.button(f"Apply Recommendation", key=f"apply_rec_{i}"):
169
+ self.apply_recommendation(df, rec)
170
+
171
+ def generate_analysis_recommendations(self, df: pd.DataFrame) -> List[Dict[str, str]]:
172
+ """Generate smart analysis recommendations"""
173
+ recommendations = []
174
+
175
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
176
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
177
+
178
+ # Missing data recommendation
179
+ missing_data = df.isnull().sum()
180
+ high_missing = missing_data[missing_data > len(df) * 0.1]
181
+
182
+ if len(high_missing) > 0:
183
+ recommendations.append({
184
+ 'title': 'Missing Data Analysis',
185
+ 'description': f'Analyze missing data patterns in {len(high_missing)} columns with >10% missing values',
186
+ 'rationale': 'Understanding missing data patterns can reveal data collection issues or systematic biases',
187
+ 'action': 'missing_analysis'
188
+ })
189
+
190
+ # Correlation analysis recommendation
191
+ if len(numeric_cols) > 2:
192
+ recommendations.append({
193
+ 'title': 'Correlation Deep Dive',
194
+ 'description': 'Perform comprehensive correlation analysis with feature selection recommendations',
195
+ 'rationale': 'Identifying highly correlated features can improve model performance and interpretability',
196
+ 'action': 'correlation_analysis'
197
+ })
198
+
199
+ # Outlier detection recommendation
200
+ if len(numeric_cols) > 0:
201
+ recommendations.append({
202
+ 'title': 'Outlier Detection & Treatment',
203
+ 'description': 'Identify and analyze outliers using multiple statistical methods',
204
+ 'rationale': 'Outliers can significantly impact analysis results and model performance',
205
+ 'action': 'outlier_analysis'
206
+ })
207
+
208
+ # Segmentation recommendation
209
+ if len(categorical_cols) > 0 and len(numeric_cols) > 0:
210
+ recommendations.append({
211
+ 'title': 'Customer/Data Segmentation',
212
+ 'description': 'Perform clustering analysis to identify natural data segments',
213
+ 'rationale': 'Segmentation can reveal hidden patterns and improve targeted strategies',
214
+ 'action': 'segmentation_analysis'
215
+ })
216
+
217
+ # Time series recommendation
218
+ date_cols = df.select_dtypes(include=['datetime64']).columns
219
+ if len(date_cols) > 0:
220
+ recommendations.append({
221
+ 'title': 'Time Series Analysis',
222
+ 'description': 'Analyze temporal patterns and trends in your data',
223
+ 'rationale': 'Time-based analysis can reveal seasonality, trends, and forecasting opportunities',
224
+ 'action': 'time_series_analysis'
225
+ })
226
+
227
+ return recommendations
228
+
229
+ def apply_recommendation(self, df: pd.DataFrame, recommendation: Dict[str, str]):
230
+ """Apply a smart recommendation"""
231
+ action = recommendation.get('action')
232
+
233
+ if action == 'missing_analysis':
234
+ self.perform_missing_analysis(df)
235
+ elif action == 'correlation_analysis':
236
+ self.perform_correlation_analysis(df)
237
+ elif action == 'outlier_analysis':
238
+ self.perform_outlier_analysis(df)
239
+ elif action == 'segmentation_analysis':
240
+ self.perform_segmentation_analysis(df)
241
+ elif action == 'time_series_analysis':
242
+ self.perform_time_series_analysis(df)
243
+
244
+ def perform_missing_analysis(self, df: pd.DataFrame):
245
+ """Perform detailed missing data analysis"""
246
+ st.markdown("#### πŸ” Missing Data Analysis Results")
247
+
248
+ missing_data = df.isnull().sum()
249
+ missing_percent = (missing_data / len(df)) * 100
250
+
251
+ missing_df = pd.DataFrame({
252
+ 'Column': missing_data.index,
253
+ 'Missing_Count': missing_data.values,
254
+ 'Missing_Percentage': missing_percent.values
255
+ })
256
+
257
+ missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
258
+
259
+ if len(missing_df) > 0:
260
+ fig = px.bar(missing_df, x='Column', y='Missing_Percentage',
261
+ title='Missing Data by Column (%)')
262
+ fig.update_layout(height=400)
263
+ st.plotly_chart(fig, use_container_width=True)
264
+
265
+ st.dataframe(missing_df, use_container_width=True)
266
+ else:
267
+ st.success("βœ… No missing data found in the dataset!")
268
+
269
+ def perform_correlation_analysis(self, df: pd.DataFrame):
270
+ """Perform detailed correlation analysis"""
271
+ st.markdown("#### πŸ”— Advanced Correlation Analysis")
272
+
273
+ numeric_df = df.select_dtypes(include=[np.number])
274
+
275
+ if len(numeric_df.columns) > 1:
276
+ corr_matrix = numeric_df.corr()
277
+
278
+ # Hierarchical clustering of correlations
279
+ from scipy.cluster.hierarchy import linkage, dendrogram
280
+ from scipy.spatial.distance import squareform
281
+
282
+ distance_matrix = 1 - np.abs(corr_matrix)
283
+ condensed_distances = squareform(distance_matrix, checks=False)
284
+ linkage_matrix = linkage(condensed_distances, method='average')
285
+
286
+ fig = go.Figure()
287
+ dendro = dendrogram(linkage_matrix, labels=corr_matrix.columns, no_plot=True)
288
+
289
+ # Create dendrogram plot
290
+ for i in range(len(dendro['icoord'])):
291
+ x = dendro['icoord'][i]
292
+ y = dendro['dcoord'][i]
293
+ fig.add_trace(go.Scatter(x=x, y=y, mode='lines',
294
+ line=dict(color='gold', width=2),
295
+ showlegend=False))
296
+
297
+ fig.update_layout(
298
+ title="Feature Clustering Dendrogram",
299
+ xaxis_title="Features",
300
+ yaxis_title="Distance",
301
+ height=400
302
+ )
303
+
304
+ st.plotly_chart(fig, use_container_width=True)
305
+
306
+ def render_trend_analysis(self, df: pd.DataFrame):
307
+ """Render trend analysis interface"""
308
+ st.markdown("### πŸ“ˆ Trend Analysis")
309
+
310
+ date_cols = df.select_dtypes(include=['datetime64']).columns
311
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
312
+
313
+ if len(date_cols) == 0:
314
+ st.warning("No datetime columns found. Try converting date columns to datetime format.")
315
+
316
+ # Offer to convert columns
317
+ potential_date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
318
+ if potential_date_cols:
319
+ date_col = st.selectbox("Select date column to convert:", potential_date_cols)
320
+ if st.button("Convert to DateTime"):
321
+ try:
322
+ df[date_col] = pd.to_datetime(df[date_col])
323
+ st.success(f"Converted {date_col} to datetime!")
324
+ st.experimental_rerun()
325
+ except Exception as e:
326
+ st.error(f"Conversion failed: {str(e)}")
327
+ return
328
+
329
+ col1, col2 = st.columns(2)
330
+ with col1:
331
+ date_col = st.selectbox("Date Column", date_cols)
332
+ with col2:
333
+ value_col = st.selectbox("Value Column", numeric_cols)
334
+
335
+ if st.button("πŸ” Analyze Trends"):
336
+ self.perform_trend_analysis(df, date_col, value_col)
337
+
338
+ def perform_trend_analysis(self, df: pd.DataFrame, date_col: str, value_col: str):
339
+ """Perform trend analysis"""
340
+ st.markdown("#### πŸ“Š Trend Analysis Results")
341
+
342
+ # Time series plot
343
+ fig = px.line(df.sort_values(date_col), x=date_col, y=value_col,
344
+ title=f'{value_col} Over Time')
345
+ fig.update_layout(height=400)
346
+ st.plotly_chart(fig, use_container_width=True)
347
+
348
+ # Rolling statistics
349
+ df_sorted = df.sort_values(date_col).copy()
350
+ df_sorted['7_day_avg'] = df_sorted[value_col].rolling(window=7, min_periods=1).mean()
351
+ df_sorted['30_day_avg'] = df_sorted[value_col].rolling(window=30, min_periods=1).mean()
352
+
353
+ fig = go.Figure()
354
+ fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
355
+ name='Original', mode='lines'))
356
+ fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['7_day_avg'],
357
+ name='7-Day Average', mode='lines'))
358
+ fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['30_day_avg'],
359
+ name='30-Day Average', mode='lines'))
360
+
361
+ fig.update_layout(title="Trend with Moving Averages", height=400)
362
+ st.plotly_chart(fig, use_container_width=True)
363
+
364
+ def render_anomaly_detection(self, df: pd.DataFrame):
365
+ """Render anomaly detection interface"""
366
+ st.markdown("### πŸ” Anomaly Detection")
367
+
368
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
369
+
370
+ if len(numeric_cols) == 0:
371
+ st.warning("No numeric columns found for anomaly detection.")
372
+ return
373
+
374
+ col1, col2 = st.columns(2)
375
+ with col1:
376
+ target_col = st.selectbox("Target Column", numeric_cols)
377
+ with col2:
378
+ method = st.selectbox("Detection Method",
379
+ ["IQR", "Z-Score", "Isolation Forest", "Local Outlier Factor"])
380
+
381
+ if st.button("🎯 Detect Anomalies"):
382
+ self.perform_anomaly_detection(df, target_col, method)
383
+
384
+ def perform_anomaly_detection(self, df: pd.DataFrame, target_col: str, method: str):
385
+ """Perform anomaly detection"""
386
+ st.markdown("#### 🎯 Anomaly Detection Results")
387
+
388
+ data = df[target_col].dropna()
389
+ anomalies = []
390
+
391
+ if method == "IQR":
392
+ Q1 = data.quantile(0.25)
393
+ Q3 = data.quantile(0.75)
394
+ IQR = Q3 - Q1
395
+ lower_bound = Q1 - 1.5 * IQR
396
+ upper_bound = Q3 + 1.5 * IQR
397
+ anomalies = df[(df[target_col] < lower_bound) | (df[target_col] > upper_bound)]
398
+
399
+ elif method == "Z-Score":
400
+ z_scores = np.abs((data - data.mean()) / data.std())
401
+ anomalies = df[z_scores > 3]
402
+
403
+ elif method == "Isolation Forest":
404
+ from sklearn.ensemble import IsolationForest
405
+ iso_forest = IsolationForest(contamination=0.1, random_state=42)
406
+ outlier_labels = iso_forest.fit_predict(data.values.reshape(-1, 1))
407
+ anomalies = df[outlier_labels == -1]
408
+
409
+ elif method == "Local Outlier Factor":
410
+ from sklearn.neighbors import LocalOutlierFactor
411
+ lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
412
+ outlier_labels = lof.fit_predict(data.values.reshape(-1, 1))
413
+ anomalies = df[outlier_labels == -1]
414
+
415
+ # Visualization
416
+ fig = go.Figure()
417
+
418
+ # Normal data points
419
+ normal_data = df[~df.index.isin(anomalies.index)]
420
+ fig.add_trace(go.Scatter(
421
+ x=normal_data.index,
422
+ y=normal_data[target_col],
423
+ mode='markers',
424
+ name='Normal',
425
+ marker=dict(color='blue', size=6)
426
+ ))
427
+
428
+ # Anomalies
429
+ fig.add_trace(go.Scatter(
430
+ x=anomalies.index,
431
+ y=anomalies[target_col],
432
+ mode='markers',
433
+ name='Anomalies',
434
+ marker=dict(color='red', size=10, symbol='x')
435
+ ))
436
+
437
+ fig.update_layout(
438
+ title=f'Anomaly Detection: {target_col} ({method})',
439
+ xaxis_title='Index',
440
+ yaxis_title=target_col,
441
+ height=500
442
+ )
443
+
444
+ st.plotly_chart(fig, use_container_width=True)
445
+
446
+ # Summary
447
+ col1, col2, col3 = st.columns(3)
448
+ with col1:
449
+ st.metric("Total Data Points", len(df))
450
+ with col2:
451
+ st.metric("Anomalies Found", len(anomalies))
452
+ with col3:
453
+ st.metric("Anomaly Rate", f"{len(anomalies)/len(df)*100:.2f}%")
454
+
455
+ if len(anomalies) > 0:
456
+ with st.expander("πŸ” Anomaly Details"):
457
+ st.dataframe(anomalies[[target_col]], use_container_width=True)
458
+
459
+ def render_report_generator(self, df: pd.DataFrame):
460
+ """Render automated report generator"""
461
+ st.markdown("### πŸ“‹ Automated Report Generator")
462
+
463
+ report_type = st.selectbox(
464
+ "Report Type",
465
+ ["Executive Summary", "Technical Analysis", "Data Quality Report", "Custom Report"]
466
+ )
467
+
468
+ col1, col2 = st.columns(2)
469
+ with col1:
470
+ include_charts = st.checkbox("Include Charts", value=True)
471
+ with col2:
472
+ include_recommendations = st.checkbox("Include Recommendations", value=True)
473
+
474
+ if st.button("πŸ“„ Generate Report"):
475
+ report_content = self.generate_report(df, report_type, include_charts, include_recommendations)
476
+
477
+ # Display report
478
+ st.markdown("#### πŸ“Š Generated Report")
479
+ st.markdown(report_content)
480
+
481
+ # Download option
482
+ self.create_download_link(report_content, f"neural_analyst_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
483
+
484
+ def generate_report(self, df: pd.DataFrame, report_type: str, include_charts: bool, include_recommendations: bool) -> str:
485
+ """Generate automated report"""
486
+ report = f"""
487
+ # Neural Data Analyst Report
488
+ **Generated on:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
489
+ **Report Type:** {report_type}
490
+
491
+ ## Dataset Overview
492
+ - **Total Rows:** {len(df):,}
493
+ - **Total Columns:** {len(df.columns)}
494
+ - **Memory Usage:** {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
495
+ - **Missing Values:** {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)
496
+
497
+ ## Column Information
498
+ """
499
+
500
+ # Column details
501
+ for col in df.columns:
502
+ dtype = str(df[col].dtype)
503
+ null_count = df[col].isnull().sum()
504
+ unique_count = df[col].nunique()
505
+
506
+ report += f"- **{col}** ({dtype}): {null_count} missing, {unique_count} unique values\n"
507
+
508
+ # Numeric summary
509
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
510
+ if len(numeric_cols) > 0:
511
+ report += "\n## Numeric Summary\n"
512
+ summary_stats = df[numeric_cols].describe()
513
+ report += summary_stats.to_markdown()
514
+
515
+ # Key insights
516
+ if include_recommendations:
517
+ report += "\n## Key Insights & Recommendations\n"
518
+ recommendations = self.generate_analysis_recommendations(df)
519
+ for i, rec in enumerate(recommendations[:5], 1):
520
+ report += f"{i}. **{rec['title']}:** {rec['description']}\n"
521
+
522
+ return report
523
+
524
+ def create_download_link(self, content: str, filename: str):
525
+ """Create download link for report"""
526
+ b64 = base64.b64encode(content.encode()).decode()
527
+ href = f'<a href="data:text/markdown;base64,{b64}" download="{filename}">πŸ“₯ Download Report</a>'
528
+ st.markdown(href, unsafe_allow_html=True)
529
+
530
+ def render_data_comparison_tool(self):
531
+ """Render data comparison tool for multiple datasets"""
532
+ st.markdown("## βš–οΈ Data Comparison Tool")
533
+
534
+ st.markdown("Upload multiple datasets to compare their characteristics:")
535
+
536
+ uploaded_files = st.file_uploader(
537
+ "Choose CSV files for comparison",
538
+ type=['csv'],
539
+ accept_multiple_files=True
540
+ )
541
+
542
+ if len(uploaded_files) >= 2:
543
+ datasets = {}
544
+
545
+ for file in uploaded_files:
546
+ try:
547
+ df = pd.read_csv(file)
548
+ datasets[file.name] = df
549
+ except Exception as e:
550
+ st.error(f"Error loading {file.name}: {str(e)}")
551
+
552
+ if len(datasets) >= 2:
553
+ self.perform_dataset_comparison(datasets)
554
+
555
+ def perform_dataset_comparison(self, datasets: Dict[str, pd.DataFrame]):
556
+ """Perform comparison between multiple datasets"""
557
+ st.markdown("### πŸ“Š Dataset Comparison Results")
558
+
559
+ # Basic comparison table
560
+ comparison_data = []
561
+
562
+ for name, df in datasets.items():
563
+ comparison_data.append({
564
+ 'Dataset': name,
565
+ 'Rows': len(df),
566
+ 'Columns': len(df.columns),
567
+ 'Numeric Columns': len(df.select_dtypes(include=[np.number]).columns),
568
+ 'Text Columns': len(df.select_dtypes(include=['object']).columns),
569
+ 'Missing Values': df.isnull().sum().sum(),
570
+ 'Memory (MB)': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f}"
571
+ })
572
+
573
+ comparison_df = pd.DataFrame(comparison_data)
574
+ st.dataframe(comparison_df, use_container_width=True)
575
+
576
+ # Visual comparison
577
+ fig = make_subplots(
578
+ rows=2, cols=2,
579
+ subplot_titles=['Rows Comparison', 'Columns Comparison',
580
+ 'Missing Values', 'Memory Usage'],
581
+ specs=[[{"type": "bar"}, {"type": "bar"}],
582
+ [{"type": "bar"}, {"type": "bar"}]]
583
+ )
584
+
585
+ names = list(datasets.keys())
586
+
587
+ # Rows comparison
588
+ fig.add_trace(
589
+ go.Bar(x=names, y=[len(datasets[name]) for name in names], name="Rows"),
590
+ row=1, col=1
591
+ )
592
+
593
+ # Columns comparison
594
+ fig.add_trace(
595
+ go.Bar(x=names, y=[len(datasets[name].columns) for name in names], name="Columns"),
596
+ row=1, col=2
597
+ )
598
+
599
+ # Missing values comparison
600
+ fig.add_trace(
601
+ go.Bar(x=names, y=[datasets[name].isnull().sum().sum() for name in names], name="Missing"),
602
+ row=2, col=1
603
+ )
604
+
605
+ # Memory usage comparison
606
+ fig.add_trace(
607
+ go.Bar(x=names, y=[datasets[name].memory_usage(deep=True).sum() / 1024**2 for name in names], name="Memory"),
608
+ row=2, col=2
609
+ )
610
+
611
+ fig.update_layout(height=600, showlegend=False, title_text="Dataset Comparison Dashboard")
612
+ st.plotly_chart(fig, use_container_width=True)
613
+
614
+ def render_data_profiling_tool(self, df: pd.DataFrame):
615
+ """Render comprehensive data profiling tool"""
616
+ st.markdown("## πŸ”¬ Data Profiling Tool")
617
+
618
+ if st.button("πŸš€ Generate Complete Data Profile"):
619
+ with st.spinner("Generating comprehensive data profile..."):
620
+ profile = self.generate_data_profile(df)
621
+ self.display_data_profile(profile)
622
+
623
+ def generate_data_profile(self, df: pd.DataFrame) -> Dict[str, Any]:
624
+ """Generate comprehensive data profile"""
625
+ profile = {
626
+ 'overview': {},
627
+ 'column_profiles': {},
628
+ 'data_quality': {},
629
+ 'relationships': {},
630
+ 'recommendations': []
631
+ }
632
+
633
+ # Overview
634
+ profile['overview'] = {
635
+ 'shape': df.shape,
636
+ 'memory_usage': df.memory_usage(deep=True).sum(),
637
+ 'dtypes': df.dtypes.value_counts().to_dict(),
638
+ 'missing_cells': df.isnull().sum().sum(),
639
+ 'duplicate_rows': df.duplicated().sum()
640
+ }
641
+
642
+ # Column profiles
643
+ for col in df.columns:
644
+ col_profile = {
645
+ 'dtype': str(df[col].dtype),
646
+ 'null_count': df[col].isnull().sum(),
647
+ 'null_percentage': df[col].isnull().sum() / len(df) * 100,
648
+ 'unique_count': df[col].nunique(),
649
+ 'unique_percentage': df[col].nunique() / len(df) * 100
650
+ }
651
+
652
+ if df[col].dtype in ['int64', 'float64']:
653
+ col_profile.update({
654
+ 'min': df[col].min(),
655
+ 'max': df[col].max(),
656
+ 'mean': df[col].mean(),
657
+ 'std': df[col].std(),
658
+ 'skewness': df[col].skew(),
659
+ 'kurtosis': df[col].kurtosis()
660
+ })
661
+ else:
662
+ col_profile.update({
663
+ 'most_frequent': df[col].mode().iloc[0] if len(df[col].mode()) > 0 else None,
664
+ 'most_frequent_count': df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 0
665
+ })
666
+
667
+ profile['column_profiles'][col] = col_profile
668
+
669
+ return profile
670
+
671
+ def display_data_profile(self, profile: Dict[str, Any]):
672
+ """Display data profile results"""
673
+ st.markdown("### πŸ“Š Complete Data Profile")
674
+
675
+ # Overview metrics
676
+ overview = profile['overview']
677
+
678
+ col1, col2, col3, col4 = st.columns(4)
679
+ with col1:
680
+ st.metric("Rows", f"{overview['shape'][0]:,}")
681
+ with col2:
682
+ st.metric("Columns", overview['shape'][1])
683
+ with col3:
684
+ st.metric("Missing Cells", f"{overview['missing_cells']:,}")
685
+ with col4:
686
+ st.metric("Duplicates", f"{overview['duplicate_rows']:,}")
687
+
688
+ # Column details table
689
+ st.markdown("#### πŸ“‹ Column Details")
690
+
691
+ col_data = []
692
+ for col, details in profile['column_profiles'].items():
693
+ col_data.append({
694
+ 'Column': col,
695
+ 'Type': details['dtype'],
696
+ 'Missing %': f"{details['null_percentage']:.1f}%",
697
+ 'Unique %': f"{details['unique_percentage']:.1f}%",
698
+ 'Details': f"Min: {details.get('min', 'N/A')}, Max: {details.get('max', 'N/A')}" if 'min' in details else f"Most Frequent: {details.get('most_frequent', 'N/A')}"
699
+ })
700
+
701
+ col_df = pd.DataFrame(col_data)
702
+ st.dataframe(col_df, use_container_width=True)
703
+
704
+ # Usage in main app
705
+ def integrate_advanced_features():
706
+ """Integration function for advanced features"""
707
+ return """
708
+ # Add this to your main.py file:
709
+
710
+ from advanced_features import AdvancedFeatures
711
+
712
+ # In your NeuralDataAnalyst class:
713
+ def __init__(self):
714
+ # ... existing code ...
715
+ self.advanced_features = AdvancedFeatures(self.db_manager)
716
+
717
+ # Add this after your existing data upload section:
718
+ if st.session_state.uploaded_data is not None:
719
+ if st.button("πŸ”¬ Advanced Analytics", key="advanced_analytics"):
720
+ self.advanced_features.render_advanced_analytics_dashboard(st.session_state.uploaded_data)
721
+
722
+ if st.button("πŸ” Data Profiling", key="data_profiling"):
723
+ self.advanced_features.render_data_profiling_tool(st.session_state.uploaded_data)
724
+
725
+ # Add dataset comparison in sidebar:
726
+ with st.sidebar:
727
+ st.markdown("---")
728
+ if st.button("βš–οΈ Compare Datasets"):
729
+ self.advanced_features.render_data_comparison_tool()
730
+ """
database_manager.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from typing import Dict, List, Any
5
+
6
+ class DatabaseManager:
7
+ """Simple file-based database manager for storing analysis history"""
8
+
9
+ def __init__(self, db_file: str = "analysis_history.json"):
10
+ """Initialize the database manager
11
+
12
+ Args:
13
+ db_file: Path to the JSON file to store analysis history
14
+ """
15
+ self.db_file = db_file
16
+ self.ensure_db_file_exists()
17
+
18
+ def ensure_db_file_exists(self):
19
+ """Ensure the database file exists"""
20
+ if not os.path.exists(self.db_file):
21
+ with open(self.db_file, 'w') as f:
22
+ json.dump([], f)
23
+
24
+ def save_analysis(self, analysis_record: Dict[str, Any]) -> bool:
25
+ """Save an analysis record to the database
26
+
27
+ Args:
28
+ analysis_record: Dictionary containing analysis data
29
+
30
+ Returns:
31
+ bool: True if successful, False otherwise
32
+ """
33
+ try:
34
+ # Read existing data
35
+ existing_data = self.load_all_data()
36
+
37
+ # Add timestamp if not present
38
+ if 'timestamp' not in analysis_record:
39
+ analysis_record['timestamp'] = datetime.now().isoformat()
40
+
41
+ # Append new record
42
+ existing_data.append(analysis_record)
43
+
44
+ # Write back to file
45
+ with open(self.db_file, 'w') as f:
46
+ json.dump(existing_data, f, indent=2, default=str)
47
+
48
+ return True
49
+
50
+ except Exception as e:
51
+ print(f"Error saving analysis: {e}")
52
+ return False
53
+
54
+ def get_history(self, session_id: str = None, limit: int = 100) -> List[Dict[str, Any]]:
55
+ """Get analysis history
56
+
57
+ Args:
58
+ session_id: Optional session ID to filter by
59
+ limit: Maximum number of records to return
60
+
61
+ Returns:
62
+ List of analysis records
63
+ """
64
+ try:
65
+ data = self.load_all_data()
66
+
67
+ # Filter by session_id if provided
68
+ if session_id:
69
+ data = [record for record in data if record.get('session_id') == session_id]
70
+
71
+ # Sort by timestamp (newest first)
72
+ data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
73
+
74
+ # Apply limit
75
+ return data[:limit]
76
+
77
+ except Exception as e:
78
+ print(f"Error getting history: {e}")
79
+ return []
80
+
81
+ def clear_history(self, session_id: str = None) -> bool:
82
+ """Clear analysis history
83
+
84
+ Args:
85
+ session_id: Optional session ID to clear specific session data
86
+
87
+ Returns:
88
+ bool: True if successful, False otherwise
89
+ """
90
+ try:
91
+ if session_id:
92
+ # Clear only specific session data
93
+ data = self.load_all_data()
94
+ filtered_data = [record for record in data if record.get('session_id') != session_id]
95
+
96
+ with open(self.db_file, 'w') as f:
97
+ json.dump(filtered_data, f, indent=2, default=str)
98
+ else:
99
+ # Clear all data
100
+ with open(self.db_file, 'w') as f:
101
+ json.dump([], f)
102
+
103
+ return True
104
+
105
+ except Exception as e:
106
+ print(f"Error clearing history: {e}")
107
+ return False
108
+
109
+ def load_all_data(self) -> List[Dict[str, Any]]:
110
+ """Load all data from the database file
111
+
112
+ Returns:
113
+ List of all records
114
+ """
115
+ try:
116
+ with open(self.db_file, 'r') as f:
117
+ data = json.load(f)
118
+ return data if isinstance(data, list) else []
119
+ except (FileNotFoundError, json.JSONDecodeError):
120
+ return []
121
+
122
+ def get_analysis_by_type(self, analysis_type: str, session_id: str = None) -> List[Dict[str, Any]]:
123
+ """Get analyses by type
124
+
125
+ Args:
126
+ analysis_type: Type of analysis (e.g., 'EDA', 'Single Query Analysis')
127
+ session_id: Optional session ID to filter by
128
+
129
+ Returns:
130
+ List of matching analysis records
131
+ """
132
+ try:
133
+ data = self.load_all_data()
134
+
135
+ # Filter by type
136
+ filtered_data = [record for record in data if record.get('type') == analysis_type]
137
+
138
+ # Filter by session_id if provided
139
+ if session_id:
140
+ filtered_data = [record for record in filtered_data if record.get('session_id') == session_id]
141
+
142
+ # Sort by timestamp (newest first)
143
+ filtered_data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
144
+
145
+ return filtered_data
146
+
147
+ except Exception as e:
148
+ print(f"Error getting analysis by type: {e}")
149
+ return []
150
+
151
+ def get_stats(self) -> Dict[str, Any]:
152
+ """Get database statistics
153
+
154
+ Returns:
155
+ Dictionary with database statistics
156
+ """
157
+ try:
158
+ data = self.load_all_data()
159
+
160
+ stats = {
161
+ 'total_records': len(data),
162
+ 'unique_sessions': len(set(record.get('session_id', '') for record in data)),
163
+ 'analysis_types': {},
164
+ 'oldest_record': None,
165
+ 'newest_record': None
166
+ }
167
+
168
+ # Count analysis types
169
+ for record in data:
170
+ analysis_type = record.get('type', 'Unknown')
171
+ stats['analysis_types'][analysis_type] = stats['analysis_types'].get(analysis_type, 0) + 1
172
+
173
+ # Find oldest and newest records
174
+ if data:
175
+ timestamps = [record.get('timestamp', '') for record in data if record.get('timestamp')]
176
+ if timestamps:
177
+ timestamps.sort()
178
+ stats['oldest_record'] = timestamps[0]
179
+ stats['newest_record'] = timestamps[-1]
180
+
181
+ return stats
182
+
183
+ except Exception as e:
184
+ print(f"Error getting stats: {e}")
185
+ return {
186
+ 'total_records': 0,
187
+ 'unique_sessions': 0,
188
+ 'analysis_types': {},
189
+ 'oldest_record': None,
190
+ 'newest_record': None,
191
+ 'error': str(e)
192
+ }
193
+
194
+ def backup_database(self, backup_file: str = None) -> bool:
195
+ """Create a backup of the database
196
+
197
+ Args:
198
+ backup_file: Path for backup file. If None, uses timestamp-based name
199
+
200
+ Returns:
201
+ bool: True if successful, False otherwise
202
+ """
203
+ try:
204
+ if backup_file is None:
205
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
206
+ backup_file = f"analysis_history_backup_{timestamp}.json"
207
+
208
+ data = self.load_all_data()
209
+
210
+ with open(backup_file, 'w') as f:
211
+ json.dump(data, f, indent=2, default=str)
212
+
213
+ return True
214
+
215
+ except Exception as e:
216
+ print(f"Error creating backup: {e}")
217
+ return False
218
+
219
+ def restore_from_backup(self, backup_file: str) -> bool:
220
+ """Restore database from backup
221
+
222
+ Args:
223
+ backup_file: Path to backup file
224
+
225
+ Returns:
226
+ bool: True if successful, False otherwise
227
+ """
228
+ try:
229
+ if not os.path.exists(backup_file):
230
+ print(f"Backup file not found: {backup_file}")
231
+ return False
232
+
233
+ with open(backup_file, 'r') as f:
234
+ data = json.load(f)
235
+
236
+ # Validate data format
237
+ if not isinstance(data, list):
238
+ print("Invalid backup file format")
239
+ return False
240
+
241
+ # Write to main database file
242
+ with open(self.db_file, 'w') as f:
243
+ json.dump(data, f, indent=2, default=str)
244
+
245
+ return True
246
+
247
+ except Exception as e:
248
+ print(f"Error restoring from backup: {e}")
249
+ return False
250
+
251
+ def delete_old_records(self, days_old: int = 30) -> int:
252
+ """Delete records older than specified days
253
+
254
+ Args:
255
+ days_old: Number of days to keep records
256
+
257
+ Returns:
258
+ int: Number of records deleted
259
+ """
260
+ try:
261
+ from datetime import datetime, timedelta
262
+
263
+ cutoff_date = datetime.now() - timedelta(days=days_old)
264
+ cutoff_str = cutoff_date.isoformat()
265
+
266
+ data = self.load_all_data()
267
+ original_count = len(data)
268
+
269
+ # Filter out old records
270
+ filtered_data = []
271
+ for record in data:
272
+ record_time = record.get('timestamp', '')
273
+ if record_time >= cutoff_str:
274
+ filtered_data.append(record)
275
+
276
+ # Write filtered data back
277
+ with open(self.db_file, 'w') as f:
278
+ json.dump(filtered_data, f, indent=2, default=str)
279
+
280
+ deleted_count = original_count - len(filtered_data)
281
+ return deleted_count
282
+
283
+ except Exception as e:
284
+ print(f"Error deleting old records: {e}")
285
+ return 0
eda_analyzer.py ADDED
@@ -0,0 +1,593 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
+ from plotly.subplots import make_subplots
6
+ import plotly.figure_factory as ff
7
+ from typing import Dict, List, Any, Tuple
8
+ import warnings
9
+ warnings.filterwarnings('ignore')
10
+
11
+ # Import scipy with error handling
12
+ try:
13
+ from scipy import stats
14
+ from scipy.stats import chi2_contingency
15
+ SCIPY_AVAILABLE = True
16
+ except ImportError:
17
+ SCIPY_AVAILABLE = False
18
+
19
+ class EDAAnalyzer:
20
+ """Comprehensive Exploratory Data Analysis with advanced visualizations"""
21
+
22
+ def __init__(self):
23
+ self.color_palette = [
24
+ '#FFD700', '#FF6B6B', '#4ECDC4', '#45B7D1',
25
+ '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8'
26
+ ]
27
+
28
+ def perform_complete_eda(self, df: pd.DataFrame) -> Dict[str, Any]:
29
+ """Perform comprehensive EDA analysis"""
30
+ try:
31
+ results = {
32
+ 'overview': self.generate_overview(df),
33
+ 'distributions': self.analyze_distributions(df),
34
+ 'correlations': self.analyze_correlations(df),
35
+ 'insights': self.generate_insights(df),
36
+ 'data_quality': self.assess_data_quality(df),
37
+ 'advanced_analysis': self.perform_advanced_analysis(df)
38
+ }
39
+
40
+ return results
41
+ except Exception as e:
42
+ # Return basic results if advanced analysis fails
43
+ return {
44
+ 'overview': self.generate_overview(df),
45
+ 'distributions': {},
46
+ 'correlations': {},
47
+ 'insights': [{'title': 'Analysis Error', 'description': f'Error during analysis: {str(e)}'}],
48
+ 'data_quality': {},
49
+ 'advanced_analysis': {}
50
+ }
51
+
52
+ def generate_overview(self, df: pd.DataFrame) -> Dict[str, Any]:
53
+ """Generate dataset overview"""
54
+ try:
55
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
56
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
57
+ datetime_cols = df.select_dtypes(include=['datetime64']).columns
58
+
59
+ overview = {
60
+ 'total_rows': len(df),
61
+ 'total_columns': len(df.columns),
62
+ 'numeric_columns': len(numeric_cols),
63
+ 'categorical_columns': len(categorical_cols),
64
+ 'datetime_columns': len(datetime_cols),
65
+ 'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
66
+ 'duplicate_rows': df.duplicated().sum(),
67
+ 'missing_values_total': df.isnull().sum().sum()
68
+ }
69
+
70
+ if len(numeric_cols) > 0:
71
+ overview['summary_stats'] = df[numeric_cols].describe()
72
+
73
+ return overview
74
+ except Exception as e:
75
+ return {
76
+ 'total_rows': len(df) if df is not None else 0,
77
+ 'total_columns': len(df.columns) if df is not None else 0,
78
+ 'numeric_columns': 0,
79
+ 'categorical_columns': 0,
80
+ 'datetime_columns': 0,
81
+ 'memory_usage': '0 MB',
82
+ 'duplicate_rows': 0,
83
+ 'missing_values_total': 0,
84
+ 'error': str(e)
85
+ }
86
+
87
+ def analyze_distributions(self, df: pd.DataFrame) -> Dict[str, go.Figure]:
88
+ """Analyze data distributions with multiple chart types"""
89
+ distributions = {}
90
+
91
+ try:
92
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
93
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
94
+
95
+ # Numeric distributions
96
+ if len(numeric_cols) > 0:
97
+ distributions.update(self.create_numeric_distributions(df, numeric_cols))
98
+
99
+ # Categorical distributions
100
+ if len(categorical_cols) > 0:
101
+ distributions.update(self.create_categorical_distributions(df, categorical_cols))
102
+
103
+ except Exception as e:
104
+ distributions['error'] = self.create_error_plot(f"Distribution analysis failed: {str(e)}")
105
+
106
+ return distributions
107
+
108
+ def create_error_plot(self, error_message: str) -> go.Figure:
109
+ """Create an error plot when analysis fails"""
110
+ fig = go.Figure()
111
+ fig.add_annotation(
112
+ text=error_message,
113
+ xref="paper", yref="paper",
114
+ x=0.5, y=0.5, xanchor='center', yanchor='middle',
115
+ showarrow=False,
116
+ font=dict(size=16, color="red")
117
+ )
118
+ fig.update_layout(
119
+ title="Analysis Error",
120
+ showlegend=False,
121
+ plot_bgcolor='rgba(0,0,0,0)',
122
+ paper_bgcolor='rgba(0,0,0,0)',
123
+ font=dict(color='white')
124
+ )
125
+ return fig
126
+
127
+ def create_numeric_distributions(self, df: pd.DataFrame, numeric_cols: List[str]) -> Dict[str, go.Figure]:
128
+ """Create numeric distribution plots"""
129
+ plots = {}
130
+
131
+ try:
132
+ # Multi-histogram plot
133
+ if len(numeric_cols) <= 6:
134
+ rows = (len(numeric_cols) + 2) // 3
135
+ fig = make_subplots(
136
+ rows=rows, cols=3,
137
+ subplot_titles=list(numeric_cols),
138
+ vertical_spacing=0.08
139
+ )
140
+
141
+ for i, col in enumerate(numeric_cols):
142
+ row = (i // 3) + 1
143
+ col_pos = (i % 3) + 1
144
+
145
+ # Filter out non-finite values
146
+ data = df[col].dropna()
147
+ if len(data) > 0:
148
+ fig.add_trace(
149
+ go.Histogram(
150
+ x=data,
151
+ name=col,
152
+ marker_color=self.color_palette[i % len(self.color_palette)],
153
+ opacity=0.7,
154
+ showlegend=False
155
+ ),
156
+ row=row, col=col_pos
157
+ )
158
+
159
+ fig.update_layout(
160
+ title="πŸ“Š Numeric Distributions Overview",
161
+ height=300 * rows,
162
+ plot_bgcolor='rgba(0,0,0,0)',
163
+ paper_bgcolor='rgba(0,0,0,0)',
164
+ font=dict(color='white')
165
+ )
166
+ plots['numeric_histograms'] = fig
167
+
168
+ # Box plots for outlier detection
169
+ if len(numeric_cols) > 0:
170
+ fig = go.Figure()
171
+ for i, col in enumerate(numeric_cols[:8]): # Limit to 8 columns
172
+ data = df[col].dropna()
173
+ if len(data) > 0:
174
+ fig.add_trace(go.Box(
175
+ y=data,
176
+ name=col,
177
+ marker_color=self.color_palette[i % len(self.color_palette)]
178
+ ))
179
+
180
+ fig.update_layout(
181
+ title="πŸ“¦ Box Plots - Outlier Detection",
182
+ height=500,
183
+ plot_bgcolor='rgba(0,0,0,0)',
184
+ paper_bgcolor='rgba(0,0,0,0)',
185
+ font=dict(color='white')
186
+ )
187
+ plots['box_plots'] = fig
188
+
189
+ # Violin plots for distribution shapes
190
+ if len(numeric_cols) > 0:
191
+ fig = go.Figure()
192
+ for i, col in enumerate(numeric_cols[:6]):
193
+ data = df[col].dropna()
194
+ if len(data) > 1: # Need at least 2 points for violin plot
195
+ fig.add_trace(go.Violin(
196
+ y=data,
197
+ name=col,
198
+ box_visible=True,
199
+ meanline_visible=True,
200
+ fillcolor=self.color_palette[i % len(self.color_palette)],
201
+ opacity=0.6
202
+ ))
203
+
204
+ fig.update_layout(
205
+ title="🎻 Violin Plots - Distribution Shapes",
206
+ height=500,
207
+ plot_bgcolor='rgba(0,0,0,0)',
208
+ paper_bgcolor='rgba(0,0,0,0)',
209
+ font=dict(color='white')
210
+ )
211
+ plots['violin_plots'] = fig
212
+
213
+ except Exception as e:
214
+ plots['numeric_error'] = self.create_error_plot(f"Numeric distribution error: {str(e)}")
215
+
216
+ return plots
217
+
218
+ def create_categorical_distributions(self, df: pd.DataFrame, categorical_cols: List[str]) -> Dict[str, go.Figure]:
219
+ """Create categorical distribution plots"""
220
+ plots = {}
221
+
222
+ try:
223
+ # Bar charts for categorical variables
224
+ for i, col in enumerate(categorical_cols[:4]): # Limit to 4 columns
225
+ value_counts = df[col].value_counts().head(15) # Top 15 categories
226
+
227
+ if len(value_counts) > 0:
228
+ fig = go.Figure(data=[
229
+ go.Bar(
230
+ x=value_counts.index.astype(str),
231
+ y=value_counts.values,
232
+ marker_color=self.color_palette[i % len(self.color_palette)],
233
+ text=value_counts.values,
234
+ textposition='auto'
235
+ )
236
+ ])
237
+
238
+ fig.update_layout(
239
+ title=f"πŸ“Š {col} - Value Distribution",
240
+ xaxis_title=col,
241
+ yaxis_title="Count",
242
+ height=400,
243
+ plot_bgcolor='rgba(0,0,0,0)',
244
+ paper_bgcolor='rgba(0,0,0,0)',
245
+ font=dict(color='white')
246
+ )
247
+ plots[f'categorical_{col}'] = fig
248
+
249
+ # Pie chart for first categorical variable
250
+ if len(categorical_cols) > 0:
251
+ col = categorical_cols[0]
252
+ value_counts = df[col].value_counts().head(10)
253
+
254
+ if len(value_counts) > 0:
255
+ fig = go.Figure(data=[go.Pie(
256
+ labels=value_counts.index.astype(str),
257
+ values=value_counts.values,
258
+ hole=0.3,
259
+ marker_colors=self.color_palette
260
+ )])
261
+
262
+ fig.update_layout(
263
+ title=f"πŸ₯§ {col} - Proportion Analysis",
264
+ height=500,
265
+ plot_bgcolor='rgba(0,0,0,0)',
266
+ paper_bgcolor='rgba(0,0,0,0)',
267
+ font=dict(color='white')
268
+ )
269
+ plots['pie_chart'] = fig
270
+
271
+ except Exception as e:
272
+ plots['categorical_error'] = self.create_error_plot(f"Categorical distribution error: {str(e)}")
273
+
274
+ return plots
275
+
276
+ def analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
277
+ """Analyze correlations between variables"""
278
+ correlations = {}
279
+
280
+ try:
281
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
282
+
283
+ if len(numeric_cols) > 1:
284
+ # Correlation matrix
285
+ corr_matrix = df[numeric_cols].corr()
286
+
287
+ # Heatmap
288
+ fig = go.Figure(data=go.Heatmap(
289
+ z=corr_matrix.values,
290
+ x=corr_matrix.columns,
291
+ y=corr_matrix.columns,
292
+ colorscale='RdYlBu',
293
+ zmid=0,
294
+ text=np.round(corr_matrix.values, 2),
295
+ texttemplate="%{text}",
296
+ textfont={"size": 10},
297
+ colorbar=dict(title="Correlation")
298
+ ))
299
+
300
+ fig.update_layout(
301
+ title="πŸ”₯ Correlation Heatmap",
302
+ height=max(400, len(numeric_cols) * 30),
303
+ plot_bgcolor='rgba(0,0,0,0)',
304
+ paper_bgcolor='rgba(0,0,0,0)',
305
+ font=dict(color='white')
306
+ )
307
+ correlations['heatmap'] = fig
308
+
309
+ # Top correlations
310
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
311
+ corr_matrix_masked = corr_matrix.mask(mask)
312
+
313
+ # Get top positive and negative correlations
314
+ corr_pairs = []
315
+ for i in range(len(corr_matrix_masked.columns)):
316
+ for j in range(len(corr_matrix_masked.columns)):
317
+ if pd.notna(corr_matrix_masked.iloc[i, j]):
318
+ corr_pairs.append({
319
+ 'Variable 1': corr_matrix_masked.columns[i],
320
+ 'Variable 2': corr_matrix_masked.columns[j],
321
+ 'Correlation': corr_matrix_masked.iloc[i, j]
322
+ })
323
+
324
+ if corr_pairs:
325
+ corr_df = pd.DataFrame(corr_pairs)
326
+ corr_df = corr_df.reindex(corr_df['Correlation'].abs().sort_values(ascending=False).index)
327
+ correlations['top_correlations'] = corr_df.head(10)
328
+
329
+ # Scatter plot matrix for top correlated variables
330
+ if len(numeric_cols) >= 2:
331
+ top_corr_cols = corr_df.head(3)[['Variable 1', 'Variable 2']].values.flatten()
332
+ unique_cols = list(set(top_corr_cols))[:4] # Max 4 variables
333
+
334
+ if len(unique_cols) >= 2:
335
+ try:
336
+ fig = px.scatter_matrix(
337
+ df[unique_cols].dropna(),
338
+ dimensions=unique_cols,
339
+ color_discrete_sequence=self.color_palette
340
+ )
341
+
342
+ fig.update_layout(
343
+ title="🎯 Scatter Plot Matrix - Top Correlated Variables",
344
+ height=600,
345
+ plot_bgcolor='rgba(0,0,0,0)',
346
+ paper_bgcolor='rgba(0,0,0,0)',
347
+ font=dict(color='white')
348
+ )
349
+ correlations['scatter_matrix'] = fig
350
+ except Exception:
351
+ pass # Skip if scatter matrix fails
352
+
353
+ except Exception as e:
354
+ correlations['error'] = f"Correlation analysis failed: {str(e)}"
355
+
356
+ return correlations
357
+
358
+ def generate_insights(self, df: pd.DataFrame) -> List[Dict[str, str]]:
359
+ """Generate AI-powered insights about the data"""
360
+ insights = []
361
+
362
+ try:
363
+ # Basic statistics insights
364
+ insights.append({
365
+ 'title': 'πŸ“Š Dataset Overview',
366
+ 'description': f"Dataset contains {len(df):,} rows and {len(df.columns)} columns. "
367
+ f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB. "
368
+ f"Missing values: {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)."
369
+ })
370
+
371
+ # Numeric columns insights
372
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
373
+ if len(numeric_cols) > 0:
374
+ try:
375
+ # Find columns with high variance
376
+ variances = df[numeric_cols].var().sort_values(ascending=False)
377
+ high_var_col = variances.index[0]
378
+
379
+ insights.append({
380
+ 'title': 'πŸ“ˆ Variance Analysis',
381
+ 'description': f"'{high_var_col}' shows the highest variance ({variances.iloc[0]:.2f}), "
382
+ f"indicating significant spread in values. This column might contain outliers "
383
+ f"or represent a key differentiating factor in your dataset."
384
+ })
385
+
386
+ # Skewness analysis
387
+ skewed_cols = []
388
+ for col in numeric_cols:
389
+ try:
390
+ skewness = df[col].skew()
391
+ if abs(skewness) > 1:
392
+ skewed_cols.append((col, skewness))
393
+ except:
394
+ continue
395
+
396
+ if skewed_cols:
397
+ insights.append({
398
+ 'title': 'πŸ“ Distribution Skewness',
399
+ 'description': f"Found {len(skewed_cols)} heavily skewed columns. "
400
+ f"Most skewed: '{skewed_cols[0][0]}' (skewness: {skewed_cols[0][1]:.2f}). "
401
+ f"Consider log transformation or outlier treatment for better modeling."
402
+ })
403
+ except Exception:
404
+ pass
405
+
406
+ # Categorical insights
407
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
408
+ if len(categorical_cols) > 0:
409
+ try:
410
+ cardinalities = []
411
+ for col in categorical_cols:
412
+ unique_count = df[col].nunique()
413
+ cardinalities.append((col, unique_count))
414
+
415
+ cardinalities.sort(key=lambda x: x[1], reverse=True)
416
+
417
+ insights.append({
418
+ 'title': '🏷️ Categorical Analysis',
419
+ 'description': f"'{cardinalities[0][0]}' has the highest cardinality ({cardinalities[0][1]} unique values). "
420
+ f"High cardinality columns might need encoding strategies for machine learning. "
421
+ f"Consider grouping rare categories or using embedding techniques."
422
+ })
423
+ except Exception:
424
+ pass
425
+
426
+ # Missing data patterns
427
+ try:
428
+ missing_data = df.isnull().sum()
429
+ missing_cols = missing_data[missing_data > 0].sort_values(ascending=False)
430
+
431
+ if len(missing_cols) > 0:
432
+ insights.append({
433
+ 'title': '❓ Missing Data Patterns',
434
+ 'description': f"'{missing_cols.index[0]}' has the most missing values ({missing_cols.iloc[0]:,} - "
435
+ f"{missing_cols.iloc[0] / len(df) * 100:.1f}%). "
436
+ f"Analyze if missing data is random or systematic. "
437
+ f"Consider imputation strategies or feature engineering."
438
+ })
439
+ except Exception:
440
+ pass
441
+
442
+ # Correlation insights
443
+ if len(numeric_cols) > 1:
444
+ try:
445
+ corr_matrix = df[numeric_cols].corr()
446
+ mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
447
+ corr_matrix_masked = corr_matrix.mask(mask)
448
+
449
+ max_corr = 0
450
+ max_pair = None
451
+ for i in range(len(corr_matrix_masked.columns)):
452
+ for j in range(len(corr_matrix_masked.columns)):
453
+ if pd.notna(corr_matrix_masked.iloc[i, j]):
454
+ if abs(corr_matrix_masked.iloc[i, j]) > abs(max_corr):
455
+ max_corr = corr_matrix_masked.iloc[i, j]
456
+ max_pair = (corr_matrix_masked.columns[i], corr_matrix_masked.columns[j])
457
+
458
+ if max_pair and abs(max_corr) > 0.5:
459
+ insights.append({
460
+ 'title': 'πŸ”— Strong Correlations',
461
+ 'description': f"Strong correlation found between '{max_pair[0]}' and '{max_pair[1]}' "
462
+ f"(r = {max_corr:.3f}). This suggests potential multicollinearity. "
463
+ f"Consider feature selection or dimensionality reduction techniques."
464
+ })
465
+ except Exception:
466
+ pass
467
+
468
+ except Exception as e:
469
+ insights.append({
470
+ 'title': 'Analysis Error',
471
+ 'description': f"Error generating insights: {str(e)}"
472
+ })
473
+
474
+ return insights
475
+
476
+ def assess_data_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
477
+ """Assess data quality with visualizations"""
478
+ quality = {}
479
+
480
+ try:
481
+ # Missing values heatmap
482
+ if df.isnull().sum().sum() > 0:
483
+ missing_data = df.isnull().sum().sort_values(ascending=False)
484
+ missing_data = missing_data[missing_data > 0]
485
+
486
+ if len(missing_data) > 0:
487
+ fig = go.Figure([go.Bar(
488
+ x=missing_data.index,
489
+ y=missing_data.values,
490
+ marker_color='#FF6B6B',
491
+ text=missing_data.values,
492
+ textposition='auto'
493
+ )])
494
+
495
+ fig.update_layout(
496
+ title="❓ Missing Values by Column",
497
+ xaxis_title="Columns",
498
+ yaxis_title="Missing Count",
499
+ height=400,
500
+ plot_bgcolor='rgba(0,0,0,0)',
501
+ paper_bgcolor='rgba(0,0,0,0)',
502
+ font=dict(color='white')
503
+ )
504
+ quality['missing_values'] = fig
505
+
506
+ # Data types distribution
507
+ dtype_counts = df.dtypes.value_counts()
508
+
509
+ if len(dtype_counts) > 0:
510
+ fig = go.Figure(data=[go.Pie(
511
+ labels=[str(dtype) for dtype in dtype_counts.index],
512
+ values=dtype_counts.values,
513
+ hole=0.3,
514
+ marker_colors=self.color_palette
515
+ )])
516
+
517
+ fig.update_layout(
518
+ title="πŸ”§ Data Types Distribution",
519
+ height=400,
520
+ plot_bgcolor='rgba(0,0,0,0)',
521
+ paper_bgcolor='rgba(0,0,0,0)',
522
+ font=dict(color='white')
523
+ )
524
+ quality['data_types'] = fig
525
+
526
+ # Duplicate analysis
527
+ duplicates = df.duplicated().sum()
528
+ if duplicates > 0:
529
+ quality['duplicates'] = {
530
+ 'count': duplicates,
531
+ 'percentage': duplicates / len(df) * 100
532
+ }
533
+
534
+ except Exception as e:
535
+ quality['error'] = f"Data quality assessment failed: {str(e)}"
536
+
537
+ return quality
538
+
539
+ def perform_advanced_analysis(self, df: pd.DataFrame) -> Dict[str, Any]:
540
+ """Perform advanced statistical analysis"""
541
+ advanced = {}
542
+
543
+ try:
544
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
545
+
546
+ # Outlier detection using IQR method
547
+ if len(numeric_cols) > 0:
548
+ outlier_counts = {}
549
+ for col in numeric_cols:
550
+ try:
551
+ data = df[col].dropna()
552
+ if len(data) > 0:
553
+ Q1 = data.quantile(0.25)
554
+ Q3 = data.quantile(0.75)
555
+ IQR = Q3 - Q1
556
+ lower_bound = Q1 - 1.5 * IQR
557
+ upper_bound = Q3 + 1.5 * IQR
558
+
559
+ outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
560
+ outlier_counts[col] = len(outliers)
561
+ except Exception:
562
+ outlier_counts[col] = 0
563
+
564
+ if outlier_counts:
565
+ outlier_df = pd.DataFrame(list(outlier_counts.items()),
566
+ columns=['Column', 'Outlier_Count'])
567
+ outlier_df = outlier_df.sort_values('Outlier_Count', ascending=False)
568
+ advanced['outliers'] = outlier_df
569
+
570
+ # Statistical tests
571
+ categorical_cols = df.select_dtypes(include=['object', 'category']).columns
572
+
573
+ if len(categorical_cols) >= 2 and SCIPY_AVAILABLE:
574
+ try:
575
+ col1, col2 = categorical_cols[0], categorical_cols[1]
576
+ contingency_table = pd.crosstab(df[col1], df[col2])
577
+
578
+ if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
579
+ chi2, p_value, dof, expected = chi2_contingency(contingency_table)
580
+
581
+ advanced['chi_square_test'] = {
582
+ 'variables': [col1, col2],
583
+ 'chi2_statistic': chi2,
584
+ 'p_value': p_value,
585
+ 'interpretation': 'Dependent' if p_value < 0.05 else 'Independent'
586
+ }
587
+ except Exception:
588
+ pass # Skip if test fails
589
+
590
+ except Exception as e:
591
+ advanced['error'] = f"Advanced analysis failed: {str(e)}"
592
+
593
+ return advanced
main.py ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ pandas>=1.5.0
3
+ numpy>=1.24.0
4
+ plotly>=5.15.0
5
+ requests>=2.31.0
6
+ faiss-cpu>=1.7.4
7
+ scipy>=1.10.0
8
+ seaborn>=0.12.0
9
+ sentence-transformers>=2.2.0
10
+ scikit-learn>=1.3.0
11
+ pathlib2>=2.3.7
12
+ python-dotenv>=1.0.0
setup.bat ADDED
File without changes
setup.sh ADDED
File without changes
test.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify the Neural Data Analyst application works correctly
4
+ Run this to check if all dependencies are available and create sample files
5
+ """
6
+
7
+ import sys
8
+ import os
9
+ from pathlib import Path
10
+
11
+ def check_dependencies():
12
+ """Check if all required dependencies are installed"""
13
+ print("πŸ” Checking dependencies...")
14
+
15
+ required_packages = [
16
+ 'streamlit',
17
+ 'pandas',
18
+ 'numpy',
19
+ 'plotly',
20
+ 'requests'
21
+ ]
22
+
23
+ missing_packages = []
24
+
25
+ for package in required_packages:
26
+ try:
27
+ __import__(package)
28
+ print(f"βœ… {package}")
29
+ except ImportError:
30
+ print(f"❌ {package} - MISSING")
31
+ missing_packages.append(package)
32
+
33
+ # Optional packages
34
+ optional_packages = [
35
+ 'scipy',
36
+ 'python-dotenv'
37
+ ]
38
+
39
+ print("\nπŸ” Checking optional dependencies...")
40
+ for package in optional_packages:
41
+ try:
42
+ __import__(package.replace('-', '_'))
43
+ print(f"βœ… {package} (optional)")
44
+ except ImportError:
45
+ print(f"⚠️ {package} (optional) - not installed")
46
+
47
+ if missing_packages:
48
+ print(f"\n❌ Missing required packages: {', '.join(missing_packages)}")
49
+ print("Install them with: pip install " + " ".join(missing_packages))
50
+ return False
51
+ else:
52
+ print("\nβœ… All required dependencies are installed!")
53
+ return True
54
+
55
+ def create_sample_files():
56
+ """Create sample configuration files"""
57
+ print("\nπŸ“ Creating sample files...")
58
+
59
+ # Create .env file template
60
+ env_content = """# Groq API Configuration
61
+ # Get your API key from: https://console.groq.com/keys
62
+ GROQ_API_KEY=your_groq_api_key_here
63
+
64
+ # Optional: Set other environment variables
65
+ # DEBUG=True
66
+ """
67
+
68
+ env_file = Path('.env.template')
69
+ if not env_file.exists():
70
+ with open(env_file, 'w') as f:
71
+ f.write(env_content)
72
+ print(f"βœ… Created {env_file}")
73
+ else:
74
+ print(f"ℹ️ {env_file} already exists")
75
+
76
+ # Create sample CSV data
77
+ sample_csv = Path('sample_data.csv')
78
+ if not sample_csv.exists():
79
+ csv_content = """customer_id,customer_name,product,sales_amount,order_date,region,sales_rep
80
+ 1,Customer_1,Widget A,2147.23,2023-01-01,North,John Smith
81
+ 2,Customer_2,Widget B,1823.45,2023-01-02,South,Jane Doe
82
+ 3,Customer_3,Widget C,2456.78,2023-01-03,East,Bob Johnson
83
+ 4,Customer_4,Gadget X,1934.56,2023-01-04,West,Alice Brown
84
+ 5,Customer_5,Widget A,2234.67,2023-01-05,North,John Smith
85
+ """
86
+ with open(sample_csv, 'w') as f:
87
+ f.write(csv_content)
88
+ print(f"βœ… Created {sample_csv}")
89
+ else:
90
+ print(f"ℹ️ {sample_csv} already exists")
91
+
92
+ def create_required_modules():
93
+ """Create the required module files if they don't exist"""
94
+ print("\nπŸ“ Checking required modules...")
95
+
96
+ # Check if eda_analyzer.py exists
97
+ if not Path('eda_analyzer.py').exists():
98
+ print("❌ eda_analyzer.py not found!")
99
+ print(" Please save the EDA Analyzer code as 'eda_analyzer.py'")
100
+ return False
101
+ else:
102
+ print("βœ… eda_analyzer.py found")
103
+
104
+ # Check if database_manager.py exists
105
+ if not Path('database_manager.py').exists():
106
+ print("❌ database_manager.py not found!")
107
+ print(" Please save the Database Manager code as 'database_manager.py'")
108
+ return False
109
+ else:
110
+ print("βœ… database_manager.py found")
111
+
112
+ return True
113
+
114
+ def test_imports():
115
+ """Test if the modules can be imported"""
116
+ print("\nπŸ§ͺ Testing module imports...")
117
+
118
+ try:
119
+ from eda_analyzer import EDAAnalyzer
120
+ print("βœ… EDAAnalyzer imported successfully")
121
+ except Exception as e:
122
+ print(f"❌ Failed to import EDAAnalyzer: {e}")
123
+ return False
124
+
125
+ try:
126
+ from database_manager import DatabaseManager
127
+ print("βœ… DatabaseManager imported successfully")
128
+ except Exception as e:
129
+ print(f"❌ Failed to import DatabaseManager: {e}")
130
+ return False
131
+
132
+ return True
133
+
134
+ def main():
135
+ """Main test function"""
136
+ print("πŸš€ Neural Data Analyst - Setup Test")
137
+ print("=" * 50)
138
+
139
+ # Check Python version
140
+ python_version = sys.version_info
141
+ print(f"🐍 Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
142
+
143
+ if python_version < (3, 7):
144
+ print("❌ Python 3.7+ required!")
145
+ return False
146
+ else:
147
+ print("βœ… Python version OK")
148
+
149
+ # Run all checks
150
+ deps_ok = check_dependencies()
151
+ if not deps_ok:
152
+ return False
153
+
154
+ create_sample_files()
155
+
156
+ modules_ok = create_required_modules()
157
+ if not modules_ok:
158
+ return False
159
+
160
+ imports_ok = test_imports()
161
+ if not imports_ok:
162
+ return False
163
+
164
+ print("\nπŸŽ‰ Setup test completed successfully!")
165
+ print("\nπŸ“‹ Next steps:")
166
+ print("1. Copy .env.template to .env and add your Groq API key (optional)")
167
+ print("2. Run: streamlit run app.py")
168
+ print("3. Upload sample_data.csv or your own data file")
169
+ print("4. Explore the analysis features!")
170
+
171
+ return True
172
+
173
+ if __name__ == "__main__":
174
+ success = main()
175
+ sys.exit(0 if success else 1)