Spaces:
Running
Running
Dhruv Pawar
commited on
Commit
Β·
1273036
0
Parent(s):
Initial commit: Neural Data Analyst v1.0
Browse files- .env.template +10 -0
- .gitignore +134 -0
- advanced_features.py +730 -0
- database_manager.py +285 -0
- eda_analyzer.py +593 -0
- main.py +0 -0
- requirements.txt +12 -0
- setup.bat +0 -0
- setup.sh +0 -0
- test.py +175 -0
.env.template
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Neural Data Analyst Environment Variables
|
| 2 |
+
# Copy this file to .env and add your actual API key
|
| 3 |
+
|
| 4 |
+
# Groq API Configuration
|
| 5 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 6 |
+
|
| 7 |
+
# Optional: Default model to use
|
| 8 |
+
DEFAULT_MODEL=llama-3.3-70b-versatile
|
| 9 |
+
|
| 10 |
+
# Get your Groq API key from: https://console.groq.com/keys
|
.gitignore
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Neural Data Analyst - .gitignore
|
| 2 |
+
|
| 3 |
+
# Environment Variables (IMPORTANT: Never commit API keys!)
|
| 4 |
+
.env
|
| 5 |
+
.env.local
|
| 6 |
+
.env.development
|
| 7 |
+
.env.test
|
| 8 |
+
.env.production
|
| 9 |
+
|
| 10 |
+
# Python
|
| 11 |
+
__pycache__/
|
| 12 |
+
*.py[cod]
|
| 13 |
+
*$py.class
|
| 14 |
+
*.so
|
| 15 |
+
.Python
|
| 16 |
+
build/
|
| 17 |
+
develop-eggs/
|
| 18 |
+
dist/
|
| 19 |
+
downloads/
|
| 20 |
+
eggs/
|
| 21 |
+
.eggs/
|
| 22 |
+
lib/
|
| 23 |
+
lib64/
|
| 24 |
+
parts/
|
| 25 |
+
sdist/
|
| 26 |
+
var/
|
| 27 |
+
wheels/
|
| 28 |
+
pip-wheel-metadata/
|
| 29 |
+
share/python-wheels/
|
| 30 |
+
*.egg-info/
|
| 31 |
+
.installed.cfg
|
| 32 |
+
*.egg
|
| 33 |
+
MANIFEST
|
| 34 |
+
|
| 35 |
+
# Virtual Environments
|
| 36 |
+
venv/
|
| 37 |
+
env/
|
| 38 |
+
ENV/
|
| 39 |
+
env.bak/
|
| 40 |
+
venv.bak/
|
| 41 |
+
.venv/
|
| 42 |
+
|
| 43 |
+
# Streamlit
|
| 44 |
+
.streamlit/
|
| 45 |
+
streamlit_cache/
|
| 46 |
+
|
| 47 |
+
# Database files
|
| 48 |
+
*.db
|
| 49 |
+
*.sqlite
|
| 50 |
+
*.sqlite3
|
| 51 |
+
neural_analyst_db/
|
| 52 |
+
analysis_history.json
|
| 53 |
+
|
| 54 |
+
# Logs
|
| 55 |
+
*.log
|
| 56 |
+
logs/
|
| 57 |
+
neural_logs/
|
| 58 |
+
|
| 59 |
+
# Cache
|
| 60 |
+
.cache/
|
| 61 |
+
cache/
|
| 62 |
+
temp/
|
| 63 |
+
tmp/
|
| 64 |
+
|
| 65 |
+
# Data files (add your data files here)
|
| 66 |
+
data/
|
| 67 |
+
datasets/
|
| 68 |
+
uploads/
|
| 69 |
+
*.csv
|
| 70 |
+
*.json
|
| 71 |
+
*.xlsx
|
| 72 |
+
*.xls
|
| 73 |
+
sample_data.csv
|
| 74 |
+
|
| 75 |
+
# IDE and Editor files
|
| 76 |
+
.vscode/
|
| 77 |
+
.idea/
|
| 78 |
+
*.swp
|
| 79 |
+
*.swo
|
| 80 |
+
*~
|
| 81 |
+
.DS_Store
|
| 82 |
+
Thumbs.db
|
| 83 |
+
|
| 84 |
+
# Jupyter Notebooks
|
| 85 |
+
.ipynb_checkpoints/
|
| 86 |
+
*.ipynb
|
| 87 |
+
|
| 88 |
+
# pytest
|
| 89 |
+
.pytest_cache/
|
| 90 |
+
.coverage
|
| 91 |
+
htmlcov/
|
| 92 |
+
|
| 93 |
+
# mypy
|
| 94 |
+
.mypy_cache/
|
| 95 |
+
.dmypy.json
|
| 96 |
+
dmypy.json
|
| 97 |
+
|
| 98 |
+
# Documentation builds
|
| 99 |
+
docs/_build/
|
| 100 |
+
|
| 101 |
+
# PyInstaller
|
| 102 |
+
*.manifest
|
| 103 |
+
*.spec
|
| 104 |
+
|
| 105 |
+
# Unit test / coverage reports
|
| 106 |
+
htmlcov/
|
| 107 |
+
.tox/
|
| 108 |
+
.coverage
|
| 109 |
+
.coverage.*
|
| 110 |
+
.cache
|
| 111 |
+
nosetests.xml
|
| 112 |
+
coverage.xml
|
| 113 |
+
*.cover
|
| 114 |
+
*.py,cover
|
| 115 |
+
.hypothesis/
|
| 116 |
+
|
| 117 |
+
# Backup files
|
| 118 |
+
*.backup
|
| 119 |
+
*.bak
|
| 120 |
+
*.old
|
| 121 |
+
|
| 122 |
+
# Temporary files
|
| 123 |
+
*.tmp
|
| 124 |
+
*.temp
|
| 125 |
+
|
| 126 |
+
# API Keys and Secrets (double protection)
|
| 127 |
+
secrets.toml
|
| 128 |
+
.secrets.toml
|
| 129 |
+
api_keys.txt
|
| 130 |
+
config.json
|
| 131 |
+
|
| 132 |
+
# Local configuration
|
| 133 |
+
local_config.py
|
| 134 |
+
config_local.py
|
advanced_features.py
ADDED
|
@@ -0,0 +1,730 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
import plotly.express as px
|
| 5 |
+
import plotly.graph_objects as go
|
| 6 |
+
from plotly.subplots import make_subplots
|
| 7 |
+
import json
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from typing import Dict, List, Any
|
| 10 |
+
import base64
|
| 11 |
+
from io import BytesIO
|
| 12 |
+
|
| 13 |
+
# Additional advanced features for Neural Data Analyst
|
| 14 |
+
|
| 15 |
+
class AdvancedFeatures:
|
| 16 |
+
"""Advanced features and utilities for the Neural Data Analyst"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, db_manager):
|
| 19 |
+
self.db_manager = db_manager
|
| 20 |
+
|
| 21 |
+
def render_advanced_analytics_dashboard(self, df: pd.DataFrame):
|
| 22 |
+
"""Render advanced analytics dashboard"""
|
| 23 |
+
st.markdown("## π¬ Advanced Analytics Dashboard")
|
| 24 |
+
|
| 25 |
+
tabs = st.tabs([
|
| 26 |
+
"π Interactive Plots",
|
| 27 |
+
"π― Smart Recommendations",
|
| 28 |
+
"π Trend Analysis",
|
| 29 |
+
"π Anomaly Detection",
|
| 30 |
+
"π Report Generator"
|
| 31 |
+
])
|
| 32 |
+
|
| 33 |
+
with tabs[0]:
|
| 34 |
+
self.render_interactive_plots(df)
|
| 35 |
+
|
| 36 |
+
with tabs[1]:
|
| 37 |
+
self.render_smart_recommendations(df)
|
| 38 |
+
|
| 39 |
+
with tabs[2]:
|
| 40 |
+
self.render_trend_analysis(df)
|
| 41 |
+
|
| 42 |
+
with tabs[3]:
|
| 43 |
+
self.render_anomaly_detection(df)
|
| 44 |
+
|
| 45 |
+
with tabs[4]:
|
| 46 |
+
self.render_report_generator(df)
|
| 47 |
+
|
| 48 |
+
def render_interactive_plots(self, df: pd.DataFrame):
|
| 49 |
+
"""Render interactive plotting interface"""
|
| 50 |
+
st.markdown("### π Interactive Plot Builder")
|
| 51 |
+
|
| 52 |
+
col1, col2, col3 = st.columns(3)
|
| 53 |
+
|
| 54 |
+
with col1:
|
| 55 |
+
plot_type = st.selectbox(
|
| 56 |
+
"Plot Type",
|
| 57 |
+
["Scatter", "Line", "Bar", "Histogram", "Box", "Violin", "Heatmap", "3D Scatter"]
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
with col2:
|
| 61 |
+
x_column = st.selectbox("X-axis", df.columns)
|
| 62 |
+
|
| 63 |
+
with col3:
|
| 64 |
+
y_column = st.selectbox("Y-axis", df.columns)
|
| 65 |
+
|
| 66 |
+
# Color and size options
|
| 67 |
+
col1, col2 = st.columns(2)
|
| 68 |
+
with col1:
|
| 69 |
+
color_column = st.selectbox("Color by", ["None"] + list(df.columns))
|
| 70 |
+
with col2:
|
| 71 |
+
size_column = st.selectbox("Size by", ["None"] + list(df.select_dtypes(include=[np.number]).columns))
|
| 72 |
+
|
| 73 |
+
# Generate plot based on selections
|
| 74 |
+
if st.button("π¨ Generate Plot"):
|
| 75 |
+
fig = self.create_dynamic_plot(df, plot_type, x_column, y_column, color_column, size_column)
|
| 76 |
+
if fig:
|
| 77 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 78 |
+
|
| 79 |
+
# Plot gallery
|
| 80 |
+
with st.expander("πΌοΈ Quick Plot Gallery"):
|
| 81 |
+
self.render_plot_gallery(df)
|
| 82 |
+
|
| 83 |
+
def create_dynamic_plot(self, df: pd.DataFrame, plot_type: str, x_col: str, y_col: str,
|
| 84 |
+
color_col: str = None, size_col: str = None):
|
| 85 |
+
"""Create dynamic plot based on user selections"""
|
| 86 |
+
try:
|
| 87 |
+
kwargs = {
|
| 88 |
+
'data_frame': df,
|
| 89 |
+
'x': x_col,
|
| 90 |
+
'title': f'{plot_type} Plot: {x_col} vs {y_col}'
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
if y_col and y_col != x_col:
|
| 94 |
+
kwargs['y'] = y_col
|
| 95 |
+
|
| 96 |
+
if color_col and color_col != "None":
|
| 97 |
+
kwargs['color'] = color_col
|
| 98 |
+
|
| 99 |
+
if size_col and size_col != "None" and plot_type in ["Scatter", "3D Scatter"]:
|
| 100 |
+
kwargs['size'] = size_col
|
| 101 |
+
|
| 102 |
+
if plot_type == "Scatter":
|
| 103 |
+
fig = px.scatter(**kwargs)
|
| 104 |
+
elif plot_type == "Line":
|
| 105 |
+
fig = px.line(**kwargs)
|
| 106 |
+
elif plot_type == "Bar":
|
| 107 |
+
fig = px.bar(**kwargs)
|
| 108 |
+
elif plot_type == "Histogram":
|
| 109 |
+
fig = px.histogram(df, x=x_col, title=f'Histogram: {x_col}')
|
| 110 |
+
elif plot_type == "Box":
|
| 111 |
+
fig = px.box(**kwargs)
|
| 112 |
+
elif plot_type == "Violin":
|
| 113 |
+
fig = px.violin(**kwargs)
|
| 114 |
+
elif plot_type == "3D Scatter":
|
| 115 |
+
z_col = st.selectbox("Z-axis", df.select_dtypes(include=[np.number]).columns)
|
| 116 |
+
kwargs['z'] = z_col
|
| 117 |
+
fig = px.scatter_3d(**kwargs)
|
| 118 |
+
elif plot_type == "Heatmap":
|
| 119 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
| 120 |
+
corr_matrix = numeric_df.corr()
|
| 121 |
+
fig = px.imshow(corr_matrix, text_auto=True, title="Correlation Heatmap")
|
| 122 |
+
else:
|
| 123 |
+
return None
|
| 124 |
+
|
| 125 |
+
fig.update_layout(
|
| 126 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 127 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 128 |
+
font=dict(color='white')
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
return fig
|
| 132 |
+
|
| 133 |
+
except Exception as e:
|
| 134 |
+
st.error(f"Error creating plot: {str(e)}")
|
| 135 |
+
return None
|
| 136 |
+
|
| 137 |
+
def render_plot_gallery(self, df: pd.DataFrame):
|
| 138 |
+
"""Render quick plot gallery"""
|
| 139 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 140 |
+
|
| 141 |
+
if len(numeric_cols) >= 2:
|
| 142 |
+
col1, col2 = st.columns(2)
|
| 143 |
+
|
| 144 |
+
with col1:
|
| 145 |
+
# Quick correlation plot
|
| 146 |
+
fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
|
| 147 |
+
title="Quick Correlation View")
|
| 148 |
+
fig.update_layout(height=300)
|
| 149 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 150 |
+
|
| 151 |
+
with col2:
|
| 152 |
+
# Quick distribution plot
|
| 153 |
+
fig = px.histogram(df, x=numeric_cols[0], title="Quick Distribution")
|
| 154 |
+
fig.update_layout(height=300)
|
| 155 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 156 |
+
|
| 157 |
+
def render_smart_recommendations(self, df: pd.DataFrame):
|
| 158 |
+
"""Render smart analysis recommendations"""
|
| 159 |
+
st.markdown("### π― Smart Analysis Recommendations")
|
| 160 |
+
|
| 161 |
+
recommendations = self.generate_analysis_recommendations(df)
|
| 162 |
+
|
| 163 |
+
for i, rec in enumerate(recommendations):
|
| 164 |
+
with st.expander(f"π‘ {rec['title']}", expanded=i == 0):
|
| 165 |
+
st.markdown(f"**Recommendation:** {rec['description']}")
|
| 166 |
+
st.markdown(f"**Rationale:** {rec['rationale']}")
|
| 167 |
+
|
| 168 |
+
if st.button(f"Apply Recommendation", key=f"apply_rec_{i}"):
|
| 169 |
+
self.apply_recommendation(df, rec)
|
| 170 |
+
|
| 171 |
+
def generate_analysis_recommendations(self, df: pd.DataFrame) -> List[Dict[str, str]]:
|
| 172 |
+
"""Generate smart analysis recommendations"""
|
| 173 |
+
recommendations = []
|
| 174 |
+
|
| 175 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 176 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 177 |
+
|
| 178 |
+
# Missing data recommendation
|
| 179 |
+
missing_data = df.isnull().sum()
|
| 180 |
+
high_missing = missing_data[missing_data > len(df) * 0.1]
|
| 181 |
+
|
| 182 |
+
if len(high_missing) > 0:
|
| 183 |
+
recommendations.append({
|
| 184 |
+
'title': 'Missing Data Analysis',
|
| 185 |
+
'description': f'Analyze missing data patterns in {len(high_missing)} columns with >10% missing values',
|
| 186 |
+
'rationale': 'Understanding missing data patterns can reveal data collection issues or systematic biases',
|
| 187 |
+
'action': 'missing_analysis'
|
| 188 |
+
})
|
| 189 |
+
|
| 190 |
+
# Correlation analysis recommendation
|
| 191 |
+
if len(numeric_cols) > 2:
|
| 192 |
+
recommendations.append({
|
| 193 |
+
'title': 'Correlation Deep Dive',
|
| 194 |
+
'description': 'Perform comprehensive correlation analysis with feature selection recommendations',
|
| 195 |
+
'rationale': 'Identifying highly correlated features can improve model performance and interpretability',
|
| 196 |
+
'action': 'correlation_analysis'
|
| 197 |
+
})
|
| 198 |
+
|
| 199 |
+
# Outlier detection recommendation
|
| 200 |
+
if len(numeric_cols) > 0:
|
| 201 |
+
recommendations.append({
|
| 202 |
+
'title': 'Outlier Detection & Treatment',
|
| 203 |
+
'description': 'Identify and analyze outliers using multiple statistical methods',
|
| 204 |
+
'rationale': 'Outliers can significantly impact analysis results and model performance',
|
| 205 |
+
'action': 'outlier_analysis'
|
| 206 |
+
})
|
| 207 |
+
|
| 208 |
+
# Segmentation recommendation
|
| 209 |
+
if len(categorical_cols) > 0 and len(numeric_cols) > 0:
|
| 210 |
+
recommendations.append({
|
| 211 |
+
'title': 'Customer/Data Segmentation',
|
| 212 |
+
'description': 'Perform clustering analysis to identify natural data segments',
|
| 213 |
+
'rationale': 'Segmentation can reveal hidden patterns and improve targeted strategies',
|
| 214 |
+
'action': 'segmentation_analysis'
|
| 215 |
+
})
|
| 216 |
+
|
| 217 |
+
# Time series recommendation
|
| 218 |
+
date_cols = df.select_dtypes(include=['datetime64']).columns
|
| 219 |
+
if len(date_cols) > 0:
|
| 220 |
+
recommendations.append({
|
| 221 |
+
'title': 'Time Series Analysis',
|
| 222 |
+
'description': 'Analyze temporal patterns and trends in your data',
|
| 223 |
+
'rationale': 'Time-based analysis can reveal seasonality, trends, and forecasting opportunities',
|
| 224 |
+
'action': 'time_series_analysis'
|
| 225 |
+
})
|
| 226 |
+
|
| 227 |
+
return recommendations
|
| 228 |
+
|
| 229 |
+
def apply_recommendation(self, df: pd.DataFrame, recommendation: Dict[str, str]):
|
| 230 |
+
"""Apply a smart recommendation"""
|
| 231 |
+
action = recommendation.get('action')
|
| 232 |
+
|
| 233 |
+
if action == 'missing_analysis':
|
| 234 |
+
self.perform_missing_analysis(df)
|
| 235 |
+
elif action == 'correlation_analysis':
|
| 236 |
+
self.perform_correlation_analysis(df)
|
| 237 |
+
elif action == 'outlier_analysis':
|
| 238 |
+
self.perform_outlier_analysis(df)
|
| 239 |
+
elif action == 'segmentation_analysis':
|
| 240 |
+
self.perform_segmentation_analysis(df)
|
| 241 |
+
elif action == 'time_series_analysis':
|
| 242 |
+
self.perform_time_series_analysis(df)
|
| 243 |
+
|
| 244 |
+
def perform_missing_analysis(self, df: pd.DataFrame):
|
| 245 |
+
"""Perform detailed missing data analysis"""
|
| 246 |
+
st.markdown("#### π Missing Data Analysis Results")
|
| 247 |
+
|
| 248 |
+
missing_data = df.isnull().sum()
|
| 249 |
+
missing_percent = (missing_data / len(df)) * 100
|
| 250 |
+
|
| 251 |
+
missing_df = pd.DataFrame({
|
| 252 |
+
'Column': missing_data.index,
|
| 253 |
+
'Missing_Count': missing_data.values,
|
| 254 |
+
'Missing_Percentage': missing_percent.values
|
| 255 |
+
})
|
| 256 |
+
|
| 257 |
+
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
|
| 258 |
+
|
| 259 |
+
if len(missing_df) > 0:
|
| 260 |
+
fig = px.bar(missing_df, x='Column', y='Missing_Percentage',
|
| 261 |
+
title='Missing Data by Column (%)')
|
| 262 |
+
fig.update_layout(height=400)
|
| 263 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 264 |
+
|
| 265 |
+
st.dataframe(missing_df, use_container_width=True)
|
| 266 |
+
else:
|
| 267 |
+
st.success("β
No missing data found in the dataset!")
|
| 268 |
+
|
| 269 |
+
def perform_correlation_analysis(self, df: pd.DataFrame):
|
| 270 |
+
"""Perform detailed correlation analysis"""
|
| 271 |
+
st.markdown("#### π Advanced Correlation Analysis")
|
| 272 |
+
|
| 273 |
+
numeric_df = df.select_dtypes(include=[np.number])
|
| 274 |
+
|
| 275 |
+
if len(numeric_df.columns) > 1:
|
| 276 |
+
corr_matrix = numeric_df.corr()
|
| 277 |
+
|
| 278 |
+
# Hierarchical clustering of correlations
|
| 279 |
+
from scipy.cluster.hierarchy import linkage, dendrogram
|
| 280 |
+
from scipy.spatial.distance import squareform
|
| 281 |
+
|
| 282 |
+
distance_matrix = 1 - np.abs(corr_matrix)
|
| 283 |
+
condensed_distances = squareform(distance_matrix, checks=False)
|
| 284 |
+
linkage_matrix = linkage(condensed_distances, method='average')
|
| 285 |
+
|
| 286 |
+
fig = go.Figure()
|
| 287 |
+
dendro = dendrogram(linkage_matrix, labels=corr_matrix.columns, no_plot=True)
|
| 288 |
+
|
| 289 |
+
# Create dendrogram plot
|
| 290 |
+
for i in range(len(dendro['icoord'])):
|
| 291 |
+
x = dendro['icoord'][i]
|
| 292 |
+
y = dendro['dcoord'][i]
|
| 293 |
+
fig.add_trace(go.Scatter(x=x, y=y, mode='lines',
|
| 294 |
+
line=dict(color='gold', width=2),
|
| 295 |
+
showlegend=False))
|
| 296 |
+
|
| 297 |
+
fig.update_layout(
|
| 298 |
+
title="Feature Clustering Dendrogram",
|
| 299 |
+
xaxis_title="Features",
|
| 300 |
+
yaxis_title="Distance",
|
| 301 |
+
height=400
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 305 |
+
|
| 306 |
+
def render_trend_analysis(self, df: pd.DataFrame):
|
| 307 |
+
"""Render trend analysis interface"""
|
| 308 |
+
st.markdown("### π Trend Analysis")
|
| 309 |
+
|
| 310 |
+
date_cols = df.select_dtypes(include=['datetime64']).columns
|
| 311 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 312 |
+
|
| 313 |
+
if len(date_cols) == 0:
|
| 314 |
+
st.warning("No datetime columns found. Try converting date columns to datetime format.")
|
| 315 |
+
|
| 316 |
+
# Offer to convert columns
|
| 317 |
+
potential_date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
|
| 318 |
+
if potential_date_cols:
|
| 319 |
+
date_col = st.selectbox("Select date column to convert:", potential_date_cols)
|
| 320 |
+
if st.button("Convert to DateTime"):
|
| 321 |
+
try:
|
| 322 |
+
df[date_col] = pd.to_datetime(df[date_col])
|
| 323 |
+
st.success(f"Converted {date_col} to datetime!")
|
| 324 |
+
st.experimental_rerun()
|
| 325 |
+
except Exception as e:
|
| 326 |
+
st.error(f"Conversion failed: {str(e)}")
|
| 327 |
+
return
|
| 328 |
+
|
| 329 |
+
col1, col2 = st.columns(2)
|
| 330 |
+
with col1:
|
| 331 |
+
date_col = st.selectbox("Date Column", date_cols)
|
| 332 |
+
with col2:
|
| 333 |
+
value_col = st.selectbox("Value Column", numeric_cols)
|
| 334 |
+
|
| 335 |
+
if st.button("π Analyze Trends"):
|
| 336 |
+
self.perform_trend_analysis(df, date_col, value_col)
|
| 337 |
+
|
| 338 |
+
def perform_trend_analysis(self, df: pd.DataFrame, date_col: str, value_col: str):
|
| 339 |
+
"""Perform trend analysis"""
|
| 340 |
+
st.markdown("#### π Trend Analysis Results")
|
| 341 |
+
|
| 342 |
+
# Time series plot
|
| 343 |
+
fig = px.line(df.sort_values(date_col), x=date_col, y=value_col,
|
| 344 |
+
title=f'{value_col} Over Time')
|
| 345 |
+
fig.update_layout(height=400)
|
| 346 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 347 |
+
|
| 348 |
+
# Rolling statistics
|
| 349 |
+
df_sorted = df.sort_values(date_col).copy()
|
| 350 |
+
df_sorted['7_day_avg'] = df_sorted[value_col].rolling(window=7, min_periods=1).mean()
|
| 351 |
+
df_sorted['30_day_avg'] = df_sorted[value_col].rolling(window=30, min_periods=1).mean()
|
| 352 |
+
|
| 353 |
+
fig = go.Figure()
|
| 354 |
+
fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
|
| 355 |
+
name='Original', mode='lines'))
|
| 356 |
+
fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['7_day_avg'],
|
| 357 |
+
name='7-Day Average', mode='lines'))
|
| 358 |
+
fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['30_day_avg'],
|
| 359 |
+
name='30-Day Average', mode='lines'))
|
| 360 |
+
|
| 361 |
+
fig.update_layout(title="Trend with Moving Averages", height=400)
|
| 362 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 363 |
+
|
| 364 |
+
def render_anomaly_detection(self, df: pd.DataFrame):
|
| 365 |
+
"""Render anomaly detection interface"""
|
| 366 |
+
st.markdown("### π Anomaly Detection")
|
| 367 |
+
|
| 368 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 369 |
+
|
| 370 |
+
if len(numeric_cols) == 0:
|
| 371 |
+
st.warning("No numeric columns found for anomaly detection.")
|
| 372 |
+
return
|
| 373 |
+
|
| 374 |
+
col1, col2 = st.columns(2)
|
| 375 |
+
with col1:
|
| 376 |
+
target_col = st.selectbox("Target Column", numeric_cols)
|
| 377 |
+
with col2:
|
| 378 |
+
method = st.selectbox("Detection Method",
|
| 379 |
+
["IQR", "Z-Score", "Isolation Forest", "Local Outlier Factor"])
|
| 380 |
+
|
| 381 |
+
if st.button("π― Detect Anomalies"):
|
| 382 |
+
self.perform_anomaly_detection(df, target_col, method)
|
| 383 |
+
|
| 384 |
+
def perform_anomaly_detection(self, df: pd.DataFrame, target_col: str, method: str):
|
| 385 |
+
"""Perform anomaly detection"""
|
| 386 |
+
st.markdown("#### π― Anomaly Detection Results")
|
| 387 |
+
|
| 388 |
+
data = df[target_col].dropna()
|
| 389 |
+
anomalies = []
|
| 390 |
+
|
| 391 |
+
if method == "IQR":
|
| 392 |
+
Q1 = data.quantile(0.25)
|
| 393 |
+
Q3 = data.quantile(0.75)
|
| 394 |
+
IQR = Q3 - Q1
|
| 395 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 396 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 397 |
+
anomalies = df[(df[target_col] < lower_bound) | (df[target_col] > upper_bound)]
|
| 398 |
+
|
| 399 |
+
elif method == "Z-Score":
|
| 400 |
+
z_scores = np.abs((data - data.mean()) / data.std())
|
| 401 |
+
anomalies = df[z_scores > 3]
|
| 402 |
+
|
| 403 |
+
elif method == "Isolation Forest":
|
| 404 |
+
from sklearn.ensemble import IsolationForest
|
| 405 |
+
iso_forest = IsolationForest(contamination=0.1, random_state=42)
|
| 406 |
+
outlier_labels = iso_forest.fit_predict(data.values.reshape(-1, 1))
|
| 407 |
+
anomalies = df[outlier_labels == -1]
|
| 408 |
+
|
| 409 |
+
elif method == "Local Outlier Factor":
|
| 410 |
+
from sklearn.neighbors import LocalOutlierFactor
|
| 411 |
+
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
|
| 412 |
+
outlier_labels = lof.fit_predict(data.values.reshape(-1, 1))
|
| 413 |
+
anomalies = df[outlier_labels == -1]
|
| 414 |
+
|
| 415 |
+
# Visualization
|
| 416 |
+
fig = go.Figure()
|
| 417 |
+
|
| 418 |
+
# Normal data points
|
| 419 |
+
normal_data = df[~df.index.isin(anomalies.index)]
|
| 420 |
+
fig.add_trace(go.Scatter(
|
| 421 |
+
x=normal_data.index,
|
| 422 |
+
y=normal_data[target_col],
|
| 423 |
+
mode='markers',
|
| 424 |
+
name='Normal',
|
| 425 |
+
marker=dict(color='blue', size=6)
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
# Anomalies
|
| 429 |
+
fig.add_trace(go.Scatter(
|
| 430 |
+
x=anomalies.index,
|
| 431 |
+
y=anomalies[target_col],
|
| 432 |
+
mode='markers',
|
| 433 |
+
name='Anomalies',
|
| 434 |
+
marker=dict(color='red', size=10, symbol='x')
|
| 435 |
+
))
|
| 436 |
+
|
| 437 |
+
fig.update_layout(
|
| 438 |
+
title=f'Anomaly Detection: {target_col} ({method})',
|
| 439 |
+
xaxis_title='Index',
|
| 440 |
+
yaxis_title=target_col,
|
| 441 |
+
height=500
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 445 |
+
|
| 446 |
+
# Summary
|
| 447 |
+
col1, col2, col3 = st.columns(3)
|
| 448 |
+
with col1:
|
| 449 |
+
st.metric("Total Data Points", len(df))
|
| 450 |
+
with col2:
|
| 451 |
+
st.metric("Anomalies Found", len(anomalies))
|
| 452 |
+
with col3:
|
| 453 |
+
st.metric("Anomaly Rate", f"{len(anomalies)/len(df)*100:.2f}%")
|
| 454 |
+
|
| 455 |
+
if len(anomalies) > 0:
|
| 456 |
+
with st.expander("π Anomaly Details"):
|
| 457 |
+
st.dataframe(anomalies[[target_col]], use_container_width=True)
|
| 458 |
+
|
| 459 |
+
def render_report_generator(self, df: pd.DataFrame):
|
| 460 |
+
"""Render automated report generator"""
|
| 461 |
+
st.markdown("### π Automated Report Generator")
|
| 462 |
+
|
| 463 |
+
report_type = st.selectbox(
|
| 464 |
+
"Report Type",
|
| 465 |
+
["Executive Summary", "Technical Analysis", "Data Quality Report", "Custom Report"]
|
| 466 |
+
)
|
| 467 |
+
|
| 468 |
+
col1, col2 = st.columns(2)
|
| 469 |
+
with col1:
|
| 470 |
+
include_charts = st.checkbox("Include Charts", value=True)
|
| 471 |
+
with col2:
|
| 472 |
+
include_recommendations = st.checkbox("Include Recommendations", value=True)
|
| 473 |
+
|
| 474 |
+
if st.button("π Generate Report"):
|
| 475 |
+
report_content = self.generate_report(df, report_type, include_charts, include_recommendations)
|
| 476 |
+
|
| 477 |
+
# Display report
|
| 478 |
+
st.markdown("#### π Generated Report")
|
| 479 |
+
st.markdown(report_content)
|
| 480 |
+
|
| 481 |
+
# Download option
|
| 482 |
+
self.create_download_link(report_content, f"neural_analyst_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
|
| 483 |
+
|
| 484 |
+
def generate_report(self, df: pd.DataFrame, report_type: str, include_charts: bool, include_recommendations: bool) -> str:
|
| 485 |
+
"""Generate automated report"""
|
| 486 |
+
report = f"""
|
| 487 |
+
# Neural Data Analyst Report
|
| 488 |
+
**Generated on:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
| 489 |
+
**Report Type:** {report_type}
|
| 490 |
+
|
| 491 |
+
## Dataset Overview
|
| 492 |
+
- **Total Rows:** {len(df):,}
|
| 493 |
+
- **Total Columns:** {len(df.columns)}
|
| 494 |
+
- **Memory Usage:** {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
|
| 495 |
+
- **Missing Values:** {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)
|
| 496 |
+
|
| 497 |
+
## Column Information
|
| 498 |
+
"""
|
| 499 |
+
|
| 500 |
+
# Column details
|
| 501 |
+
for col in df.columns:
|
| 502 |
+
dtype = str(df[col].dtype)
|
| 503 |
+
null_count = df[col].isnull().sum()
|
| 504 |
+
unique_count = df[col].nunique()
|
| 505 |
+
|
| 506 |
+
report += f"- **{col}** ({dtype}): {null_count} missing, {unique_count} unique values\n"
|
| 507 |
+
|
| 508 |
+
# Numeric summary
|
| 509 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 510 |
+
if len(numeric_cols) > 0:
|
| 511 |
+
report += "\n## Numeric Summary\n"
|
| 512 |
+
summary_stats = df[numeric_cols].describe()
|
| 513 |
+
report += summary_stats.to_markdown()
|
| 514 |
+
|
| 515 |
+
# Key insights
|
| 516 |
+
if include_recommendations:
|
| 517 |
+
report += "\n## Key Insights & Recommendations\n"
|
| 518 |
+
recommendations = self.generate_analysis_recommendations(df)
|
| 519 |
+
for i, rec in enumerate(recommendations[:5], 1):
|
| 520 |
+
report += f"{i}. **{rec['title']}:** {rec['description']}\n"
|
| 521 |
+
|
| 522 |
+
return report
|
| 523 |
+
|
| 524 |
+
def create_download_link(self, content: str, filename: str):
|
| 525 |
+
"""Create download link for report"""
|
| 526 |
+
b64 = base64.b64encode(content.encode()).decode()
|
| 527 |
+
href = f'<a href="data:text/markdown;base64,{b64}" download="{filename}">π₯ Download Report</a>'
|
| 528 |
+
st.markdown(href, unsafe_allow_html=True)
|
| 529 |
+
|
| 530 |
+
def render_data_comparison_tool(self):
|
| 531 |
+
"""Render data comparison tool for multiple datasets"""
|
| 532 |
+
st.markdown("## βοΈ Data Comparison Tool")
|
| 533 |
+
|
| 534 |
+
st.markdown("Upload multiple datasets to compare their characteristics:")
|
| 535 |
+
|
| 536 |
+
uploaded_files = st.file_uploader(
|
| 537 |
+
"Choose CSV files for comparison",
|
| 538 |
+
type=['csv'],
|
| 539 |
+
accept_multiple_files=True
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
if len(uploaded_files) >= 2:
|
| 543 |
+
datasets = {}
|
| 544 |
+
|
| 545 |
+
for file in uploaded_files:
|
| 546 |
+
try:
|
| 547 |
+
df = pd.read_csv(file)
|
| 548 |
+
datasets[file.name] = df
|
| 549 |
+
except Exception as e:
|
| 550 |
+
st.error(f"Error loading {file.name}: {str(e)}")
|
| 551 |
+
|
| 552 |
+
if len(datasets) >= 2:
|
| 553 |
+
self.perform_dataset_comparison(datasets)
|
| 554 |
+
|
| 555 |
+
def perform_dataset_comparison(self, datasets: Dict[str, pd.DataFrame]):
|
| 556 |
+
"""Perform comparison between multiple datasets"""
|
| 557 |
+
st.markdown("### π Dataset Comparison Results")
|
| 558 |
+
|
| 559 |
+
# Basic comparison table
|
| 560 |
+
comparison_data = []
|
| 561 |
+
|
| 562 |
+
for name, df in datasets.items():
|
| 563 |
+
comparison_data.append({
|
| 564 |
+
'Dataset': name,
|
| 565 |
+
'Rows': len(df),
|
| 566 |
+
'Columns': len(df.columns),
|
| 567 |
+
'Numeric Columns': len(df.select_dtypes(include=[np.number]).columns),
|
| 568 |
+
'Text Columns': len(df.select_dtypes(include=['object']).columns),
|
| 569 |
+
'Missing Values': df.isnull().sum().sum(),
|
| 570 |
+
'Memory (MB)': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f}"
|
| 571 |
+
})
|
| 572 |
+
|
| 573 |
+
comparison_df = pd.DataFrame(comparison_data)
|
| 574 |
+
st.dataframe(comparison_df, use_container_width=True)
|
| 575 |
+
|
| 576 |
+
# Visual comparison
|
| 577 |
+
fig = make_subplots(
|
| 578 |
+
rows=2, cols=2,
|
| 579 |
+
subplot_titles=['Rows Comparison', 'Columns Comparison',
|
| 580 |
+
'Missing Values', 'Memory Usage'],
|
| 581 |
+
specs=[[{"type": "bar"}, {"type": "bar"}],
|
| 582 |
+
[{"type": "bar"}, {"type": "bar"}]]
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
names = list(datasets.keys())
|
| 586 |
+
|
| 587 |
+
# Rows comparison
|
| 588 |
+
fig.add_trace(
|
| 589 |
+
go.Bar(x=names, y=[len(datasets[name]) for name in names], name="Rows"),
|
| 590 |
+
row=1, col=1
|
| 591 |
+
)
|
| 592 |
+
|
| 593 |
+
# Columns comparison
|
| 594 |
+
fig.add_trace(
|
| 595 |
+
go.Bar(x=names, y=[len(datasets[name].columns) for name in names], name="Columns"),
|
| 596 |
+
row=1, col=2
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
# Missing values comparison
|
| 600 |
+
fig.add_trace(
|
| 601 |
+
go.Bar(x=names, y=[datasets[name].isnull().sum().sum() for name in names], name="Missing"),
|
| 602 |
+
row=2, col=1
|
| 603 |
+
)
|
| 604 |
+
|
| 605 |
+
# Memory usage comparison
|
| 606 |
+
fig.add_trace(
|
| 607 |
+
go.Bar(x=names, y=[datasets[name].memory_usage(deep=True).sum() / 1024**2 for name in names], name="Memory"),
|
| 608 |
+
row=2, col=2
|
| 609 |
+
)
|
| 610 |
+
|
| 611 |
+
fig.update_layout(height=600, showlegend=False, title_text="Dataset Comparison Dashboard")
|
| 612 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 613 |
+
|
| 614 |
+
def render_data_profiling_tool(self, df: pd.DataFrame):
|
| 615 |
+
"""Render comprehensive data profiling tool"""
|
| 616 |
+
st.markdown("## π¬ Data Profiling Tool")
|
| 617 |
+
|
| 618 |
+
if st.button("π Generate Complete Data Profile"):
|
| 619 |
+
with st.spinner("Generating comprehensive data profile..."):
|
| 620 |
+
profile = self.generate_data_profile(df)
|
| 621 |
+
self.display_data_profile(profile)
|
| 622 |
+
|
| 623 |
+
def generate_data_profile(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 624 |
+
"""Generate comprehensive data profile"""
|
| 625 |
+
profile = {
|
| 626 |
+
'overview': {},
|
| 627 |
+
'column_profiles': {},
|
| 628 |
+
'data_quality': {},
|
| 629 |
+
'relationships': {},
|
| 630 |
+
'recommendations': []
|
| 631 |
+
}
|
| 632 |
+
|
| 633 |
+
# Overview
|
| 634 |
+
profile['overview'] = {
|
| 635 |
+
'shape': df.shape,
|
| 636 |
+
'memory_usage': df.memory_usage(deep=True).sum(),
|
| 637 |
+
'dtypes': df.dtypes.value_counts().to_dict(),
|
| 638 |
+
'missing_cells': df.isnull().sum().sum(),
|
| 639 |
+
'duplicate_rows': df.duplicated().sum()
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
# Column profiles
|
| 643 |
+
for col in df.columns:
|
| 644 |
+
col_profile = {
|
| 645 |
+
'dtype': str(df[col].dtype),
|
| 646 |
+
'null_count': df[col].isnull().sum(),
|
| 647 |
+
'null_percentage': df[col].isnull().sum() / len(df) * 100,
|
| 648 |
+
'unique_count': df[col].nunique(),
|
| 649 |
+
'unique_percentage': df[col].nunique() / len(df) * 100
|
| 650 |
+
}
|
| 651 |
+
|
| 652 |
+
if df[col].dtype in ['int64', 'float64']:
|
| 653 |
+
col_profile.update({
|
| 654 |
+
'min': df[col].min(),
|
| 655 |
+
'max': df[col].max(),
|
| 656 |
+
'mean': df[col].mean(),
|
| 657 |
+
'std': df[col].std(),
|
| 658 |
+
'skewness': df[col].skew(),
|
| 659 |
+
'kurtosis': df[col].kurtosis()
|
| 660 |
+
})
|
| 661 |
+
else:
|
| 662 |
+
col_profile.update({
|
| 663 |
+
'most_frequent': df[col].mode().iloc[0] if len(df[col].mode()) > 0 else None,
|
| 664 |
+
'most_frequent_count': df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 0
|
| 665 |
+
})
|
| 666 |
+
|
| 667 |
+
profile['column_profiles'][col] = col_profile
|
| 668 |
+
|
| 669 |
+
return profile
|
| 670 |
+
|
| 671 |
+
def display_data_profile(self, profile: Dict[str, Any]):
|
| 672 |
+
"""Display data profile results"""
|
| 673 |
+
st.markdown("### π Complete Data Profile")
|
| 674 |
+
|
| 675 |
+
# Overview metrics
|
| 676 |
+
overview = profile['overview']
|
| 677 |
+
|
| 678 |
+
col1, col2, col3, col4 = st.columns(4)
|
| 679 |
+
with col1:
|
| 680 |
+
st.metric("Rows", f"{overview['shape'][0]:,}")
|
| 681 |
+
with col2:
|
| 682 |
+
st.metric("Columns", overview['shape'][1])
|
| 683 |
+
with col3:
|
| 684 |
+
st.metric("Missing Cells", f"{overview['missing_cells']:,}")
|
| 685 |
+
with col4:
|
| 686 |
+
st.metric("Duplicates", f"{overview['duplicate_rows']:,}")
|
| 687 |
+
|
| 688 |
+
# Column details table
|
| 689 |
+
st.markdown("#### π Column Details")
|
| 690 |
+
|
| 691 |
+
col_data = []
|
| 692 |
+
for col, details in profile['column_profiles'].items():
|
| 693 |
+
col_data.append({
|
| 694 |
+
'Column': col,
|
| 695 |
+
'Type': details['dtype'],
|
| 696 |
+
'Missing %': f"{details['null_percentage']:.1f}%",
|
| 697 |
+
'Unique %': f"{details['unique_percentage']:.1f}%",
|
| 698 |
+
'Details': f"Min: {details.get('min', 'N/A')}, Max: {details.get('max', 'N/A')}" if 'min' in details else f"Most Frequent: {details.get('most_frequent', 'N/A')}"
|
| 699 |
+
})
|
| 700 |
+
|
| 701 |
+
col_df = pd.DataFrame(col_data)
|
| 702 |
+
st.dataframe(col_df, use_container_width=True)
|
| 703 |
+
|
| 704 |
+
# Usage in main app
|
| 705 |
+
def integrate_advanced_features():
|
| 706 |
+
"""Integration function for advanced features"""
|
| 707 |
+
return """
|
| 708 |
+
# Add this to your main.py file:
|
| 709 |
+
|
| 710 |
+
from advanced_features import AdvancedFeatures
|
| 711 |
+
|
| 712 |
+
# In your NeuralDataAnalyst class:
|
| 713 |
+
def __init__(self):
|
| 714 |
+
# ... existing code ...
|
| 715 |
+
self.advanced_features = AdvancedFeatures(self.db_manager)
|
| 716 |
+
|
| 717 |
+
# Add this after your existing data upload section:
|
| 718 |
+
if st.session_state.uploaded_data is not None:
|
| 719 |
+
if st.button("π¬ Advanced Analytics", key="advanced_analytics"):
|
| 720 |
+
self.advanced_features.render_advanced_analytics_dashboard(st.session_state.uploaded_data)
|
| 721 |
+
|
| 722 |
+
if st.button("π Data Profiling", key="data_profiling"):
|
| 723 |
+
self.advanced_features.render_data_profiling_tool(st.session_state.uploaded_data)
|
| 724 |
+
|
| 725 |
+
# Add dataset comparison in sidebar:
|
| 726 |
+
with st.sidebar:
|
| 727 |
+
st.markdown("---")
|
| 728 |
+
if st.button("βοΈ Compare Datasets"):
|
| 729 |
+
self.advanced_features.render_data_comparison_tool()
|
| 730 |
+
"""
|
database_manager.py
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Dict, List, Any
|
| 5 |
+
|
| 6 |
+
class DatabaseManager:
|
| 7 |
+
"""Simple file-based database manager for storing analysis history"""
|
| 8 |
+
|
| 9 |
+
def __init__(self, db_file: str = "analysis_history.json"):
|
| 10 |
+
"""Initialize the database manager
|
| 11 |
+
|
| 12 |
+
Args:
|
| 13 |
+
db_file: Path to the JSON file to store analysis history
|
| 14 |
+
"""
|
| 15 |
+
self.db_file = db_file
|
| 16 |
+
self.ensure_db_file_exists()
|
| 17 |
+
|
| 18 |
+
def ensure_db_file_exists(self):
|
| 19 |
+
"""Ensure the database file exists"""
|
| 20 |
+
if not os.path.exists(self.db_file):
|
| 21 |
+
with open(self.db_file, 'w') as f:
|
| 22 |
+
json.dump([], f)
|
| 23 |
+
|
| 24 |
+
def save_analysis(self, analysis_record: Dict[str, Any]) -> bool:
|
| 25 |
+
"""Save an analysis record to the database
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
analysis_record: Dictionary containing analysis data
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
bool: True if successful, False otherwise
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
# Read existing data
|
| 35 |
+
existing_data = self.load_all_data()
|
| 36 |
+
|
| 37 |
+
# Add timestamp if not present
|
| 38 |
+
if 'timestamp' not in analysis_record:
|
| 39 |
+
analysis_record['timestamp'] = datetime.now().isoformat()
|
| 40 |
+
|
| 41 |
+
# Append new record
|
| 42 |
+
existing_data.append(analysis_record)
|
| 43 |
+
|
| 44 |
+
# Write back to file
|
| 45 |
+
with open(self.db_file, 'w') as f:
|
| 46 |
+
json.dump(existing_data, f, indent=2, default=str)
|
| 47 |
+
|
| 48 |
+
return True
|
| 49 |
+
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"Error saving analysis: {e}")
|
| 52 |
+
return False
|
| 53 |
+
|
| 54 |
+
def get_history(self, session_id: str = None, limit: int = 100) -> List[Dict[str, Any]]:
|
| 55 |
+
"""Get analysis history
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
session_id: Optional session ID to filter by
|
| 59 |
+
limit: Maximum number of records to return
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
List of analysis records
|
| 63 |
+
"""
|
| 64 |
+
try:
|
| 65 |
+
data = self.load_all_data()
|
| 66 |
+
|
| 67 |
+
# Filter by session_id if provided
|
| 68 |
+
if session_id:
|
| 69 |
+
data = [record for record in data if record.get('session_id') == session_id]
|
| 70 |
+
|
| 71 |
+
# Sort by timestamp (newest first)
|
| 72 |
+
data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
|
| 73 |
+
|
| 74 |
+
# Apply limit
|
| 75 |
+
return data[:limit]
|
| 76 |
+
|
| 77 |
+
except Exception as e:
|
| 78 |
+
print(f"Error getting history: {e}")
|
| 79 |
+
return []
|
| 80 |
+
|
| 81 |
+
def clear_history(self, session_id: str = None) -> bool:
|
| 82 |
+
"""Clear analysis history
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
session_id: Optional session ID to clear specific session data
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
bool: True if successful, False otherwise
|
| 89 |
+
"""
|
| 90 |
+
try:
|
| 91 |
+
if session_id:
|
| 92 |
+
# Clear only specific session data
|
| 93 |
+
data = self.load_all_data()
|
| 94 |
+
filtered_data = [record for record in data if record.get('session_id') != session_id]
|
| 95 |
+
|
| 96 |
+
with open(self.db_file, 'w') as f:
|
| 97 |
+
json.dump(filtered_data, f, indent=2, default=str)
|
| 98 |
+
else:
|
| 99 |
+
# Clear all data
|
| 100 |
+
with open(self.db_file, 'w') as f:
|
| 101 |
+
json.dump([], f)
|
| 102 |
+
|
| 103 |
+
return True
|
| 104 |
+
|
| 105 |
+
except Exception as e:
|
| 106 |
+
print(f"Error clearing history: {e}")
|
| 107 |
+
return False
|
| 108 |
+
|
| 109 |
+
def load_all_data(self) -> List[Dict[str, Any]]:
|
| 110 |
+
"""Load all data from the database file
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
List of all records
|
| 114 |
+
"""
|
| 115 |
+
try:
|
| 116 |
+
with open(self.db_file, 'r') as f:
|
| 117 |
+
data = json.load(f)
|
| 118 |
+
return data if isinstance(data, list) else []
|
| 119 |
+
except (FileNotFoundError, json.JSONDecodeError):
|
| 120 |
+
return []
|
| 121 |
+
|
| 122 |
+
def get_analysis_by_type(self, analysis_type: str, session_id: str = None) -> List[Dict[str, Any]]:
|
| 123 |
+
"""Get analyses by type
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
analysis_type: Type of analysis (e.g., 'EDA', 'Single Query Analysis')
|
| 127 |
+
session_id: Optional session ID to filter by
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
List of matching analysis records
|
| 131 |
+
"""
|
| 132 |
+
try:
|
| 133 |
+
data = self.load_all_data()
|
| 134 |
+
|
| 135 |
+
# Filter by type
|
| 136 |
+
filtered_data = [record for record in data if record.get('type') == analysis_type]
|
| 137 |
+
|
| 138 |
+
# Filter by session_id if provided
|
| 139 |
+
if session_id:
|
| 140 |
+
filtered_data = [record for record in filtered_data if record.get('session_id') == session_id]
|
| 141 |
+
|
| 142 |
+
# Sort by timestamp (newest first)
|
| 143 |
+
filtered_data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
|
| 144 |
+
|
| 145 |
+
return filtered_data
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"Error getting analysis by type: {e}")
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
def get_stats(self) -> Dict[str, Any]:
|
| 152 |
+
"""Get database statistics
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Dictionary with database statistics
|
| 156 |
+
"""
|
| 157 |
+
try:
|
| 158 |
+
data = self.load_all_data()
|
| 159 |
+
|
| 160 |
+
stats = {
|
| 161 |
+
'total_records': len(data),
|
| 162 |
+
'unique_sessions': len(set(record.get('session_id', '') for record in data)),
|
| 163 |
+
'analysis_types': {},
|
| 164 |
+
'oldest_record': None,
|
| 165 |
+
'newest_record': None
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
# Count analysis types
|
| 169 |
+
for record in data:
|
| 170 |
+
analysis_type = record.get('type', 'Unknown')
|
| 171 |
+
stats['analysis_types'][analysis_type] = stats['analysis_types'].get(analysis_type, 0) + 1
|
| 172 |
+
|
| 173 |
+
# Find oldest and newest records
|
| 174 |
+
if data:
|
| 175 |
+
timestamps = [record.get('timestamp', '') for record in data if record.get('timestamp')]
|
| 176 |
+
if timestamps:
|
| 177 |
+
timestamps.sort()
|
| 178 |
+
stats['oldest_record'] = timestamps[0]
|
| 179 |
+
stats['newest_record'] = timestamps[-1]
|
| 180 |
+
|
| 181 |
+
return stats
|
| 182 |
+
|
| 183 |
+
except Exception as e:
|
| 184 |
+
print(f"Error getting stats: {e}")
|
| 185 |
+
return {
|
| 186 |
+
'total_records': 0,
|
| 187 |
+
'unique_sessions': 0,
|
| 188 |
+
'analysis_types': {},
|
| 189 |
+
'oldest_record': None,
|
| 190 |
+
'newest_record': None,
|
| 191 |
+
'error': str(e)
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
def backup_database(self, backup_file: str = None) -> bool:
|
| 195 |
+
"""Create a backup of the database
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
backup_file: Path for backup file. If None, uses timestamp-based name
|
| 199 |
+
|
| 200 |
+
Returns:
|
| 201 |
+
bool: True if successful, False otherwise
|
| 202 |
+
"""
|
| 203 |
+
try:
|
| 204 |
+
if backup_file is None:
|
| 205 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 206 |
+
backup_file = f"analysis_history_backup_{timestamp}.json"
|
| 207 |
+
|
| 208 |
+
data = self.load_all_data()
|
| 209 |
+
|
| 210 |
+
with open(backup_file, 'w') as f:
|
| 211 |
+
json.dump(data, f, indent=2, default=str)
|
| 212 |
+
|
| 213 |
+
return True
|
| 214 |
+
|
| 215 |
+
except Exception as e:
|
| 216 |
+
print(f"Error creating backup: {e}")
|
| 217 |
+
return False
|
| 218 |
+
|
| 219 |
+
def restore_from_backup(self, backup_file: str) -> bool:
|
| 220 |
+
"""Restore database from backup
|
| 221 |
+
|
| 222 |
+
Args:
|
| 223 |
+
backup_file: Path to backup file
|
| 224 |
+
|
| 225 |
+
Returns:
|
| 226 |
+
bool: True if successful, False otherwise
|
| 227 |
+
"""
|
| 228 |
+
try:
|
| 229 |
+
if not os.path.exists(backup_file):
|
| 230 |
+
print(f"Backup file not found: {backup_file}")
|
| 231 |
+
return False
|
| 232 |
+
|
| 233 |
+
with open(backup_file, 'r') as f:
|
| 234 |
+
data = json.load(f)
|
| 235 |
+
|
| 236 |
+
# Validate data format
|
| 237 |
+
if not isinstance(data, list):
|
| 238 |
+
print("Invalid backup file format")
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
# Write to main database file
|
| 242 |
+
with open(self.db_file, 'w') as f:
|
| 243 |
+
json.dump(data, f, indent=2, default=str)
|
| 244 |
+
|
| 245 |
+
return True
|
| 246 |
+
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(f"Error restoring from backup: {e}")
|
| 249 |
+
return False
|
| 250 |
+
|
| 251 |
+
def delete_old_records(self, days_old: int = 30) -> int:
|
| 252 |
+
"""Delete records older than specified days
|
| 253 |
+
|
| 254 |
+
Args:
|
| 255 |
+
days_old: Number of days to keep records
|
| 256 |
+
|
| 257 |
+
Returns:
|
| 258 |
+
int: Number of records deleted
|
| 259 |
+
"""
|
| 260 |
+
try:
|
| 261 |
+
from datetime import datetime, timedelta
|
| 262 |
+
|
| 263 |
+
cutoff_date = datetime.now() - timedelta(days=days_old)
|
| 264 |
+
cutoff_str = cutoff_date.isoformat()
|
| 265 |
+
|
| 266 |
+
data = self.load_all_data()
|
| 267 |
+
original_count = len(data)
|
| 268 |
+
|
| 269 |
+
# Filter out old records
|
| 270 |
+
filtered_data = []
|
| 271 |
+
for record in data:
|
| 272 |
+
record_time = record.get('timestamp', '')
|
| 273 |
+
if record_time >= cutoff_str:
|
| 274 |
+
filtered_data.append(record)
|
| 275 |
+
|
| 276 |
+
# Write filtered data back
|
| 277 |
+
with open(self.db_file, 'w') as f:
|
| 278 |
+
json.dump(filtered_data, f, indent=2, default=str)
|
| 279 |
+
|
| 280 |
+
deleted_count = original_count - len(filtered_data)
|
| 281 |
+
return deleted_count
|
| 282 |
+
|
| 283 |
+
except Exception as e:
|
| 284 |
+
print(f"Error deleting old records: {e}")
|
| 285 |
+
return 0
|
eda_analyzer.py
ADDED
|
@@ -0,0 +1,593 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
import plotly.express as px
|
| 4 |
+
import plotly.graph_objects as go
|
| 5 |
+
from plotly.subplots import make_subplots
|
| 6 |
+
import plotly.figure_factory as ff
|
| 7 |
+
from typing import Dict, List, Any, Tuple
|
| 8 |
+
import warnings
|
| 9 |
+
warnings.filterwarnings('ignore')
|
| 10 |
+
|
| 11 |
+
# Import scipy with error handling
|
| 12 |
+
try:
|
| 13 |
+
from scipy import stats
|
| 14 |
+
from scipy.stats import chi2_contingency
|
| 15 |
+
SCIPY_AVAILABLE = True
|
| 16 |
+
except ImportError:
|
| 17 |
+
SCIPY_AVAILABLE = False
|
| 18 |
+
|
| 19 |
+
class EDAAnalyzer:
|
| 20 |
+
"""Comprehensive Exploratory Data Analysis with advanced visualizations"""
|
| 21 |
+
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.color_palette = [
|
| 24 |
+
'#FFD700', '#FF6B6B', '#4ECDC4', '#45B7D1',
|
| 25 |
+
'#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8'
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
def perform_complete_eda(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 29 |
+
"""Perform comprehensive EDA analysis"""
|
| 30 |
+
try:
|
| 31 |
+
results = {
|
| 32 |
+
'overview': self.generate_overview(df),
|
| 33 |
+
'distributions': self.analyze_distributions(df),
|
| 34 |
+
'correlations': self.analyze_correlations(df),
|
| 35 |
+
'insights': self.generate_insights(df),
|
| 36 |
+
'data_quality': self.assess_data_quality(df),
|
| 37 |
+
'advanced_analysis': self.perform_advanced_analysis(df)
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
return results
|
| 41 |
+
except Exception as e:
|
| 42 |
+
# Return basic results if advanced analysis fails
|
| 43 |
+
return {
|
| 44 |
+
'overview': self.generate_overview(df),
|
| 45 |
+
'distributions': {},
|
| 46 |
+
'correlations': {},
|
| 47 |
+
'insights': [{'title': 'Analysis Error', 'description': f'Error during analysis: {str(e)}'}],
|
| 48 |
+
'data_quality': {},
|
| 49 |
+
'advanced_analysis': {}
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
def generate_overview(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 53 |
+
"""Generate dataset overview"""
|
| 54 |
+
try:
|
| 55 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 56 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 57 |
+
datetime_cols = df.select_dtypes(include=['datetime64']).columns
|
| 58 |
+
|
| 59 |
+
overview = {
|
| 60 |
+
'total_rows': len(df),
|
| 61 |
+
'total_columns': len(df.columns),
|
| 62 |
+
'numeric_columns': len(numeric_cols),
|
| 63 |
+
'categorical_columns': len(categorical_cols),
|
| 64 |
+
'datetime_columns': len(datetime_cols),
|
| 65 |
+
'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
|
| 66 |
+
'duplicate_rows': df.duplicated().sum(),
|
| 67 |
+
'missing_values_total': df.isnull().sum().sum()
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if len(numeric_cols) > 0:
|
| 71 |
+
overview['summary_stats'] = df[numeric_cols].describe()
|
| 72 |
+
|
| 73 |
+
return overview
|
| 74 |
+
except Exception as e:
|
| 75 |
+
return {
|
| 76 |
+
'total_rows': len(df) if df is not None else 0,
|
| 77 |
+
'total_columns': len(df.columns) if df is not None else 0,
|
| 78 |
+
'numeric_columns': 0,
|
| 79 |
+
'categorical_columns': 0,
|
| 80 |
+
'datetime_columns': 0,
|
| 81 |
+
'memory_usage': '0 MB',
|
| 82 |
+
'duplicate_rows': 0,
|
| 83 |
+
'missing_values_total': 0,
|
| 84 |
+
'error': str(e)
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
def analyze_distributions(self, df: pd.DataFrame) -> Dict[str, go.Figure]:
|
| 88 |
+
"""Analyze data distributions with multiple chart types"""
|
| 89 |
+
distributions = {}
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 93 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 94 |
+
|
| 95 |
+
# Numeric distributions
|
| 96 |
+
if len(numeric_cols) > 0:
|
| 97 |
+
distributions.update(self.create_numeric_distributions(df, numeric_cols))
|
| 98 |
+
|
| 99 |
+
# Categorical distributions
|
| 100 |
+
if len(categorical_cols) > 0:
|
| 101 |
+
distributions.update(self.create_categorical_distributions(df, categorical_cols))
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
distributions['error'] = self.create_error_plot(f"Distribution analysis failed: {str(e)}")
|
| 105 |
+
|
| 106 |
+
return distributions
|
| 107 |
+
|
| 108 |
+
def create_error_plot(self, error_message: str) -> go.Figure:
|
| 109 |
+
"""Create an error plot when analysis fails"""
|
| 110 |
+
fig = go.Figure()
|
| 111 |
+
fig.add_annotation(
|
| 112 |
+
text=error_message,
|
| 113 |
+
xref="paper", yref="paper",
|
| 114 |
+
x=0.5, y=0.5, xanchor='center', yanchor='middle',
|
| 115 |
+
showarrow=False,
|
| 116 |
+
font=dict(size=16, color="red")
|
| 117 |
+
)
|
| 118 |
+
fig.update_layout(
|
| 119 |
+
title="Analysis Error",
|
| 120 |
+
showlegend=False,
|
| 121 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 122 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 123 |
+
font=dict(color='white')
|
| 124 |
+
)
|
| 125 |
+
return fig
|
| 126 |
+
|
| 127 |
+
def create_numeric_distributions(self, df: pd.DataFrame, numeric_cols: List[str]) -> Dict[str, go.Figure]:
|
| 128 |
+
"""Create numeric distribution plots"""
|
| 129 |
+
plots = {}
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
# Multi-histogram plot
|
| 133 |
+
if len(numeric_cols) <= 6:
|
| 134 |
+
rows = (len(numeric_cols) + 2) // 3
|
| 135 |
+
fig = make_subplots(
|
| 136 |
+
rows=rows, cols=3,
|
| 137 |
+
subplot_titles=list(numeric_cols),
|
| 138 |
+
vertical_spacing=0.08
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
for i, col in enumerate(numeric_cols):
|
| 142 |
+
row = (i // 3) + 1
|
| 143 |
+
col_pos = (i % 3) + 1
|
| 144 |
+
|
| 145 |
+
# Filter out non-finite values
|
| 146 |
+
data = df[col].dropna()
|
| 147 |
+
if len(data) > 0:
|
| 148 |
+
fig.add_trace(
|
| 149 |
+
go.Histogram(
|
| 150 |
+
x=data,
|
| 151 |
+
name=col,
|
| 152 |
+
marker_color=self.color_palette[i % len(self.color_palette)],
|
| 153 |
+
opacity=0.7,
|
| 154 |
+
showlegend=False
|
| 155 |
+
),
|
| 156 |
+
row=row, col=col_pos
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
fig.update_layout(
|
| 160 |
+
title="π Numeric Distributions Overview",
|
| 161 |
+
height=300 * rows,
|
| 162 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 163 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 164 |
+
font=dict(color='white')
|
| 165 |
+
)
|
| 166 |
+
plots['numeric_histograms'] = fig
|
| 167 |
+
|
| 168 |
+
# Box plots for outlier detection
|
| 169 |
+
if len(numeric_cols) > 0:
|
| 170 |
+
fig = go.Figure()
|
| 171 |
+
for i, col in enumerate(numeric_cols[:8]): # Limit to 8 columns
|
| 172 |
+
data = df[col].dropna()
|
| 173 |
+
if len(data) > 0:
|
| 174 |
+
fig.add_trace(go.Box(
|
| 175 |
+
y=data,
|
| 176 |
+
name=col,
|
| 177 |
+
marker_color=self.color_palette[i % len(self.color_palette)]
|
| 178 |
+
))
|
| 179 |
+
|
| 180 |
+
fig.update_layout(
|
| 181 |
+
title="π¦ Box Plots - Outlier Detection",
|
| 182 |
+
height=500,
|
| 183 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 184 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 185 |
+
font=dict(color='white')
|
| 186 |
+
)
|
| 187 |
+
plots['box_plots'] = fig
|
| 188 |
+
|
| 189 |
+
# Violin plots for distribution shapes
|
| 190 |
+
if len(numeric_cols) > 0:
|
| 191 |
+
fig = go.Figure()
|
| 192 |
+
for i, col in enumerate(numeric_cols[:6]):
|
| 193 |
+
data = df[col].dropna()
|
| 194 |
+
if len(data) > 1: # Need at least 2 points for violin plot
|
| 195 |
+
fig.add_trace(go.Violin(
|
| 196 |
+
y=data,
|
| 197 |
+
name=col,
|
| 198 |
+
box_visible=True,
|
| 199 |
+
meanline_visible=True,
|
| 200 |
+
fillcolor=self.color_palette[i % len(self.color_palette)],
|
| 201 |
+
opacity=0.6
|
| 202 |
+
))
|
| 203 |
+
|
| 204 |
+
fig.update_layout(
|
| 205 |
+
title="π» Violin Plots - Distribution Shapes",
|
| 206 |
+
height=500,
|
| 207 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 208 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 209 |
+
font=dict(color='white')
|
| 210 |
+
)
|
| 211 |
+
plots['violin_plots'] = fig
|
| 212 |
+
|
| 213 |
+
except Exception as e:
|
| 214 |
+
plots['numeric_error'] = self.create_error_plot(f"Numeric distribution error: {str(e)}")
|
| 215 |
+
|
| 216 |
+
return plots
|
| 217 |
+
|
| 218 |
+
def create_categorical_distributions(self, df: pd.DataFrame, categorical_cols: List[str]) -> Dict[str, go.Figure]:
|
| 219 |
+
"""Create categorical distribution plots"""
|
| 220 |
+
plots = {}
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
# Bar charts for categorical variables
|
| 224 |
+
for i, col in enumerate(categorical_cols[:4]): # Limit to 4 columns
|
| 225 |
+
value_counts = df[col].value_counts().head(15) # Top 15 categories
|
| 226 |
+
|
| 227 |
+
if len(value_counts) > 0:
|
| 228 |
+
fig = go.Figure(data=[
|
| 229 |
+
go.Bar(
|
| 230 |
+
x=value_counts.index.astype(str),
|
| 231 |
+
y=value_counts.values,
|
| 232 |
+
marker_color=self.color_palette[i % len(self.color_palette)],
|
| 233 |
+
text=value_counts.values,
|
| 234 |
+
textposition='auto'
|
| 235 |
+
)
|
| 236 |
+
])
|
| 237 |
+
|
| 238 |
+
fig.update_layout(
|
| 239 |
+
title=f"π {col} - Value Distribution",
|
| 240 |
+
xaxis_title=col,
|
| 241 |
+
yaxis_title="Count",
|
| 242 |
+
height=400,
|
| 243 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 244 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 245 |
+
font=dict(color='white')
|
| 246 |
+
)
|
| 247 |
+
plots[f'categorical_{col}'] = fig
|
| 248 |
+
|
| 249 |
+
# Pie chart for first categorical variable
|
| 250 |
+
if len(categorical_cols) > 0:
|
| 251 |
+
col = categorical_cols[0]
|
| 252 |
+
value_counts = df[col].value_counts().head(10)
|
| 253 |
+
|
| 254 |
+
if len(value_counts) > 0:
|
| 255 |
+
fig = go.Figure(data=[go.Pie(
|
| 256 |
+
labels=value_counts.index.astype(str),
|
| 257 |
+
values=value_counts.values,
|
| 258 |
+
hole=0.3,
|
| 259 |
+
marker_colors=self.color_palette
|
| 260 |
+
)])
|
| 261 |
+
|
| 262 |
+
fig.update_layout(
|
| 263 |
+
title=f"π₯§ {col} - Proportion Analysis",
|
| 264 |
+
height=500,
|
| 265 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 266 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 267 |
+
font=dict(color='white')
|
| 268 |
+
)
|
| 269 |
+
plots['pie_chart'] = fig
|
| 270 |
+
|
| 271 |
+
except Exception as e:
|
| 272 |
+
plots['categorical_error'] = self.create_error_plot(f"Categorical distribution error: {str(e)}")
|
| 273 |
+
|
| 274 |
+
return plots
|
| 275 |
+
|
| 276 |
+
def analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 277 |
+
"""Analyze correlations between variables"""
|
| 278 |
+
correlations = {}
|
| 279 |
+
|
| 280 |
+
try:
|
| 281 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 282 |
+
|
| 283 |
+
if len(numeric_cols) > 1:
|
| 284 |
+
# Correlation matrix
|
| 285 |
+
corr_matrix = df[numeric_cols].corr()
|
| 286 |
+
|
| 287 |
+
# Heatmap
|
| 288 |
+
fig = go.Figure(data=go.Heatmap(
|
| 289 |
+
z=corr_matrix.values,
|
| 290 |
+
x=corr_matrix.columns,
|
| 291 |
+
y=corr_matrix.columns,
|
| 292 |
+
colorscale='RdYlBu',
|
| 293 |
+
zmid=0,
|
| 294 |
+
text=np.round(corr_matrix.values, 2),
|
| 295 |
+
texttemplate="%{text}",
|
| 296 |
+
textfont={"size": 10},
|
| 297 |
+
colorbar=dict(title="Correlation")
|
| 298 |
+
))
|
| 299 |
+
|
| 300 |
+
fig.update_layout(
|
| 301 |
+
title="π₯ Correlation Heatmap",
|
| 302 |
+
height=max(400, len(numeric_cols) * 30),
|
| 303 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 304 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 305 |
+
font=dict(color='white')
|
| 306 |
+
)
|
| 307 |
+
correlations['heatmap'] = fig
|
| 308 |
+
|
| 309 |
+
# Top correlations
|
| 310 |
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
| 311 |
+
corr_matrix_masked = corr_matrix.mask(mask)
|
| 312 |
+
|
| 313 |
+
# Get top positive and negative correlations
|
| 314 |
+
corr_pairs = []
|
| 315 |
+
for i in range(len(corr_matrix_masked.columns)):
|
| 316 |
+
for j in range(len(corr_matrix_masked.columns)):
|
| 317 |
+
if pd.notna(corr_matrix_masked.iloc[i, j]):
|
| 318 |
+
corr_pairs.append({
|
| 319 |
+
'Variable 1': corr_matrix_masked.columns[i],
|
| 320 |
+
'Variable 2': corr_matrix_masked.columns[j],
|
| 321 |
+
'Correlation': corr_matrix_masked.iloc[i, j]
|
| 322 |
+
})
|
| 323 |
+
|
| 324 |
+
if corr_pairs:
|
| 325 |
+
corr_df = pd.DataFrame(corr_pairs)
|
| 326 |
+
corr_df = corr_df.reindex(corr_df['Correlation'].abs().sort_values(ascending=False).index)
|
| 327 |
+
correlations['top_correlations'] = corr_df.head(10)
|
| 328 |
+
|
| 329 |
+
# Scatter plot matrix for top correlated variables
|
| 330 |
+
if len(numeric_cols) >= 2:
|
| 331 |
+
top_corr_cols = corr_df.head(3)[['Variable 1', 'Variable 2']].values.flatten()
|
| 332 |
+
unique_cols = list(set(top_corr_cols))[:4] # Max 4 variables
|
| 333 |
+
|
| 334 |
+
if len(unique_cols) >= 2:
|
| 335 |
+
try:
|
| 336 |
+
fig = px.scatter_matrix(
|
| 337 |
+
df[unique_cols].dropna(),
|
| 338 |
+
dimensions=unique_cols,
|
| 339 |
+
color_discrete_sequence=self.color_palette
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
fig.update_layout(
|
| 343 |
+
title="π― Scatter Plot Matrix - Top Correlated Variables",
|
| 344 |
+
height=600,
|
| 345 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 346 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 347 |
+
font=dict(color='white')
|
| 348 |
+
)
|
| 349 |
+
correlations['scatter_matrix'] = fig
|
| 350 |
+
except Exception:
|
| 351 |
+
pass # Skip if scatter matrix fails
|
| 352 |
+
|
| 353 |
+
except Exception as e:
|
| 354 |
+
correlations['error'] = f"Correlation analysis failed: {str(e)}"
|
| 355 |
+
|
| 356 |
+
return correlations
|
| 357 |
+
|
| 358 |
+
def generate_insights(self, df: pd.DataFrame) -> List[Dict[str, str]]:
|
| 359 |
+
"""Generate AI-powered insights about the data"""
|
| 360 |
+
insights = []
|
| 361 |
+
|
| 362 |
+
try:
|
| 363 |
+
# Basic statistics insights
|
| 364 |
+
insights.append({
|
| 365 |
+
'title': 'π Dataset Overview',
|
| 366 |
+
'description': f"Dataset contains {len(df):,} rows and {len(df.columns)} columns. "
|
| 367 |
+
f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB. "
|
| 368 |
+
f"Missing values: {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)."
|
| 369 |
+
})
|
| 370 |
+
|
| 371 |
+
# Numeric columns insights
|
| 372 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 373 |
+
if len(numeric_cols) > 0:
|
| 374 |
+
try:
|
| 375 |
+
# Find columns with high variance
|
| 376 |
+
variances = df[numeric_cols].var().sort_values(ascending=False)
|
| 377 |
+
high_var_col = variances.index[0]
|
| 378 |
+
|
| 379 |
+
insights.append({
|
| 380 |
+
'title': 'π Variance Analysis',
|
| 381 |
+
'description': f"'{high_var_col}' shows the highest variance ({variances.iloc[0]:.2f}), "
|
| 382 |
+
f"indicating significant spread in values. This column might contain outliers "
|
| 383 |
+
f"or represent a key differentiating factor in your dataset."
|
| 384 |
+
})
|
| 385 |
+
|
| 386 |
+
# Skewness analysis
|
| 387 |
+
skewed_cols = []
|
| 388 |
+
for col in numeric_cols:
|
| 389 |
+
try:
|
| 390 |
+
skewness = df[col].skew()
|
| 391 |
+
if abs(skewness) > 1:
|
| 392 |
+
skewed_cols.append((col, skewness))
|
| 393 |
+
except:
|
| 394 |
+
continue
|
| 395 |
+
|
| 396 |
+
if skewed_cols:
|
| 397 |
+
insights.append({
|
| 398 |
+
'title': 'π Distribution Skewness',
|
| 399 |
+
'description': f"Found {len(skewed_cols)} heavily skewed columns. "
|
| 400 |
+
f"Most skewed: '{skewed_cols[0][0]}' (skewness: {skewed_cols[0][1]:.2f}). "
|
| 401 |
+
f"Consider log transformation or outlier treatment for better modeling."
|
| 402 |
+
})
|
| 403 |
+
except Exception:
|
| 404 |
+
pass
|
| 405 |
+
|
| 406 |
+
# Categorical insights
|
| 407 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 408 |
+
if len(categorical_cols) > 0:
|
| 409 |
+
try:
|
| 410 |
+
cardinalities = []
|
| 411 |
+
for col in categorical_cols:
|
| 412 |
+
unique_count = df[col].nunique()
|
| 413 |
+
cardinalities.append((col, unique_count))
|
| 414 |
+
|
| 415 |
+
cardinalities.sort(key=lambda x: x[1], reverse=True)
|
| 416 |
+
|
| 417 |
+
insights.append({
|
| 418 |
+
'title': 'π·οΈ Categorical Analysis',
|
| 419 |
+
'description': f"'{cardinalities[0][0]}' has the highest cardinality ({cardinalities[0][1]} unique values). "
|
| 420 |
+
f"High cardinality columns might need encoding strategies for machine learning. "
|
| 421 |
+
f"Consider grouping rare categories or using embedding techniques."
|
| 422 |
+
})
|
| 423 |
+
except Exception:
|
| 424 |
+
pass
|
| 425 |
+
|
| 426 |
+
# Missing data patterns
|
| 427 |
+
try:
|
| 428 |
+
missing_data = df.isnull().sum()
|
| 429 |
+
missing_cols = missing_data[missing_data > 0].sort_values(ascending=False)
|
| 430 |
+
|
| 431 |
+
if len(missing_cols) > 0:
|
| 432 |
+
insights.append({
|
| 433 |
+
'title': 'β Missing Data Patterns',
|
| 434 |
+
'description': f"'{missing_cols.index[0]}' has the most missing values ({missing_cols.iloc[0]:,} - "
|
| 435 |
+
f"{missing_cols.iloc[0] / len(df) * 100:.1f}%). "
|
| 436 |
+
f"Analyze if missing data is random or systematic. "
|
| 437 |
+
f"Consider imputation strategies or feature engineering."
|
| 438 |
+
})
|
| 439 |
+
except Exception:
|
| 440 |
+
pass
|
| 441 |
+
|
| 442 |
+
# Correlation insights
|
| 443 |
+
if len(numeric_cols) > 1:
|
| 444 |
+
try:
|
| 445 |
+
corr_matrix = df[numeric_cols].corr()
|
| 446 |
+
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
|
| 447 |
+
corr_matrix_masked = corr_matrix.mask(mask)
|
| 448 |
+
|
| 449 |
+
max_corr = 0
|
| 450 |
+
max_pair = None
|
| 451 |
+
for i in range(len(corr_matrix_masked.columns)):
|
| 452 |
+
for j in range(len(corr_matrix_masked.columns)):
|
| 453 |
+
if pd.notna(corr_matrix_masked.iloc[i, j]):
|
| 454 |
+
if abs(corr_matrix_masked.iloc[i, j]) > abs(max_corr):
|
| 455 |
+
max_corr = corr_matrix_masked.iloc[i, j]
|
| 456 |
+
max_pair = (corr_matrix_masked.columns[i], corr_matrix_masked.columns[j])
|
| 457 |
+
|
| 458 |
+
if max_pair and abs(max_corr) > 0.5:
|
| 459 |
+
insights.append({
|
| 460 |
+
'title': 'π Strong Correlations',
|
| 461 |
+
'description': f"Strong correlation found between '{max_pair[0]}' and '{max_pair[1]}' "
|
| 462 |
+
f"(r = {max_corr:.3f}). This suggests potential multicollinearity. "
|
| 463 |
+
f"Consider feature selection or dimensionality reduction techniques."
|
| 464 |
+
})
|
| 465 |
+
except Exception:
|
| 466 |
+
pass
|
| 467 |
+
|
| 468 |
+
except Exception as e:
|
| 469 |
+
insights.append({
|
| 470 |
+
'title': 'Analysis Error',
|
| 471 |
+
'description': f"Error generating insights: {str(e)}"
|
| 472 |
+
})
|
| 473 |
+
|
| 474 |
+
return insights
|
| 475 |
+
|
| 476 |
+
def assess_data_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 477 |
+
"""Assess data quality with visualizations"""
|
| 478 |
+
quality = {}
|
| 479 |
+
|
| 480 |
+
try:
|
| 481 |
+
# Missing values heatmap
|
| 482 |
+
if df.isnull().sum().sum() > 0:
|
| 483 |
+
missing_data = df.isnull().sum().sort_values(ascending=False)
|
| 484 |
+
missing_data = missing_data[missing_data > 0]
|
| 485 |
+
|
| 486 |
+
if len(missing_data) > 0:
|
| 487 |
+
fig = go.Figure([go.Bar(
|
| 488 |
+
x=missing_data.index,
|
| 489 |
+
y=missing_data.values,
|
| 490 |
+
marker_color='#FF6B6B',
|
| 491 |
+
text=missing_data.values,
|
| 492 |
+
textposition='auto'
|
| 493 |
+
)])
|
| 494 |
+
|
| 495 |
+
fig.update_layout(
|
| 496 |
+
title="β Missing Values by Column",
|
| 497 |
+
xaxis_title="Columns",
|
| 498 |
+
yaxis_title="Missing Count",
|
| 499 |
+
height=400,
|
| 500 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 501 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 502 |
+
font=dict(color='white')
|
| 503 |
+
)
|
| 504 |
+
quality['missing_values'] = fig
|
| 505 |
+
|
| 506 |
+
# Data types distribution
|
| 507 |
+
dtype_counts = df.dtypes.value_counts()
|
| 508 |
+
|
| 509 |
+
if len(dtype_counts) > 0:
|
| 510 |
+
fig = go.Figure(data=[go.Pie(
|
| 511 |
+
labels=[str(dtype) for dtype in dtype_counts.index],
|
| 512 |
+
values=dtype_counts.values,
|
| 513 |
+
hole=0.3,
|
| 514 |
+
marker_colors=self.color_palette
|
| 515 |
+
)])
|
| 516 |
+
|
| 517 |
+
fig.update_layout(
|
| 518 |
+
title="π§ Data Types Distribution",
|
| 519 |
+
height=400,
|
| 520 |
+
plot_bgcolor='rgba(0,0,0,0)',
|
| 521 |
+
paper_bgcolor='rgba(0,0,0,0)',
|
| 522 |
+
font=dict(color='white')
|
| 523 |
+
)
|
| 524 |
+
quality['data_types'] = fig
|
| 525 |
+
|
| 526 |
+
# Duplicate analysis
|
| 527 |
+
duplicates = df.duplicated().sum()
|
| 528 |
+
if duplicates > 0:
|
| 529 |
+
quality['duplicates'] = {
|
| 530 |
+
'count': duplicates,
|
| 531 |
+
'percentage': duplicates / len(df) * 100
|
| 532 |
+
}
|
| 533 |
+
|
| 534 |
+
except Exception as e:
|
| 535 |
+
quality['error'] = f"Data quality assessment failed: {str(e)}"
|
| 536 |
+
|
| 537 |
+
return quality
|
| 538 |
+
|
| 539 |
+
def perform_advanced_analysis(self, df: pd.DataFrame) -> Dict[str, Any]:
|
| 540 |
+
"""Perform advanced statistical analysis"""
|
| 541 |
+
advanced = {}
|
| 542 |
+
|
| 543 |
+
try:
|
| 544 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
| 545 |
+
|
| 546 |
+
# Outlier detection using IQR method
|
| 547 |
+
if len(numeric_cols) > 0:
|
| 548 |
+
outlier_counts = {}
|
| 549 |
+
for col in numeric_cols:
|
| 550 |
+
try:
|
| 551 |
+
data = df[col].dropna()
|
| 552 |
+
if len(data) > 0:
|
| 553 |
+
Q1 = data.quantile(0.25)
|
| 554 |
+
Q3 = data.quantile(0.75)
|
| 555 |
+
IQR = Q3 - Q1
|
| 556 |
+
lower_bound = Q1 - 1.5 * IQR
|
| 557 |
+
upper_bound = Q3 + 1.5 * IQR
|
| 558 |
+
|
| 559 |
+
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
|
| 560 |
+
outlier_counts[col] = len(outliers)
|
| 561 |
+
except Exception:
|
| 562 |
+
outlier_counts[col] = 0
|
| 563 |
+
|
| 564 |
+
if outlier_counts:
|
| 565 |
+
outlier_df = pd.DataFrame(list(outlier_counts.items()),
|
| 566 |
+
columns=['Column', 'Outlier_Count'])
|
| 567 |
+
outlier_df = outlier_df.sort_values('Outlier_Count', ascending=False)
|
| 568 |
+
advanced['outliers'] = outlier_df
|
| 569 |
+
|
| 570 |
+
# Statistical tests
|
| 571 |
+
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
|
| 572 |
+
|
| 573 |
+
if len(categorical_cols) >= 2 and SCIPY_AVAILABLE:
|
| 574 |
+
try:
|
| 575 |
+
col1, col2 = categorical_cols[0], categorical_cols[1]
|
| 576 |
+
contingency_table = pd.crosstab(df[col1], df[col2])
|
| 577 |
+
|
| 578 |
+
if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
|
| 579 |
+
chi2, p_value, dof, expected = chi2_contingency(contingency_table)
|
| 580 |
+
|
| 581 |
+
advanced['chi_square_test'] = {
|
| 582 |
+
'variables': [col1, col2],
|
| 583 |
+
'chi2_statistic': chi2,
|
| 584 |
+
'p_value': p_value,
|
| 585 |
+
'interpretation': 'Dependent' if p_value < 0.05 else 'Independent'
|
| 586 |
+
}
|
| 587 |
+
except Exception:
|
| 588 |
+
pass # Skip if test fails
|
| 589 |
+
|
| 590 |
+
except Exception as e:
|
| 591 |
+
advanced['error'] = f"Advanced analysis failed: {str(e)}"
|
| 592 |
+
|
| 593 |
+
return advanced
|
main.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
pandas>=1.5.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
plotly>=5.15.0
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
faiss-cpu>=1.7.4
|
| 7 |
+
scipy>=1.10.0
|
| 8 |
+
seaborn>=0.12.0
|
| 9 |
+
sentence-transformers>=2.2.0
|
| 10 |
+
scikit-learn>=1.3.0
|
| 11 |
+
pathlib2>=2.3.7
|
| 12 |
+
python-dotenv>=1.0.0
|
setup.bat
ADDED
|
File without changes
|
setup.sh
ADDED
|
File without changes
|
test.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script to verify the Neural Data Analyst application works correctly
|
| 4 |
+
Run this to check if all dependencies are available and create sample files
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
def check_dependencies():
|
| 12 |
+
"""Check if all required dependencies are installed"""
|
| 13 |
+
print("π Checking dependencies...")
|
| 14 |
+
|
| 15 |
+
required_packages = [
|
| 16 |
+
'streamlit',
|
| 17 |
+
'pandas',
|
| 18 |
+
'numpy',
|
| 19 |
+
'plotly',
|
| 20 |
+
'requests'
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
missing_packages = []
|
| 24 |
+
|
| 25 |
+
for package in required_packages:
|
| 26 |
+
try:
|
| 27 |
+
__import__(package)
|
| 28 |
+
print(f"β
{package}")
|
| 29 |
+
except ImportError:
|
| 30 |
+
print(f"β {package} - MISSING")
|
| 31 |
+
missing_packages.append(package)
|
| 32 |
+
|
| 33 |
+
# Optional packages
|
| 34 |
+
optional_packages = [
|
| 35 |
+
'scipy',
|
| 36 |
+
'python-dotenv'
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
print("\nπ Checking optional dependencies...")
|
| 40 |
+
for package in optional_packages:
|
| 41 |
+
try:
|
| 42 |
+
__import__(package.replace('-', '_'))
|
| 43 |
+
print(f"β
{package} (optional)")
|
| 44 |
+
except ImportError:
|
| 45 |
+
print(f"β οΈ {package} (optional) - not installed")
|
| 46 |
+
|
| 47 |
+
if missing_packages:
|
| 48 |
+
print(f"\nβ Missing required packages: {', '.join(missing_packages)}")
|
| 49 |
+
print("Install them with: pip install " + " ".join(missing_packages))
|
| 50 |
+
return False
|
| 51 |
+
else:
|
| 52 |
+
print("\nβ
All required dependencies are installed!")
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
def create_sample_files():
|
| 56 |
+
"""Create sample configuration files"""
|
| 57 |
+
print("\nπ Creating sample files...")
|
| 58 |
+
|
| 59 |
+
# Create .env file template
|
| 60 |
+
env_content = """# Groq API Configuration
|
| 61 |
+
# Get your API key from: https://console.groq.com/keys
|
| 62 |
+
GROQ_API_KEY=your_groq_api_key_here
|
| 63 |
+
|
| 64 |
+
# Optional: Set other environment variables
|
| 65 |
+
# DEBUG=True
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
env_file = Path('.env.template')
|
| 69 |
+
if not env_file.exists():
|
| 70 |
+
with open(env_file, 'w') as f:
|
| 71 |
+
f.write(env_content)
|
| 72 |
+
print(f"β
Created {env_file}")
|
| 73 |
+
else:
|
| 74 |
+
print(f"βΉοΈ {env_file} already exists")
|
| 75 |
+
|
| 76 |
+
# Create sample CSV data
|
| 77 |
+
sample_csv = Path('sample_data.csv')
|
| 78 |
+
if not sample_csv.exists():
|
| 79 |
+
csv_content = """customer_id,customer_name,product,sales_amount,order_date,region,sales_rep
|
| 80 |
+
1,Customer_1,Widget A,2147.23,2023-01-01,North,John Smith
|
| 81 |
+
2,Customer_2,Widget B,1823.45,2023-01-02,South,Jane Doe
|
| 82 |
+
3,Customer_3,Widget C,2456.78,2023-01-03,East,Bob Johnson
|
| 83 |
+
4,Customer_4,Gadget X,1934.56,2023-01-04,West,Alice Brown
|
| 84 |
+
5,Customer_5,Widget A,2234.67,2023-01-05,North,John Smith
|
| 85 |
+
"""
|
| 86 |
+
with open(sample_csv, 'w') as f:
|
| 87 |
+
f.write(csv_content)
|
| 88 |
+
print(f"β
Created {sample_csv}")
|
| 89 |
+
else:
|
| 90 |
+
print(f"βΉοΈ {sample_csv} already exists")
|
| 91 |
+
|
| 92 |
+
def create_required_modules():
|
| 93 |
+
"""Create the required module files if they don't exist"""
|
| 94 |
+
print("\nπ Checking required modules...")
|
| 95 |
+
|
| 96 |
+
# Check if eda_analyzer.py exists
|
| 97 |
+
if not Path('eda_analyzer.py').exists():
|
| 98 |
+
print("β eda_analyzer.py not found!")
|
| 99 |
+
print(" Please save the EDA Analyzer code as 'eda_analyzer.py'")
|
| 100 |
+
return False
|
| 101 |
+
else:
|
| 102 |
+
print("β
eda_analyzer.py found")
|
| 103 |
+
|
| 104 |
+
# Check if database_manager.py exists
|
| 105 |
+
if not Path('database_manager.py').exists():
|
| 106 |
+
print("β database_manager.py not found!")
|
| 107 |
+
print(" Please save the Database Manager code as 'database_manager.py'")
|
| 108 |
+
return False
|
| 109 |
+
else:
|
| 110 |
+
print("β
database_manager.py found")
|
| 111 |
+
|
| 112 |
+
return True
|
| 113 |
+
|
| 114 |
+
def test_imports():
|
| 115 |
+
"""Test if the modules can be imported"""
|
| 116 |
+
print("\nπ§ͺ Testing module imports...")
|
| 117 |
+
|
| 118 |
+
try:
|
| 119 |
+
from eda_analyzer import EDAAnalyzer
|
| 120 |
+
print("β
EDAAnalyzer imported successfully")
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"β Failed to import EDAAnalyzer: {e}")
|
| 123 |
+
return False
|
| 124 |
+
|
| 125 |
+
try:
|
| 126 |
+
from database_manager import DatabaseManager
|
| 127 |
+
print("β
DatabaseManager imported successfully")
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(f"β Failed to import DatabaseManager: {e}")
|
| 130 |
+
return False
|
| 131 |
+
|
| 132 |
+
return True
|
| 133 |
+
|
| 134 |
+
def main():
|
| 135 |
+
"""Main test function"""
|
| 136 |
+
print("π Neural Data Analyst - Setup Test")
|
| 137 |
+
print("=" * 50)
|
| 138 |
+
|
| 139 |
+
# Check Python version
|
| 140 |
+
python_version = sys.version_info
|
| 141 |
+
print(f"π Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
|
| 142 |
+
|
| 143 |
+
if python_version < (3, 7):
|
| 144 |
+
print("β Python 3.7+ required!")
|
| 145 |
+
return False
|
| 146 |
+
else:
|
| 147 |
+
print("β
Python version OK")
|
| 148 |
+
|
| 149 |
+
# Run all checks
|
| 150 |
+
deps_ok = check_dependencies()
|
| 151 |
+
if not deps_ok:
|
| 152 |
+
return False
|
| 153 |
+
|
| 154 |
+
create_sample_files()
|
| 155 |
+
|
| 156 |
+
modules_ok = create_required_modules()
|
| 157 |
+
if not modules_ok:
|
| 158 |
+
return False
|
| 159 |
+
|
| 160 |
+
imports_ok = test_imports()
|
| 161 |
+
if not imports_ok:
|
| 162 |
+
return False
|
| 163 |
+
|
| 164 |
+
print("\nπ Setup test completed successfully!")
|
| 165 |
+
print("\nπ Next steps:")
|
| 166 |
+
print("1. Copy .env.template to .env and add your Groq API key (optional)")
|
| 167 |
+
print("2. Run: streamlit run app.py")
|
| 168 |
+
print("3. Upload sample_data.csv or your own data file")
|
| 169 |
+
print("4. Explore the analysis features!")
|
| 170 |
+
|
| 171 |
+
return True
|
| 172 |
+
|
| 173 |
+
if __name__ == "__main__":
|
| 174 |
+
success = main()
|
| 175 |
+
sys.exit(0 if success else 1)
|