Spaces:

Dhruv-18
/

neural-data-anlayst

Running

App Files Files Community

Dhruv Pawar commited on Jul 14

Commit

1273036

0 Parent(s):

Initial commit: Neural Data Analyst v1.0

Browse files

Files changed (10) hide show

.env.template +10 -0
.gitignore +134 -0
advanced_features.py +730 -0
database_manager.py +285 -0
eda_analyzer.py +593 -0
main.py +0 -0
requirements.txt +12 -0
setup.bat +0 -0
setup.sh +0 -0
test.py +175 -0

.env.template ADDED Viewed

	@@ -0,0 +1,10 @@

+# Neural Data Analyst Environment Variables
+# Copy this file to .env and add your actual API key
+# Groq API Configuration
+GROQ_API_KEY=your_groq_api_key_here
+# Optional: Default model to use
+DEFAULT_MODEL=llama-3.3-70b-versatile
+# Get your Groq API key from: https://console.groq.com/keys

.gitignore ADDED Viewed

	@@ -0,0 +1,134 @@

+# Neural Data Analyst - .gitignore
+# Environment Variables (IMPORTANT: Never commit API keys!)
+.env
+.env.local
+.env.development
+.env.test
+.env.production
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual Environments
+venv/
+env/
+ENV/
+env.bak/
+venv.bak/
+.venv/
+# Streamlit
+.streamlit/
+streamlit_cache/
+# Database files
+*.db
+*.sqlite
+*.sqlite3
+neural_analyst_db/
+analysis_history.json
+# Logs
+*.log
+logs/
+neural_logs/
+# Cache
+.cache/
+cache/
+temp/
+tmp/
+# Data files (add your data files here)
+data/
+datasets/
+uploads/
+*.csv
+*.json
+*.xlsx
+*.xls
+sample_data.csv
+# IDE and Editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+Thumbs.db
+# Jupyter Notebooks
+.ipynb_checkpoints/
+*.ipynb
+# pytest
+.pytest_cache/
+.coverage
+htmlcov/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Documentation builds
+docs/_build/
+# PyInstaller
+*.manifest
+*.spec
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+# Backup files
+*.backup
+*.bak
+*.old
+# Temporary files
+*.tmp
+*.temp
+# API Keys and Secrets (double protection)
+secrets.toml
+.secrets.toml
+api_keys.txt
+config.json
+# Local configuration
+local_config.py
+config_local.py

advanced_features.py ADDED Viewed

	@@ -0,0 +1,730 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import json
+from datetime import datetime, timedelta
+from typing import Dict, List, Any
+import base64
+from io import BytesIO
+# Additional advanced features for Neural Data Analyst
+class AdvancedFeatures:
+    """Advanced features and utilities for the Neural Data Analyst"""
+    def __init__(self, db_manager):
+        self.db_manager = db_manager
+    def render_advanced_analytics_dashboard(self, df: pd.DataFrame):
+        """Render advanced analytics dashboard"""
+        st.markdown("## 🔬 Advanced Analytics Dashboard")
+        tabs = st.tabs([
+            "📊 Interactive Plots",
+            "🎯 Smart Recommendations",
+            "📈 Trend Analysis",
+            "🔍 Anomaly Detection",
+            "📋 Report Generator"
+        ])
+        with tabs[0]:
+            self.render_interactive_plots(df)
+        with tabs[1]:
+            self.render_smart_recommendations(df)
+        with tabs[2]:
+            self.render_trend_analysis(df)
+        with tabs[3]:
+            self.render_anomaly_detection(df)
+        with tabs[4]:
+            self.render_report_generator(df)
+    def render_interactive_plots(self, df: pd.DataFrame):
+        """Render interactive plotting interface"""
+        st.markdown("### 📊 Interactive Plot Builder")
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            plot_type = st.selectbox(
+                "Plot Type",
+                ["Scatter", "Line", "Bar", "Histogram", "Box", "Violin", "Heatmap", "3D Scatter"]
+            )
+        with col2:
+            x_column = st.selectbox("X-axis", df.columns)
+        with col3:
+            y_column = st.selectbox("Y-axis", df.columns)
+        # Color and size options
+        col1, col2 = st.columns(2)
+        with col1:
+            color_column = st.selectbox("Color by", ["None"] + list(df.columns))
+        with col2:
+            size_column = st.selectbox("Size by", ["None"] + list(df.select_dtypes(include=[np.number]).columns))
+        # Generate plot based on selections
+        if st.button("🎨 Generate Plot"):
+            fig = self.create_dynamic_plot(df, plot_type, x_column, y_column, color_column, size_column)
+            if fig:
+                st.plotly_chart(fig, use_container_width=True)
+        # Plot gallery
+        with st.expander("🖼️ Quick Plot Gallery"):
+            self.render_plot_gallery(df)
+    def create_dynamic_plot(self, df: pd.DataFrame, plot_type: str, x_col: str, y_col: str,
+                           color_col: str = None, size_col: str = None):
+        """Create dynamic plot based on user selections"""
+        try:
+            kwargs = {
+                'data_frame': df,
+                'x': x_col,
+                'title': f'{plot_type} Plot: {x_col} vs {y_col}'
+            }
+            if y_col and y_col != x_col:
+                kwargs['y'] = y_col
+            if color_col and color_col != "None":
+                kwargs['color'] = color_col
+            if size_col and size_col != "None" and plot_type in ["Scatter", "3D Scatter"]:
+                kwargs['size'] = size_col
+            if plot_type == "Scatter":
+                fig = px.scatter(**kwargs)
+            elif plot_type == "Line":
+                fig = px.line(**kwargs)
+            elif plot_type == "Bar":
+                fig = px.bar(**kwargs)
+            elif plot_type == "Histogram":
+                fig = px.histogram(df, x=x_col, title=f'Histogram: {x_col}')
+            elif plot_type == "Box":
+                fig = px.box(**kwargs)
+            elif plot_type == "Violin":
+                fig = px.violin(**kwargs)
+            elif plot_type == "3D Scatter":
+                z_col = st.selectbox("Z-axis", df.select_dtypes(include=[np.number]).columns)
+                kwargs['z'] = z_col
+                fig = px.scatter_3d(**kwargs)
+            elif plot_type == "Heatmap":
+                numeric_df = df.select_dtypes(include=[np.number])
+                corr_matrix = numeric_df.corr()
+                fig = px.imshow(corr_matrix, text_auto=True, title="Correlation Heatmap")
+            else:
+                return None
+            fig.update_layout(
+                plot_bgcolor='rgba(0,0,0,0)',
+                paper_bgcolor='rgba(0,0,0,0)',
+                font=dict(color='white')
+            )
+            return fig
+        except Exception as e:
+            st.error(f"Error creating plot: {str(e)}")
+            return None
+    def render_plot_gallery(self, df: pd.DataFrame):
+        """Render quick plot gallery"""
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) >= 2:
+            col1, col2 = st.columns(2)
+            with col1:
+                # Quick correlation plot
+                fig = px.scatter(df, x=numeric_cols[0], y=numeric_cols[1],
+                               title="Quick Correlation View")
+                fig.update_layout(height=300)
+                st.plotly_chart(fig, use_container_width=True)
+            with col2:
+                # Quick distribution plot
+                fig = px.histogram(df, x=numeric_cols[0], title="Quick Distribution")
+                fig.update_layout(height=300)
+                st.plotly_chart(fig, use_container_width=True)
+    def render_smart_recommendations(self, df: pd.DataFrame):
+        """Render smart analysis recommendations"""
+        st.markdown("### 🎯 Smart Analysis Recommendations")
+        recommendations = self.generate_analysis_recommendations(df)
+        for i, rec in enumerate(recommendations):
+            with st.expander(f"💡 {rec['title']}", expanded=i == 0):
+                st.markdown(f"**Recommendation:** {rec['description']}")
+                st.markdown(f"**Rationale:** {rec['rationale']}")
+                if st.button(f"Apply Recommendation", key=f"apply_rec_{i}"):
+                    self.apply_recommendation(df, rec)
+    def generate_analysis_recommendations(self, df: pd.DataFrame) -> List[Dict[str, str]]:
+        """Generate smart analysis recommendations"""
+        recommendations = []
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+        # Missing data recommendation
+        missing_data = df.isnull().sum()
+        high_missing = missing_data[missing_data > len(df) * 0.1]
+        if len(high_missing) > 0:
+            recommendations.append({
+                'title': 'Missing Data Analysis',
+                'description': f'Analyze missing data patterns in {len(high_missing)} columns with >10% missing values',
+                'rationale': 'Understanding missing data patterns can reveal data collection issues or systematic biases',
+                'action': 'missing_analysis'
+            })
+        # Correlation analysis recommendation
+        if len(numeric_cols) > 2:
+            recommendations.append({
+                'title': 'Correlation Deep Dive',
+                'description': 'Perform comprehensive correlation analysis with feature selection recommendations',
+                'rationale': 'Identifying highly correlated features can improve model performance and interpretability',
+                'action': 'correlation_analysis'
+            })
+        # Outlier detection recommendation
+        if len(numeric_cols) > 0:
+            recommendations.append({
+                'title': 'Outlier Detection & Treatment',
+                'description': 'Identify and analyze outliers using multiple statistical methods',
+                'rationale': 'Outliers can significantly impact analysis results and model performance',
+                'action': 'outlier_analysis'
+            })
+        # Segmentation recommendation
+        if len(categorical_cols) > 0 and len(numeric_cols) > 0:
+            recommendations.append({
+                'title': 'Customer/Data Segmentation',
+                'description': 'Perform clustering analysis to identify natural data segments',
+                'rationale': 'Segmentation can reveal hidden patterns and improve targeted strategies',
+                'action': 'segmentation_analysis'
+            })
+        # Time series recommendation
+        date_cols = df.select_dtypes(include=['datetime64']).columns
+        if len(date_cols) > 0:
+            recommendations.append({
+                'title': 'Time Series Analysis',
+                'description': 'Analyze temporal patterns and trends in your data',
+                'rationale': 'Time-based analysis can reveal seasonality, trends, and forecasting opportunities',
+                'action': 'time_series_analysis'
+            })
+        return recommendations
+    def apply_recommendation(self, df: pd.DataFrame, recommendation: Dict[str, str]):
+        """Apply a smart recommendation"""
+        action = recommendation.get('action')
+        if action == 'missing_analysis':
+            self.perform_missing_analysis(df)
+        elif action == 'correlation_analysis':
+            self.perform_correlation_analysis(df)
+        elif action == 'outlier_analysis':
+            self.perform_outlier_analysis(df)
+        elif action == 'segmentation_analysis':
+            self.perform_segmentation_analysis(df)
+        elif action == 'time_series_analysis':
+            self.perform_time_series_analysis(df)
+    def perform_missing_analysis(self, df: pd.DataFrame):
+        """Perform detailed missing data analysis"""
+        st.markdown("#### 🔍 Missing Data Analysis Results")
+        missing_data = df.isnull().sum()
+        missing_percent = (missing_data / len(df)) * 100
+        missing_df = pd.DataFrame({
+            'Column': missing_data.index,
+            'Missing_Count': missing_data.values,
+            'Missing_Percentage': missing_percent.values
+        })
+        missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Percentage', ascending=False)
+        if len(missing_df) > 0:
+            fig = px.bar(missing_df, x='Column', y='Missing_Percentage',
+                        title='Missing Data by Column (%)')
+            fig.update_layout(height=400)
+            st.plotly_chart(fig, use_container_width=True)
+            st.dataframe(missing_df, use_container_width=True)
+        else:
+            st.success("✅ No missing data found in the dataset!")
+    def perform_correlation_analysis(self, df: pd.DataFrame):
+        """Perform detailed correlation analysis"""
+        st.markdown("#### 🔗 Advanced Correlation Analysis")
+        numeric_df = df.select_dtypes(include=[np.number])
+        if len(numeric_df.columns) > 1:
+            corr_matrix = numeric_df.corr()
+            # Hierarchical clustering of correlations
+            from scipy.cluster.hierarchy import linkage, dendrogram
+            from scipy.spatial.distance import squareform
+            distance_matrix = 1 - np.abs(corr_matrix)
+            condensed_distances = squareform(distance_matrix, checks=False)
+            linkage_matrix = linkage(condensed_distances, method='average')
+            fig = go.Figure()
+            dendro = dendrogram(linkage_matrix, labels=corr_matrix.columns, no_plot=True)
+            # Create dendrogram plot
+            for i in range(len(dendro['icoord'])):
+                x = dendro['icoord'][i]
+                y = dendro['dcoord'][i]
+                fig.add_trace(go.Scatter(x=x, y=y, mode='lines',
+                                       line=dict(color='gold', width=2),
+                                       showlegend=False))
+            fig.update_layout(
+                title="Feature Clustering Dendrogram",
+                xaxis_title="Features",
+                yaxis_title="Distance",
+                height=400
+            )
+            st.plotly_chart(fig, use_container_width=True)
+    def render_trend_analysis(self, df: pd.DataFrame):
+        """Render trend analysis interface"""
+        st.markdown("### 📈 Trend Analysis")
+        date_cols = df.select_dtypes(include=['datetime64']).columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(date_cols) == 0:
+            st.warning("No datetime columns found. Try converting date columns to datetime format.")
+            # Offer to convert columns
+            potential_date_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
+            if potential_date_cols:
+                date_col = st.selectbox("Select date column to convert:", potential_date_cols)
+                if st.button("Convert to DateTime"):
+                    try:
+                        df[date_col] = pd.to_datetime(df[date_col])
+                        st.success(f"Converted {date_col} to datetime!")
+                        st.experimental_rerun()
+                    except Exception as e:
+                        st.error(f"Conversion failed: {str(e)}")
+            return
+        col1, col2 = st.columns(2)
+        with col1:
+            date_col = st.selectbox("Date Column", date_cols)
+        with col2:
+            value_col = st.selectbox("Value Column", numeric_cols)
+        if st.button("🔍 Analyze Trends"):
+            self.perform_trend_analysis(df, date_col, value_col)
+    def perform_trend_analysis(self, df: pd.DataFrame, date_col: str, value_col: str):
+        """Perform trend analysis"""
+        st.markdown("#### 📊 Trend Analysis Results")
+        # Time series plot
+        fig = px.line(df.sort_values(date_col), x=date_col, y=value_col,
+                     title=f'{value_col} Over Time')
+        fig.update_layout(height=400)
+        st.plotly_chart(fig, use_container_width=True)
+        # Rolling statistics
+        df_sorted = df.sort_values(date_col).copy()
+        df_sorted['7_day_avg'] = df_sorted[value_col].rolling(window=7, min_periods=1).mean()
+        df_sorted['30_day_avg'] = df_sorted[value_col].rolling(window=30, min_periods=1).mean()
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted[value_col],
+                               name='Original', mode='lines'))
+        fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['7_day_avg'],
+                               name='7-Day Average', mode='lines'))
+        fig.add_trace(go.Scatter(x=df_sorted[date_col], y=df_sorted['30_day_avg'],
+                               name='30-Day Average', mode='lines'))
+        fig.update_layout(title="Trend with Moving Averages", height=400)
+        st.plotly_chart(fig, use_container_width=True)
+    def render_anomaly_detection(self, df: pd.DataFrame):
+        """Render anomaly detection interface"""
+        st.markdown("### 🔍 Anomaly Detection")
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) == 0:
+            st.warning("No numeric columns found for anomaly detection.")
+            return
+        col1, col2 = st.columns(2)
+        with col1:
+            target_col = st.selectbox("Target Column", numeric_cols)
+        with col2:
+            method = st.selectbox("Detection Method",
+                                ["IQR", "Z-Score", "Isolation Forest", "Local Outlier Factor"])
+        if st.button("🎯 Detect Anomalies"):
+            self.perform_anomaly_detection(df, target_col, method)
+    def perform_anomaly_detection(self, df: pd.DataFrame, target_col: str, method: str):
+        """Perform anomaly detection"""
+        st.markdown("#### 🎯 Anomaly Detection Results")
+        data = df[target_col].dropna()
+        anomalies = []
+        if method == "IQR":
+            Q1 = data.quantile(0.25)
+            Q3 = data.quantile(0.75)
+            IQR = Q3 - Q1
+            lower_bound = Q1 - 1.5 * IQR
+            upper_bound = Q3 + 1.5 * IQR
+            anomalies = df[(df[target_col] < lower_bound) | (df[target_col] > upper_bound)]
+        elif method == "Z-Score":
+            z_scores = np.abs((data - data.mean()) / data.std())
+            anomalies = df[z_scores > 3]
+        elif method == "Isolation Forest":
+            from sklearn.ensemble import IsolationForest
+            iso_forest = IsolationForest(contamination=0.1, random_state=42)
+            outlier_labels = iso_forest.fit_predict(data.values.reshape(-1, 1))
+            anomalies = df[outlier_labels == -1]
+        elif method == "Local Outlier Factor":
+            from sklearn.neighbors import LocalOutlierFactor
+            lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
+            outlier_labels = lof.fit_predict(data.values.reshape(-1, 1))
+            anomalies = df[outlier_labels == -1]
+        # Visualization
+        fig = go.Figure()
+        # Normal data points
+        normal_data = df[~df.index.isin(anomalies.index)]
+        fig.add_trace(go.Scatter(
+            x=normal_data.index,
+            y=normal_data[target_col],
+            mode='markers',
+            name='Normal',
+            marker=dict(color='blue', size=6)
+        ))
+        # Anomalies
+        fig.add_trace(go.Scatter(
+            x=anomalies.index,
+            y=anomalies[target_col],
+            mode='markers',
+            name='Anomalies',
+            marker=dict(color='red', size=10, symbol='x')
+        ))
+        fig.update_layout(
+            title=f'Anomaly Detection: {target_col} ({method})',
+            xaxis_title='Index',
+            yaxis_title=target_col,
+            height=500
+        )
+        st.plotly_chart(fig, use_container_width=True)
+        # Summary
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Total Data Points", len(df))
+        with col2:
+            st.metric("Anomalies Found", len(anomalies))
+        with col3:
+            st.metric("Anomaly Rate", f"{len(anomalies)/len(df)*100:.2f}%")
+        if len(anomalies) > 0:
+            with st.expander("🔍 Anomaly Details"):
+                st.dataframe(anomalies[[target_col]], use_container_width=True)
+    def render_report_generator(self, df: pd.DataFrame):
+        """Render automated report generator"""
+        st.markdown("### 📋 Automated Report Generator")
+        report_type = st.selectbox(
+            "Report Type",
+            ["Executive Summary", "Technical Analysis", "Data Quality Report", "Custom Report"]
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            include_charts = st.checkbox("Include Charts", value=True)
+        with col2:
+            include_recommendations = st.checkbox("Include Recommendations", value=True)
+        if st.button("📄 Generate Report"):
+            report_content = self.generate_report(df, report_type, include_charts, include_recommendations)
+            # Display report
+            st.markdown("#### 📊 Generated Report")
+            st.markdown(report_content)
+            # Download option
+            self.create_download_link(report_content, f"neural_analyst_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.md")
+    def generate_report(self, df: pd.DataFrame, report_type: str, include_charts: bool, include_recommendations: bool) -> str:
+        """Generate automated report"""
+        report = f"""
+# Neural Data Analyst Report
+**Generated on:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
+**Report Type:** {report_type}
+## Dataset Overview
+- **Total Rows:** {len(df):,}
+- **Total Columns:** {len(df.columns)}
+- **Memory Usage:** {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB
+- **Missing Values:** {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)
+## Column Information
+"""
+        # Column details
+        for col in df.columns:
+            dtype = str(df[col].dtype)
+            null_count = df[col].isnull().sum()
+            unique_count = df[col].nunique()
+            report += f"- **{col}** ({dtype}): {null_count} missing, {unique_count} unique values\n"
+        # Numeric summary
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        if len(numeric_cols) > 0:
+            report += "\n## Numeric Summary\n"
+            summary_stats = df[numeric_cols].describe()
+            report += summary_stats.to_markdown()
+        # Key insights
+        if include_recommendations:
+            report += "\n## Key Insights & Recommendations\n"
+            recommendations = self.generate_analysis_recommendations(df)
+            for i, rec in enumerate(recommendations[:5], 1):
+                report += f"{i}. **{rec['title']}:** {rec['description']}\n"
+        return report
+    def create_download_link(self, content: str, filename: str):
+        """Create download link for report"""
+        b64 = base64.b64encode(content.encode()).decode()
+        href = f'<a href="data:text/markdown;base64,{b64}" download="{filename}">📥 Download Report</a>'
+        st.markdown(href, unsafe_allow_html=True)
+    def render_data_comparison_tool(self):
+        """Render data comparison tool for multiple datasets"""
+        st.markdown("## ⚖️ Data Comparison Tool")
+        st.markdown("Upload multiple datasets to compare their characteristics:")
+        uploaded_files = st.file_uploader(
+            "Choose CSV files for comparison",
+            type=['csv'],
+            accept_multiple_files=True
+        )
+        if len(uploaded_files) >= 2:
+            datasets = {}
+            for file in uploaded_files:
+                try:
+                    df = pd.read_csv(file)
+                    datasets[file.name] = df
+                except Exception as e:
+                    st.error(f"Error loading {file.name}: {str(e)}")
+            if len(datasets) >= 2:
+                self.perform_dataset_comparison(datasets)
+    def perform_dataset_comparison(self, datasets: Dict[str, pd.DataFrame]):
+        """Perform comparison between multiple datasets"""
+        st.markdown("### 📊 Dataset Comparison Results")
+        # Basic comparison table
+        comparison_data = []
+        for name, df in datasets.items():
+            comparison_data.append({
+                'Dataset': name,
+                'Rows': len(df),
+                'Columns': len(df.columns),
+                'Numeric Columns': len(df.select_dtypes(include=[np.number]).columns),
+                'Text Columns': len(df.select_dtypes(include=['object']).columns),
+                'Missing Values': df.isnull().sum().sum(),
+                'Memory (MB)': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f}"
+            })
+        comparison_df = pd.DataFrame(comparison_data)
+        st.dataframe(comparison_df, use_container_width=True)
+        # Visual comparison
+        fig = make_subplots(
+            rows=2, cols=2,
+            subplot_titles=['Rows Comparison', 'Columns Comparison',
+                          'Missing Values', 'Memory Usage'],
+            specs=[[{"type": "bar"}, {"type": "bar"}],
+                   [{"type": "bar"}, {"type": "bar"}]]
+        )
+        names = list(datasets.keys())
+        # Rows comparison
+        fig.add_trace(
+            go.Bar(x=names, y=[len(datasets[name]) for name in names], name="Rows"),
+            row=1, col=1
+        )
+        # Columns comparison
+        fig.add_trace(
+            go.Bar(x=names, y=[len(datasets[name].columns) for name in names], name="Columns"),
+            row=1, col=2
+        )
+        # Missing values comparison
+        fig.add_trace(
+            go.Bar(x=names, y=[datasets[name].isnull().sum().sum() for name in names], name="Missing"),
+            row=2, col=1
+        )
+        # Memory usage comparison
+        fig.add_trace(
+            go.Bar(x=names, y=[datasets[name].memory_usage(deep=True).sum() / 1024**2 for name in names], name="Memory"),
+            row=2, col=2
+        )
+        fig.update_layout(height=600, showlegend=False, title_text="Dataset Comparison Dashboard")
+        st.plotly_chart(fig, use_container_width=True)
+    def render_data_profiling_tool(self, df: pd.DataFrame):
+        """Render comprehensive data profiling tool"""
+        st.markdown("## 🔬 Data Profiling Tool")
+        if st.button("🚀 Generate Complete Data Profile"):
+            with st.spinner("Generating comprehensive data profile..."):
+                profile = self.generate_data_profile(df)
+                self.display_data_profile(profile)
+    def generate_data_profile(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Generate comprehensive data profile"""
+        profile = {
+            'overview': {},
+            'column_profiles': {},
+            'data_quality': {},
+            'relationships': {},
+            'recommendations': []
+        }
+        # Overview
+        profile['overview'] = {
+            'shape': df.shape,
+            'memory_usage': df.memory_usage(deep=True).sum(),
+            'dtypes': df.dtypes.value_counts().to_dict(),
+            'missing_cells': df.isnull().sum().sum(),
+            'duplicate_rows': df.duplicated().sum()
+        }
+        # Column profiles
+        for col in df.columns:
+            col_profile = {
+                'dtype': str(df[col].dtype),
+                'null_count': df[col].isnull().sum(),
+                'null_percentage': df[col].isnull().sum() / len(df) * 100,
+                'unique_count': df[col].nunique(),
+                'unique_percentage': df[col].nunique() / len(df) * 100
+            }
+            if df[col].dtype in ['int64', 'float64']:
+                col_profile.update({
+                    'min': df[col].min(),
+                    'max': df[col].max(),
+                    'mean': df[col].mean(),
+                    'std': df[col].std(),
+                    'skewness': df[col].skew(),
+                    'kurtosis': df[col].kurtosis()
+                })
+            else:
+                col_profile.update({
+                    'most_frequent': df[col].mode().iloc[0] if len(df[col].mode()) > 0 else None,
+                    'most_frequent_count': df[col].value_counts().iloc[0] if len(df[col].value_counts()) > 0 else 0
+                })
+            profile['column_profiles'][col] = col_profile
+        return profile
+    def display_data_profile(self, profile: Dict[str, Any]):
+        """Display data profile results"""
+        st.markdown("### 📊 Complete Data Profile")
+        # Overview metrics
+        overview = profile['overview']
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Rows", f"{overview['shape'][0]:,}")
+        with col2:
+            st.metric("Columns", overview['shape'][1])
+        with col3:
+            st.metric("Missing Cells", f"{overview['missing_cells']:,}")
+        with col4:
+            st.metric("Duplicates", f"{overview['duplicate_rows']:,}")
+        # Column details table
+        st.markdown("#### 📋 Column Details")
+        col_data = []
+        for col, details in profile['column_profiles'].items():
+            col_data.append({
+                'Column': col,
+                'Type': details['dtype'],
+                'Missing %': f"{details['null_percentage']:.1f}%",
+                'Unique %': f"{details['unique_percentage']:.1f}%",
+                'Details': f"Min: {details.get('min', 'N/A')}, Max: {details.get('max', 'N/A')}" if 'min' in details else f"Most Frequent: {details.get('most_frequent', 'N/A')}"
+            })
+        col_df = pd.DataFrame(col_data)
+        st.dataframe(col_df, use_container_width=True)
+# Usage in main app
+def integrate_advanced_features():
+    """Integration function for advanced features"""
+    return """
+    # Add this to your main.py file:
+    from advanced_features import AdvancedFeatures
+    # In your NeuralDataAnalyst class:
+    def __init__(self):
+        # ... existing code ...
+        self.advanced_features = AdvancedFeatures(self.db_manager)
+    # Add this after your existing data upload section:
+    if st.session_state.uploaded_data is not None:
+        if st.button("🔬 Advanced Analytics", key="advanced_analytics"):
+            self.advanced_features.render_advanced_analytics_dashboard(st.session_state.uploaded_data)
+        if st.button("🔍 Data Profiling", key="data_profiling"):
+            self.advanced_features.render_data_profiling_tool(st.session_state.uploaded_data)
+    # Add dataset comparison in sidebar:
+    with st.sidebar:
+        st.markdown("---")
+        if st.button("⚖️ Compare Datasets"):
+            self.advanced_features.render_data_comparison_tool()
+    """

database_manager.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import json
+import os
+from datetime import datetime
+from typing import Dict, List, Any
+class DatabaseManager:
+    """Simple file-based database manager for storing analysis history"""
+    def __init__(self, db_file: str = "analysis_history.json"):
+        """Initialize the database manager
+        Args:
+            db_file: Path to the JSON file to store analysis history
+        """
+        self.db_file = db_file
+        self.ensure_db_file_exists()
+    def ensure_db_file_exists(self):
+        """Ensure the database file exists"""
+        if not os.path.exists(self.db_file):
+            with open(self.db_file, 'w') as f:
+                json.dump([], f)
+    def save_analysis(self, analysis_record: Dict[str, Any]) -> bool:
+        """Save an analysis record to the database
+        Args:
+            analysis_record: Dictionary containing analysis data
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            # Read existing data
+            existing_data = self.load_all_data()
+            # Add timestamp if not present
+            if 'timestamp' not in analysis_record:
+                analysis_record['timestamp'] = datetime.now().isoformat()
+            # Append new record
+            existing_data.append(analysis_record)
+            # Write back to file
+            with open(self.db_file, 'w') as f:
+                json.dump(existing_data, f, indent=2, default=str)
+            return True
+        except Exception as e:
+            print(f"Error saving analysis: {e}")
+            return False
+    def get_history(self, session_id: str = None, limit: int = 100) -> List[Dict[str, Any]]:
+        """Get analysis history
+        Args:
+            session_id: Optional session ID to filter by
+            limit: Maximum number of records to return
+        Returns:
+            List of analysis records
+        """
+        try:
+            data = self.load_all_data()
+            # Filter by session_id if provided
+            if session_id:
+                data = [record for record in data if record.get('session_id') == session_id]
+            # Sort by timestamp (newest first)
+            data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
+            # Apply limit
+            return data[:limit]
+        except Exception as e:
+            print(f"Error getting history: {e}")
+            return []
+    def clear_history(self, session_id: str = None) -> bool:
+        """Clear analysis history
+        Args:
+            session_id: Optional session ID to clear specific session data
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            if session_id:
+                # Clear only specific session data
+                data = self.load_all_data()
+                filtered_data = [record for record in data if record.get('session_id') != session_id]
+                with open(self.db_file, 'w') as f:
+                    json.dump(filtered_data, f, indent=2, default=str)
+            else:
+                # Clear all data
+                with open(self.db_file, 'w') as f:
+                    json.dump([], f)
+            return True
+        except Exception as e:
+            print(f"Error clearing history: {e}")
+            return False
+    def load_all_data(self) -> List[Dict[str, Any]]:
+        """Load all data from the database file
+        Returns:
+            List of all records
+        """
+        try:
+            with open(self.db_file, 'r') as f:
+                data = json.load(f)
+                return data if isinstance(data, list) else []
+        except (FileNotFoundError, json.JSONDecodeError):
+            return []
+    def get_analysis_by_type(self, analysis_type: str, session_id: str = None) -> List[Dict[str, Any]]:
+        """Get analyses by type
+        Args:
+            analysis_type: Type of analysis (e.g., 'EDA', 'Single Query Analysis')
+            session_id: Optional session ID to filter by
+        Returns:
+            List of matching analysis records
+        """
+        try:
+            data = self.load_all_data()
+            # Filter by type
+            filtered_data = [record for record in data if record.get('type') == analysis_type]
+            # Filter by session_id if provided
+            if session_id:
+                filtered_data = [record for record in filtered_data if record.get('session_id') == session_id]
+            # Sort by timestamp (newest first)
+            filtered_data.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
+            return filtered_data
+        except Exception as e:
+            print(f"Error getting analysis by type: {e}")
+            return []
+    def get_stats(self) -> Dict[str, Any]:
+        """Get database statistics
+        Returns:
+            Dictionary with database statistics
+        """
+        try:
+            data = self.load_all_data()
+            stats = {
+                'total_records': len(data),
+                'unique_sessions': len(set(record.get('session_id', '') for record in data)),
+                'analysis_types': {},
+                'oldest_record': None,
+                'newest_record': None
+            }
+            # Count analysis types
+            for record in data:
+                analysis_type = record.get('type', 'Unknown')
+                stats['analysis_types'][analysis_type] = stats['analysis_types'].get(analysis_type, 0) + 1
+            # Find oldest and newest records
+            if data:
+                timestamps = [record.get('timestamp', '') for record in data if record.get('timestamp')]
+                if timestamps:
+                    timestamps.sort()
+                    stats['oldest_record'] = timestamps[0]
+                    stats['newest_record'] = timestamps[-1]
+            return stats
+        except Exception as e:
+            print(f"Error getting stats: {e}")
+            return {
+                'total_records': 0,
+                'unique_sessions': 0,
+                'analysis_types': {},
+                'oldest_record': None,
+                'newest_record': None,
+                'error': str(e)
+            }
+    def backup_database(self, backup_file: str = None) -> bool:
+        """Create a backup of the database
+        Args:
+            backup_file: Path for backup file. If None, uses timestamp-based name
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            if backup_file is None:
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                backup_file = f"analysis_history_backup_{timestamp}.json"
+            data = self.load_all_data()
+            with open(backup_file, 'w') as f:
+                json.dump(data, f, indent=2, default=str)
+            return True
+        except Exception as e:
+            print(f"Error creating backup: {e}")
+            return False
+    def restore_from_backup(self, backup_file: str) -> bool:
+        """Restore database from backup
+        Args:
+            backup_file: Path to backup file
+        Returns:
+            bool: True if successful, False otherwise
+        """
+        try:
+            if not os.path.exists(backup_file):
+                print(f"Backup file not found: {backup_file}")
+                return False
+            with open(backup_file, 'r') as f:
+                data = json.load(f)
+            # Validate data format
+            if not isinstance(data, list):
+                print("Invalid backup file format")
+                return False
+            # Write to main database file
+            with open(self.db_file, 'w') as f:
+                json.dump(data, f, indent=2, default=str)
+            return True
+        except Exception as e:
+            print(f"Error restoring from backup: {e}")
+            return False
+    def delete_old_records(self, days_old: int = 30) -> int:
+        """Delete records older than specified days
+        Args:
+            days_old: Number of days to keep records
+        Returns:
+            int: Number of records deleted
+        """
+        try:
+            from datetime import datetime, timedelta
+            cutoff_date = datetime.now() - timedelta(days=days_old)
+            cutoff_str = cutoff_date.isoformat()
+            data = self.load_all_data()
+            original_count = len(data)
+            # Filter out old records
+            filtered_data = []
+            for record in data:
+                record_time = record.get('timestamp', '')
+                if record_time >= cutoff_str:
+                    filtered_data.append(record)
+            # Write filtered data back
+            with open(self.db_file, 'w') as f:
+                json.dump(filtered_data, f, indent=2, default=str)
+            deleted_count = original_count - len(filtered_data)
+            return deleted_count
+        except Exception as e:
+            print(f"Error deleting old records: {e}")
+            return 0

eda_analyzer.py ADDED Viewed

	@@ -0,0 +1,593 @@

+import pandas as pd
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import plotly.figure_factory as ff
+from typing import Dict, List, Any, Tuple
+import warnings
+warnings.filterwarnings('ignore')
+# Import scipy with error handling
+try:
+    from scipy import stats
+    from scipy.stats import chi2_contingency
+    SCIPY_AVAILABLE = True
+except ImportError:
+    SCIPY_AVAILABLE = False
+class EDAAnalyzer:
+    """Comprehensive Exploratory Data Analysis with advanced visualizations"""
+    def __init__(self):
+        self.color_palette = [
+            '#FFD700', '#FF6B6B', '#4ECDC4', '#45B7D1',
+            '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8'
+        ]
+    def perform_complete_eda(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Perform comprehensive EDA analysis"""
+        try:
+            results = {
+                'overview': self.generate_overview(df),
+                'distributions': self.analyze_distributions(df),
+                'correlations': self.analyze_correlations(df),
+                'insights': self.generate_insights(df),
+                'data_quality': self.assess_data_quality(df),
+                'advanced_analysis': self.perform_advanced_analysis(df)
+            }
+            return results
+        except Exception as e:
+            # Return basic results if advanced analysis fails
+            return {
+                'overview': self.generate_overview(df),
+                'distributions': {},
+                'correlations': {},
+                'insights': [{'title': 'Analysis Error', 'description': f'Error during analysis: {str(e)}'}],
+                'data_quality': {},
+                'advanced_analysis': {}
+            }
+    def generate_overview(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Generate dataset overview"""
+        try:
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            datetime_cols = df.select_dtypes(include=['datetime64']).columns
+            overview = {
+                'total_rows': len(df),
+                'total_columns': len(df.columns),
+                'numeric_columns': len(numeric_cols),
+                'categorical_columns': len(categorical_cols),
+                'datetime_columns': len(datetime_cols),
+                'memory_usage': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB",
+                'duplicate_rows': df.duplicated().sum(),
+                'missing_values_total': df.isnull().sum().sum()
+            }
+            if len(numeric_cols) > 0:
+                overview['summary_stats'] = df[numeric_cols].describe()
+            return overview
+        except Exception as e:
+            return {
+                'total_rows': len(df) if df is not None else 0,
+                'total_columns': len(df.columns) if df is not None else 0,
+                'numeric_columns': 0,
+                'categorical_columns': 0,
+                'datetime_columns': 0,
+                'memory_usage': '0 MB',
+                'duplicate_rows': 0,
+                'missing_values_total': 0,
+                'error': str(e)
+            }
+    def analyze_distributions(self, df: pd.DataFrame) -> Dict[str, go.Figure]:
+        """Analyze data distributions with multiple chart types"""
+        distributions = {}
+        try:
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            # Numeric distributions
+            if len(numeric_cols) > 0:
+                distributions.update(self.create_numeric_distributions(df, numeric_cols))
+            # Categorical distributions
+            if len(categorical_cols) > 0:
+                distributions.update(self.create_categorical_distributions(df, categorical_cols))
+        except Exception as e:
+            distributions['error'] = self.create_error_plot(f"Distribution analysis failed: {str(e)}")
+        return distributions
+    def create_error_plot(self, error_message: str) -> go.Figure:
+        """Create an error plot when analysis fails"""
+        fig = go.Figure()
+        fig.add_annotation(
+            text=error_message,
+            xref="paper", yref="paper",
+            x=0.5, y=0.5, xanchor='center', yanchor='middle',
+            showarrow=False,
+            font=dict(size=16, color="red")
+        )
+        fig.update_layout(
+            title="Analysis Error",
+            showlegend=False,
+            plot_bgcolor='rgba(0,0,0,0)',
+            paper_bgcolor='rgba(0,0,0,0)',
+            font=dict(color='white')
+        )
+        return fig
+    def create_numeric_distributions(self, df: pd.DataFrame, numeric_cols: List[str]) -> Dict[str, go.Figure]:
+        """Create numeric distribution plots"""
+        plots = {}
+        try:
+            # Multi-histogram plot
+            if len(numeric_cols) <= 6:
+                rows = (len(numeric_cols) + 2) // 3
+                fig = make_subplots(
+                    rows=rows, cols=3,
+                    subplot_titles=list(numeric_cols),
+                    vertical_spacing=0.08
+                )
+                for i, col in enumerate(numeric_cols):
+                    row = (i // 3) + 1
+                    col_pos = (i % 3) + 1
+                    # Filter out non-finite values
+                    data = df[col].dropna()
+                    if len(data) > 0:
+                        fig.add_trace(
+                            go.Histogram(
+                                x=data,
+                                name=col,
+                                marker_color=self.color_palette[i % len(self.color_palette)],
+                                opacity=0.7,
+                                showlegend=False
+                            ),
+                            row=row, col=col_pos
+                        )
+                fig.update_layout(
+                    title="📊 Numeric Distributions Overview",
+                    height=300 * rows,
+                    plot_bgcolor='rgba(0,0,0,0)',
+                    paper_bgcolor='rgba(0,0,0,0)',
+                    font=dict(color='white')
+                )
+                plots['numeric_histograms'] = fig
+            # Box plots for outlier detection
+            if len(numeric_cols) > 0:
+                fig = go.Figure()
+                for i, col in enumerate(numeric_cols[:8]):  # Limit to 8 columns
+                    data = df[col].dropna()
+                    if len(data) > 0:
+                        fig.add_trace(go.Box(
+                            y=data,
+                            name=col,
+                            marker_color=self.color_palette[i % len(self.color_palette)]
+                        ))
+                fig.update_layout(
+                    title="📦 Box Plots - Outlier Detection",
+                    height=500,
+                    plot_bgcolor='rgba(0,0,0,0)',
+                    paper_bgcolor='rgba(0,0,0,0)',
+                    font=dict(color='white')
+                )
+                plots['box_plots'] = fig
+            # Violin plots for distribution shapes
+            if len(numeric_cols) > 0:
+                fig = go.Figure()
+                for i, col in enumerate(numeric_cols[:6]):
+                    data = df[col].dropna()
+                    if len(data) > 1:  # Need at least 2 points for violin plot
+                        fig.add_trace(go.Violin(
+                            y=data,
+                            name=col,
+                            box_visible=True,
+                            meanline_visible=True,
+                            fillcolor=self.color_palette[i % len(self.color_palette)],
+                            opacity=0.6
+                        ))
+                fig.update_layout(
+                    title="🎻 Violin Plots - Distribution Shapes",
+                    height=500,
+                    plot_bgcolor='rgba(0,0,0,0)',
+                    paper_bgcolor='rgba(0,0,0,0)',
+                    font=dict(color='white')
+                )
+                plots['violin_plots'] = fig
+        except Exception as e:
+            plots['numeric_error'] = self.create_error_plot(f"Numeric distribution error: {str(e)}")
+        return plots
+    def create_categorical_distributions(self, df: pd.DataFrame, categorical_cols: List[str]) -> Dict[str, go.Figure]:
+        """Create categorical distribution plots"""
+        plots = {}
+        try:
+            # Bar charts for categorical variables
+            for i, col in enumerate(categorical_cols[:4]):  # Limit to 4 columns
+                value_counts = df[col].value_counts().head(15)  # Top 15 categories
+                if len(value_counts) > 0:
+                    fig = go.Figure(data=[
+                        go.Bar(
+                            x=value_counts.index.astype(str),
+                            y=value_counts.values,
+                            marker_color=self.color_palette[i % len(self.color_palette)],
+                            text=value_counts.values,
+                            textposition='auto'
+                        )
+                    ])
+                    fig.update_layout(
+                        title=f"📊 {col} - Value Distribution",
+                        xaxis_title=col,
+                        yaxis_title="Count",
+                        height=400,
+                        plot_bgcolor='rgba(0,0,0,0)',
+                        paper_bgcolor='rgba(0,0,0,0)',
+                        font=dict(color='white')
+                    )
+                    plots[f'categorical_{col}'] = fig
+            # Pie chart for first categorical variable
+            if len(categorical_cols) > 0:
+                col = categorical_cols[0]
+                value_counts = df[col].value_counts().head(10)
+                if len(value_counts) > 0:
+                    fig = go.Figure(data=[go.Pie(
+                        labels=value_counts.index.astype(str),
+                        values=value_counts.values,
+                        hole=0.3,
+                        marker_colors=self.color_palette
+                    )])
+                    fig.update_layout(
+                        title=f"🥧 {col} - Proportion Analysis",
+                        height=500,
+                        plot_bgcolor='rgba(0,0,0,0)',
+                        paper_bgcolor='rgba(0,0,0,0)',
+                        font=dict(color='white')
+                    )
+                    plots['pie_chart'] = fig
+        except Exception as e:
+            plots['categorical_error'] = self.create_error_plot(f"Categorical distribution error: {str(e)}")
+        return plots
+    def analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Analyze correlations between variables"""
+        correlations = {}
+        try:
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) > 1:
+                # Correlation matrix
+                corr_matrix = df[numeric_cols].corr()
+                # Heatmap
+                fig = go.Figure(data=go.Heatmap(
+                    z=corr_matrix.values,
+                    x=corr_matrix.columns,
+                    y=corr_matrix.columns,
+                    colorscale='RdYlBu',
+                    zmid=0,
+                    text=np.round(corr_matrix.values, 2),
+                    texttemplate="%{text}",
+                    textfont={"size": 10},
+                    colorbar=dict(title="Correlation")
+                ))
+                fig.update_layout(
+                    title="🔥 Correlation Heatmap",
+                    height=max(400, len(numeric_cols) * 30),
+                    plot_bgcolor='rgba(0,0,0,0)',
+                    paper_bgcolor='rgba(0,0,0,0)',
+                    font=dict(color='white')
+                )
+                correlations['heatmap'] = fig
+                # Top correlations
+                mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+                corr_matrix_masked = corr_matrix.mask(mask)
+                # Get top positive and negative correlations
+                corr_pairs = []
+                for i in range(len(corr_matrix_masked.columns)):
+                    for j in range(len(corr_matrix_masked.columns)):
+                        if pd.notna(corr_matrix_masked.iloc[i, j]):
+                            corr_pairs.append({
+                                'Variable 1': corr_matrix_masked.columns[i],
+                                'Variable 2': corr_matrix_masked.columns[j],
+                                'Correlation': corr_matrix_masked.iloc[i, j]
+                            })
+                if corr_pairs:
+                    corr_df = pd.DataFrame(corr_pairs)
+                    corr_df = corr_df.reindex(corr_df['Correlation'].abs().sort_values(ascending=False).index)
+                    correlations['top_correlations'] = corr_df.head(10)
+                # Scatter plot matrix for top correlated variables
+                if len(numeric_cols) >= 2:
+                    top_corr_cols = corr_df.head(3)[['Variable 1', 'Variable 2']].values.flatten()
+                    unique_cols = list(set(top_corr_cols))[:4]  # Max 4 variables
+                    if len(unique_cols) >= 2:
+                        try:
+                            fig = px.scatter_matrix(
+                                df[unique_cols].dropna(),
+                                dimensions=unique_cols,
+                                color_discrete_sequence=self.color_palette
+                            )
+                            fig.update_layout(
+                                title="🎯 Scatter Plot Matrix - Top Correlated Variables",
+                                height=600,
+                                plot_bgcolor='rgba(0,0,0,0)',
+                                paper_bgcolor='rgba(0,0,0,0)',
+                                font=dict(color='white')
+                            )
+                            correlations['scatter_matrix'] = fig
+                        except Exception:
+                            pass  # Skip if scatter matrix fails
+        except Exception as e:
+            correlations['error'] = f"Correlation analysis failed: {str(e)}"
+        return correlations
+    def generate_insights(self, df: pd.DataFrame) -> List[Dict[str, str]]:
+        """Generate AI-powered insights about the data"""
+        insights = []
+        try:
+            # Basic statistics insights
+            insights.append({
+                'title': '📊 Dataset Overview',
+                'description': f"Dataset contains {len(df):,} rows and {len(df.columns)} columns. "
+                              f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB. "
+                              f"Missing values: {df.isnull().sum().sum():,} ({df.isnull().sum().sum() / df.size * 100:.1f}%)."
+            })
+            # Numeric columns insights
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            if len(numeric_cols) > 0:
+                try:
+                    # Find columns with high variance
+                    variances = df[numeric_cols].var().sort_values(ascending=False)
+                    high_var_col = variances.index[0]
+                    insights.append({
+                        'title': '📈 Variance Analysis',
+                        'description': f"'{high_var_col}' shows the highest variance ({variances.iloc[0]:.2f}), "
+                                      f"indicating significant spread in values. This column might contain outliers "
+                                      f"or represent a key differentiating factor in your dataset."
+                    })
+                    # Skewness analysis
+                    skewed_cols = []
+                    for col in numeric_cols:
+                        try:
+                            skewness = df[col].skew()
+                            if abs(skewness) > 1:
+                                skewed_cols.append((col, skewness))
+                        except:
+                            continue
+                    if skewed_cols:
+                        insights.append({
+                            'title': '📏 Distribution Skewness',
+                            'description': f"Found {len(skewed_cols)} heavily skewed columns. "
+                                          f"Most skewed: '{skewed_cols[0][0]}' (skewness: {skewed_cols[0][1]:.2f}). "
+                                          f"Consider log transformation or outlier treatment for better modeling."
+                        })
+                except Exception:
+                    pass
+            # Categorical insights
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            if len(categorical_cols) > 0:
+                try:
+                    cardinalities = []
+                    for col in categorical_cols:
+                        unique_count = df[col].nunique()
+                        cardinalities.append((col, unique_count))
+                    cardinalities.sort(key=lambda x: x[1], reverse=True)
+                    insights.append({
+                        'title': '🏷️ Categorical Analysis',
+                        'description': f"'{cardinalities[0][0]}' has the highest cardinality ({cardinalities[0][1]} unique values). "
+                                      f"High cardinality columns might need encoding strategies for machine learning. "
+                                      f"Consider grouping rare categories or using embedding techniques."
+                    })
+                except Exception:
+                    pass
+            # Missing data patterns
+            try:
+                missing_data = df.isnull().sum()
+                missing_cols = missing_data[missing_data > 0].sort_values(ascending=False)
+                if len(missing_cols) > 0:
+                    insights.append({
+                        'title': '❓ Missing Data Patterns',
+                        'description': f"'{missing_cols.index[0]}' has the most missing values ({missing_cols.iloc[0]:,} - "
+                                      f"{missing_cols.iloc[0] / len(df) * 100:.1f}%). "
+                                      f"Analyze if missing data is random or systematic. "
+                                      f"Consider imputation strategies or feature engineering."
+                    })
+            except Exception:
+                pass
+            # Correlation insights
+            if len(numeric_cols) > 1:
+                try:
+                    corr_matrix = df[numeric_cols].corr()
+                    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+                    corr_matrix_masked = corr_matrix.mask(mask)
+                    max_corr = 0
+                    max_pair = None
+                    for i in range(len(corr_matrix_masked.columns)):
+                        for j in range(len(corr_matrix_masked.columns)):
+                            if pd.notna(corr_matrix_masked.iloc[i, j]):
+                                if abs(corr_matrix_masked.iloc[i, j]) > abs(max_corr):
+                                    max_corr = corr_matrix_masked.iloc[i, j]
+                                    max_pair = (corr_matrix_masked.columns[i], corr_matrix_masked.columns[j])
+                    if max_pair and abs(max_corr) > 0.5:
+                        insights.append({
+                            'title': '🔗 Strong Correlations',
+                            'description': f"Strong correlation found between '{max_pair[0]}' and '{max_pair[1]}' "
+                                          f"(r = {max_corr:.3f}). This suggests potential multicollinearity. "
+                                          f"Consider feature selection or dimensionality reduction techniques."
+                        })
+                except Exception:
+                    pass
+        except Exception as e:
+            insights.append({
+                'title': 'Analysis Error',
+                'description': f"Error generating insights: {str(e)}"
+            })
+        return insights
+    def assess_data_quality(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Assess data quality with visualizations"""
+        quality = {}
+        try:
+            # Missing values heatmap
+            if df.isnull().sum().sum() > 0:
+                missing_data = df.isnull().sum().sort_values(ascending=False)
+                missing_data = missing_data[missing_data > 0]
+                if len(missing_data) > 0:
+                    fig = go.Figure([go.Bar(
+                        x=missing_data.index,
+                        y=missing_data.values,
+                        marker_color='#FF6B6B',
+                        text=missing_data.values,
+                        textposition='auto'
+                    )])
+                    fig.update_layout(
+                        title="❓ Missing Values by Column",
+                        xaxis_title="Columns",
+                        yaxis_title="Missing Count",
+                        height=400,
+                        plot_bgcolor='rgba(0,0,0,0)',
+                        paper_bgcolor='rgba(0,0,0,0)',
+                        font=dict(color='white')
+                    )
+                    quality['missing_values'] = fig
+            # Data types distribution
+            dtype_counts = df.dtypes.value_counts()
+            if len(dtype_counts) > 0:
+                fig = go.Figure(data=[go.Pie(
+                    labels=[str(dtype) for dtype in dtype_counts.index],
+                    values=dtype_counts.values,
+                    hole=0.3,
+                    marker_colors=self.color_palette
+                )])
+                fig.update_layout(
+                    title="🔧 Data Types Distribution",
+                    height=400,
+                    plot_bgcolor='rgba(0,0,0,0)',
+                    paper_bgcolor='rgba(0,0,0,0)',
+                    font=dict(color='white')
+                )
+                quality['data_types'] = fig
+            # Duplicate analysis
+            duplicates = df.duplicated().sum()
+            if duplicates > 0:
+                quality['duplicates'] = {
+                    'count': duplicates,
+                    'percentage': duplicates / len(df) * 100
+                }
+        except Exception as e:
+            quality['error'] = f"Data quality assessment failed: {str(e)}"
+        return quality
+    def perform_advanced_analysis(self, df: pd.DataFrame) -> Dict[str, Any]:
+        """Perform advanced statistical analysis"""
+        advanced = {}
+        try:
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            # Outlier detection using IQR method
+            if len(numeric_cols) > 0:
+                outlier_counts = {}
+                for col in numeric_cols:
+                    try:
+                        data = df[col].dropna()
+                        if len(data) > 0:
+                            Q1 = data.quantile(0.25)
+                            Q3 = data.quantile(0.75)
+                            IQR = Q3 - Q1
+                            lower_bound = Q1 - 1.5 * IQR
+                            upper_bound = Q3 + 1.5 * IQR
+                            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
+                            outlier_counts[col] = len(outliers)
+                    except Exception:
+                        outlier_counts[col] = 0
+                if outlier_counts:
+                    outlier_df = pd.DataFrame(list(outlier_counts.items()),
+                                            columns=['Column', 'Outlier_Count'])
+                    outlier_df = outlier_df.sort_values('Outlier_Count', ascending=False)
+                    advanced['outliers'] = outlier_df
+            # Statistical tests
+            categorical_cols = df.select_dtypes(include=['object', 'category']).columns
+            if len(categorical_cols) >= 2 and SCIPY_AVAILABLE:
+                try:
+                    col1, col2 = categorical_cols[0], categorical_cols[1]
+                    contingency_table = pd.crosstab(df[col1], df[col2])
+                    if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
+                        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
+                        advanced['chi_square_test'] = {
+                            'variables': [col1, col2],
+                            'chi2_statistic': chi2,
+                            'p_value': p_value,
+                            'interpretation': 'Dependent' if p_value < 0.05 else 'Independent'
+                        }
+                except Exception:
+                    pass  # Skip if test fails
+        except Exception as e:
+            advanced['error'] = f"Advanced analysis failed: {str(e)}"
+        return advanced

main.py ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+streamlit>=1.28.0
+pandas>=1.5.0
+numpy>=1.24.0
+plotly>=5.15.0
+requests>=2.31.0
+faiss-cpu>=1.7.4
+scipy>=1.10.0
+seaborn>=0.12.0
+sentence-transformers>=2.2.0
+scikit-learn>=1.3.0
+pathlib2>=2.3.7
+python-dotenv>=1.0.0

setup.bat ADDED Viewed

File without changes

setup.sh ADDED Viewed

File without changes

test.py ADDED Viewed

	@@ -0,0 +1,175 @@

+#!/usr/bin/env python3
+"""
+Test script to verify the Neural Data Analyst application works correctly
+Run this to check if all dependencies are available and create sample files
+"""
+import sys
+import os
+from pathlib import Path
+def check_dependencies():
+    """Check if all required dependencies are installed"""
+    print("🔍 Checking dependencies...")
+    required_packages = [
+        'streamlit',
+        'pandas',
+        'numpy',
+        'plotly',
+        'requests'
+    ]
+    missing_packages = []
+    for package in required_packages:
+        try:
+            __import__(package)
+            print(f"✅ {package}")
+        except ImportError:
+            print(f"❌ {package} - MISSING")
+            missing_packages.append(package)
+    # Optional packages
+    optional_packages = [
+        'scipy',
+        'python-dotenv'
+    ]
+    print("\n🔍 Checking optional dependencies...")
+    for package in optional_packages:
+        try:
+            __import__(package.replace('-', '_'))
+            print(f"✅ {package} (optional)")
+        except ImportError:
+            print(f"⚠️  {package} (optional) - not installed")
+    if missing_packages:
+        print(f"\n❌ Missing required packages: {', '.join(missing_packages)}")
+        print("Install them with: pip install " + " ".join(missing_packages))
+        return False
+    else:
+        print("\n✅ All required dependencies are installed!")
+        return True
+def create_sample_files():
+    """Create sample configuration files"""
+    print("\n📁 Creating sample files...")
+    # Create .env file template
+    env_content = """# Groq API Configuration
+# Get your API key from: https://console.groq.com/keys
+GROQ_API_KEY=your_groq_api_key_here
+# Optional: Set other environment variables
+# DEBUG=True
+"""
+    env_file = Path('.env.template')
+    if not env_file.exists():
+        with open(env_file, 'w') as f:
+            f.write(env_content)
+        print(f"✅ Created {env_file}")
+    else:
+        print(f"ℹ️  {env_file} already exists")
+    # Create sample CSV data
+    sample_csv = Path('sample_data.csv')
+    if not sample_csv.exists():
+        csv_content = """customer_id,customer_name,product,sales_amount,order_date,region,sales_rep
+1,Customer_1,Widget A,2147.23,2023-01-01,North,John Smith
+2,Customer_2,Widget B,1823.45,2023-01-02,South,Jane Doe
+3,Customer_3,Widget C,2456.78,2023-01-03,East,Bob Johnson
+4,Customer_4,Gadget X,1934.56,2023-01-04,West,Alice Brown
+5,Customer_5,Widget A,2234.67,2023-01-05,North,John Smith
+"""
+        with open(sample_csv, 'w') as f:
+            f.write(csv_content)
+        print(f"✅ Created {sample_csv}")
+    else:
+        print(f"ℹ️  {sample_csv} already exists")
+def create_required_modules():
+    """Create the required module files if they don't exist"""
+    print("\n📝 Checking required modules...")
+    # Check if eda_analyzer.py exists
+    if not Path('eda_analyzer.py').exists():
+        print("❌ eda_analyzer.py not found!")
+        print("   Please save the EDA Analyzer code as 'eda_analyzer.py'")
+        return False
+    else:
+        print("✅ eda_analyzer.py found")
+    # Check if database_manager.py exists
+    if not Path('database_manager.py').exists():
+        print("❌ database_manager.py not found!")
+        print("   Please save the Database Manager code as 'database_manager.py'")
+        return False
+    else:
+        print("✅ database_manager.py found")
+    return True
+def test_imports():
+    """Test if the modules can be imported"""
+    print("\n🧪 Testing module imports...")
+    try:
+        from eda_analyzer import EDAAnalyzer
+        print("✅ EDAAnalyzer imported successfully")
+    except Exception as e:
+        print(f"❌ Failed to import EDAAnalyzer: {e}")
+        return False
+    try:
+        from database_manager import DatabaseManager
+        print("✅ DatabaseManager imported successfully")
+    except Exception as e:
+        print(f"❌ Failed to import DatabaseManager: {e}")
+        return False
+    return True
+def main():
+    """Main test function"""
+    print("🚀 Neural Data Analyst - Setup Test")
+    print("=" * 50)
+    # Check Python version
+    python_version = sys.version_info
+    print(f"🐍 Python version: {python_version.major}.{python_version.minor}.{python_version.micro}")
+    if python_version < (3, 7):
+        print("❌ Python 3.7+ required!")
+        return False
+    else:
+        print("✅ Python version OK")
+    # Run all checks
+    deps_ok = check_dependencies()
+    if not deps_ok:
+        return False
+    create_sample_files()
+    modules_ok = create_required_modules()
+    if not modules_ok:
+        return False
+    imports_ok = test_imports()
+    if not imports_ok:
+        return False
+    print("\n🎉 Setup test completed successfully!")
+    print("\n📋 Next steps:")
+    print("1. Copy .env.template to .env and add your Groq API key (optional)")
+    print("2. Run: streamlit run app.py")
+    print("3. Upload sample_data.csv or your own data file")
+    print("4. Explore the analysis features!")
+    return True
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)