Spaces:

Agents-MCP-Hackathon
/

Statistical-Analysis-MCP

Running

File size: 127,119 Bytes

import gradio as gr
import numpy as np
import pandas as pd
from scipy import stats
from typing import List, Dict, Any, Optional, Union

def parse_numeric_input(data: str) -> List[float]:
    """
    Parse comma-separated string of numbers into a list of floats.
    
    Args:
        data (str): Comma-separated string of numbers (e.g., "1.2,2.3,3.4,2.1")
        
    Returns:
        List[float]: Parsed numeric data
        
    Raises:
        ValueError: If data cannot be parsed as numeric values
        
    Example:
        >>> parse_numeric_input("85.2,90.1,78.5,92.3")
        [85.2, 90.1, 78.5, 92.3]
    """
    try:
        parsed = [float(x.strip()) for x in data.split(',') if x.strip()]
        if not parsed:
            raise ValueError("No valid numbers found in input string")
        return parsed
    except ValueError as e:
        if "could not convert" in str(e):
            raise ValueError(f"Cannot parse '{data}' as comma-separated numbers")
        raise e

def welch_t_test(
    dataframe: Optional[pd.DataFrame] = None,
    group1_str: Optional[str] = None,
    group2_str: Optional[str] = None,
    alternative: str = "two-sided",
    alpha: float = 0.05,
    effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
    """
    Accepts two groups of numeric data as comma-separated strings or DataFrame columns and performs Welch's t-test. This test determines whether two independent groups have significantly different means.
    This test is valid even when populations have different variances. Default to this test instead of students t-test if you are unsure about population variance. 
    This test calculates a t-statistic using Welch's formula that accounts for unequal variances. Given an alternative hypothesis (group1 ≠ group2, group1 < group2, or group1 > group2), 
    it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists. Results are considered statistically significant 
    when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, calculated using pooled standard deviation for consistency with other t-tests, with interpretation: 
    |d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large (custom thresholds may be used). 
    EXAMPLE USE CASES: treatment vs control groups, before/after measurements with different participants, 
    comparing performance between demographic groups.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing group data in first two columns. 
                                           If provided, group1_str and group2_str will be ignored.
        group1_str (Optional[str]): Comma-separated string of numeric values for the first group.
                                   Example: "12.1,15.3,18.7,14.2,16.8" (reaction times for Group A)
                                   Only used if dataframe is None or empty.
        group2_str (Optional[str]): Comma-separated string of numeric values for the second group.
                                   Example: "22.4,19.8,25.1,21.3" (reaction times for Group B)
                                   Only used if dataframe is None or empty.
        alternative (str): Direction of the alternative hypothesis:
                          - "two-sided": group1 mean ≠ group2 mean (different in either direction)
                          - "less": group1 mean < group2 mean (group1 is smaller)
                          - "greater": group1 mean > group2 mean (group1 is larger)
        alpha (float): Significance level for the test (probability of Type I error).
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Always "Welch's t-test (unequal variances)"
            - t_statistic (float): The calculated t-value using Welch's formula
            - p_value (float): Probability of observing this result if null hypothesis is true
            - degrees_of_freedom (float): Welch's adjusted df (usually non-integer), accounts for unequal variances
            - cohens_d (float): Standardized effect size. Positive means group1 > group2, negative means group1 < group2
            - pooled_std (float): Pooled standard deviation used in effect size calculation
            - group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
            - group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
            - significant (bool): True if p_value < alpha
            - effect_size (str): Categorical interpretation of Cohen's d magnitude
            - alternative_hypothesis (str): Echo of alternative parameter
            - alpha (float): Echo of significance level used
            - effect_thresholds (List[float]): Echo of effect size thresholds used
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
        
        # Method 1: DataFrame input (preferred for LLMs and data pipelines)
        if dataframe is not None and not dataframe.empty:
            # Use first two columns automatically
            if len(dataframe.columns) < 2:
                return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
            
            # Extract and validate data from first two columns
            try:
                # Convert to numeric, coercing errors to NaN
                col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
                col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
                
                # Remove NaN values and convert to list
                group1 = col1_numeric.dropna().tolist()
                group2 = col2_numeric.dropna().tolist()
                
                # Check if we lost too much data due to non-numeric values
                original_count1 = len(dataframe.iloc[:, 0].dropna())
                original_count2 = len(dataframe.iloc[:, 1].dropna())
                
                if len(group1) < original_count1 * 0.5:  # Lost more than 50% of data
                    return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
                
                if len(group2) < original_count2 * 0.5:  # Lost more than 50% of data
                    return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
                
                input_method = "dataframe"
                
            except Exception as e:
                return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
            
        # Method 2: String input (preferred for humans and simple use cases)
        elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
            try:
                group1 = parse_numeric_input(group1_str)
                group2 = parse_numeric_input(group2_str)
                input_method = "strings"
            except ValueError as e:
                return {"error": f"String parsing error: {str(e)}"}
            
        else:
            return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
        
        # Validate extracted data
        if len(group1) < 2:
            return {"error": f"Group 1 must have at least 2 observations. Found {len(group1)} values."}
        
        if len(group2) < 2:
            return {"error": f"Group 2 must have at least 2 observations. Found {len(group2)} values."}
        
        # Perform Welch's t-test analysis
        # Convert to numpy arrays for calculations
        data1 = np.array(group1)
        data2 = np.array(group2)
        
        # Perform Welch's t-test (unequal variances)
        t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=False, alternative=alternative)
        
        # Calculate descriptive statistics
        desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
        desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
        
        # Welch's degrees of freedom formula
        s1_sq, s2_sq = desc1["std"]**2, desc2["std"]**2
        n1, n2 = desc1["n"], desc2["n"]
        df = (s1_sq/n1 + s2_sq/n2)**2 / ((s1_sq/n1)**2/(n1-1) + (s2_sq/n2)**2/(n2-1))
        
        # Effect size (Cohen's d using pooled standard deviation for consistency)
        # For Welch's test, we still typically use pooled SD for Cohen's d calculation
        pooled_std = np.sqrt(((len(data1)-1)*desc1["std"]**2 + (len(data2)-1)*desc2["std"]**2) / (len(data1) + len(data2) - 2))
        cohens_d = (desc1["mean"] - desc2["mean"]) / pooled_std
        
        # Interpretation using Cohen's canonical benchmarks
        significant = p_value < alpha
        abs_d = abs(cohens_d)
        small_threshold, medium_threshold, large_threshold = thresholds
        if abs_d < small_threshold:
            effect_size_interp = "negligible"
        elif abs_d < medium_threshold:
            effect_size_interp = "small"
        elif abs_d < large_threshold:
            effect_size_interp = "medium"
        else:
            effect_size_interp = "large"
        
        return {
            "test_type": "Welch's t-test",
            "t_statistic": t_stat,
            "p_value": p_value,
            "degrees_of_freedom": df,
            "cohens_d": cohens_d,
            "pooled_std": pooled_std,
            "group1_stats": desc1,
            "group2_stats": desc2,
            "significant": significant,
            "effect_size": effect_size_interp,
            "alternative_hypothesis": alternative,
            "alpha": alpha,
            "effect_thresholds": thresholds
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in Welch's t-test: {str(e)}"}

def student_t_test(
    dataframe: Optional[pd.DataFrame] = None,
    group1_str: Optional[str] = None,
    group2_str: Optional[str] = None,
    alternative: str = "two-sided",
    alpha: float = 0.05,
    effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
    """
    Accepts two groups of numeric data as comma-separated strings or DataFrame columns and performs Student's t-test. 
    This test determines whether two independent groups have significantly different means, assuming populations from which the groups were sampled have equal 
    variances (if this assumption is violated, or if equal population variance cannot be verified, use Welch's t-test instead). The test calculates a t-statistic quantifying the mean 
    difference as a multiple of pooled standard deviation. Given an alternative hypothesis (group1 ≠ group2, group1 < group2, or group1 > group2), 
    it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists. Results are statistically significant 
    when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, standardized by pooled standard deviation, with interpretation: 
    |d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large (custom thresholds may be used). 
    EXAMPLE USE CASES: treatment vs control groups, before/after measurements with different participants, 
    comparing performance between demographic groups.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing group data in first two columns. 
                                           If provided, group1_str and group2_str will be ignored.
        group1_str (Optional[str]): Comma-separated string of numeric values for the first group.
                                   Example: "85.2,90.1,78.5,92.3" (test scores for Group A)
                                   Only used if dataframe is None or empty.
        group2_str (Optional[str]): Comma-separated string of numeric values for the second group.
                                   Example: "88.1,85.7,91.2,87.4" (test scores for Group B)
                                   Only used if dataframe is None or empty.
        alternative (str): Direction of the alternative hypothesis:
                          - "two-sided": group1 mean ≠ group2 mean (different in either direction)
                          - "less": group1 mean < group2 mean (group1 is smaller)
                          - "greater": group1 mean > group2 mean (group1 is larger)
        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
                               These are Cohen's canonical benchmarks for effect size interpretation.
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Always "Student's t-test"
            - t_statistic (float): The calculated t-value, which measures how many standard errors the difference 
                    between group means is away from zero (assuming the null hypothesis is true). 
                    Larger absolute values indicate the observed difference is less likely under the null hypothesis.
            - p_value (float): Probability of observing this result (or more extreme) if null hypothesis is true.
                              Values < alpha indicate statistical significance.
            - degrees_of_freedom (int): df = n1 + n2 - 2, degrees of freedom for the pooled variance estimate, used for determining critical t-values.
            - cohens_d (float): Effect size measure. Positive means group1 > group2, negative means group1 < group2.
                               Interpreted using Cohen's canonical benchmarks: negligible (<0.2), small (0.2), medium (0.5), large (0.8).
            - pooled_std (float): Combined standard deviation used in Cohen's d calculation.
            - group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
            - group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
            - significant (bool): True if p_value < alpha, False otherwise
            - effect_size (str): Categorical interpretation ("negligible", "small", "medium", "large") based on |cohens_d| and effect_thresholds
            - alternative_hypothesis (str): Echo of the alternative parameter used
            - alpha (float): Echo of the significance level used
            - effect_thresholds (List[float]): Echo of the thresholds used
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
        
        # Method 1: DataFrame input (preferred for LLMs and data pipelines)
        if dataframe is not None and not dataframe.empty:
            # Use first two columns automatically
            if len(dataframe.columns) < 2:
                return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
            
            # Extract and validate data from first two columns
            try:
                # Convert to numeric, coercing errors to NaN
                col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
                col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
                
                # Remove NaN values and convert to list
                group1 = col1_numeric.dropna().tolist()
                group2 = col2_numeric.dropna().tolist()
                
                # Check if we lost too much data due to non-numeric values
                original_count1 = len(dataframe.iloc[:, 0].dropna())
                original_count2 = len(dataframe.iloc[:, 1].dropna())
                
                if len(group1) < original_count1 * 0.5:  # Lost more than 50% of data
                    return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
                
                if len(group2) < original_count2 * 0.5:  # Lost more than 50% of data
                    return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
                
                input_method = "dataframe"
                
            except Exception as e:
                return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
            
        # Method 2: String input (preferred for humans and simple use cases)
        elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
            try:
                group1 = parse_numeric_input(group1_str)
                group2 = parse_numeric_input(group2_str)
                input_method = "strings"
            except ValueError as e:
                return {"error": f"String parsing error: {str(e)}"}
            
        else:
            return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
        
        # Validate extracted data
        if len(group1) < 2:
            return {"error": f"Group 1 must have at least 2 observations. Found {len(group1)} values."}
        
        if len(group2) < 2:
            return {"error": f"Group 2 must have at least 2 observations. Found {len(group2)} values."}
        
        # Perform Student's t-test analysis directly
        # Convert to numpy arrays for calculations
        data1 = np.array(group1)
        data2 = np.array(group2)
        
        # Perform Student's t-test (equal variances)
        t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=True, alternative=alternative)
        
        # Calculate descriptive statistics
        desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
        desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
        
        # Degrees of freedom (pooled)
        df = len(data1) + len(data2) - 2
        
        # Effect size (Cohen's d using pooled standard deviation)
        pooled_std = np.sqrt(((len(data1)-1)*desc1["std"]**2 + (len(data2)-1)*desc2["std"]**2) / df)
        cohens_d = (desc1["mean"] - desc2["mean"]) / pooled_std
        
        # Interpretation using Cohen's canonical benchmarks
        significant = p_value < alpha
        abs_d = abs(cohens_d)
        small_threshold, medium_threshold, large_threshold = thresholds
        if abs_d < small_threshold:
            effect_size_interp = "negligible"
        elif abs_d < medium_threshold:
            effect_size_interp = "small"
        elif abs_d < large_threshold:
            effect_size_interp = "medium"
        else:
            effect_size_interp = "large"
        
        return {
            "test_type": "Student's t-test",
            "t_statistic": t_stat,
            "p_value": p_value,
            "degrees_of_freedom": df,
            "cohens_d": cohens_d,
            "pooled_std": pooled_std,
            "group1_stats": desc1,
            "group2_stats": desc2,
            "significant": significant,
            "effect_size": effect_size_interp,
            "alternative_hypothesis": alternative,
            "alpha": alpha,
            "effect_thresholds": thresholds
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in flexible t-test: {str(e)}"}

def paired_t_test(
   dataframe: Optional[pd.DataFrame] = None,
   group1_str: Optional[str] = None,
   group2_str: Optional[str] = None,
   alternative: str = "two-sided",
   alpha: float = 0.05,
   effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
   """
   Accepts two groups of paired numeric data as comma-separated strings or DataFrame columns and performs a paired samples t-test. 
   This test determines whether there is a significant difference between two related measurements (same subjects measured twice), 
   such as before/after treatment measurements. Unlike independent samples t-tests, this test accounts for the correlation between 
   paired observations, making it more powerful for detecting differences in repeated measures designs. The test calculates a t-statistic 
   based on the mean of the differences between paired observations. Given an alternative hypothesis (group1 ≠ group2, group1 < group2, 
   or group1 > group2), it outputs the p-value: the probability of observing this result (or more extreme) if no true difference exists. 
   Results are statistically significant when p-value < alpha (typically 0.05). Cohen's d measures practical effect size, calculated 
   as the mean difference divided by the standard deviation of differences, with interpretation: |d| < 0.2 = negligible, 0.2-0.5 = small, 
   0.5-0.8 = medium, >0.8 = large (custom thresholds may be used).
   EXAMPLE USE CASES: before/after treatment measurements on same subjects, pre/post test scores, repeated measurements over time.
   
   Args:
       dataframe (Optional[pd.DataFrame]): DataFrame containing paired data in first two columns. 
                                          If provided, group1_str and group2_str will be ignored.
       group1_str (Optional[str]): Comma-separated string of numeric values for the first measurement.
                                  Example: "85.2,90.1,78.5,92.3" (pre-test scores)
                                  Only used if dataframe is None or empty.
       group2_str (Optional[str]): Comma-separated string of numeric values for the second measurement.
                                  Example: "88.1,95.7,82.2,94.4" (post-test scores)
                                  Only used if dataframe is None or empty.
       alternative (str): Direction of the alternative hypothesis:
                         - "two-sided": group1 mean ≠ group2 mean (different in either direction)
                         - "less": group1 mean < group2 mean (group1 is smaller)
                         - "greater": group1 mean > group2 mean (group1 is larger)
       alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                     Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
       effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
                              Format: "small_threshold,medium_threshold,large_threshold"
                              Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
   
   Returns:
       dict: Comprehensive test results with the following keys:
           - test_type (str): Always "Paired samples t-test"
           - t_statistic (float): The calculated t-value based on mean difference and standard error of differences
           - p_value (float): Probability of observing this result if null hypothesis is true
           - degrees_of_freedom (int): df = n - 1, where n is the number of paired observations
           - cohens_d (float): Effect size measure. Positive means group2 > group1, negative means group1 > group2
           - pooled_std (float): Standard deviation of the differences (used in Cohen's d calculation)
           - group1_stats (dict): Descriptive statistics for group1 (mean, std, n)
           - group2_stats (dict): Descriptive statistics for group2 (mean, std, n)
           - significant (bool): True if p_value < alpha
           - effect_size (str): Categorical interpretation of Cohen's d magnitude
           - alternative_hypothesis (str): Echo of alternative parameter
           - alpha (float): Echo of significance level used
           - effect_thresholds (List[float]): Echo of effect size thresholds used
   """
   try:
       # Parse effect size thresholds
       try:
           thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
           if len(thresholds) != 3:
               return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
       except:
           return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
       
       # Method 1: DataFrame input (preferred for LLMs and data pipelines)
       if dataframe is not None and not dataframe.empty:
           # Use first two columns automatically
           if len(dataframe.columns) < 2:
               return {"error": f"DataFrame must have at least 2 columns. Found {len(dataframe.columns)} columns."}
           
           # Extract and validate data from first two columns
           try:
               # Convert to numeric, coercing errors to NaN
               col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
               col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
               
               # Remove NaN values and convert to list
               group1 = col1_numeric.dropna().tolist()
               group2 = col2_numeric.dropna().tolist()
               
               # Check if we lost too much data due to non-numeric values
               original_count1 = len(dataframe.iloc[:, 0].dropna())
               original_count2 = len(dataframe.iloc[:, 1].dropna())
               
               if len(group1) < original_count1 * 0.5:  # Lost more than 50% of data
                   return {"error": f"Column 1 contains too many non-numeric values. Only {len(group1)} out of {original_count1} values could be converted to numbers."}
               
               if len(group2) < original_count2 * 0.5:  # Lost more than 50% of data
                   return {"error": f"Column 2 contains too many non-numeric values. Only {len(group2)} out of {original_count2} values could be converted to numbers."}
               
               input_method = "dataframe"
               
           except Exception as e:
               return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
           
       # Method 2: String input (preferred for humans and simple use cases)
       elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
           try:
               group1 = parse_numeric_input(group1_str)
               group2 = parse_numeric_input(group2_str)
               input_method = "strings"
           except ValueError as e:
               return {"error": f"String parsing error: {str(e)}"}
           
       else:
           return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both groups. Do not leave inputs empty."}
       
       # Validate extracted data - paired samples must have equal length
       if len(group1) != len(group2):
           return {"error": f"Paired samples must have equal length. Group1 has {len(group1)} observations, Group2 has {len(group2)} observations."}
       
       if len(group1) < 2:
           return {"error": f"Need at least 2 paired observations. Found {len(group1)} pairs."}
       
       # Perform paired samples t-test
       # Convert to numpy arrays for calculations
       data1 = np.array(group1)
       data2 = np.array(group2)
       
       # Perform paired t-test
       t_stat, p_value = stats.ttest_rel(data1, data2, alternative=alternative)
       
       # Calculate descriptive statistics
       desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
       desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
       
       # Calculate differences and effect size
       differences = data2 - data1
       mean_diff = np.mean(differences)
       std_diff = np.std(differences, ddof=1)
       
       # Degrees of freedom for paired t-test
       df = len(data1) - 1
       
       # Effect size (Cohen's d for paired samples: mean difference / std of differences)
       cohens_d = mean_diff / std_diff
       
       # Interpretation using Cohen's canonical benchmarks
       significant = p_value < alpha
       abs_d = abs(cohens_d)
       small_threshold, medium_threshold, large_threshold = thresholds
       if abs_d < small_threshold:
           effect_size_interp = "negligible"
       elif abs_d < medium_threshold:
           effect_size_interp = "small"
       elif abs_d < large_threshold:
           effect_size_interp = "medium"
       else:
           effect_size_interp = "large"
       
       return {
           "test_type": "Paired samples t-test",
           "t_statistic": t_stat,
           "p_value": p_value,
           "degrees_of_freedom": df,
           "cohens_d": cohens_d,
           "pooled_std": std_diff,  # For paired t-test, this is std of differences
           "group1_stats": desc1,
           "group2_stats": desc2,
           "significant": significant,
           "effect_size": effect_size_interp,
           "alternative_hypothesis": alternative,
           "alpha": alpha,
           "effect_thresholds": thresholds
       }
       
   except Exception as e:
       return {"error": f"Unexpected error in paired t-test: {str(e)}"}

def one_sample_t_test(
   dataframe: Optional[pd.DataFrame] = None,
   group_str: Optional[str] = None,
   population_mean: float = 0.0,
   alternative: str = "two-sided",
   alpha: float = 0.05,
   effect_thresholds: str = "0.2,0.5,0.8"
) -> Dict[str, Any]:
   """
   Accepts a single group of numeric data as comma-separated string or DataFrame column and performs a one-sample t-test 
   against a known or hypothesized population mean. This test determines whether the sample mean differs significantly 
   from the specified population mean. The test calculates a t-statistic quantifying how many standard errors the sample 
   mean is away from the hypothesized population mean. Given an alternative hypothesis (sample ≠ population, sample < population, 
   or sample > population), it outputs the p-value: the probability of observing this result (or more extreme) if the true 
   population mean equals the hypothesized value. Results are statistically significant when p-value < alpha (typically 0.05). 
   Cohen's d measures practical effect size, calculated as the difference between sample and population means divided by the 
   sample standard deviation, with interpretation: |d| < 0.2 = negligible, 0.2-0.5 = small, 0.5-0.8 = medium, >0.8 = large 
   (custom thresholds may be used).
   EXAMPLE USE CASES: testing if sample mean differs from known standard, quality control against specification, 
   comparing sample performance against established benchmark.
   
   Args:
       dataframe (Optional[pd.DataFrame]): DataFrame containing sample data in first column. 
                                          If provided, group_str will be ignored.
       group_str (Optional[str]): Comma-separated string of numeric values for the sample.
                                  Example: "85.2,90.1,78.5,92.3" (test scores)
                                  Only used if dataframe is None or empty.
       population_mean (float): Hypothesized or known population mean to test against.
       alternative (str): Direction of the alternative hypothesis:
                         - "two-sided": sample mean ≠ population mean (different in either direction)
                         - "less": sample mean < population mean (sample is smaller)
                         - "greater": sample mean > population mean (sample is larger)
       alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                     Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
       effect_thresholds (str): Three comma-separated values defining Cohen's d effect size boundaries.
                              Format: "small_threshold,medium_threshold,large_threshold"
                              Default "0.2,0.5,0.8" means: <0.2=negligible, 0.2-0.5=small, 0.5-0.8=medium, >0.8=large
   
   Returns:
       dict: Comprehensive test results with the following keys:
           - test_type (str): Always "One-sample t-test"
           - t_statistic (float): The calculated t-value measuring sample mean deviation from population mean
           - p_value (float): Probability of observing this result if null hypothesis is true
           - degrees_of_freedom (int): df = n - 1, where n is the sample size
           - cohens_d (float): Effect size measure. Positive means sample > population, negative means sample < population
           - pooled_std (float): Sample standard deviation (used in Cohen's d calculation)
           - group_stats (dict): Descriptive statistics for the sample (mean, std, n)
           - significant (bool): True if p_value < alpha
           - effect_size (str): Categorical interpretation of Cohen's d magnitude
           - alternative_hypothesis (str): Echo of alternative parameter
           - alpha (float): Echo of significance level used
           - effect_thresholds (List[float]): Echo of effect size thresholds used
   """
   try:
       # Parse effect size thresholds
       try:
           thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
           if len(thresholds) != 3:
               return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
       except:
           return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.2,0.5,0.8')"}
       
       # Method 1: DataFrame input (preferred for LLMs and data pipelines)
       if dataframe is not None and not dataframe.empty:
           # Use first column only
           if len(dataframe.columns) < 1:
               return {"error": f"DataFrame must have at least 1 column. Found {len(dataframe.columns)} columns."}
           
           # Extract and validate data from first column
           try:
               # Convert to numeric, coercing errors to NaN
               col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
               
               # Remove NaN values and convert to list
               group = col1_numeric.dropna().tolist()
               
               # Check if we lost too much data due to non-numeric values
               original_count = len(dataframe.iloc[:, 0].dropna())
               
               if len(group) < original_count * 0.5:  # Lost more than 50% of data
                   return {"error": f"Column 1 contains too many non-numeric values. Only {len(group)} out of {original_count} values could be converted to numbers."}
               
           except Exception as e:
               return {"error": f"Error processing DataFrame column: {str(e)}. Ensure column contains numeric data."}
           
       # Method 2: String input (preferred for humans and simple use cases)
       elif group_str and group_str.strip():
           try:
               group = parse_numeric_input(group_str)
           except ValueError as e:
               return {"error": f"String parsing error: {str(e)}"}
           
       else:
           return {"error": "Please provide either a DataFrame with data OR a comma-separated string for the sample. Do not leave input empty."}
       
       # Validate extracted data
       if len(group) < 2:
           return {"error": f"Sample must have at least 2 observations. Found {len(group)} values."}
       
       # Perform one-sample t-test
       # Convert to numpy array for calculations
       data = np.array(group)
       
       # Perform one-sample t-test
       t_stat, p_value = stats.ttest_1samp(data, population_mean, alternative=alternative)
       
       # Calculate descriptive statistics
       group_stats = {"mean": np.mean(data), "std": np.std(data, ddof=1), "n": len(data)}
       
       # Degrees of freedom
       df = len(data) - 1
       
       # Effect size (Cohen's d for one-sample: (sample_mean - population_mean) / sample_std)
       sample_std = group_stats["std"]
       cohens_d = (group_stats["mean"] - population_mean) / sample_std
       
       # Interpretation using Cohen's canonical benchmarks
       significant = p_value < alpha
       abs_d = abs(cohens_d)
       small_threshold, medium_threshold, large_threshold = thresholds
       if abs_d < small_threshold:
           effect_size_interp = "negligible"
       elif abs_d < medium_threshold:
           effect_size_interp = "small"
       elif abs_d < large_threshold:
           effect_size_interp = "medium"
       else:
           effect_size_interp = "large"
       
       return {
           "test_type": "One-sample t-test",
           "t_statistic": t_stat,
           "p_value": p_value,
           "degrees_of_freedom": df,
           "cohens_d": cohens_d,
           "pooled_std": sample_std,
           "group_stats": group_stats,
           "significant": significant,
           "effect_size": effect_size_interp,
           "alternative_hypothesis": alternative,
           "alpha": alpha,
           "effect_thresholds": thresholds
       }
       
   except Exception as e:
       return {"error": f"Unexpected error in one-sample t-test: {str(e)}"}


def one_way_anova(
    dataframe: Optional[pd.DataFrame] = None,
    groups_str: Optional[str] = None,
    alpha: float = 0.05,
    effect_thresholds: str = "0.01,0.06,0.14"
) -> Dict[str, Any]:
    """
    Accepts multiple groups of numeric data as semicolon-separated groups or DataFrame columns and performs a one-way ANOVA 
    (Analysis of Variance). This test determines whether there are statistically significant differences between the means 
    of three or more independent groups. ANOVA tests the null hypothesis that all group means are equal against the alternative 
    that at least one group mean differs from the others. The test calculates an F-statistic by comparing the variance between 
    groups to the variance within groups. A significant result (p-value < alpha) indicates that at least one group differs, 
    but does not identify which specific groups differ (post-hoc tests needed for pairwise comparisons). Eta-squared (η²) 
    measures effect size as the proportion of total variance explained by group membership, with interpretation: η² < 0.01 = negligible, 
    0.01-0.06 = small, 0.06-0.14 = medium, >0.14 = large (custom thresholds may be used).
    EXAMPLE USE CASES: comparing means across multiple treatment conditions, testing differences between multiple demographic groups, 
    evaluating performance across several experimental conditions.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing group data in columns. All columns will be treated as separate groups.
                                           If provided, groups_str will be ignored.
        groups_str (Optional[str]): Multiple groups separated by semicolons, with each group containing comma-separated values.
                                   Example: "85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1" (3 groups with their respective values)
                                   Only used if dataframe is None or empty.
        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining eta-squared effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.01,0.06,0.14" means: <0.01=negligible, 0.01-0.06=small, 0.06-0.14=medium, >0.14=large
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Always "One-way ANOVA"
            - f_statistic (float): The calculated F-value comparing between-group to within-group variance
            - p_value (float): Probability of observing this result if null hypothesis is true
            - degrees_of_freedom (dict): Contains df_between (groups-1) and df_within (total_n - groups)
            - eta_squared (float): Effect size measure (proportion of variance explained by groups)
            - group_stats (List[dict]): Descriptive statistics for each group (mean, std, n)
            - significant (bool): True if p_value < alpha
            - effect_size (str): Categorical interpretation of eta-squared magnitude
            - alpha (float): Echo of significance level used
            - effect_thresholds (List[float]): Echo of effect size thresholds used
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.01,0.06,0.14')"}
        
        groups = []
        
        # Method 1: DataFrame input (preferred for LLMs and data pipelines)
        if dataframe is not None and not dataframe.empty:
            # Use all columns as separate groups
            if len(dataframe.columns) < 2:
                return {"error": f"DataFrame must have at least 2 columns for ANOVA. Found {len(dataframe.columns)} columns."}
            
            # Extract and validate data from all columns
            try:
                for col_idx, col in enumerate(dataframe.columns):
                    col_numeric = pd.to_numeric(dataframe.iloc[:, col_idx], errors='coerce')
                    group_data = col_numeric.dropna().tolist()
                    
                    # Check if we have enough data
                    original_count = len(dataframe.iloc[:, col_idx].dropna())
                    if len(group_data) < original_count * 0.5:  # Lost more than 50% of data
                        return {"error": f"Column {col_idx+1} contains too many non-numeric values. Only {len(group_data)} out of {original_count} values could be converted to numbers."}
                    
                    if len(group_data) < 2:
                        return {"error": f"Column {col_idx+1} must have at least 2 observations. Found {len(group_data)} values."}
                    
                    groups.append(group_data)
                    
            except Exception as e:
                return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
            
        # Method 2: String input (preferred for humans and simple use cases)
        elif groups_str and groups_str.strip():
            try:
                # Split by semicolon to separate groups
                group_strings = [group.strip() for group in groups_str.split(';') if group.strip()]
                
                if len(group_strings) < 2:
                    return {"error": "ANOVA requires at least 2 groups. Please provide groups separated by semicolons (;)."}
                
                for i, group_str in enumerate(group_strings):
                    try:
                        group_data = parse_numeric_input(group_str)
                        if len(group_data) < 2:
                            return {"error": f"Group {i+1} must have at least 2 observations. Found {len(group_data)} values."}
                        groups.append(group_data)
                    except ValueError as e:
                        return {"error": f"String parsing error for group {i+1}: {str(e)}"}
                        
            except Exception as e:
                return {"error": f"Error parsing groups string: {str(e)}. Use format 'group1_values;group2_values;group3_values' where each group contains comma-separated numbers."}
        
        else:
            return {"error": "Please provide either a DataFrame with data OR a semicolon-separated string of groups. Do not leave input empty."}
        
        # Validate we have enough groups
        if len(groups) < 2:
            return {"error": "ANOVA requires at least 2 groups. Please provide data for at least 2 groups."}
        
        # Perform one-way ANOVA
        # Convert to numpy arrays for calculations
        numpy_groups = [np.array(group) for group in groups]
        
        # Perform ANOVA
        f_stat, p_value = stats.f_oneway(*numpy_groups)
        
        # Calculate descriptive statistics for each group
        group_stats = []
        all_data = []
        for i, group in enumerate(numpy_groups):
            group_stats.append({
                "group": i+1,
                "mean": np.mean(group),
                "std": np.std(group, ddof=1),
                "n": len(group)
            })
            all_data.extend(group)
        
        # Calculate effect size (eta-squared)
        all_data = np.array(all_data)
        overall_mean = np.mean(all_data)
        
        # Sum of squares
        ss_total = np.sum((all_data - overall_mean)**2)
        ss_between = sum(len(group) * (np.mean(group) - overall_mean)**2 for group in numpy_groups)
        
        eta_squared = ss_between / ss_total if ss_total > 0 else 0
        
        # Degrees of freedom
        df_between = len(groups) - 1
        df_within = len(all_data) - len(groups)
        
        # Interpretation using effect size thresholds
        significant = p_value < alpha
        small_threshold, medium_threshold, large_threshold = thresholds
        if eta_squared < small_threshold:
            effect_size_interp = "negligible"
        elif eta_squared < medium_threshold:
            effect_size_interp = "small"
        elif eta_squared < large_threshold:
            effect_size_interp = "medium"
        else:
            effect_size_interp = "large"
        
        return {
            "test_type": "One-way ANOVA",
            "f_statistic": f_stat,
            "p_value": p_value,
            "degrees_of_freedom": {"df_between": df_between, "df_within": df_within},
            "eta_squared": eta_squared,
            "group_stats": group_stats,
            "significant": significant,
            "effect_size": effect_size_interp,
            "alpha": alpha,
            "effect_thresholds": thresholds
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in one-way ANOVA: {str(e)}"}

def multi_way_anova(
    dataframe: Optional[pd.DataFrame] = None,
    dependent_var: Optional[str] = None,
    factors: Optional[str] = None,
    alpha: float = 0.05,
    effect_thresholds: str = "0.01,0.06,0.14",
    include_interactions: bool = True,
    max_interaction_order: Optional[int] = None,
    sum_squares_type: int = 2
) -> Dict[str, Any]:
    """
    Accepts multiple categorical factors and performs Multi-Way ANOVA to determine whether there are 
    statistically significant differences between group means when multiple factors are involved simultaneously.
    Multi-way ANOVA extends the one-way ANOVA framework to handle complex experimental designs with multiple 
    categorical independent variables (factors), each with two or more levels. Unlike one-way ANOVA which tests 
    a single factor, multi-way ANOVA can simultaneously test: (1) main effects of each individual factor, 
    (2) interaction effects between factors, and (3) higher-order interactions. The test uses F-statistics to 
    compare variance between groups to variance within groups for each effect. Eta-squared (η²) measures effect 
    size as the proportion of total variance explained by each factor and interaction, with interpretation: 
    η² < 0.01 = negligible, 0.01-0.06 = small, 0.06-0.14 = medium, >0.14 = large (custom thresholds may be used).
    EXAMPLE USE CASES: 2-way ANOVA for treatment × gender effects on blood pressure, 3-way ANOVA for teaching 
    method × school type × student age on test scores, 4-way ANOVA for drug × dose × gender × age effects on recovery.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing the experimental data with factors as columns
                                           and the dependent variable. All factors must be categorical.
                                           If provided, dependent_var and factors parameters are required.
        dependent_var (Optional[str]): Name of the dependent (outcome) variable column in the DataFrame.
                                      Must be a continuous numeric variable.
                                      Example: "test_score", "recovery_time", "blood_pressure"
        factors (Optional[str]): Comma-separated string of factor column names from the DataFrame.
                                Format: "factor1,factor2,factor3"
                                Example: "treatment,gender,age_group" for a 3-way ANOVA
                                Each factor must be categorical with 2 or more levels.
        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining eta-squared effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.01,0.06,0.14" means: <0.01=negligible, 0.01-0.06=small, 0.06-0.14=medium, >0.14=large
                               These follow Cohen's conventions for eta-squared interpretation.
        include_interactions (bool): Whether to include interaction terms in the model. 
                                   True (default): Tests main effects AND interactions
                                   False: Tests only main effects (additive model)
        max_interaction_order (Optional[int]): Maximum order of interactions to include in the model.
                                             If None, includes all possible interactions up to the number of factors.
                                             Example: For 4 factors, setting to 2 includes only 2-way interactions.
                                             Useful for simplifying complex models with many factors.
        sum_squares_type (int): Type of sum of squares calculation for the ANOVA table.
                              Type 1: Sequential (depends on order of factors)
                              Type 2: Marginal (recommended for balanced designs, default)
                              Type 3: Partial (recommended for unbalanced designs)
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Description of the multi-way ANOVA performed (e.g., "3-way ANOVA with interactions")
            - anova_table (pd.DataFrame): Complete ANOVA table with sum of squares, F-statistics, p-values, etc.
            - significant_effects (List[str]): List of statistically significant main effects and interactions
            - effect_sizes (Dict[str, float]): Eta-squared values for each effect measuring proportion of variance explained
            - effect_interpretations (Dict[str, str]): Categorical interpretation of each effect size ("negligible", "small", "medium", "large")
            - factor_summaries (Dict[str, dict]): Descriptive statistics for each factor level
            - model_summary (dict): Overall model statistics (R², F-statistic, AIC, BIC, etc.)
            - formula_used (str): The statsmodels formula string used for the analysis
            - design_summary (dict): Information about the experimental design (balanced/unbalanced, sample sizes)
            - alpha (float): Echo of significance level used
            - factors_analyzed (List[str]): Echo of factors included in the analysis
            - sum_squares_type (int): Echo of sum of squares type used
            - effect_thresholds (List[float]): Echo of effect size thresholds used
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.01,0.06,0.14')"}
        
        # Validate inputs
        if dataframe is None or dataframe.empty:
            return {"error": "DataFrame cannot be None or empty"}
        
        if not dependent_var:
            return {"error": "Dependent variable name is required"}
        
        if dependent_var not in dataframe.columns:
            return {"error": f"Dependent variable '{dependent_var}' not found in DataFrame columns"}
        
        if not factors:
            return {"error": "Factor names are required. Provide as comma-separated string (e.g., 'factor1,factor2,factor3')"}
        
        # Parse factors
        try:
            factor_list = [f.strip() for f in factors.split(',') if f.strip()]
            if len(factor_list) < 2:
                return {"error": "At least 2 factors are required for multi-way ANOVA"}
        except:
            return {"error": "Invalid factors format. Use comma-separated factor names (e.g., 'treatment,gender,age_group')"}
        
        # Check factors exist in DataFrame
        missing_factors = [f for f in factor_list if f not in dataframe.columns]
        if missing_factors:
            return {"error": f"Factors not found in DataFrame: {missing_factors}"}
        
        # Validate sum of squares type
        if sum_squares_type not in [1, 2, 3]:
            return {"error": "sum_squares_type must be 1, 2, or 3"}
        
        # Clean and prepare the data
        analysis_columns = [dependent_var] + factor_list
        analysis_df = dataframe[analysis_columns].copy()
        
        # Remove rows with missing values
        initial_rows = len(analysis_df)
        analysis_df = analysis_df.dropna()
        final_rows = len(analysis_df)
        
        if final_rows < initial_rows * 0.5:
            return {"error": f"Too much missing data: only {final_rows} out of {initial_rows} rows usable"}
        
        if final_rows < 20:
            return {"error": f"Insufficient data after removing missing values: {final_rows} rows remaining (minimum 20 required)"}
        
        # Validate dependent variable is numeric
        try:
            analysis_df[dependent_var] = pd.to_numeric(analysis_df[dependent_var])
        except:
            return {"error": f"Dependent variable '{dependent_var}' must be numeric"}
        
        # Ensure factors are categorical and check levels
        factor_level_counts = {}
        for factor in factor_list:
            analysis_df[factor] = analysis_df[factor].astype('category')
            unique_levels = len(analysis_df[factor].cat.categories)
            factor_level_counts[factor] = unique_levels
            
            if unique_levels < 2:
                return {"error": f"Factor '{factor}' must have at least 2 levels. Found {unique_levels} level(s)"}
            
            if unique_levels > 20:
                return {"error": f"Factor '{factor}' has too many levels ({unique_levels}). Consider combining levels or using a different analysis method"}
        
        # Check for sufficient observations per factor combination
        try:
            cell_counts = analysis_df.groupby(factor_list).size()
            min_cell_size = cell_counts.min()
            empty_cells = (cell_counts == 0).sum()
            
            if min_cell_size < 2:
                return {"error": f"Some factor combinations have fewer than 2 observations. Minimum cell size: {min_cell_size}"}
            
            if empty_cells > 0:
                return {"error": f"Missing data: {empty_cells} factor combinations have no observations"}
                
        except Exception as e:
            return {"error": f"Error checking experimental design: {str(e)}"}
        
        # Build formula components
        formula_terms = []
        
        # Add main effects (always included)
        for factor in factor_list:
            formula_terms.append(f"C({factor})")
        
        # Add interaction terms if requested
        if include_interactions and len(factor_list) > 1:
            max_order = max_interaction_order if max_interaction_order is not None else len(factor_list)
            max_order = min(max_order, len(factor_list))  # Don't exceed number of factors
            
            # Generate all interaction combinations
            for order in range(2, max_order + 1):
                for combination in itertools.combinations(factor_list, order):
                    interaction_term = ":".join([f"C({factor})" for factor in combination])
                    formula_terms.append(interaction_term)
        
        # Build the complete formula
        formula = f"{dependent_var} ~ " + " + ".join(formula_terms)
        
        # Fit the model
        try:
            model = ols(formula, data=analysis_df).fit()
        except Exception as e:
            return {"error": f"Model fitting failed: {str(e)}. This may indicate perfect multicollinearity or insufficient data variation"}
        
        # Generate ANOVA table
        try:
            anova_table = sm.stats.anova_lm(model, typ=sum_squares_type)
        except Exception as e:
            return {"error": f"ANOVA table generation failed: {str(e)}"}
        
        # Calculate effect sizes (eta-squared)
        effect_sizes = {}
        effect_interpretations = {}
        total_ss = anova_table['sum_sq'].sum()
        
        for index, row in anova_table.iterrows():
            if index != 'Residual':
                eta_squared = row['sum_sq'] / total_ss
                effect_sizes[index] = eta_squared
                
                # Interpret effect size
                small_threshold, medium_threshold, large_threshold = thresholds
                if eta_squared < small_threshold:
                    effect_interpretations[index] = "negligible"
                elif eta_squared < medium_threshold:
                    effect_interpretations[index] = "small"
                elif eta_squared < large_threshold:
                    effect_interpretations[index] = "medium"
                else:
                    effect_interpretations[index] = "large"
        
        # Identify significant effects
        significant_effects = []
        for index, row in anova_table.iterrows():
            if index != 'Residual' and row['PR(>F)'] < alpha:
                significant_effects.append(index)
        
        # Calculate factor summaries
        factor_summaries = {}
        for factor in factor_list:
            factor_stats = analysis_df.groupby(factor)[dependent_var].agg(['mean', 'std', 'count']).round(4)
            factor_summaries[factor] = factor_stats.to_dict('index')
        
        # Model summary statistics
        model_summary = {
            "r_squared": model.rsquared,
            "adj_r_squared": model.rsquared_adj,
            "f_statistic": model.fvalue,
            "f_pvalue": model.f_pvalue,
            "aic": model.aic,
            "bic": model.bic,
            "df_model": model.df_model,
            "df_resid": model.df_resid,
            "n_observations": int(model.nobs),
            "mse_resid": model.mse_resid
        }
        
        # Design summary
        total_combinations = np.prod(list(factor_level_counts.values()))
        observed_combinations = len(cell_counts)
        balanced = len(cell_counts.unique()) == 1  # All cells have same count
        
        design_summary = {
            "n_factors": len(factor_list),
            "factor_levels": factor_level_counts,
            "total_possible_combinations": total_combinations,
            "observed_combinations": observed_combinations,
            "is_balanced": balanced,
            "min_cell_size": int(min_cell_size),
            "max_cell_size": int(cell_counts.max()),
            "mean_cell_size": round(cell_counts.mean(), 2)
        }
            
        # Determine test description
        n_factors = len(factor_list)
        test_description = f"{n_factors}-way ANOVA"
        
        if include_interactions:
            max_order_desc = max_interaction_order if max_interaction_order else n_factors
            test_description += f" with interactions (up to {max_order_desc}-way)"
        else:
            test_description += " (main effects only)"
        
        return {
            "test_type": test_description,
            "anova_table": anova_table,
            "significant_effects": significant_effects,
            "effect_sizes": effect_sizes,
            "effect_interpretations": effect_interpretations,
            "factor_summaries": factor_summaries,
            "model_summary": model_summary,
            "formula_used": formula,
            "design_summary": design_summary,
            "alpha": alpha,
            "factors_analyzed": factor_list,
            "sum_squares_type": sum_squares_type,
            "effect_thresholds": thresholds
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in multi-way ANOVA: {str(e)}"}

def chi_square_test(
    dataframe: Optional[pd.DataFrame] = None,
    observed_str: Optional[str] = None,
    expected_str: Optional[str] = None,
    alpha: float = 0.05,
    effect_thresholds: str = "0.1,0.3,0.5"
) -> Dict[str, Any]:
    """
    Accepts observed frequencies (and optionally expected frequencies) as comma-separated strings or DataFrame columns 
    and performs a chi-square goodness of fit test. This test determines whether observed categorical data frequencies 
    differ significantly from expected frequencies. If no expected frequencies are provided, the test assumes equal 
    distribution across all categories. The test calculates a chi-square statistic measuring the discrepancy between 
    observed and expected frequencies. A significant result (p-value < alpha) indicates that the observed distribution 
    differs from the expected distribution. Cramér's V measures effect size as the strength of association, with 
    interpretation: V < 0.1 = negligible, 0.1-0.3 = small, 0.3-0.5 = medium, >0.5 = large (custom thresholds may be used).
    EXAMPLE USE CASES: testing if dice rolls follow uniform distribution, comparing observed vs expected sales across 
    categories, analyzing survey response distributions.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing frequency data in first column (observed) and 
                                           optionally second column (expected). If provided, string parameters will be ignored.
        observed_str (Optional[str]): Comma-separated string of observed frequencies.
                                     Example: "25,30,20,15" (frequencies for 4 categories)
                                     Only used if dataframe is None or empty.
        expected_str (Optional[str]): Comma-separated string of expected frequencies (optional).
                                     Example: "22.5,22.5,22.5,22.5" (equal distribution)
                                     If not provided, assumes equal distribution. Only used if dataframe is None or empty.
        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining Cramér's V effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.1,0.3,0.5" means: <0.1=negligible, 0.1-0.3=small, 0.3-0.5=medium, >0.5=large
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Always "Chi-square goodness of fit test"
            - chi_square_statistic (float): The calculated chi-square value measuring discrepancy from expected
            - p_value (float): Probability of observing this result if null hypothesis is true
            - degrees_of_freedom (int): df = categories - 1
            - cramers_v (float): Effect size measure (strength of association)
            - significant (bool): True if p_value < alpha
            - effect_size (str): Categorical interpretation of Cramér's V magnitude
            - alpha (float): Echo of significance level used
            - effect_thresholds (List[float]): Echo of effect size thresholds used
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.1,0.3,0.5')"}
        
        # Method 1: DataFrame input (preferred for LLMs and data pipelines)
        if dataframe is not None and not dataframe.empty:
            # Use first column for observed, second column for expected (if available)
            if len(dataframe.columns) < 1:
                return {"error": f"DataFrame must have at least 1 column. Found {len(dataframe.columns)} columns."}
            
            try:
                # Convert first column to numeric (observed frequencies)
                col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
                observed = col1_numeric.dropna().tolist()
                
                # Check if we lost too much data
                original_count1 = len(dataframe.iloc[:, 0].dropna())
                if len(observed) < original_count1 * 0.5:
                    return {"error": f"Column 1 contains too many non-numeric values. Only {len(observed)} out of {original_count1} values could be converted to numbers."}
                
                # Check for second column (expected frequencies)
                if len(dataframe.columns) >= 2:
                    col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
                    expected = col2_numeric.dropna().tolist()
                    
                    if len(expected) != len(observed):
                        return {"error": "Observed and expected columns must have the same number of valid entries."}
                else:
                    # Calculate equal distribution
                    total = sum(observed)
                    expected = [total / len(observed)] * len(observed)
                
            except Exception as e:
                return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
            
        # Method 2: String input (preferred for humans and simple use cases)
        elif observed_str and observed_str.strip():
            try:
                observed = parse_numeric_input(observed_str)
                
                if expected_str and expected_str.strip():
                    expected = parse_numeric_input(expected_str)
                    if len(observed) != len(expected):
                        return {"error": "Observed and expected must have the same number of categories."}
                else:
                    # Calculate equal distribution
                    total = sum(observed)
                    expected = [total / len(observed)] * len(observed)
                    
            except ValueError as e:
                return {"error": f"String parsing error: {str(e)}"}
            
        else:
            return {"error": "Please provide either a DataFrame with data OR a comma-separated string for observed frequencies. Do not leave input empty."}
        
        # Validate extracted data
        if len(observed) < 2:
            return {"error": f"Need at least 2 categories for chi-square test. Found {len(observed)} categories."}
        
        # Check for non-negative frequencies
        if any(x < 0 for x in observed) or any(x < 0 for x in expected):
            return {"error": "Frequencies cannot be negative."}
        
        # Check for zero expected frequencies
        if any(x <= 0 for x in expected):
            return {"error": "Expected frequencies must be greater than zero."}
        
        # Perform chi-square goodness of fit test
        observed_array = np.array(observed)
        expected_array = np.array(expected)
        
        # Perform chi-square test
        chi2_stat, p_value = stats.chisquare(observed_array, expected_array)
        
        # Degrees of freedom
        df = len(observed) - 1
        
        # Effect size (Cramér's V for goodness of fit)
        n = sum(observed)
        cramers_v = np.sqrt(chi2_stat / (n * df)) if df > 0 else 0
        
        # Interpretation using effect size thresholds
        significant = p_value < alpha
        small_threshold, medium_threshold, large_threshold = thresholds
        if cramers_v < small_threshold:
            effect_size_interp = "negligible"
        elif cramers_v < medium_threshold:
            effect_size_interp = "small"
        elif cramers_v < large_threshold:
            effect_size_interp = "medium"
        else:
            effect_size_interp = "large"
        
        return {
            "test_type": "Chi-square goodness of fit test",
            "chi_square_statistic": chi2_stat,
            "p_value": p_value,
            "degrees_of_freedom": df,
            "cramers_v": cramers_v,
            "significant": significant,
            "effect_size": effect_size_interp,
            "alpha": alpha,
            "effect_thresholds": thresholds
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in chi-square test: {str(e)}"}


def correlation_test(
    dataframe: Optional[pd.DataFrame] = None,
    group1_str: Optional[str] = None,
    group2_str: Optional[str] = None,
    method: str = "pearson",
    alpha: float = 0.05,
    effect_thresholds: str = "0.1,0.3,0.5"
) -> Dict[str, Any]:
    """
    Accepts two variables as comma-separated strings or DataFrame columns and performs correlation analysis. 
    This test determines the strength and direction of the linear relationship between two continuous variables. 
    Pearson correlation measures linear relationships, Spearman correlation measures monotonic relationships 
    (rank-based), and Kendall's tau is robust to outliers and suitable for small samples. The test calculates 
    a correlation coefficient ranging from -1 (perfect negative correlation) to +1 (perfect positive correlation), 
    with 0 indicating no linear relationship. A significant result (p-value < alpha) indicates that the observed 
    correlation is statistically different from zero. Effect size interpretation: |r| < 0.1 = negligible, 
    0.1-0.3 = small, 0.3-0.5 = medium, >0.5 = large (custom thresholds may be used).
    EXAMPLE USE CASES: examining relationship between height and weight, analyzing correlation between study time 
    and test scores, investigating association between variables in research.
    
    Args:
        dataframe (Optional[pd.DataFrame]): DataFrame containing two variables in first two columns. 
                                           If provided, group1_str and group2_str will be ignored.
        group1_str (Optional[str]): Comma-separated string of numeric values for the first variable (X).
                                   Example: "5.2,6.1,4.8,7.3" (hours studied)
                                   Only used if dataframe is None or empty.
        group2_str (Optional[str]): Comma-separated string of numeric values for the second variable (Y).
                                   Example: "78,85,72,92" (test scores)
                                   Only used if dataframe is None or empty.
        method (str): Correlation method to use:
                     - "pearson": Pearson product-moment correlation (linear relationships)
                     - "spearman": Spearman rank correlation (monotonic relationships)
                     - "kendall": Kendall's tau (robust to outliers, good for small samples)
        alpha (float): Significance level for the test (probability of Type I error). Reject null hypothesis if p_value below this threshold.
                      Common values: 0.05 (5%), 0.01 (1%), 0.10 (10%)
        effect_thresholds (str): Three comma-separated values defining correlation effect size boundaries.
                               Format: "small_threshold,medium_threshold,large_threshold"
                               Default "0.1,0.3,0.5" means: <0.1=negligible, 0.1-0.3=small, 0.3-0.5=medium, >0.5=large
    
    Returns:
        dict: Comprehensive test results with the following keys:
            - test_type (str): Type of correlation test performed
            - correlation_coefficient (float): The calculated correlation coefficient (-1 to +1)
            - p_value (float): Probability of observing this result if null hypothesis (no correlation) is true
            - sample_size (int): Number of paired observations
            - significant (bool): True if p_value < alpha
            - effect_size (str): Categorical interpretation of correlation magnitude
            - method (str): Echo of correlation method used
            - alpha (float): Echo of significance level used
            - effect_thresholds (List[float]): Echo of effect size thresholds used
            - group1_stats (dict): Descriptive statistics for first variable (mean, std, n)
            - group2_stats (dict): Descriptive statistics for second variable (mean, std, n)
    """
    try:
        # Parse effect size thresholds
        try:
            thresholds = [float(x.strip()) for x in effect_thresholds.split(',')]
            if len(thresholds) != 3:
                return {"error": "Effect thresholds must be three comma-separated numbers (small,medium,large)"}
        except:
            return {"error": "Invalid effect thresholds format. Use 'small,medium,large' (e.g., '0.1,0.3,0.5')"}
        
        # Method 1: DataFrame input (preferred for LLMs and data pipelines)
        if dataframe is not None and not dataframe.empty:
            # Use first two columns
            if len(dataframe.columns) < 2:
                return {"error": f"DataFrame must have at least 2 columns for correlation. Found {len(dataframe.columns)} columns."}
            
            try:
                # Convert to numeric, coercing errors to NaN
                col1_numeric = pd.to_numeric(dataframe.iloc[:, 0], errors='coerce')
                col2_numeric = pd.to_numeric(dataframe.iloc[:, 1], errors='coerce')
                
                # Remove rows where either value is NaN
                valid_mask = ~(col1_numeric.isna() | col2_numeric.isna())
                group1 = col1_numeric[valid_mask].tolist()
                group2 = col2_numeric[valid_mask].tolist()
                
                # Check if we lost too much data
                original_count = len(dataframe)
                if len(group1) < original_count * 0.5:
                    return {"error": f"Too many non-numeric values in the data. Only {len(group1)} out of {original_count} rows could be used."}
                
            except Exception as e:
                return {"error": f"Error processing DataFrame columns: {str(e)}. Ensure columns contain numeric data."}
            
        # Method 2: String input (preferred for humans and simple use cases)
        elif group1_str and group2_str and group1_str.strip() and group2_str.strip():
            try:
                group1 = parse_numeric_input(group1_str)
                group2 = parse_numeric_input(group2_str)
                
                if len(group1) != len(group2):
                    return {"error": f"Variables must have the same number of observations. Variable 1 has {len(group1)}, Variable 2 has {len(group2)}."}
                    
            except ValueError as e:
                return {"error": f"String parsing error: {str(e)}"}
            
        else:
            return {"error": "Please provide either a DataFrame with data OR comma-separated strings for both variables. Do not leave inputs empty."}
        
        # Validate extracted data
        if len(group1) < 3:
            return {"error": f"Need at least 3 paired observations for correlation. Found {len(group1)} pairs."}
        
        # Perform correlation analysis
        data1 = np.array(group1)
        data2 = np.array(group2)
        
        # Choose correlation method
        method_lower = method.lower()
        if method_lower == "pearson":
            corr_coef, p_value = stats.pearsonr(data1, data2)
            test_name = "Pearson correlation"
        elif method_lower == "spearman":
            corr_coef, p_value = stats.spearmanr(data1, data2)
            test_name = "Spearman rank correlation"
        elif method_lower == "kendall":
            corr_coef, p_value = stats.kendalltau(data1, data2)
            test_name = "Kendall's tau correlation"
        else:
            return {"error": "Method must be 'pearson', 'spearman', or 'kendall'"}
        
        # Calculate descriptive statistics
        desc1 = {"mean": np.mean(data1), "std": np.std(data1, ddof=1), "n": len(data1)}
        desc2 = {"mean": np.mean(data2), "std": np.std(data2, ddof=1), "n": len(data2)}
        
        # Interpretation using effect size thresholds
        significant = p_value < alpha
        abs_corr = abs(corr_coef)
        small_threshold, medium_threshold, large_threshold = thresholds
        if abs_corr < small_threshold:
            effect_size_interp = "negligible"
        elif abs_corr < medium_threshold:
            effect_size_interp = "small"
        elif abs_corr < large_threshold:
            effect_size_interp = "medium"
        else:
            effect_size_interp = "large"
        
        return {
            "test_type": test_name,
            "correlation_coefficient": corr_coef,
            "p_value": p_value,
            "sample_size": len(data1),
            "significant": significant,
            "effect_size": effect_size_interp,
            "method": method_lower,
            "alpha": alpha,
            "effect_thresholds": thresholds,
            "group1_stats": desc1,
            "group2_stats": desc2
        }
        
    except Exception as e:
        return {"error": f"Unexpected error in correlation test: {str(e)}"}

# SHARED UTILITY FUNCTIONS (Hidden from MCP)
def load_uploaded_file(file_path, has_header_flag):
    """Shared function to load uploaded files and return both the DataFrame and preview."""
    if file_path is None:
        return None, None
    
    try:
        # Determine header parameter for pandas
        header_param = 0 if has_header_flag else None
        
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path, header=header_param)
        elif file_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file_path, header=header_param)
        else:
            return None, pd.DataFrame({'Error': ['Unsupported file format']})
        
        # Take only first two columns
        if len(df.columns) >= 2:
            df_subset = df.iloc[:, :2].copy()
            
            # Set column names based on whether headers were detected
            if has_header_flag and not str(df_subset.columns[0]).startswith('Unnamed'):
                # Keep original column names if they exist and aren't auto-generated
                df_subset.columns = [str(df_subset.columns[0]), str(df_subset.columns[1])]
            else:
                # Use default names
                df_subset.columns = ['Group1', 'Group2']
            
            # Convert columns to numeric, replacing non-numeric with NaN
            df_subset.iloc[:, 0] = pd.to_numeric(df_subset.iloc[:, 0], errors='coerce')
            df_subset.iloc[:, 1] = pd.to_numeric(df_subset.iloc[:, 1], errors='coerce')
            
            # Remove rows where both values are NaN
            df_subset = df_subset.dropna(how='all')
            
            # Return full dataframe for processing and preview for display
            preview = df_subset.head(10)  # Show first 10 rows
            return df_subset, preview
        else:
            error_df = pd.DataFrame({'Error': ['File must have at least 2 columns']})
            return None, error_df
    except Exception as e:
        error_df = pd.DataFrame({'Error': [f"Failed to load file: {str(e)}"]})
        return None, error_df

def toggle_input_method(method):
    """Toggle between file upload and text input sections."""
    if method == "File Upload":
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)

def clear_all():
    """Clear all form inputs and reset to defaults."""
    return (
        "File Upload",  # input_method
        None,           # loaded_dataframe
        None,           # data_preview
        "",             # group1_str
        "",             # group2_str
        "two-sided",    # alternative
        0.05,           # alpha
        "0.2,0.5,0.8",  # effect_thresholds
        {}              # output
    )

def load_example():
    """Load example data for demonstration purposes."""
    example_df = pd.DataFrame({
        'Treatment': [85.2, 90.1, 78.5, 92.3, 88.7, 86.4, 89.2],
        'Control': [88.1, 85.7, 91.2, 87.4, 89.3, 90.8, 86.9]
    })
    preview = example_df.head(10)
    return "File Upload", example_df, preview, "", ""

# COMPONENT FACTORY FUNCTIONS
def create_input_components():
    """Create reusable input components for both test tabs."""
    # Input method selector
    input_method = gr.Radio(
        choices=["File Upload", "Text Input"],
        value="File Upload",
        label="Choose Input Method",
        info="Select how you want to provide your data"
    )
    
    # File upload input section
    with gr.Group(visible=True) as file_section:
        gr.Markdown("### File Upload")
        gr.Markdown("*Upload CSV or Excel file - first two columns will be used as Group 1 and Group 2*")
        
        with gr.Row():
            file_upload = gr.File(
                label="Upload CSV/Excel File",
                file_types=[".csv", ".xlsx", ".xls"],
                type="filepath"
            )
            has_header = gr.Checkbox(
                label="File has header row",
                value=True,
                info="Check if first row contains column names"
            )
            
        # Display loaded data preview
        data_preview = gr.Dataframe(
            label="Data Preview (first two columns)",
            interactive=False,
            row_count=5
        )
    
    # Text input section
    with gr.Group(visible=False) as text_section:
        gr.Markdown("### Text Input")
        gr.Markdown("*Enter comma-separated numbers for each group*")
        
        group1_str = gr.Textbox(
            placeholder="85.2,90.1,78.5,92.3,88.7",
            label="Group 1 Data",
            info="Comma-separated numbers (e.g., test scores for condition A)"
        )
        group2_str = gr.Textbox(
            placeholder="88.1,85.7,91.2,87.4,89.3", 
            label="Group 2 Data",
            info="Comma-separated numbers (e.g., test scores for condition B)"
        )
    
    return input_method, file_section, text_section, file_upload, has_header, data_preview, group1_str, group2_str

def create_parameter_components():
    """Create reusable parameter components for both test tabs."""
    gr.Markdown("### Test Parameters")
    with gr.Row():
        alternative = gr.Dropdown(
            choices=["two-sided", "less", "greater"], 
            value="two-sided", 
            label="Alternative Hypothesis",
            info="two-sided: groups differ; less: group1 < group2; greater: group1 > group2"
        )
        alpha = gr.Number(
            value=0.05, 
            minimum=0, 
            maximum=1, 
            step=0.01, 
            label="Significance Level (α)",
            info="Probability threshold for statistical significance (typically 0.05)"
        )
        effect_thresholds = gr.Textbox(
            value="0.2,0.5,0.8",
            label="Effect Size Thresholds",
            info="Cohen's d boundaries: small,medium,large (Cohen's canonical values)"
        )
    
    return alternative, alpha, effect_thresholds

def create_t_test_tab(test_function, test_name, description):
    """
    Factory function to create a complete t-test tab with all components and handlers.
    
    Args:
        test_function: The statistical function to call (student_t_test or welch_t_test)
        test_name: Display name for the tab (e.g., "Student's T-Test")
        description: Markdown description to show at the top of the tab
    
    Returns:
        dict: Dictionary containing all created components and state for external reference
    """
    
    with gr.TabItem(test_name):
        gr.Markdown(description)
        
        # Create input components
        (input_method, file_section, text_section, file_upload, 
         has_header, data_preview, group1_str, group2_str) = create_input_components()
        
        # Create parameter components
        alternative, alpha, effect_thresholds = create_parameter_components()
        
        # Create action buttons
        with gr.Row():
            run_button = gr.Button(f"Run {test_name}", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Statistical Test Results")
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # EVENT HANDLERS
        # Toggle between input methods
        input_method.change(
            fn=toggle_input_method,
            inputs=input_method,
            outputs=[file_section, text_section],
            show_api=False  # Hide UI helper from MCP
        )
        
        # File upload handlers
        file_upload.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False  # Hide UI helper from MCP
        )
        
        has_header.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False  # Hide UI helper from MCP
        )
        
        # MAIN STATISTICAL FUNCTION CALL - This will be exposed to MCP!
        run_button.click(
            fn=test_function,  # Direct call to the statistical function
            inputs=[
                loaded_dataframe,   # dataframe
                group1_str,         # group1_str  
                group2_str,         # group2_str
                alternative,        # alternative
                alpha,              # alpha
                effect_thresholds   # effect_thresholds
            ],
            outputs=output
            # Note: No show_api=False here - we want the main function exposed to MCP!
        )
        
        # Clear form handler
        clear_button.click(
            fn=clear_all,
            outputs=[
                input_method, loaded_dataframe, data_preview, 
                group1_str, group2_str, alternative, 
                alpha, effect_thresholds, output
            ],
            show_api=False  # Hide UI helper from MCP
        )
        
        # Example data handler
        example_button.click(
            fn=load_example,
            outputs=[input_method, loaded_dataframe, data_preview, 
                    group1_str, group2_str],
            show_api=False  # Hide UI helper from MCP
        )
    
    # Return components for external reference if needed
    return {
        'input_method': input_method,
        'file_upload': file_upload,
        'has_header': has_header,
        'data_preview': data_preview,
        'group1_str': group1_str,
        'group2_str': group2_str,
        'alternative': alternative,
        'alpha': alpha,
        'effect_thresholds': effect_thresholds,
        'run_button': run_button,
        'clear_button': clear_button,
        'example_button': example_button,
        'output': output,
        'loaded_dataframe': loaded_dataframe
    }

def create_one_sample_t_test_tab():
    """Create a complete one-sample t-test tab with all components and handlers."""
    
    with gr.TabItem("One-Sample T-Test"):
        gr.Markdown("**Test a sample against a known population mean**")
        
        # Input method selector
        input_method = gr.Radio(
            choices=["File Upload", "Text Input"],
            value="File Upload",
            label="Choose Input Method",
            info="Select how you want to provide your data"
        )
        
        # File upload input section
        with gr.Group(visible=True) as file_section:
            gr.Markdown("### File Upload")
            gr.Markdown("*Upload CSV or Excel file - first column will be used as sample data*")
            
            with gr.Row():
                file_upload = gr.File(
                    label="Upload CSV/Excel File",
                    file_types=[".csv", ".xlsx", ".xls"],
                    type="filepath"
                )
                has_header = gr.Checkbox(
                    label="File has header row",
                    value=True,
                    info="Check if first row contains column names"
                )
                
            # Display loaded data preview
            data_preview = gr.Dataframe(
                label="Data Preview (first column)",
                interactive=False,
                row_count=5
            )
        
        # Text input section
        with gr.Group(visible=False) as text_section:
            gr.Markdown("### Text Input")
            gr.Markdown("*Enter comma-separated numbers for your sample*")
            
            group_str = gr.Textbox(
                placeholder="85.2,90.1,78.5,92.3,88.7",
                label="Sample Data",
                info="Comma-separated numbers (e.g., test scores, measurements)"
            )
        
        # Test parameters
        gr.Markdown("### Test Parameters")
        with gr.Row():
            population_mean = gr.Number(
                value=0.0,
                label="Population Mean (μ₀)",
                info="Known or hypothesized population mean to test against"
            )
            alternative = gr.Dropdown(
                choices=["two-sided", "less", "greater"], 
                value="two-sided", 
                label="Alternative Hypothesis",
                info="two-sided: sample ≠ population; less: sample < population; greater: sample > population"
            )
        
        with gr.Row():
            alpha = gr.Number(
                value=0.05, 
                minimum=0, 
                maximum=1, 
                step=0.01, 
                label="Significance Level (α)",
                info="Probability threshold for statistical significance (typically 0.05)"
            )
            effect_thresholds = gr.Textbox(
                value="0.2,0.5,0.8",
                label="Effect Size Thresholds",
                info="Cohen's d boundaries: small,medium,large"
            )
        
        # Action buttons
        with gr.Row():
            run_button = gr.Button("Run One-Sample T-Test", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Statistical Test Results")
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # EVENT HANDLERS
        # Toggle between input methods
        input_method.change(
            fn=toggle_input_method,
            inputs=input_method,
            outputs=[file_section, text_section],
            show_api=False
        )
        
        # File upload handlers
        file_upload.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        has_header.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
        run_button.click(
            fn=one_sample_t_test,
            inputs=[
                loaded_dataframe,    # dataframe
                group_str,           # group_str
                population_mean,     # population_mean
                alternative,         # alternative
                alpha,               # alpha
                effect_thresholds    # effect_thresholds
            ],
            outputs=output
        )
        
        # Clear form handler
        def clear_one_sample():
            return (
                "File Upload",      # input_method
                None,               # loaded_dataframe
                None,               # data_preview
                "",                 # group_str
                0.0,                # population_mean
                "two-sided",        # alternative
                0.05,               # alpha
                "0.2,0.5,0.8",      # effect_thresholds
                {}                  # output
            )
        
        clear_button.click(
            fn=clear_one_sample,
            outputs=[
                input_method, loaded_dataframe, data_preview, 
                group_str, population_mean, alternative, 
                alpha, effect_thresholds, output
            ],
            show_api=False
        )
        
        # Example data handler
        def load_one_sample_example():
            example_data = "100,105,98,102,97,103,99,101,96,104"
            return "Text Input", None, None, example_data, 100.0
        
        example_button.click(
            fn=load_one_sample_example,
            outputs=[input_method, loaded_dataframe, data_preview, group_str, population_mean],
            show_api=False
        )


def create_anova_tab():
    """Create a complete one-way ANOVA tab with all components and handlers."""
    
    with gr.TabItem("One-Way ANOVA"):
        gr.Markdown("**Compare means across multiple independent groups**")
        
        # Input method selector
        input_method = gr.Radio(
            choices=["File Upload", "Text Input"],
            value="File Upload",
            label="Choose Input Method",
            info="Select how you want to provide your data"
        )
        
        # File upload input section
        with gr.Group(visible=True) as file_section:
            gr.Markdown("### File Upload")
            gr.Markdown("*Upload CSV or Excel file - each column will be treated as a separate group*")
            
            with gr.Row():
                file_upload = gr.File(
                    label="Upload CSV/Excel File",
                    file_types=[".csv", ".xlsx", ".xls"],
                    type="filepath"
                )
                has_header = gr.Checkbox(
                    label="File has header row",
                    value=True,
                    info="Check if first row contains column names"
                )
                
            # Display loaded data preview
            data_preview = gr.Dataframe(
                label="Data Preview (all columns as groups)",
                interactive=False,
                row_count=5
            )
        
        # Text input section
        with gr.Group(visible=False) as text_section:
            gr.Markdown("### Text Input")
            gr.Markdown("*Enter groups separated by semicolons (;) with comma-separated values within each group*")
            
            groups_str = gr.Textbox(
                placeholder="85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1",
                label="Groups Data",
                info="Format: group1_values;group2_values;group3_values (e.g., treatment A;treatment B;control)",
                lines=3
            )
            
            gr.Markdown("**Example**: `85.2,90.1,78.5;88.1,85.7,91.2;82.3,87.4,89.1` represents 3 groups with their respective measurements")
        
        # Test parameters
        gr.Markdown("### Test Parameters")
        with gr.Row():
            alpha = gr.Number(
                value=0.05, 
                minimum=0, 
                maximum=1, 
                step=0.01, 
                label="Significance Level (α)",
                info="Probability threshold for statistical significance (typically 0.05)"
            )
            effect_thresholds = gr.Textbox(
                value="0.01,0.06,0.14",
                label="Effect Size Thresholds",
                info="Eta-squared (η²) boundaries: small,medium,large"
            )
        
        # Action buttons
        with gr.Row():
            run_button = gr.Button("Run One-Way ANOVA", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Statistical Test Results")
        
        # Interpretation note
        gr.Markdown("""
        ### Post-Hoc Note
        If ANOVA shows significant differences (p < α), consider running post-hoc tests to identify which specific groups differ from each other.
        """)
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # EVENT HANDLERS
        # Toggle between input methods
        input_method.change(
            fn=toggle_input_method,
            inputs=input_method,
            outputs=[file_section, text_section],
            show_api=False
        )
        
        # File upload handlers
        file_upload.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        has_header.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
        run_button.click(
            fn=one_way_anova,
            inputs=[
                loaded_dataframe,    # dataframe
                groups_str,          # groups_str
                alpha,               # alpha
                effect_thresholds    # effect_thresholds
            ],
            outputs=output
        )
        
        # Clear form handler
        def clear_anova():
            return (
                "File Upload",      # input_method
                None,               # loaded_dataframe
                None,               # data_preview
                "",                 # groups_str
                0.05,               # alpha
                "0.01,0.06,0.14",   # effect_thresholds
                {}                  # output
            )
        
        clear_button.click(
            fn=clear_anova,
            outputs=[
                input_method, loaded_dataframe, data_preview, 
                groups_str, alpha, effect_thresholds, output
            ],
            show_api=False
        )
        
        # Example data handler
        def load_anova_example():
            example_data = "85.2,90.1,78.5,92.3;88.1,85.7,91.2,87.4;82.3,87.4,89.1,83.7"
            return "Text Input", None, None, example_data
        
        example_button.click(
            fn=load_anova_example,
            outputs=[input_method, loaded_dataframe, data_preview, groups_str],
            show_api=False
        )

def create_multi_way_anova_tab():
    """Create a complete multi-way ANOVA tab with all components and handlers."""
    
    with gr.TabItem("Multi-Way ANOVA"):
        gr.Markdown("**Compare means across multiple categorical factors simultaneously**")
        
        # Input method selector
        input_method = gr.Radio(
            choices=["File Upload"],
            value="File Upload",
            label="Input Method",
            info="Multi-way ANOVA requires structured data - file upload recommended"
        )
        
        # File upload input section
        with gr.Group(visible=True) as file_section:
            gr.Markdown("### File Upload")
            gr.Markdown("*Upload CSV or Excel file with dependent variable and multiple categorical factors*")
            
            with gr.Row():
                file_upload = gr.File(
                    label="Upload CSV/Excel File",
                    file_types=[".csv", ".xlsx", ".xls"],
                    type="filepath"
                )
                has_header = gr.Checkbox(
                    label="File has header row",
                    value=True,
                    info="Check if first row contains column names"
                )
                
            # Display loaded data preview
            data_preview = gr.Dataframe(
                label="Data Preview",
                interactive=False,
                row_count=10
            )
        
        # Variable specification
        gr.Markdown("### Variable Specification")
        with gr.Row():
            dependent_var = gr.Dropdown(
                label="Dependent Variable",
                info="Select the continuous outcome variable",
                interactive=True
            )
            factors = gr.Textbox(
                label="Factors (comma-separated)",
                placeholder="treatment,gender,age_group",
                info="Enter factor column names separated by commas",
                lines=2
            )
        
        # Advanced options
        gr.Markdown("### Analysis Options")
        with gr.Row():
            include_interactions = gr.Checkbox(
                label="Include Interactions",
                value=True,
                info="Test for interaction effects between factors"
            )
            max_interaction_order = gr.Number(
                label="Max Interaction Order",
                value=None,
                minimum=2,
                step=1,
                info="Maximum interaction order (leave empty for all interactions)"
            )
        
        with gr.Row():
            sum_squares_type = gr.Dropdown(
                choices=[1, 2, 3],
                value=2,
                label="Sum of Squares Type",
                info="Type 2 for balanced, Type 3 for unbalanced designs"
            )
            alpha = gr.Number(
                value=0.05,
                minimum=0,
                maximum=1,
                step=0.01,
                label="Significance Level (α)",
                info="Probability threshold for statistical significance"
            )
        
        with gr.Row():
            effect_thresholds = gr.Textbox(
                value="0.01,0.06,0.14",
                label="Effect Size Thresholds",
                info="Eta-squared boundaries: small,medium,large"
            )
        
        # Action buttons
        with gr.Row():
            run_button = gr.Button("Run Multi-Way ANOVA", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Multi-Way ANOVA Results")
        
        # Information section
        with gr.Accordion("Multi-Way ANOVA Information", open=False):
            gr.Markdown("""
            ### What is Multi-Way ANOVA?
            
            Multi-way ANOVA extends one-way ANOVA to handle multiple categorical factors simultaneously:
            
            **Main Effects**: How each factor independently affects the outcome
            **Interaction Effects**: How factors work together (non-additively)
            
            ### Example Designs:
            - **2-way**: Treatment (A,B,C) × Gender (Male,Female) → 6 combinations
            - **3-way**: Drug (A,B) × Dose (Low,High) × Age (Young,Old) → 8 combinations  
            - **4-way**: Method (A,B) × School (Public,Private) × Gender (M,F) × Grade (1st,2nd) → 16 combinations
            
            ### Requirements:
            - All factors must be categorical (not continuous)
            - Dependent variable must be continuous
            - At least 2 observations per factor combination
            - Independence, normality, and equal variances assumptions
            """)
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # Helper function to load and preview file data
        def load_multi_way_file(file_path, has_header_flag):
            if file_path is None:
                return None, None, []
            
            try:
                # Determine header parameter
                header_param = 0 if has_header_flag else None
                
                if file_path.endswith('.csv'):
                    df = pd.read_csv(file_path, header=header_param)
                elif file_path.endswith(('.xlsx', '.xls')):
                    df = pd.read_excel(file_path, header=header_param)
                else:
                    return None, pd.DataFrame({'Error': ['Unsupported file format']}), []
                
                # Set column names if no header
                if not has_header_flag:
                    df.columns = [f'Column_{i+1}' for i in range(len(df.columns))]
                
                # Get column options for dropdown
                column_options = list(df.columns)
                
                # Return dataframe, preview, and column options
                preview = df.head(15)
                return df, preview, column_options
                
            except Exception as e:
                error_df = pd.DataFrame({'Error': [f"Failed to load file: {str(e)}"]})
                return None, error_df, []
        
        # Clear form function
        def clear_multi_way():
            return (
                None,               # loaded_dataframe
                None,               # data_preview
                [],                 # dependent_var choices
                None,               # dependent_var value
                "",                 # factors
                True,               # include_interactions
                None,               # max_interaction_order
                2,                  # sum_squares_type
                0.05,               # alpha
                "0.01,0.06,0.14",   # effect_thresholds
                {}                  # output
            )
        
        # Example data function
        def load_multi_way_example():
            # Create example 3-way ANOVA data
            np.random.seed(42)
            
            treatments = ['Control', 'Treatment_A', 'Treatment_B']
            genders = ['Male', 'Female']
            ages = ['Young', 'Old']
            
            data = []
            for treatment in treatments:
                for gender in genders:
                    for age in ages:
                        # Generate scores with different effects
                        base_score = 50
                        treatment_effect = {'Control': 0, 'Treatment_A': 8, 'Treatment_B': 12}[treatment]
                        gender_effect = {'Male': 3, 'Female': -3}[gender] 
                        age_effect = {'Young': 5, 'Old': -5}[age]
                        
                        # Add interaction: Treatment_B works better for older patients
                        interaction_effect = 0
                        if treatment == 'Treatment_B' and age == 'Old':
                            interaction_effect = 6
                        
                        n_per_cell = 15
                        mean_score = base_score + treatment_effect + gender_effect + age_effect + interaction_effect
                        scores = np.random.normal(mean_score, 6, n_per_cell)
                        
                        for score in scores:
                            data.append({
                                'test_score': round(score, 2),
                                'treatment': treatment,
                                'gender': gender,
                                'age_group': age
                            })
            
            df = pd.DataFrame(data)
            preview = df.head(15)
            column_options = list(df.columns)
            
            return df, preview, column_options, 'test_score', 'treatment,gender,age_group'
        
        # EVENT HANDLERS
        
        # File upload handlers
        file_upload.change(
            fn=load_multi_way_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview, dependent_var],
            show_api=False
        )
        
        has_header.change(
            fn=load_multi_way_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview, dependent_var],
            show_api=False
        )
        
        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
        run_button.click(
            fn=multi_way_anova,
            inputs=[
                loaded_dataframe,       # dataframe
                dependent_var,          # dependent_var
                factors,                # factors
                alpha,                  # alpha
                effect_thresholds,      # effect_thresholds
                include_interactions,   # include_interactions
                max_interaction_order,  # max_interaction_order
                sum_squares_type        # sum_squares_type
            ],
            outputs=output
        )
        
        # Clear form handler
        clear_button.click(
            fn=clear_multi_way,
            outputs=[
                loaded_dataframe, data_preview, dependent_var, dependent_var,
                factors, include_interactions, max_interaction_order,
                sum_squares_type, alpha, effect_thresholds, output
            ],
            show_api=False
        )
        
        # Example data handler
        example_button.click(
            fn=load_multi_way_example,
            outputs=[loaded_dataframe, data_preview, dependent_var, dependent_var, factors],
            show_api=False
        )

def create_chi_square_tab():
    """Create a complete chi-square goodness of fit test tab with all components and handlers."""
    
    with gr.TabItem("Chi-Square Test"):
        gr.Markdown("**Test if observed frequencies differ from expected frequencies**")
        
        # Input method selector
        input_method = gr.Radio(
            choices=["File Upload", "Text Input"],
            value="File Upload",
            label="Choose Input Method",
            info="Select how you want to provide your data"
        )
        
        # File upload input section
        with gr.Group(visible=True) as file_section:
            gr.Markdown("### File Upload")
            gr.Markdown("*Upload CSV or Excel file - first column: observed frequencies, second column: expected frequencies (optional)*")
            
            with gr.Row():
                file_upload = gr.File(
                    label="Upload CSV/Excel File",
                    file_types=[".csv", ".xlsx", ".xls"],
                    type="filepath"
                )
                has_header = gr.Checkbox(
                    label="File has header row",
                    value=True,
                    info="Check if first row contains column names"
                )
                
            # Display loaded data preview
            data_preview = gr.Dataframe(
                label="Data Preview (observed and expected frequencies)",
                interactive=False,
                row_count=5
            )
        
        # Text input section
        with gr.Group(visible=False) as text_section:
            gr.Markdown("### Text Input")
            gr.Markdown("*Enter comma-separated frequency values*")
            
            observed_str = gr.Textbox(
                placeholder="25,30,20,15",
                label="Observed Frequencies",
                info="Comma-separated observed frequencies for each category"
            )
            
            expected_str = gr.Textbox(
                placeholder="22.5,22.5,22.5,22.5",
                label="Expected Frequencies (Optional)",
                info="Comma-separated expected frequencies. Leave empty for equal distribution"
            )
        
        # Test parameters
        gr.Markdown("### Test Parameters")
        with gr.Row():
            alpha = gr.Number(
                value=0.05, 
                minimum=0, 
                maximum=1, 
                step=0.01, 
                label="Significance Level (α)",
                info="Probability threshold for statistical significance (typically 0.05)"
            )
            effect_thresholds = gr.Textbox(
                value="0.1,0.3,0.5",
                label="Effect Size Thresholds",
                info="Cramér's V boundaries: small,medium,large"
            )
        
        # Action buttons
        with gr.Row():
            run_button = gr.Button("Run Chi-Square Test", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Statistical Test Results")
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # EVENT HANDLERS
        # Toggle between input methods
        input_method.change(
            fn=toggle_input_method,
            inputs=input_method,
            outputs=[file_section, text_section],
            show_api=False
        )
        
        # File upload handlers
        file_upload.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        has_header.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
        run_button.click(
            fn=chi_square_test,
            inputs=[
                loaded_dataframe,    # dataframe
                observed_str,        # observed_str
                expected_str,        # expected_str
                alpha,               # alpha
                effect_thresholds    # effect_thresholds
            ],
            outputs=output
        )
        
        # Clear form handler
        def clear_chi_square():
            return (
                "File Upload",      # input_method
                None,               # loaded_dataframe
                None,               # data_preview
                "",                 # observed_str
                "",                 # expected_str
                0.05,               # alpha
                "0.1,0.3,0.5",      # effect_thresholds
                {}                  # output
            )
        
        clear_button.click(
            fn=clear_chi_square,
            outputs=[
                input_method, loaded_dataframe, data_preview, 
                observed_str, expected_str, alpha, effect_thresholds, output
            ],
            show_api=False
        )
        
        # Example data handler
        def load_chi_square_example():
            observed_example = "25,30,20,15"
            expected_example = "22.5,22.5,22.5,22.5"
            return "Text Input", None, None, observed_example, expected_example
        
        example_button.click(
            fn=load_chi_square_example,
            outputs=[input_method, loaded_dataframe, data_preview, observed_str, expected_str],
            show_api=False
        )


def create_correlation_tab():
    """Create a complete correlation analysis tab with all components and handlers."""
    
    with gr.TabItem("Correlation Test"):
        gr.Markdown("**Analyze the relationship between two continuous variables**")
        
        # Input method selector
        input_method = gr.Radio(
            choices=["File Upload", "Text Input"],
            value="File Upload",
            label="Choose Input Method",
            info="Select how you want to provide your data"
        )
        
        # File upload input section
        with gr.Group(visible=True) as file_section:
            gr.Markdown("### File Upload")
            gr.Markdown("*Upload CSV or Excel file - first two columns will be used as the two variables*")
            
            with gr.Row():
                file_upload = gr.File(
                    label="Upload CSV/Excel File",
                    file_types=[".csv", ".xlsx", ".xls"],
                    type="filepath"
                )
                has_header = gr.Checkbox(
                    label="File has header row",
                    value=True,
                    info="Check if first row contains column names"
                )
                
            # Display loaded data preview
            data_preview = gr.Dataframe(
                label="Data Preview (first two columns as variables)",
                interactive=False,
                row_count=5
            )
        
        # Text input section
        with gr.Group(visible=False) as text_section:
            gr.Markdown("### Text Input")
            gr.Markdown("*Enter comma-separated values for each variable*")
            
            group1_str = gr.Textbox(
                placeholder="5.2,6.1,4.8,7.3,5.9",
                label="Variable 1 (X)",
                info="Comma-separated numbers (e.g., hours studied, height, age)"
            )
            
            group2_str = gr.Textbox(
                placeholder="78,85,72,92,81",
                label="Variable 2 (Y)",
                info="Comma-separated numbers (e.g., test scores, weight, income)"
            )
        
        # Test parameters
        gr.Markdown("### Test Parameters")
        with gr.Row():
            method = gr.Dropdown(
                choices=["pearson", "spearman", "kendall"], 
                value="pearson", 
                label="Correlation Method",
                info="pearson: linear relationships; spearman: monotonic; kendall: robust to outliers"
            )
            alpha = gr.Number(
                value=0.05, 
                minimum=0, 
                maximum=1, 
                step=0.01, 
                label="Significance Level (α)",
                info="Probability threshold for statistical significance (typically 0.05)"
            )
        
        with gr.Row():
            effect_thresholds = gr.Textbox(
                value="0.1,0.3,0.5",
                label="Effect Size Thresholds",
                info="Correlation coefficient boundaries: small,medium,large"
            )
        
        # Action buttons
        with gr.Row():
            run_button = gr.Button("Run Correlation Test", variant="primary", scale=1)
            clear_button = gr.Button("Clear All", variant="secondary", scale=1)
        
        # Output display
        output = gr.JSON(label="Statistical Test Results")
        
        # Example data section
        with gr.Row():
            gr.Markdown("### Quick Examples")
            example_button = gr.Button("Load Example Data", variant="outline")
        
        # State management
        loaded_dataframe = gr.State(value=None)
        
        # EVENT HANDLERS
        # Toggle between input methods
        input_method.change(
            fn=toggle_input_method,
            inputs=input_method,
            outputs=[file_section, text_section],
            show_api=False
        )
        
        # File upload handlers
        file_upload.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        has_header.change(
            fn=load_uploaded_file,
            inputs=[file_upload, has_header],
            outputs=[loaded_dataframe, data_preview],
            show_api=False
        )
        
        # MAIN STATISTICAL FUNCTION CALL - Exposed to MCP!
        run_button.click(
            fn=correlation_test,
            inputs=[
                loaded_dataframe,    # dataframe
                group1_str,          # group1_str
                group2_str,          # group2_str
                method,              # method
                alpha,               # alpha
                effect_thresholds    # effect_thresholds
            ],
            outputs=output
        )
        
        # Clear form handler
        def clear_correlation():
            return (
                "File Upload",      # input_method
                None,               # loaded_dataframe
                None,               # data_preview
                "",                 # group1_str
                "",                 # group2_str
                "pearson",          # method
                0.05,               # alpha
                "0.1,0.3,0.5",      # effect_thresholds
                {}                  # output
            )
        
        clear_button.click(
            fn=clear_correlation,
            outputs=[
                input_method, loaded_dataframe, data_preview, 
                group1_str, group2_str, method, alpha, effect_thresholds, output
            ],
            show_api=False
        )
        
        # Example data handler
        def load_correlation_example():
            x_example = "5.2,6.1,4.8,7.3,5.9,6.8,4.5,7.1"
            y_example = "78,85,72,92,81,89,70,88"
            return "Text Input", None, None, x_example, y_example
        
        example_button.click(
            fn=load_correlation_example,
            outputs=[input_method, loaded_dataframe, data_preview, group1_str, group2_str],
            show_api=False
        )

def create_t_test_interface():
    """Create the complete t-test interface with both Student's and Welch's tabs."""
    
    with gr.Blocks(title="T-Test Analysis", theme=gr.themes.Soft()) as demo:
        
        gr.Markdown("""
        # Statistical Analysis MCP
        """)
        
        with gr.Tabs():
            # Create Student's t-test tab
            student_components = create_t_test_tab(
                test_function=student_t_test,
                test_name="Student's T-Test",
                description="**t-test between independent groups assuming equal population variances**"
            )
            
            # Create Welch's t-test tab  
            welch_components = create_t_test_tab(
                test_function=welch_t_test,
                test_name="Welch's T-Test", 
                description="**t-test between independent groups that does not assume equal population variances**"
            )

            # Create paired t-test tab  
            paired_components = create_t_test_tab(
                test_function=paired_t_test,
                test_name="Paired T-Test", 
                description="**t-test between paired groups**"
            )

            one_sample_components = create_one_sample_t_test_tab()
            anova_components = create_anova_tab()
            manova_components = create_multi_way_anova_tab()
            chi_square_components = create_chi_square_tab()
            corr_components = create_correlation_tab()
    
    return demo

# Main execution
if __name__ == "__main__":
    demo = create_t_test_interface()
    demo.launch(mcp_server=True)