Back to Portfolio

Advanced Statistical Analysis

Comprehensive statistical analysis toolkit with hypothesis testing, regression analysis, and predictive modeling.

Analysis Overview

This comprehensive statistical analysis framework demonstrates advanced statistical methods and their practical application in business analytics. The toolkit provides a complete suite of statistical tests, regression models, and time series analysis capabilities. It showcases best practices in statistical inference, model validation, and result interpretation, making complex statistical concepts accessible for business decision-making.

Project Objectives

  • Implement comprehensive statistical testing frameworks
  • Demonstrate advanced regression and predictive modeling techniques
  • Create automated statistical reporting and interpretation
  • Build robust model validation and performance assessment tools
  • Establish statistical best practices for business analytics

Analytical Goals

  • Identify statistically significant business relationships
  • Quantify the impact of business interventions
  • Build predictive models for key business metrics
  • Detect trends and seasonal patterns in business data
  • Provide statistical validation for business hypotheses

Key Features

Comprehensive descriptive statistics with outlier detection
Hypothesis testing suite (t-tests, ANOVA, chi-square)
Multiple regression models with feature importance
Time series analysis and stationarity testing
Automated insights generation and reporting
Statistical significance interpretation and effect size calculation

Business Value & Impact

Enable evidence-based decision making through statistical rigor
Reduce risk of false conclusions through proper hypothesis testing
Improve forecast accuracy through advanced modeling techniques
Quantify business impact with confidence intervals
Standardize statistical analysis processes across teams

Technical Highlights

  • Comprehensive statistical test selection and validation
  • Automated effect size calculation and interpretation
  • Robust handling of different data types and distributions
  • Model comparison and selection frameworks
  • Time series decomposition and stationarity testing

Implementation

import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Tuple, List

class StatisticalAnalyzer:
    def __init__(self, data: pd.DataFrame):
        self.data = data
        self.results = {}
    
    def descriptive_statistics(self) -> Dict:
        """Generate comprehensive descriptive statistics"""
        numeric_cols = self.data.select_dtypes(include=[np.number]).columns
        
        stats_summary = {
            'basic_stats': self.data[numeric_cols].describe(),
            'skewness': self.data[numeric_cols].skew(),
            'kurtosis': self.data[numeric_cols].kurtosis(),
            'correlation_matrix': self.data[numeric_cols].corr()
        }
        
        # Outlier detection using IQR method
        outliers = {}
        for col in numeric_cols:
            Q1 = self.data[col].quantile(0.25)
            Q3 = self.data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            outliers[col] = {
                'count': len(self.data[(self.data[col] < lower_bound) | (self.data[col] > upper_bound)]),
                'percentage': len(self.data[(self.data[col] < lower_bound) | (self.data[col] > upper_bound)]) / len(self.data) * 100
            }
        
        stats_summary['outliers'] = outliers
        return stats_summary
    
    def hypothesis_testing(self, group_col: str, target_col: str, test_type: str = 'ttest') -> Dict:
        """Perform various hypothesis tests"""
        groups = self.data[group_col].unique()
        
        if test_type == 'ttest' and len(groups) == 2:
            group1 = self.data[self.data[group_col] == groups[0]][target_col]
            group2 = self.data[self.data[group_col] == groups[1]][target_col]
            
            # Perform independent t-test
            statistic, p_value = stats.ttest_ind(group1, group2)
            
            result = {
                'test_type': 'Independent T-Test',
                'statistic': statistic,
                'p_value': p_value,
                'significant': p_value < 0.05,
                'group1_mean': group1.mean(),
                'group2_mean': group2.mean(),
                'effect_size': (group1.mean() - group2.mean()) / np.sqrt(((group1.var() + group2.var()) / 2))
            }
            
        elif test_type == 'anova' and len(groups) > 2:
            group_data = [self.data[self.data[group_col] == group][target_col] for group in groups]
            statistic, p_value = stats.f_oneway(*group_data)
            
            result = {
                'test_type': 'One-Way ANOVA',
                'statistic': statistic,
                'p_value': p_value,
                'significant': p_value < 0.05,
                'group_means': {group: self.data[self.data[group_col] == group][target_col].mean() for group in groups}
            }
        
        elif test_type == 'chi2':
            # Chi-square test for categorical variables
            contingency_table = pd.crosstab(self.data[group_col], self.data[target_col])
            statistic, p_value, dof, expected = stats.chi2_contingency(contingency_table)
            
            result = {
                'test_type': 'Chi-Square Test',
                'statistic': statistic,
                'p_value': p_value,
                'degrees_of_freedom': dof,
                'significant': p_value < 0.05,
                'contingency_table': contingency_table
            }
        
        return result
    
    def regression_analysis(self, target_col: str, feature_cols: List[str]) -> Dict:
        """Perform comprehensive regression analysis"""
        X = self.data[feature_cols]
        y = self.data[target_col]
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        results = {}
        
        # Linear Regression
        lr_model = LinearRegression()
        lr_model.fit(X_train, y_train)
        lr_pred = lr_model.predict(X_test)
        
        results['linear_regression'] = {
            'r2_score': r2_score(y_test, lr_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, lr_pred)),
            'coefficients': dict(zip(feature_cols, lr_model.coef_)),
            'intercept': lr_model.intercept_
        }
        
        # Random Forest Regression
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)
        rf_pred = rf_model.predict(X_test)
        
        results['random_forest'] = {
            'r2_score': r2_score(y_test, rf_pred),
            'rmse': np.sqrt(mean_squared_error(y_test, rf_pred)),
            'feature_importance': dict(zip(feature_cols, rf_model.feature_importances_))
        }
        
        # Model comparison
        results['model_comparison'] = {
            'best_model': 'Linear Regression' if results['linear_regression']['r2_score'] > results['random_forest']['r2_score'] else 'Random Forest',
            'performance_difference': abs(results['linear_regression']['r2_score'] - results['random_forest']['r2_score'])
        }
        
        return results
    
    def time_series_analysis(self, date_col: str, value_col: str) -> Dict:
        """Perform time series analysis and forecasting"""
        # Ensure date column is datetime
        self.data[date_col] = pd.to_datetime(self.data[date_col])
        ts_data = self.data.set_index(date_col)[value_col].sort_index()
        
        # Basic time series statistics
        results = {
            'trend': 'increasing' if ts_data.iloc[-1] > ts_data.iloc[0] else 'decreasing',
            'volatility': ts_data.std(),
            'mean': ts_data.mean(),
            'seasonal_decomposition': self._seasonal_decompose(ts_data)
        }
        
        # Stationarity test (Augmented Dickey-Fuller)
        adf_statistic, adf_p_value = stats.adfuller(ts_data.dropna())[:2]
        results['stationarity'] = {
            'adf_statistic': adf_statistic,
            'p_value': adf_p_value,
            'is_stationary': adf_p_value < 0.05
        }
        
        return results
    
    def _seasonal_decompose(self, ts_data: pd.Series) -> Dict:
        """Simple seasonal decomposition"""
        # Calculate moving averages for trend
        window = min(12, len(ts_data) // 4)  # Adjust window based on data length
        trend = ts_data.rolling(window=window, center=True).mean()
        
        # Calculate seasonal component (simplified)
        detrended = ts_data - trend
        seasonal = detrended.groupby(detrended.index.month).mean()
        
        return {
            'trend_strength': trend.std() / ts_data.std() if ts_data.std() > 0 else 0,
            'seasonal_strength': seasonal.std() / ts_data.std() if ts_data.std() > 0 else 0
        }
    
    def generate_insights_report(self) -> str:
        """Generate a comprehensive insights report"""
        report = []
        report.append("=== STATISTICAL ANALYSIS REPORT ===")
        report.append(f"Dataset Shape: {self.data.shape}")
        report.append(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
        report.append("
")
        
        # Add descriptive statistics summary
        desc_stats = self.descriptive_statistics()
        report.append("DESCRIPTIVE STATISTICS:")
        report.append(f"- Number of numeric variables: {len(desc_stats['basic_stats'].columns)}")
        report.append(f"- Variables with high skewness (>1): {len(desc_stats['skewness'][desc_stats['skewness'] > 1])}")
        
        # Add correlation insights
        corr_matrix = desc_stats['correlation_matrix']
        high_corr_pairs = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.7:
                    high_corr_pairs.append(f"{corr_matrix.columns[i]} - {corr_matrix.columns[j]}: {corr_val:.3f}")
        
        if high_corr_pairs:
            report.append("
HIGH CORRELATIONS (>0.7):")
            for pair in high_corr_pairs:
                report.append(f"- {pair}")
        
        return "
".join(report)

Analysis Details

Complexity Level

Expert

Estimated Time

8-10 hours

Skill Level

Senior Data Scientist

Language

PYTHON

Use Cases

  • A/B testing and experimental design analysis
  • Market research and customer behavior studies
  • Quality control and process improvement
  • Financial risk assessment and modeling
  • Healthcare outcomes and clinical trial analysis