Advanced Statistical Analysis
Comprehensive statistical analysis toolkit with hypothesis testing, regression analysis, and predictive modeling.
Analysis Overview
This comprehensive statistical analysis framework demonstrates advanced statistical methods and their practical application in business analytics. The toolkit provides a complete suite of statistical tests, regression models, and time series analysis capabilities. It showcases best practices in statistical inference, model validation, and result interpretation, making complex statistical concepts accessible for business decision-making.
Project Objectives
- Implement comprehensive statistical testing frameworks
- Demonstrate advanced regression and predictive modeling techniques
- Create automated statistical reporting and interpretation
- Build robust model validation and performance assessment tools
- Establish statistical best practices for business analytics
Analytical Goals
- Identify statistically significant business relationships
- Quantify the impact of business interventions
- Build predictive models for key business metrics
- Detect trends and seasonal patterns in business data
- Provide statistical validation for business hypotheses
Key Features
Business Value & Impact
Technical Highlights
- Comprehensive statistical test selection and validation
- Automated effect size calculation and interpretation
- Robust handling of different data types and distributions
- Model comparison and selection frameworks
- Time series decomposition and stationarity testing
Implementation
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Tuple, List
class StatisticalAnalyzer:
def __init__(self, data: pd.DataFrame):
self.data = data
self.results = {}
def descriptive_statistics(self) -> Dict:
"""Generate comprehensive descriptive statistics"""
numeric_cols = self.data.select_dtypes(include=[np.number]).columns
stats_summary = {
'basic_stats': self.data[numeric_cols].describe(),
'skewness': self.data[numeric_cols].skew(),
'kurtosis': self.data[numeric_cols].kurtosis(),
'correlation_matrix': self.data[numeric_cols].corr()
}
# Outlier detection using IQR method
outliers = {}
for col in numeric_cols:
Q1 = self.data[col].quantile(0.25)
Q3 = self.data[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers[col] = {
'count': len(self.data[(self.data[col] < lower_bound) | (self.data[col] > upper_bound)]),
'percentage': len(self.data[(self.data[col] < lower_bound) | (self.data[col] > upper_bound)]) / len(self.data) * 100
}
stats_summary['outliers'] = outliers
return stats_summary
def hypothesis_testing(self, group_col: str, target_col: str, test_type: str = 'ttest') -> Dict:
"""Perform various hypothesis tests"""
groups = self.data[group_col].unique()
if test_type == 'ttest' and len(groups) == 2:
group1 = self.data[self.data[group_col] == groups[0]][target_col]
group2 = self.data[self.data[group_col] == groups[1]][target_col]
# Perform independent t-test
statistic, p_value = stats.ttest_ind(group1, group2)
result = {
'test_type': 'Independent T-Test',
'statistic': statistic,
'p_value': p_value,
'significant': p_value < 0.05,
'group1_mean': group1.mean(),
'group2_mean': group2.mean(),
'effect_size': (group1.mean() - group2.mean()) / np.sqrt(((group1.var() + group2.var()) / 2))
}
elif test_type == 'anova' and len(groups) > 2:
group_data = [self.data[self.data[group_col] == group][target_col] for group in groups]
statistic, p_value = stats.f_oneway(*group_data)
result = {
'test_type': 'One-Way ANOVA',
'statistic': statistic,
'p_value': p_value,
'significant': p_value < 0.05,
'group_means': {group: self.data[self.data[group_col] == group][target_col].mean() for group in groups}
}
elif test_type == 'chi2':
# Chi-square test for categorical variables
contingency_table = pd.crosstab(self.data[group_col], self.data[target_col])
statistic, p_value, dof, expected = stats.chi2_contingency(contingency_table)
result = {
'test_type': 'Chi-Square Test',
'statistic': statistic,
'p_value': p_value,
'degrees_of_freedom': dof,
'significant': p_value < 0.05,
'contingency_table': contingency_table
}
return result
def regression_analysis(self, target_col: str, feature_cols: List[str]) -> Dict:
"""Perform comprehensive regression analysis"""
X = self.data[feature_cols]
y = self.data[target_col]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
results = {}
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
results['linear_regression'] = {
'r2_score': r2_score(y_test, lr_pred),
'rmse': np.sqrt(mean_squared_error(y_test, lr_pred)),
'coefficients': dict(zip(feature_cols, lr_model.coef_)),
'intercept': lr_model.intercept_
}
# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
results['random_forest'] = {
'r2_score': r2_score(y_test, rf_pred),
'rmse': np.sqrt(mean_squared_error(y_test, rf_pred)),
'feature_importance': dict(zip(feature_cols, rf_model.feature_importances_))
}
# Model comparison
results['model_comparison'] = {
'best_model': 'Linear Regression' if results['linear_regression']['r2_score'] > results['random_forest']['r2_score'] else 'Random Forest',
'performance_difference': abs(results['linear_regression']['r2_score'] - results['random_forest']['r2_score'])
}
return results
def time_series_analysis(self, date_col: str, value_col: str) -> Dict:
"""Perform time series analysis and forecasting"""
# Ensure date column is datetime
self.data[date_col] = pd.to_datetime(self.data[date_col])
ts_data = self.data.set_index(date_col)[value_col].sort_index()
# Basic time series statistics
results = {
'trend': 'increasing' if ts_data.iloc[-1] > ts_data.iloc[0] else 'decreasing',
'volatility': ts_data.std(),
'mean': ts_data.mean(),
'seasonal_decomposition': self._seasonal_decompose(ts_data)
}
# Stationarity test (Augmented Dickey-Fuller)
adf_statistic, adf_p_value = stats.adfuller(ts_data.dropna())[:2]
results['stationarity'] = {
'adf_statistic': adf_statistic,
'p_value': adf_p_value,
'is_stationary': adf_p_value < 0.05
}
return results
def _seasonal_decompose(self, ts_data: pd.Series) -> Dict:
"""Simple seasonal decomposition"""
# Calculate moving averages for trend
window = min(12, len(ts_data) // 4) # Adjust window based on data length
trend = ts_data.rolling(window=window, center=True).mean()
# Calculate seasonal component (simplified)
detrended = ts_data - trend
seasonal = detrended.groupby(detrended.index.month).mean()
return {
'trend_strength': trend.std() / ts_data.std() if ts_data.std() > 0 else 0,
'seasonal_strength': seasonal.std() / ts_data.std() if ts_data.std() > 0 else 0
}
def generate_insights_report(self) -> str:
"""Generate a comprehensive insights report"""
report = []
report.append("=== STATISTICAL ANALYSIS REPORT ===")
report.append(f"Dataset Shape: {self.data.shape}")
report.append(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("
")
# Add descriptive statistics summary
desc_stats = self.descriptive_statistics()
report.append("DESCRIPTIVE STATISTICS:")
report.append(f"- Number of numeric variables: {len(desc_stats['basic_stats'].columns)}")
report.append(f"- Variables with high skewness (>1): {len(desc_stats['skewness'][desc_stats['skewness'] > 1])}")
# Add correlation insights
corr_matrix = desc_stats['correlation_matrix']
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
for j in range(i+1, len(corr_matrix.columns)):
corr_val = corr_matrix.iloc[i, j]
if abs(corr_val) > 0.7:
high_corr_pairs.append(f"{corr_matrix.columns[i]} - {corr_matrix.columns[j]}: {corr_val:.3f}")
if high_corr_pairs:
report.append("
HIGH CORRELATIONS (>0.7):")
for pair in high_corr_pairs:
report.append(f"- {pair}")
return "
".join(report)
Analysis Details
Complexity Level
Estimated Time
8-10 hours
Skill Level
Senior Data Scientist
Language
Use Cases
- • A/B testing and experimental design analysis
- • Market research and customer behavior studies
- • Quality control and process improvement
- • Financial risk assessment and modeling
- • Healthcare outcomes and clinical trial analysis
Related Examples
Pandas Data Analysis & Visualization
Comprehensive data analysis using pandas with statistical insights and advanced ...
Tableau Dashboard Automation
Python script to automate Tableau dashboard creation and data refresh using Tabl...
Data Pipeline Processor
A robust data processing pipeline with error handling, retry logic, and monitori...