Machine Learning Model Pipeline
End-to-end ML pipeline with feature engineering, model training, validation, and deployment preparation.
Analysis Overview
This comprehensive machine learning pipeline demonstrates the complete ML lifecycle from data preprocessing to model deployment. The solution showcases advanced feature engineering techniques, multiple model comparison, hyperparameter tuning, and production-ready model management. It provides a robust framework for building, validating, and deploying machine learning models at scale, with emphasis on reproducibility, performance monitoring, and model governance.
Project Objectives
- Demonstrate end-to-end machine learning pipeline development
- Implement advanced feature engineering and selection techniques
- Create comprehensive model evaluation and comparison frameworks
- Build production-ready model deployment and monitoring systems
- Establish ML best practices for reproducibility and governance
Analytical Goals
- Build accurate predictive models for business outcomes
- Identify key features driving model predictions
- Optimize model performance through systematic tuning
- Ensure model reliability through comprehensive validation
- Enable scalable model deployment and monitoring
Key Features
Business Value & Impact
Technical Highlights
- Comprehensive preprocessing pipeline with encoding and scaling
- Multiple algorithm comparison with performance metrics
- Automated hyperparameter optimization
- Feature engineering with interaction and polynomial terms
- Model persistence and loading for production deployment
Implementation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib
from typing import Dict, Tuple, Any
import warnings
warnings.filterwarnings('ignore')
class MLPipeline:
def __init__(self):
self.models = {}
self.scalers = {}
self.encoders = {}
self.feature_names = []
self.target_name = ''
def preprocess_data(self, df: pd.DataFrame, target_col: str) -> Tuple[pd.DataFrame, pd.Series]:
"""Comprehensive data preprocessing"""
# Separate features and target
X = df.drop(columns=[target_col])
y = df[target_col]
# Handle missing values
numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object']).columns
# Fill numeric missing values with median
for col in numeric_cols:
X[col].fillna(X[col].median(), inplace=True)
# Fill categorical missing values with mode
for col in categorical_cols:
X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else 'Unknown', inplace=True)
# Encode categorical variables
for col in categorical_cols:
le = LabelEncoder()
X[col] = le.fit_transform(X[col].astype(str))
self.encoders[col] = le
# Feature engineering
X = self._create_features(X)
# Store feature names
self.feature_names = X.columns.tolist()
self.target_name = target_col
return X, y
def _create_features(self, X: pd.DataFrame) -> pd.DataFrame:
"""Create additional features"""
# Create interaction features for top correlated pairs
numeric_cols = X.select_dtypes(include=[np.number]).columns
if len(numeric_cols) >= 2:
# Create polynomial features for top 2 numeric columns
top_cols = numeric_cols[:2]
X[f'{top_cols[0]}_x_{top_cols[1]}'] = X[top_cols[0]] * X[top_cols[1]]
X[f'{top_cols[0]}_squared'] = X[top_cols[0]] ** 2
# Create binned features
for col in numeric_cols[:3]: # Limit to first 3 numeric columns
X[f'{col}_binned'] = pd.cut(X[col], bins=5, labels=False)
return X
def train_models(self, X: pd.DataFrame, y: pd.Series) -> Dict[str, Any]:
"""Train multiple models and compare performance"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
self.scalers['standard'] = scaler
# Define models
models_to_train = {
'logistic_regression': LogisticRegression(random_state=42, max_iter=1000),
'random_forest': RandomForestClassifier(random_state=42, n_estimators=100),
'gradient_boosting': GradientBoostingClassifier(random_state=42, n_estimators=100)
}
results = {}
for name, model in models_to_train.items():
print(f"Training {name}...")
# Use scaled data for logistic regression, original for tree-based models
if name == 'logistic_regression':
X_train_model = X_train_scaled
X_test_model = X_test_scaled
else:
X_train_model = X_train
X_test_model = X_test
# Train model
model.fit(X_train_model, y_train)
# Make predictions
y_pred = model.predict(X_test_model)
y_pred_proba = model.predict_proba(X_test_model)[:, 1] if hasattr(model, 'predict_proba') else None
# Calculate metrics
cv_scores = cross_val_score(model, X_train_model, y_train, cv=5, scoring='accuracy')
model_results = {
'model': model,
'accuracy': model.score(X_test_model, y_test),
'cv_mean': cv_scores.mean(),
'cv_std': cv_scores.std(),
'classification_report': classification_report(y_test, y_pred, output_dict=True),
'confusion_matrix': confusion_matrix(y_test, y_pred).tolist()
}
if y_pred_proba is not None:
model_results['roc_auc'] = roc_auc_score(y_test, y_pred_proba)
# Feature importance (for tree-based models)
if hasattr(model, 'feature_importances_'):
feature_importance = dict(zip(self.feature_names, model.feature_importances_))
model_results['feature_importance'] = dict(sorted(feature_importance.items(), key=lambda x: x[1], reverse=True))
results[name] = model_results
self.models[name] = model
# Find best model
best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
results['best_model'] = best_model_name
results['best_accuracy'] = results[best_model_name]['accuracy']
return results
def hyperparameter_tuning(self, X: pd.DataFrame, y: pd.Series, model_name: str = 'random_forest') -> Dict:
"""Perform hyperparameter tuning for specified model"""
if model_name == 'random_forest':
model = RandomForestClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
elif model_name == 'gradient_boosting':
model = GradientBoostingClassifier(random_state=42)
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7]
}
else:
return {'error': f'Hyperparameter tuning not implemented for {model_name}'}
# Perform grid search
grid_search = GridSearchCV(
model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X, y)
return {
'best_params': grid_search.best_params_,
'best_score': grid_search.best_score_,
'best_model': grid_search.best_estimator_
}
def save_model(self, model_name: str, filepath: str) -> bool:
"""Save trained model to disk"""
try:
if model_name in self.models:
model_package = {
'model': self.models[model_name],
'scaler': self.scalers.get('standard'),
'encoders': self.encoders,
'feature_names': self.feature_names,
'target_name': self.target_name
}
joblib.dump(model_package, filepath)
return True
else:
print(f"Model {model_name} not found")
return False
except Exception as e:
print(f"Error saving model: {e}")
return False
def load_model(self, filepath: str) -> bool:
"""Load trained model from disk"""
try:
model_package = joblib.load(filepath)
self.models['loaded'] = model_package['model']
self.scalers['standard'] = model_package.get('scaler')
self.encoders = model_package.get('encoders', {})
self.feature_names = model_package.get('feature_names', [])
self.target_name = model_package.get('target_name', '')
return True
except Exception as e:
print(f"Error loading model: {e}")
return False
def predict_new_data(self, new_data: pd.DataFrame, model_name: str = 'loaded') -> np.ndarray:
"""Make predictions on new data"""
if model_name not in self.models:
raise ValueError(f"Model {model_name} not found")
# Preprocess new data (apply same transformations)
processed_data = new_data.copy()
# Apply encoders
for col, encoder in self.encoders.items():
if col in processed_data.columns:
processed_data[col] = encoder.transform(processed_data[col].astype(str))
# Apply scaling if needed
if model_name == 'logistic_regression' and 'standard' in self.scalers:
processed_data = self.scalers['standard'].transform(processed_data)
# Make predictions
predictions = self.models[model_name].predict(processed_data)
return predictions
Analysis Details
Complexity Level
Estimated Time
10-12 hours
Skill Level
Senior ML Engineer
Language
Use Cases
- • Customer churn prediction and retention modeling
- • Sales forecasting and demand planning
- • Risk assessment and fraud detection
- • Recommendation systems and personalization
- • Quality control and predictive maintenance
Related Examples
Pandas Data Analysis & Visualization
Comprehensive data analysis using pandas with statistical insights and advanced ...
Tableau Dashboard Automation
Python script to automate Tableau dashboard creation and data refresh using Tabl...
Advanced Statistical Analysis
Comprehensive statistical analysis toolkit with hypothesis testing, regression a...