Demo 2: Basic Classification with Synthetic Health Data 🏥¶
Learning Objectives 🎯¶
By the end of this demo, you will be able to: 1. Generate and visualize synthetic health data 2. Compare different classification algorithms 3. Evaluate model performance using various metrics 4. Interpret model decisions using feature importance
Setup and Imports 🛠️¶
# Data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# Model evaluation
from sklearn.metrics import (confusion_matrix, classification_report,
roc_curve, auc)
# Set random seed for reproducibility
np.random.seed(42)
1. Generate Synthetic Health Data 🧬¶
Let's create a synthetic dataset that mimics health measurements for diabetes risk prediction:
# Generate synthetic data
n_samples = 1000
# Generate features with more realistic distributions and correlations
age = np.random.normal(50, 15, n_samples) # Age: mean=50, std=15
bmi = np.random.normal(28, 5, n_samples) # BMI: mean=28, std=5
glucose = np.random.normal(100, 25, n_samples) # Glucose: mean=100, std=25
bp = np.random.normal(130, 15, n_samples) # Blood Pressure: mean=130, std=15
# Add some noise and interactions
noise = np.random.normal(0, 0.2, n_samples) # Random noise
age_bmi_interaction = (age/50) * (bmi/25) * 0.1 # Interaction effect
glucose_bp_interaction = (glucose/100) * (bp/120) * 0.1 # Another interaction
# Create more complex, noisy risk score
risk_score = (
0.2 * stats.norm.cdf((age - 50) / 15) + # Smoother age effect
0.2 * stats.norm.cdf((bmi - 25) / 5) + # Smoother BMI effect
0.3 * stats.norm.cdf((glucose - 110) / 25) + # Smoother glucose effect
0.1 * stats.norm.cdf((bp - 130) / 15) + # Smoother BP effect
0.1 * age_bmi_interaction + # Add interaction effects
0.1 * glucose_bp_interaction +
noise # Add random noise
)
# Convert to binary outcome with some randomness
probability = 1 / (1 + np.exp(-(risk_score - np.mean(risk_score)) * 3)) # Logistic function
y = (probability > np.random.uniform(0.3, 0.7, n_samples)).astype(int) # Random threshold
# Create DataFrame
df = pd.DataFrame({
'Age': age,
'BMI': bmi,
'Glucose': glucose,
'BloodPressure': bp,
'DiabetesRisk': y
})
print("\nDataset Shape:", df.shape)
print("\nClass Distribution:")
print(df['DiabetesRisk'].value_counts(normalize=True))
# Visualize feature distributions by class
plt.figure(figsize=(12, 8))
for i, feature in enumerate(['Age', 'BMI', 'Glucose', 'BloodPressure']):
plt.subplot(2, 2, i+1)
sns.kdeplot(data=df, x=feature, hue='DiabetesRisk')
plt.title(f'{feature} Distribution by Risk')
plt.tight_layout()
plt.show()
# Create correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='RdBu_r', center=0)
plt.title('Feature Correlations')
plt.show()
2. Prepare Data for Modeling 📊¶
# Split features and target
X = df.drop('DiabetesRisk', axis=1)
y = df['DiabetesRisk']
# Scale features first (following the recommended order)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
# Then split into train and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
X_scaled_df, y, test_size=0.2, random_state=42
)
print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)
3. Train and Compare Models 🤖¶
Let's compare three different classifiers:
# Initialize models
models = {
'Logistic Regression': LogisticRegression(random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42)
}
# Train and evaluate each model
results = {}
plt.figure(figsize=(10, 8))
for name, model in models.items():
# Train model
model.fit(X_train_scaled, y_train)
# Get predictions and probabilities
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')
# Store results
results[name] = {
'predictions': y_pred,
'probabilities': y_prob,
'confusion_matrix': confusion_matrix(y_test, y_pred),
'classification_report': classification_report(y_test, y_pred)
}
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend(loc="lower right")
plt.show()
# Print classification reports
for name, result in results.items():
print(f"\n{name} Results:")
print(result['classification_report'])
4. Feature Importance Analysis 🔍¶
Let's examine which features are most important for each model:
# Get feature importance for each model
plt.figure(figsize=(12, 4))
for i, (name, model) in enumerate(models.items()):
plt.subplot(1, 3, i+1)
if name == 'Logistic Regression':
importance = np.abs(model.coef_[0])
else:
importance = model.feature_importances_
# Create importance DataFrame
importance_df = pd.DataFrame({
'feature': X.columns,
'importance': importance
}).sort_values('importance', ascending=True)
# Plot horizontal bar chart using seaborn's barplot
sns.barplot(data=importance_df, y='feature', x='importance')
plt.title(f'{name}\nFeature Importance')
plt.tight_layout()
plt.show()
5. Interactive Prediction Function 🎯¶
Let's create a function to make predictions for new patients:
def predict_diabetes_risk(age, bmi, glucose, bp, model=models['Random Forest']):
"""Make diabetes risk prediction for a new patient."""
# Create feature array
X_new = np.array([[age, bmi, glucose, bp]])
# Scale features
X_new_scaled = scaler.transform(X_new)
# Get prediction and probability
prediction = model.predict(X_new_scaled)[0]
probability = model.predict_proba(X_new_scaled)[0][1]
print(f"\nPatient Information:")
print(f"Age: {age} years")
print(f"BMI: {bmi:.1f}")
print(f"Glucose: {glucose} mg/dL")
print(f"Blood Pressure: {bp} mmHg")
print(f"\nPrediction: {'High Risk' if prediction == 1 else 'Low Risk'}")
print(f"Risk Probability: {probability:.1%}")
return prediction, probability
# Example prediction
predict_diabetes_risk(
age=65,
bmi=32,
glucose=140,
bp=150
)
🧠 Comprehension Check¶
- Which model performed best? Why might that be?
- What are the most important features for predicting diabetes risk?
- How would you improve this model for real-world use?
🚀 Next Steps¶
- Try different feature combinations
- Experiment with model hyperparameters
- Add more health-related features
- Implement cross-validation
- Add confidence intervals to predictions