Statistics & Mathematics
Mathematical foundations for data science, machine learning, and statistical analysis.
Quick Start import numpy as np import scipy.stats as stats from sklearn.linear_model import LinearRegression
Descriptive Statistics
data = np.array([23, 45, 67, 32, 45, 67, 89, 12, 34, 56]) print(f"Mean: {np.mean(data):.2f}") print(f"Median: {np.median(data):.2f}") print(f"Std Dev: {np.std(data, ddof=1):.2f}") print(f"IQR: {np.percentile(data, 75) - np.percentile(data, 25):.2f}")
Hypothesis Testing
sample_a = [23, 45, 67, 32, 45] sample_b = [56, 78, 45, 67, 89] t_stat, p_value = stats.ttest_ind(sample_a, sample_b) print(f"T-statistic: {t_stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05: print("Reject null hypothesis: significant difference") else: print("Fail to reject null hypothesis")
Core Concepts 1. Probability Distributions import numpy as np import scipy.stats as stats import matplotlib.pyplot as plt
Normal Distribution
mu, sigma = 100, 15 normal_dist = stats.norm(loc=mu, scale=sigma) x = np.linspace(50, 150, 100)
PDF, CDF calculations
print(f"P(X < 85): {normal_dist.cdf(85):.4f}") print(f"P(X > 115): {1 - normal_dist.cdf(115):.4f}") print(f"95th percentile: {normal_dist.ppf(0.95):.2f}")
Binomial Distribution (discrete)
n, p = 100, 0.3 binom_dist = stats.binom(n=n, p=p) print(f"P(X = 30): {binom_dist.pmf(30):.4f}") print(f"P(X <= 30): {binom_dist.cdf(30):.4f}")
Poisson Distribution (events per time)
lambda_param = 5 poisson_dist = stats.poisson(mu=lambda_param) print(f"P(X = 3): {poisson_dist.pmf(3):.4f}")
Central Limit Theorem demonstration
population = np.random.exponential(scale=10, size=100000) sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)] print(f"Sample means are approximately normal: mean={np.mean(sample_means):.2f}")
- Hypothesis Testing Framework from scipy import stats import numpy as np
class HypothesisTest: """Framework for statistical hypothesis testing."""
@staticmethod
def two_sample_ttest(group_a, group_b, alpha=0.05):
"""Independent samples t-test."""
t_stat, p_value = stats.ttest_ind(group_a, group_b)
effect_size = (np.mean(group_a) - np.mean(group_b)) / np.sqrt(
(np.var(group_a) + np.var(group_b)) / 2
)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < alpha,
"effect_size_cohens_d": effect_size
}
@staticmethod
def chi_square_test(observed, expected=None, alpha=0.05):
"""Chi-square test for categorical data."""
if expected is None:
chi2, p_value, dof, expected = stats.chi2_contingency(observed)
else:
chi2, p_value = stats.chisquare(observed, expected)
dof = len(observed) - 1
return {
"chi2_statistic": chi2,
"p_value": p_value,
"degrees_of_freedom": dof,
"significant": p_value < alpha
}
@staticmethod
def ab_test_proportion(conversions_a, total_a, conversions_b, total_b, alpha=0.05):
"""Two-proportion z-test for A/B testing."""
p_a = conversions_a / total_a
p_b = conversions_b / total_b
p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b))
z_stat = (p_a - p_b) / se
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
return {
"conversion_a": p_a,
"conversion_b": p_b,
"lift": (p_b - p_a) / p_a * 100,
"z_statistic": z_stat,
"p_value": p_value,
"significant": p_value < alpha
}
Usage
result = HypothesisTest.ab_test_proportion( conversions_a=120, total_a=1000, conversions_b=150, total_b=1000 ) print(f"Lift: {result['lift']:.1f}%, p-value: {result['p_value']:.4f}")
- Linear Algebra Essentials import numpy as np
Matrix operations
A = np.array([[1, 2], [3, 4]]) B = np.array([[5, 6], [7, 8]])
Basic operations
print("Matrix multiplication:", A @ B) print("Element-wise:", A * B) print("Transpose:", A.T) print("Inverse:", np.linalg.inv(A)) print("Determinant:", np.linalg.det(A))
Eigenvalues and eigenvectors (PCA foundation)
eigenvalues, eigenvectors = np.linalg.eig(A) print(f"Eigenvalues: {eigenvalues}")
Singular Value Decomposition (dimensionality reduction)
U, S, Vt = np.linalg.svd(A) print(f"Singular values: {S}")
Solving linear systems: Ax = b
b = np.array([5, 11]) x = np.linalg.solve(A, b) print(f"Solution: {x}")
Cosine similarity (NLP, recommendations)
def cosine_similarity(v1, v2): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
vec1 = np.array([1, 2, 3]) vec2 = np.array([4, 5, 6]) print(f"Cosine similarity: {cosine_similarity(vec1, vec2):.4f}")
- Regression Analysis import numpy as np from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.metrics import r2_score, mean_squared_error import statsmodels.api as sm
Multiple Linear Regression with statsmodels
X = np.random.randn(100, 3) y = 2X[:, 0] + 3X[:, 1] - X[:, 2] + np.random.randn(100)*0.5
X_with_const = sm.add_constant(X) model = sm.OLS(y, X_with_const).fit()
print(model.summary()) print(f"R-squared: {model.rsquared:.4f}") print(f"Coefficients: {model.params}") print(f"P-values: {model.pvalues}")
Regularization comparison
X_train, y_train = X[:80], y[:80] X_test, y_test = X[80:], y[80:]
models = { "OLS": LinearRegression(), "Ridge": Ridge(alpha=1.0), "Lasso": Lasso(alpha=0.1) }
for name, model in models.items(): model.fit(X_train, y_train) y_pred = model.predict(X_test) print(f"{name}: R²={r2_score(y_test, y_pred):.4f}, RMSE={np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
Tools & Technologies Tool Purpose Version (2025) NumPy Numerical computing 1.26+ SciPy Scientific computing 1.12+ pandas Data manipulation 2.2+ statsmodels Statistical models 0.14+ scikit-learn ML algorithms 1.4+ Troubleshooting Guide Issue Symptoms Root Cause Fix Low p-value, small effect Significant but meaningless Large sample size Check effect size High variance Unstable estimates Small sample, outliers More data, robust methods Multicollinearity Inflated coefficients Correlated features VIF check, remove features Heteroscedasticity Invalid inference Non-constant variance Weighted least squares Best Practices
✅ DO: Check assumptions before testing
from scipy.stats import shapiro stat, p = shapiro(data) if p > 0.05: print("Data is approximately normal")
✅ DO: Use effect sizes, not just p-values
✅ DO: Correct for multiple comparisons (Bonferroni)
✅ DO: Report confidence intervals
❌ DON'T: p-hack by trying many tests
❌ DON'T: Confuse correlation with causation
❌ DON'T: Ignore sample size requirements
Resources Khan Academy Statistics StatQuest with Josh Starmer "Introduction to Statistical Learning" (ISLR)
Skill Certification Checklist:
Can calculate descriptive statistics Can perform hypothesis tests (t-test, chi-square) Can implement A/B testing Can perform regression analysis Can use matrix operations for ML