"""
Linear and Quadratic Discriminant Analysis (LDA/QDA) classifiers.
This module implements classical discriminant analysis methods:
- `linear_discriminant_analysis` (LDA): assumes shared covariance across classes.
Provides a low-dimensional discriminant projection and class posterior
probabilities under a Gaussian generative model with equal covariance.
- `quadratic_discriminant_analysis` (QDA): allows class-specific covariance
matrices with optional ridge-type regularization for numerical stability.
Utility:
`_quad_form_rows(X, A)` efficiently computes row-wise quadratic forms
xᵢᵀ A xᵢ using `numpy.einsum`, which is used by QDA.
Factory Aliases:
LinearDiscriminantAnalysis: Wrapper for `linear_discriminant_analysis`
via `ClassificationModel`.
QuadraticDiscriminantAnalysis: Wrapper for `quadratic_discriminant_analysis`
via `ClassificationModel`.
Typical usage example:
>>> from cleands.Classification.lda import LinearDiscriminantAnalysis
>>> model = LinearDiscriminantAnalysis(x, y)
>>> model.tidy
>>> model.glance
"""
from ..base import *
from ..utils import *
from functools import partial
import numpy as np
from typing import Optional
def _quad_form_rows(X: np.ndarray, A: np.ndarray) -> np.ndarray:
"""Compute row-wise quadratic forms xᵢᵀ A xᵢ.
Uses `numpy.einsum` for speed and numerical stability.
Args:
X (np.ndarray): Matrix of shape (n_samples, n_features) containing row
vectors xᵢ.
A (np.ndarray): Square matrix of shape (n_features, n_features).
Returns:
np.ndarray: Vector of length n_samples with values xᵢᵀ A xᵢ.
"""
# einsum is fast and stable for this pattern
return np.einsum("ij,jk,ik->i", X, A, X)
[docs]
class linear_discriminant_analysis(classification_model):
"""Linear Discriminant Analysis (LDA) classifier.
Fits class means and a pooled within-class covariance (shared across
classes) to derive a linear discriminant subspace and compute class
posterior probabilities under a Gaussian generative model.
Attributes:
mean_vectors (list[np.ndarray]): Per-class mean vectors of shape
(n_features, 1).
priors (np.ndarray): Class prior probabilities of shape (n_classes,).
Sigma_within (np.ndarray): Pooled within-class covariance matrix of
shape (n_features, n_features).
overall_mean (np.ndarray): Overall mean vector of shape (n_features, 1).
Sigma_between (np.ndarray): Between-class scatter matrix of shape
(n_features, n_features).
eigenvalues (np.ndarray): Eigenvalues from generalized eigenproblem
`inv(S_w) S_b`, sorted descending.
eigenvectors (np.ndarray): Top `n_classes - 1` eigenvectors forming the
discriminant projection matrix of shape (n_features, n_classes-1).
"""
def __init__(self, x: np.array, y: np.array) -> None:
"""Initialize and fit the LDA model.
Args:
x (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
y (np.ndarray): Integer class labels (n_samples,).
"""
super(linear_discriminant_analysis, self).__init__(x, y)
self.mean_vectors = [x[y == i].mean(0).reshape(-1, 1) for i in range(self.n_classes)]
self.priors = itemprob(y)
self.Sigma_within = np.zeros((self.n_feat, self.n_feat))
for i, mean in enumerate(self.mean_vectors):
for row in x[y == i]:
row = row.reshape(-1, 1)
self.Sigma_within += (row - mean) @ (row - mean).T
self.overall_mean = x.mean(0).reshape(-1, 1)
self.Sigma_between = np.zeros((self.n_feat, self.n_feat))
for i, mean in enumerate(self.mean_vectors):
n_class = x[y == i, :].shape[0]
self.Sigma_between += n_class * (mean - self.overall_mean) @ (mean - self.overall_mean).T
self.eigenvalues, self.eigenvectors = np.linalg.eig(np.linalg.solve(self.Sigma_within, self.Sigma_between))
self.eigenvalues = np.flip(self.eigenvalues)
self.eigenvectors = np.fliplr(self.eigenvectors)[:, :self.n_classes - 1]
self.eigenvectors = self.eigenvectors
[docs]
def discriminant(self, target: np.array) -> np.array:
"""Project data into the discriminant space.
Args:
target (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
Returns:
np.ndarray: Discriminant scores of shape (n_samples, n_classes-1).
"""
return target @ self.eigenvectors
[docs]
def predict_proba(self, target: np.array) -> np.array:
"""Compute posterior probabilities for each class.
Implements the LDA log-posterior up to a constant and returns
softmax-normalized probabilities.
Args:
target (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
Returns:
np.ndarray: Class probabilities of shape (n_samples, n_classes).
"""
outp = np.zeros((target.shape[0], self.n_classes))
Sw_inv = np.linalg.inv(self.Sigma_within)
for i, mean_vec in enumerate(self.mean_vectors):
for j, x in enumerate(target):
tmp = x.reshape(-1, 1) - mean_vec
outp[j, i] = -0.5 * tmp.T @ Sw_inv @ tmp + np.log(self.priors[i])
outp -= outp.max(1).reshape(-1, 1)
outp = np.exp(outp)
outp /= outp.sum(1).reshape(-1, 1)
return outp
[docs]
class quadratic_discriminant_analysis(classification_model):
"""Quadratic Discriminant Analysis (QDA) classifier.
QDA models each class with its own Gaussian distribution:
x | y = k ~ N(μ_k, Σ_k). Predictions use the quadratic log-density
with class-specific covariance, allowing non-linear decision boundaries.
Attributes:
classes (np.ndarray): Sorted unique class labels of shape (n_classes,).
n_classes (int): Number of classes.
priors (np.ndarray): Class prior probabilities of shape (n_classes,).
means (np.ndarray): Per-class mean vectors, shape (n_classes, n_features).
covs (np.ndarray): Per-class covariance matrices,
shape (n_classes, n_features, n_features).
inv_covs (np.ndarray): Inverses of covariance matrices, same shape as `covs`.
log_dets (np.ndarray): Log-determinants of covariance matrices, shape (n_classes,).
"""
def __init__(
self,
x: np.ndarray,
y: np.ndarray,
priors: Optional[np.ndarray] = None,
reg: float = 1e-6,
sample_weight: Optional[np.ndarray] = None,
) -> None:
"""Initialize and fit the QDA model.
Uses (optionally weighted) MLEs for class means and covariances with
ridge-type regularization to ensure positive definiteness.
Args:
x (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
y (np.ndarray): Integer class labels (n_samples,).
priors (np.ndarray, optional): Class prior probabilities of shape
(n_classes,). If None, estimated from sample weights. Defaults to None.
reg (float, optional): Nonnegative regularization added to each
class covariance (λ I) for numerical stability. Defaults to 1e-6.
sample_weight (np.ndarray, optional): Nonnegative weights of shape
(n_samples,). If None, uses uniform weights. Defaults to None.
Raises:
ValueError: On invalid `sample_weight` shape or priors shape/values.
np.linalg.LinAlgError: If a class covariance is not PD even after
regularization.
"""
super().__init__(x, y)
# Validate sample weights
if sample_weight is None:
w = np.ones(self.n_obs, dtype=float)
else:
w = np.asarray(sample_weight, dtype=float)
if w.ndim != 1 or w.shape[0] != self.n_obs:
raise ValueError("sample_weight must be shape (n_obs,).")
if np.sum(w) <= 0:
raise ValueError("sample_weight must sum to a positive value.")
# Classes and encoded indices
classes, y_idx = np.unique(y, return_inverse=True)
self.classes = classes # store labels
self.n_classes = classes.size # <-- keep this an INT
k_classes = self.n_classes
# Priors
if priors is None:
cw = np.bincount(y_idx, weights=w, minlength=k_classes).astype(float)
priors = cw / cw.sum()
else:
priors = np.asarray(priors, dtype=float)
if priors.shape != (k_classes,):
raise ValueError(f"priors must be shape ({k_classes},)")
if np.any(priors < 0) or priors.sum() <= 0:
raise ValueError("priors must be nonnegative and sum to a positive value.")
priors = priors / priors.sum()
self.priors = priors
# Means, covariances, inverses, log determinants
self.means = np.zeros((k_classes, self.n_feat))
self.covs = np.zeros((k_classes, self.n_feat, self.n_feat))
self.inv_covs = np.zeros_like(self.covs)
self.log_dets = np.zeros(k_classes)
for k in range(k_classes):
mask = (y_idx == k)
wk = w[mask]
Xk = x[mask]
W = wk.sum()
if W <= 0:
raise ValueError(f"Class {classes[k]} has zero total weight.")
mu = np.average(Xk, axis=0, weights=wk)
self.means[k] = mu
Xc = Xk - mu
cov = (Xc.T * wk) @ Xc / W # MLE covariance
if reg > 0:
cov = cov + reg * np.eye(self.n_feat)
sign, logdet = np.linalg.slogdet(cov)
if sign <= 0:
cov = cov + max(reg, 1e-8) * np.eye(self.n_feat)
sign, logdet = np.linalg.slogdet(cov)
if sign <= 0:
raise np.linalg.LinAlgError(
f"Covariance for class {classes[k]} is not PD even after regularization."
)
self.covs[k] = cov
self.inv_covs[k] = np.linalg.inv(cov)
self.log_dets[k] = logdet
[docs]
def decision_function(self, x: np.ndarray) -> np.ndarray:
"""Compute unnormalized class scores (log-posterior up to a constant).
For each class k:
score_k(x) = log π_k - 0.5 * log|Σ_k| - 0.5 * (x - μ_k)ᵀ Σ_k⁻¹ (x - μ_k)
Args:
x (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
Returns:
np.ndarray: Scores of shape (n_samples, n_classes).
"""
self._check_is_fitted()
x = np.asarray(x, dtype=float)
if x.ndim != 2:
raise ValueError("X must be a 2D array of shape (n_samples, n_features).")
if x.shape[1] != self.n_feat:
raise ValueError(f"X has {x.shape[1]} features; expected {self.n_feat}")
k_classes = self.n_classes # int
scores = np.zeros((x.shape[0], k_classes))
for k in range(k_classes):
mu = self.means[k]
inv_cov = self.inv_covs[k]
quad = _quad_form_rows(x - mu, inv_cov)
scores[:, k] = np.log(self.priors[k]) - 0.5 * self.log_dets[k] - 0.5 * quad
return scores
[docs]
def predict_proba(self, x: np.ndarray) -> np.ndarray:
"""Compute class posterior probabilities via softmax over scores.
Args:
x (np.ndarray or pd.DataFrame): Feature matrix (n_samples, n_features).
Returns:
np.ndarray: Class probabilities of shape (n_samples, n_classes).
"""
scores = self.decision_function(x)
m = scores.max(axis=1, keepdims=True)
P = np.exp(scores - m)
P /= P.sum(axis=1, keepdims=True)
return P
def _check_is_fitted(self) -> None:
"""Ensure the model has been fitted before scoring/predicting.
Raises:
AttributeError: If any required fitted attributes are missing.
"""
attrs = ("means", "covs", "inv_covs", "log_dets", "priors")
missing = [a for a in attrs if not hasattr(self, a)]
if missing:
raise AttributeError(
f"{self.__class__.__name__} is not fitted. Missing: {', '.join(missing)}"
)
[docs]
class LinearDiscriminantAnalysis(ClassificationModel):
"""Convenience wrapper for Linear Discriminant Analysis (LDA).
LDA projects data into a lower-dimensional space that maximizes
class separability, assuming normally distributed features with
equal covariance matrices. Provides a formula/DataFrame interface
for the :class:`linear_discriminant_analysis`.
Attributes:
MODEL_TYPE: Underlying model type, fixed to
:class:`linear_discriminant_analysis`.
Example:
>>> model = LinearDiscriminantAnalysis.from_formula("y ~ x1 + x2", data=df)
>>> model.classify(df[["x1", "x2"]])
>>> model.predict_proba(df[["x1", "x2"]])
"""
MODEL_TYPE = linear_discriminant_analysis
[docs]
class QuadraticDiscriminantAnalysis(ClassificationModel):
"""Convenience wrapper for Quadratic Discriminant Analysis (QDA).
QDA is similar to LDA but allows each class to have its own
covariance matrix, resulting in quadratic rather than linear
decision boundaries. Provides a formula/DataFrame interface
for the :class:`quadratic_discriminant_analysis`.
Attributes:
MODEL_TYPE: Underlying model type, fixed to
:class:`quadratic_discriminant_analysis`.
Example:
>>> model = QuadraticDiscriminantAnalysis.from_formula("y ~ x1 + x2 + x3", data=df)
>>> model.classify(df[["x1", "x2", "x3"]])
>>> model.predict_proba(df[["x1", "x2", "x3"]])
"""
MODEL_TYPE = quadratic_discriminant_analysis