ddbourgin · ddbourgin · Jun 23, 2021 · May 5, 2021 · May 15, 2021 · May 15, 2021
diff --git a/numpy_ml/linear_models/README.md b/numpy_ml/linear_models/README.md
@@ -8,6 +8,7 @@ The `lm.py` module implements:
 3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions)
  - Known coefficient prior mean and known error variance
  - Known coefficient prior mean and unknown error variance
+4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods.
 
 ## Plots
 <p align="center">

diff --git a/numpy_ml/linear_models/__init__.py b/numpy_ml/linear_models/__init__.py
@@ -1 +1,2 @@
 from .lm import *
+from .naive_bayes import *
diff --git a/numpy_ml/linear_models/naive_bayes.py b/numpy_ml/linear_models/naive_bayes.py
@@ -0,0 +1,211 @@
+import numpy as np
+
+
+class GaussianNBClassifier:
+ def __init__(self, eps=1e-6):
+ r"""
+ A naive Bayes classifier for real-valued data.
+
+ Notes
+ -----
+ The naive Bayes model assumes the features of each training example
+ :math:`\mathbf{x}` are mutually independent given the example label
+ :math:`y`:
+
+ .. math::
+
+ P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
+
+ where :math:`M` is the rank of the `i`th example :math:`\mathbf{x}_i`
+ and :math:`y_i` is the label associated with the `i`th example.
+
+ Combining the conditional independence assumption with a simple
+ application of Bayes' theorem gives the naive Bayes classification
+ rule:
+
+ .. math::
+
+ \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
+ &= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
+ &= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)
+
+ In the final expression, the prior class probability :math:`P(y)` can
+ be specified in advance or estimated empirically from the training
+ data.
+
+ In the Gaussian version of the naive Bayes model, the feature
+ likelihood is assumed to be normally distributed for each class:
+
+ .. math::
+
+ \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
+
+ where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
+ \Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
+ unique classes present in the data, and the parameters for the Gaussian
+ associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
+ (where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
+ training examples with label :math:`c`.
+
+ Parameters
+ ----------
+ eps : float
+ A value added to the variance to prevent numerical error. Default
+ is 1e-6.
+
+ Attributes
+ ----------
+ parameters : dict
+ Dictionary of model parameters: "mean", the `(K, M)` array of
+ feature means under each class, "sigma", the `(K, M)` array of
+ feature variances under each class, and "prior", the `(K,)` array of
+ empirical prior probabilities for each class label.
+ hyperparameters : dict
+ Dictionary of model hyperparameters
+ labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
+ An array containing the unique class labels for the training
+ examples.
+ """
+ self.labels = None
+ self.hyperparameters = {"eps": eps}
+ self.parameters = {
+ "mean": None, # shape: (K, M)
+ "sigma": None, # shape: (K, M)
+ "prior": None, # shape: (K,)
+ }
+
+ def fit(self, X, y):
+ """
+ Fit the model parameters via maximum likelihood.
+
+ Notes
+ -----
+ The model parameters are stored in the :py:attr:`parameters` attribute.
+ The following keys are present:
+
+ mean: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
+ Feature means for each of the `K` label classes
+ sigma: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
+ Feature variances for each of the `K` label classes
+ prior : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
+ Prior probability of each of the `K` label classes, estimated
+ empirically from the training data
+
+ Parameters
+ ----------
+ X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+ A dataset consisting of `N` examples, each of dimension `M`
+ y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
+ The class label for each of the `N` examples in `X`
+
+ Returns
+ -------
+ self: object
+ """
+ P = self.parameters
+ H = self.hyperparameters
+
+ self.labels = np.unique(y)
+
+ K = len(self.labels)
+ N, M = X.shape
+
+ P["mean"] = np.zeros((K, M))
+ P["sigma"] = np.zeros((K, M))
+ P["prior"] = np.zeros((K,))
+
+ for i, c in enumerate(self.labels):
+ X_c = X[y == c, :]
+
+ P["mean"][i, :] = np.mean(X_c, axis=0)
+ P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
+ P["prior"][i] = X_c.shape[0] / N
+ return self
+
+ def predict(self, X):
+ """
+ Use the trained classifier to predict the class label for each example
+ in **X**.
+
+ Parameters
+ ----------
+ X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+ A dataset of `N` examples, each of dimension `M`
+
+ Returns
+ -------
+ labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
+ The predicted class labels for each example in `X`
+ """
+ return self.labels[self._log_posterior(X).argmax(axis=1)]
+
+ def _log_posterior(self, X):
+ r"""
+ Compute the (unnormalized) log posterior for each class.
+
+ Parameters
+ ----------
+ X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+ A dataset of `N` examples, each of dimension `M`
+
+ Returns
+ -------
+ log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
+ Unnormalized log posterior probability of each class for each
+ example in `X`
+ """
+ K = len(self.labels)
+ log_posterior = np.zeros((X.shape[0], K))
+ for i in range(K):
+ log_posterior[:, i] = self._log_class_posterior(X, i)
+ return log_posterior
+
+ def _log_class_posterior(self, X, class_idx):
+ r"""
+ Compute the (unnormalized) log posterior for the label at index
+ `class_idx` in :py:attr:`labels`.
+
+ Notes
+ -----
+ Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
+ :math:`c` is::
+
+ .. math::
+
+ \log P(y_i = c \mid \mathbf{x}_i, \theta)
+ &\propto \log P(y=c \mid \theta) +
+ \log P(\mathbf{x}_i \mid y_i = c, \theta) \\
+ &\propto \log P(y=c \mid \theta)
+ \sum{j=1}^M \log P(x_j \mid y_i = c, \theta)
+
+ In the Gaussian naive Bayes model, the feature likelihood for class
+ :math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
+ be normally distributed
+
+ .. math::
+
+ \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
+
+
+ Parameters
+ ----------
+ X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
+ A dataset of `N` examples, each of dimension `M`
+ class_idx : int
+ The index of the current class in :py:attr:`labels`
+
+ Returns
+ -------
+ log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
+ Unnormalized log probability of the label at index `class_idx`
+ in :py:attr:`labels` for each example in `X`
+ """
+ P = self.parameters
+ mu = P["mean"][class_idx]
+ prior = P["prior"][class_idx]
+ sigsq = P["sigma"][class_idx]
+
+ # log likelihood = log X | N(mu, sigsq)
+ log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
+ log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
+ return log_likelihood + np.log(prior)
diff --git a/numpy_ml/tests/test_naive_bayes.py b/numpy_ml/tests/test_naive_bayes.py
@@ -0,0 +1,72 @@
+import numpy as np
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+
+from sklearn import naive_bayes
+
+from numpy_ml.linear_models import GaussianNBClassifier
+from numpy_ml.utils.testing import random_tensor
+
+
+def test_GaussianNB(N=10):
+ np.random.seed(12345)
+ N = np.inf if N is None else N
+
+ i = 1
+ while i < N + 1:
+ n_ex = np.random.randint(1, 300)
+ n_feats = np.random.randint(1, 100)
+ n_classes = np.random.randint(2, 10)
+
+ X = random_tensor((n_ex, n_feats), standardize=True)
+ y = np.random.randint(0, n_classes, size=n_ex)
+
+ X_test = random_tensor((n_ex, n_feats), standardize=True)
+
+ NB = GaussianNBClassifier(eps=1e-09)
+ NB.fit(X, y)
+
+ preds = NB.predict(X_test)
+
+ sklearn_NB = naive_bayes.GaussianNB()
+ sklearn_NB.fit(X, y)
+
+ sk_preds = sklearn_NB.predict(X_test)
+
+ for i in range(len(NB.labels)):
+ P = NB.parameters
+ jointi = np.log(sklearn_NB.class_prior_[i])
+ jointi_mine = np.log(P["prior"][i])
+
+ np.testing.assert_almost_equal(jointi, jointi_mine)
+
+ n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :]))
+ n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i]))
+
+ np.testing.assert_almost_equal(n_ij_mine, n_ij)
+
+ n_ij2 = n_ij - 0.5 * np.sum(
+ ((X_test - sklearn_NB.theta_[i, :]) ** 2) / (sklearn_NB.sigma_[i, :]), 1
+ )
+
+ n_ij2_mine = n_ij_mine - 0.5 * np.sum(
+ ((X_test - P["mean"][i]) ** 2) / (P["sigma"][i]), 1
+ )
+ np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4)
+
+ llh = jointi + n_ij2
+ llh_mine = jointi_mine + n_ij2_mine
+
+ np.testing.assert_almost_equal(llh_mine, llh, decimal=4)
+
+ np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_)
+ np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_)
+ np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_)
+ np.testing.assert_almost_equal(
+ sklearn_NB._joint_log_likelihood(X_test),
+ NB._log_posterior(X_test),
+ decimal=4,
+ )
+ np.testing.assert_almost_equal(preds, sk_preds)
+ print("PASSED")
+ i += 1