Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Naive bayes #68

Merged
merged 10 commits into from
Jun 23, 2021
Merged
1 change: 1 addition & 0 deletions numpy_ml/linear_models/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The `lm.py` module implements:
3. [Bayesian linear regression](https://en.wikipedia.org/wiki/Bayesian_linear_regression) with maximum a posteriori parameter estimates via [conjugacy](https://en.wikipedia.org/wiki/Conjugate_prior#Table_of_conjugate_distributions)
- Known coefficient prior mean and known error variance
- Known coefficient prior mean and unknown error variance
4. [Naive Bayes classifier](https://en.wikipedia.org/wiki/Naive_Bayes_classifier) with Gaussian feature likelihoods.

## Plots
<p align="center">
Expand Down
1 change: 1 addition & 0 deletions numpy_ml/linear_models/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from .lm import *
from .naive_bayes import *
211 changes: 211 additions & 0 deletions numpy_ml/linear_models/naive_bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import numpy as np


class GaussianNBClassifier:
def __init__(self, eps=1e-6):
r"""
A naive Bayes classifier for real-valued data.

Notes
-----
The naive Bayes model assumes the features of each training example
:math:`\mathbf{x}` are mutually independent given the example label
:math:`y`:

.. math::

P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)

where :math:`M` is the rank of the `i`th example :math:`\mathbf{x}_i`
and :math:`y_i` is the label associated with the `i`th example.

Combining the conditional independence assumption with a simple
application of Bayes' theorem gives the naive Bayes classification
rule:

.. math::

\hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
&= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
&= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)

In the final expression, the prior class probability :math:`P(y)` can
be specified in advance or estimated empirically from the training
data.

In the Gaussian version of the naive Bayes model, the feature
likelihood is assumed to be normally distributed for each class:

.. math::

\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)

where :math:`\theta` is the set of model parameters: :math:`\{\mu_1,
\Sigma_1, \ldots, \mu_K, \Sigma_K\}`, :math:`K` is the total number of
unique classes present in the data, and the parameters for the Gaussian
associated with class :math:`c`, :math:`\mu_c` and :math:`\Sigma_c`
(where :math:`1 \leq c \leq K`), are estimated via MLE from the set of
training examples with label :math:`c`.

Parameters
----------
eps : float
A value added to the variance to prevent numerical error. Default
is 1e-6.

Attributes
----------
parameters : dict
Dictionary of model parameters: "mean", the `(K, M)` array of
feature means under each class, "sigma", the `(K, M)` array of
feature variances under each class, and "prior", the `(K,)` array of
empirical prior probabilities for each class label.
hyperparameters : dict
Dictionary of model hyperparameters
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
An array containing the unique class labels for the training
examples.
"""
self.labels = None
self.hyperparameters = {"eps": eps}
self.parameters = {
"mean": None, # shape: (K, M)
"sigma": None, # shape: (K, M)
"prior": None, # shape: (K,)
}

def fit(self, X, y):
"""
Fit the model parameters via maximum likelihood.

Notes
-----
The model parameters are stored in the :py:attr:`parameters` attribute.
The following keys are present:

mean: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature means for each of the `K` label classes
sigma: :py:class:`ndarray <numpy.ndarray>` of shape `(K, M)`
Feature variances for each of the `K` label classes
prior : :py:class:`ndarray <numpy.ndarray>` of shape `(K,)`
Prior probability of each of the `K` label classes, estimated
empirically from the training data

Parameters
----------
X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset consisting of `N` examples, each of dimension `M`
y: :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
The class label for each of the `N` examples in `X`

Returns
-------
self: object
"""
P = self.parameters
H = self.hyperparameters

self.labels = np.unique(y)

K = len(self.labels)
N, M = X.shape

P["mean"] = np.zeros((K, M))
P["sigma"] = np.zeros((K, M))
P["prior"] = np.zeros((K,))

for i, c in enumerate(self.labels):
X_c = X[y == c, :]

P["mean"][i, :] = np.mean(X_c, axis=0)
P["sigma"][i, :] = np.var(X_c, axis=0) + H["eps"]
P["prior"][i] = X_c.shape[0] / N
return self

def predict(self, X):
"""
Use the trained classifier to predict the class label for each example
in **X**.

Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`

Returns
-------
labels : :py:class:`ndarray <numpy.ndarray>` of shape `(N)`
The predicted class labels for each example in `X`
"""
return self.labels[self._log_posterior(X).argmax(axis=1)]

def _log_posterior(self, X):
r"""
Compute the (unnormalized) log posterior for each class.

Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`

Returns
-------
log_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
Unnormalized log posterior probability of each class for each
example in `X`
"""
K = len(self.labels)
log_posterior = np.zeros((X.shape[0], K))
for i in range(K):
log_posterior[:, i] = self._log_class_posterior(X, i)
return log_posterior

def _log_class_posterior(self, X, class_idx):
r"""
Compute the (unnormalized) log posterior for the label at index
`class_idx` in :py:attr:`labels`.

Notes
-----
Unnormalized log posterior for example :math:`\mathbf{x}_i` and class
:math:`c` is::

.. math::

\log P(y_i = c \mid \mathbf{x}_i, \theta)
&\propto \log P(y=c \mid \theta) +
\log P(\mathbf{x}_i \mid y_i = c, \theta) \\
&\propto \log P(y=c \mid \theta)
\sum{j=1}^M \log P(x_j \mid y_i = c, \theta)

In the Gaussian naive Bayes model, the feature likelihood for class
:math:`c`, :math:`P(\mathbf{x}_i \mid y_i = c, \theta)` is assumed to
be normally distributed

.. math::

\mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)


Parameters
----------
X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
A dataset of `N` examples, each of dimension `M`
class_idx : int
The index of the current class in :py:attr:`labels`

Returns
-------
log_class_posterior : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
Unnormalized log probability of the label at index `class_idx`
in :py:attr:`labels` for each example in `X`
"""
P = self.parameters
mu = P["mean"][class_idx]
prior = P["prior"][class_idx]
sigsq = P["sigma"][class_idx]

# log likelihood = log X | N(mu, sigsq)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sfsf9797 this is what I believe the proper log likelihood + log class posterior calc should be. Let me know if you agree!

log_likelihood = -0.5 * np.sum(np.log(2 * np.pi * sigsq))
log_likelihood -= 0.5 * np.sum(((X - mu) ** 2) / sigsq, axis=1)
return log_likelihood + np.log(prior)
72 changes: 72 additions & 0 deletions numpy_ml/tests/test_naive_bayes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split

from sklearn import naive_bayes

from numpy_ml.linear_models import GaussianNBClassifier
from numpy_ml.utils.testing import random_tensor


def test_GaussianNB(N=10):
np.random.seed(12345)
N = np.inf if N is None else N

i = 1
while i < N + 1:
n_ex = np.random.randint(1, 300)
n_feats = np.random.randint(1, 100)
n_classes = np.random.randint(2, 10)

X = random_tensor((n_ex, n_feats), standardize=True)
y = np.random.randint(0, n_classes, size=n_ex)

X_test = random_tensor((n_ex, n_feats), standardize=True)

NB = GaussianNBClassifier(eps=1e-09)
NB.fit(X, y)

preds = NB.predict(X_test)

sklearn_NB = naive_bayes.GaussianNB()
sklearn_NB.fit(X, y)

sk_preds = sklearn_NB.predict(X_test)

for i in range(len(NB.labels)):
P = NB.parameters
jointi = np.log(sklearn_NB.class_prior_[i])
jointi_mine = np.log(P["prior"][i])

np.testing.assert_almost_equal(jointi, jointi_mine)

n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * sklearn_NB.sigma_[i, :]))
n_ij_mine = -0.5 * np.sum(np.log(2.0 * np.pi * P["sigma"][i]))

np.testing.assert_almost_equal(n_ij_mine, n_ij)

n_ij2 = n_ij - 0.5 * np.sum(
((X_test - sklearn_NB.theta_[i, :]) ** 2) / (sklearn_NB.sigma_[i, :]), 1
)

n_ij2_mine = n_ij_mine - 0.5 * np.sum(
((X_test - P["mean"][i]) ** 2) / (P["sigma"][i]), 1
)
np.testing.assert_almost_equal(n_ij2_mine, n_ij2, decimal=4)

llh = jointi + n_ij2
llh_mine = jointi_mine + n_ij2_mine

np.testing.assert_almost_equal(llh_mine, llh, decimal=4)

np.testing.assert_almost_equal(P["prior"], sklearn_NB.class_prior_)
np.testing.assert_almost_equal(P["mean"], sklearn_NB.theta_)
np.testing.assert_almost_equal(P["sigma"], sklearn_NB.sigma_)
np.testing.assert_almost_equal(
sklearn_NB._joint_log_likelihood(X_test),
NB._log_posterior(X_test),
decimal=4,
)
np.testing.assert_almost_equal(preds, sk_preds)
print("PASSED")
i += 1