Test 10 Solution
Anamoly Detection¶
Here, we'll analyze the decision boundaries of One Class SVM and Isolation Forest and observe how sensitive and insensitive respectively they are to outliers.
# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True):
'''
args: show_head -> if True print only first 5 rows.
return: None
'''
if a.ndim < 2:
a = [a]
if show_head:
display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
return
display(HTML(tabulate.tabulate(a, tablefmt='html')))
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_blobs
from sklearn import svm
from sklearn.ensemble import IsolationForest
import nose.tools as test_# For testin
We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.
¶
# Global variables. Make sure you understand what each var means.
n_samples = 1000
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers
Question 1¶
Generate 2 isotropic Gaussian blobs using make_blobs given the total number of samples to generate, centers locations, standard deviation of each cluster, and random state.
center_1_coordinates = [0,2] # [2,0], [0,2]
center_2_coordinates = [2,2]
cluster_std_ = 0.4
rs = 42
n_samples_ = n_inliers
def generate_guassian_clusters_dataset(centers_coords, cluster_std_, rs, n_samples_):
'''
args: centers_coords -> tuple of len 2. => first elment is a pair of x,y coordinates
for cluster 1 location
second elment is a pair of x,y coordinates
for cluster 2 location
cluster_std_ -> float => cluster standard deviation for both clusters.
rs -> int => random state
n_samples_ -> int
'''
### BEGIN SOLUTION
center_1_coordinates, center_2_coordinates = centers_coords[0], centers_coords[1]
data = make_blobs(centers=[center_1_coordinates, center_2_coordinates], \
cluster_std=cluster_std_, random_state=rs, n_samples=n_inliers)
return data
### END SOLUTION
all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
cluster_std_=.5, rs=42, n_samples_=n_samples))
test_.eq_ (len(data_sets), 3)
test_.ok_ (np.isclose (data_sets[0][0][:5], np.asarray([[ 1.50518593, 2.47038559],
[ 0.03868415, -0.4306421 ],
[-0.18048308, 0.5796649 ],
[ 0.04352353, -0.14950368],
[ 2.16067861, 2.21096038]])).all())
test_.ok_ (((data_sets[0][1][:5]) == np.asarray([1, 0, 0, 0, 1])).all())
Generate three datasets¶
all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
cluster_std_=.5, rs=42, n_samples_=n_samples))
Observe the three datasets. Think about the anamolous locations (where points will be called anamolies).¶
X1 = data_sets[0][0]
plt.scatter(X1[:, 0], X1[:, 1])
<matplotlib.collections.PathCollection at 0x7fe044a47f28>
X2 = data_sets[1][0]
plt.scatter(X2[:, 0], X2[:, 1])
<matplotlib.collections.PathCollection at 0x7fe044c7ceb8>
X3 = data_sets[2][0]
plt.scatter(X3[:, 0], X3[:, 1])
<matplotlib.collections.PathCollection at 0x7fe044668f28>
Question 2¶
Given an indentifier for a classifier and features, draw the decision boundary for that classifier after instantiating and then defining it.
You may also find np.ravel useful. Read the tutorial for more information on plotting decision boundaries.
You may also find xlim useful but please make sure you don't hardcode any values; Use your judgement such that all data points are displayed and no point is hidden for any input dataset.
To develop full functionality and pass test cases. You'll have to call the following functions in your code (non-exhaustive). Read relevant documentations.
- plt.scatter
- plt.contour
- np.meshgrid
- plt.xticks
- plt.title
def plot_decision_boundary_after_training_clf(X, dataset_id, with_outliers=False,\
_outliers_fraction=outliers_fraction, rs=42, clf_id=1):
'''
args:
X -> ndarray -> shape (m, 2) => features only (no labels)
dataset_id -> int -> use for plt.title()
clf -> int => if 1 use on class SVC else IsolationForest
with_outliers -> Bool => whether the input data contains manually added outliers.
outliers_fraction -> float => arg for svm and IsolationForest
rs -> int => random state for IsolationForest
return: y_pred -> ndarray -> shape (m,) => which are predicted labels (-1 or 1) of
X using the given classifier. More info in docs.
Out solution has ~35 lines of code.
'''
if clf_id == 1:
clf = svm.OneClassSVM(nu=_outliers_fraction, kernel="rbf",)
else:
clf = IsolationForest(contamination=_outliers_fraction, random_state=rs)
### BEGIN SOLUTION
clf.fit(X)
y_pred = clf.fit(X).predict(X)
coordinates_vector1 = np.linspace(-7, 7, 150)
coordinates_vector2 = np.linspace(-7, 7, 150)
xx, yy = np.meshgrid(coordinates_vector1, coordinates_vector2)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # ravel- flatten the array- 1D
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')
colors = np.array(['black', 'orange'])
feat1 = X[:, 0]
feat2 = X[:, 1]
plt.scatter(feat1, feat2, s=10, color=colors[(y_pred + 1) // 2])
x_min = feat1.min()
x_max = feat1.max()
plt.xlim(x_min-1, x_max+1)
y_min = feat2.min()
y_max = feat2.max()
plt.ylim(y_min-1, y_max+1)
plt.xticks(())
plt.yticks(())
if clf_id == 1:
if with_outliers:
plt.title('Decision Boundary for dataset %d (with Outliers) with One Class SVM'% (dataset_id))
else:
plt.title('Decision Boundary for dataset %d with One Class SVM'% (dataset_id))
else:
if with_outliers:
plt.title('Decision Boundary for dataset %d (with Outliers) with Random Forest'% (dataset_id))
else:
plt.title('Decision Boundary for dataset %d with Random Forest'% (dataset_id))
return y_pred
### END SOLUTION
y_pred = plot_decision_boundary_after_training_clf(X1, dataset_id=1)
test_.eq_ (len(y_pred), 850)
### BEGIN HIDDEN TESTS
test_.ok_ ((y_pred[10:30] == np.array(
[
1, -1, 1, 1, 1, 1, 1,
-1, 1, 1, 1, 1, 1, 1,
1, 1, 1, -1, 1, 1,
]
)).all())
# save a reference to the original function, then delete it from the
# global namespace
old_f = plt.scatter
del plt.scatter
# try running the students' code
try:
plot_decision_boundary_after_training_clf(X1, dataset_id=1)
# if an NameError is thrown, that means their function calls mse
# except NameError:
# pass
except: # any exception
pass
# if no error is thrown, that means their function does not call mse
else:
raise AssertionError("plot_decision_boundary_after_training_clf does \
not call the required fun1")
# restore the original function
finally:
plt.scatter = old_f
del old_f
##############################
old_f = np.meshgrid
del np.meshgrid
# try running the students' code
try:
plot_decision_boundary_after_training_clf(X1, dataset_id=1)
# if an NameError is thrown, that means their function calls mse
except NameError:
pass
except: # any exception
pass
# if no error is thrown, that means their function does not call mse
else:
raise AssertionError("plot_decision_boundary_after_training_clf does \
not call the required fun1")
# restore the original function
finally:
np.meshgrid = old_f
del old_f
##############################
old_f = plt.title
del plt.title
# try running the students' code
try:
plot_decision_boundary_after_training_clf(X1, dataset_id=1)
# if an NameError is thrown, that means their function calls mse
except NameError:
pass
except: # any exception
pass
# if no error is thrown, that means their function does not call mse
else:
raise AssertionError("plot_decision_boundary_after_training_clf does \
not call the required fun1")
# restore the original function
finally:
plt.title = old_f
del old_f
# ### END HIDDEN TESTS
_ = plot_decision_boundary_after_training_clf(X1, dataset_id=1)
Expected Output
¶
_ = plot_decision_boundary_after_training_clf(X2, dataset_id=2)
_ = plot_decision_boundary_after_training_clf(X3, dataset_id=3)
We argued in the tutorial that oneClassSVM is sensitive to outliers. It works ok if number of outliers in training are not too much. If there many outliers, it'll distort the decision boundary. Let's see if that's the case in our datasets.
Question 3¶
Add outliers in all three datasets using np.random.uniform given low and high values range within which you'll generate outliers, number of outliers and random state (again, random state is very important for test cases).
def pollute_dataset(datasets_, rs, n_outliers, low_, high_): # add outliers.
'''
datasets_ -> list => containing n datasets (X, y)
rs -> int => random state
n_outliers -> int => number of outliers to put in each dataset
low -> int => min possible val of generated outliers. arg for np.random.uniform
high -> int => max possible val of generated outliers. arg for np.random.uniform
Other:
Return labels unchanged.
'''
polluted_data_sets = []
### BEGIN SOLUTION
rng = np.random.RandomState(rs)
for dataset in datasets_:
X = dataset[0]
X = np.concatenate([X, rng.uniform(low=low_, high=high_,
size=(n_outliers, 2))], axis=0)
Y = dataset[1] # return as it is
polluted_data_sets.append((X, Y))
### END SOLUTION
return polluted_data_sets
polluted_datasets = pollute_dataset(data_sets, 42, n_outliers, -5, 4)
test_.ok_ (len(polluted_datasets), 3)
test_.ok_ ((np.isclose (polluted_datasets[0][0][:3], np.asarray(
[[ 1.50518593, 2.47038559],
[ 0.03868415, -0.4306421 ],
[-0.18048308, 0.5796649 ]
]))).all())
polluted_datasets = pollute_dataset(data_sets, 42, n_outliers, -5, 4)
def plot_datasets_with_outliers(polluted_datasets, fig_size=(5,10),):
### BEGIN SOLUTION
plt.figure(figsize=fig_size)
for idx in range(1, len(polluted_datasets)+1):
X_polluted = polluted_datasets[0][0]
plt.subplot(3, 1, idx)
plt.scatter(X_polluted[:, 0], X_polluted[:, 1])
plt.title('Dataset %d with outliers'% (idx))
plt.show()
### END SOLUTION
# All test cases hidden.
### BEGIN HIDDEN TESTS
old_f = plt.subplot
del plt.subplot
# try running the students' code
try:
plot_datasets_with_outliers(X1, dataset_id=1)
# if an NameError is thrown, that means their function calls mse
except NameError:
pass
except: # any exception
pass
# if no error is thrown, that means their function does not call mse
else:
raise AssertionError("plot_decision_boundary_after_training_clf does \
not call the required fun1")
# restore the original function
finally:
plt.subplot = old_f
del old_f
# ### END HIDDEN TESTS
plot_datasets_with_outliers(polluted_datasets)
Expected Output
¶
X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X1_pol, dataset_id=1, \
with_outliers=True)
Expected Output
¶
X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X2_pol, dataset_id=2)
X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(X=X3_pol, clf_id=1, dataset_id=3)
- Observe how much distorted are the boundaries because of the outliers. = Also, three separate boundaries. Now, SVM is wrongly covering anamolies inside its decision boundary.
Isolation Forest¶
On the other hand, isolation is not so senstitive to outliers.
X1 = data_sets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1)
Expected Output
¶
X2 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2)
X3 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3)
With Outliers (Random Forest is not too sensitive to Outliers).¶
Notice the unchanged (relatively speaking) decision boundaries.
X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1, with_outliers=True)
X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2, with_outliers=True)
X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3, with_outliers=True)
Expected Output
¶