# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_blobs
from sklearn import svm
from sklearn.ensemble import IsolationForest
import nose.tools as test_# For testin


# Global variables. Make sure you understand what each var means.
n_samples = 1000
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers


center_1_coordinates = [0,2] # [2,0], [0,2]
center_2_coordinates = [2,2]
cluster_std_ = 0.4
rs = 42
n_samples_ = n_inliers
def generate_guassian_clusters_dataset(centers_coords, cluster_std_, rs, n_samples_):
    '''
    args: centers_coords -> tuple of len 2. => first elment is a pair of x,y coordinates 
                                               for cluster 1 location
                                               second elment is a pair of x,y coordinates 
                                               for cluster 2 location
          cluster_std_ -> float => cluster standard deviation for both clusters.
          rs -> int => random state 
          n_samples_ -> int 
    '''
    ### BEGIN SOLUTION
    center_1_coordinates, center_2_coordinates = centers_coords[0], centers_coords[1] 
    data = make_blobs(centers=[center_1_coordinates, center_2_coordinates], \
                        cluster_std=cluster_std_, random_state=rs, n_samples=n_inliers)
    return data
    ### END SOLUTION


all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
    data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
                                                        cluster_std_=.5, rs=42, n_samples_=n_samples))
test_.eq_ (len(data_sets), 3)
test_.ok_ (np.isclose (data_sets[0][0][:5], np.asarray([[ 1.50518593,  2.47038559],
       [ 0.03868415, -0.4306421 ],
       [-0.18048308,  0.5796649 ],
       [ 0.04352353, -0.14950368],
       [ 2.16067861,  2.21096038]])).all())

test_.ok_ (((data_sets[0][1][:5]) == np.asarray([1, 0, 0, 0, 1])).all())


all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
    data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
                                                        cluster_std_=.5, rs=42, n_samples_=n_samples))


X1 = data_sets[0][0]
plt.scatter(X1[:, 0], X1[:, 1])

<matplotlib.collections.PathCollection at 0x7fe044a47f28>


X2 = data_sets[1][0]
plt.scatter(X2[:, 0], X2[:, 1])

<matplotlib.collections.PathCollection at 0x7fe044c7ceb8>


X3 = data_sets[2][0]
plt.scatter(X3[:, 0], X3[:, 1])

<matplotlib.collections.PathCollection at 0x7fe044668f28>


def plot_decision_boundary_after_training_clf(X, dataset_id, with_outliers=False,\
    _outliers_fraction=outliers_fraction, rs=42, clf_id=1):
    
    '''
    args: 
        X -> ndarray -> shape (m, 2) => features only (no labels)
        dataset_id -> int -> use for plt.title()
        clf -> int => if 1 use on class SVC else IsolationForest
        with_outliers -> Bool => whether the input data contains manually added outliers.
        outliers_fraction -> float => arg for svm and IsolationForest
        rs -> int => random state for IsolationForest
          
    return: y_pred -> ndarray -> shape (m,) => which are predicted labels (-1 or 1) of
                                 X using the given classifier. More info in docs.
                                 
    Out solution has ~35 lines of code.
    '''
    if clf_id == 1:
        clf = svm.OneClassSVM(nu=_outliers_fraction, kernel="rbf",)
    else:
        clf = IsolationForest(contamination=_outliers_fraction, random_state=rs)
        
    ### BEGIN SOLUTION
    clf.fit(X)
    y_pred = clf.fit(X).predict(X)
   
    coordinates_vector1 = np.linspace(-7, 7, 150)
    coordinates_vector2 = np.linspace(-7, 7, 150)
    xx, yy = np.meshgrid(coordinates_vector1, coordinates_vector2)
    
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # ravel- flatten the array- 1D
    Z = Z.reshape(xx.shape)
    plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')

    colors = np.array(['black', 'orange'])
    feat1 = X[:, 0]
    feat2 = X[:, 1]
    plt.scatter(feat1, feat2, s=10, color=colors[(y_pred + 1) // 2])
    x_min = feat1.min()
    x_max = feat1.max()
    plt.xlim(x_min-1, x_max+1)
    y_min = feat2.min()
    y_max = feat2.max()
    plt.ylim(y_min-1, y_max+1)
    plt.xticks(())
    plt.yticks(())
    if clf_id == 1:
        if with_outliers:
            plt.title('Decision Boundary for dataset %d (with Outliers) with One Class SVM'% (dataset_id))
        else:
            plt.title('Decision Boundary for dataset %d with One Class SVM'% (dataset_id))
    else:
        if with_outliers:
            plt.title('Decision Boundary for dataset %d (with Outliers) with Random Forest'% (dataset_id))
        else:
            plt.title('Decision Boundary for dataset %d with Random Forest'% (dataset_id))
    return y_pred
    ### END SOLUTION


y_pred = plot_decision_boundary_after_training_clf(X1, dataset_id=1)
test_.eq_ (len(y_pred), 850)

### BEGIN HIDDEN TESTS
test_.ok_ ((y_pred[10:30] == np.array(
    [ 
        1, -1, 1, 1, 1, 1, 1, 
        -1, 1,  1,  1, 1, 1, 1,
        1, 1, 1, -1, 1, 1,
    ]
)).all())

# save a reference to the original function, then delete it from the
# global namespace
old_f = plt.scatter
del plt.scatter

# try running the students' code
try:
    plot_decision_boundary_after_training_clf(X1, dataset_id=1)

# if an NameError is thrown, that means their function calls mse
# except NameError:
#     pass

except: # any exception
    pass

# if no error is thrown, that means their function does not call mse
else:
    raise AssertionError("plot_decision_boundary_after_training_clf does \
    not call the required fun1")

# restore the original function
finally:
    plt.scatter = old_f
    del old_f
############################## 
old_f = np.meshgrid
del np.meshgrid

# try running the students' code
try:
    plot_decision_boundary_after_training_clf(X1, dataset_id=1)

# if an NameError is thrown, that means their function calls mse
except NameError:
    pass

except: # any exception
    pass

# if no error is thrown, that means their function does not call mse
else:
    raise AssertionError("plot_decision_boundary_after_training_clf does \
    not call the required fun1")

# restore the original function
finally:
    np.meshgrid = old_f
    del old_f
##############################
old_f = plt.title
del plt.title

# try running the students' code
try:
    plot_decision_boundary_after_training_clf(X1, dataset_id=1)

# if an NameError is thrown, that means their function calls mse
except NameError:
    pass

except: # any exception
    pass

# if no error is thrown, that means their function does not call mse
else:
    raise AssertionError("plot_decision_boundary_after_training_clf does \
    not call the required fun1")

# restore the original function
finally:
    plt.title = old_f
    del old_f
# ### END HIDDEN TESTS


_ = plot_decision_boundary_after_training_clf(X1, dataset_id=1)


_ = plot_decision_boundary_after_training_clf(X2, dataset_id=2)


_ = plot_decision_boundary_after_training_clf(X3, dataset_id=3)


def pollute_dataset(datasets_, rs, n_outliers, low_, high_): # add outliers.
    '''
    datasets_ -> list => containing n datasets (X, y)
    rs -> int => random state
    n_outliers -> int => number of outliers to put in each dataset
    low -> int => min possible val of generated outliers. arg for np.random.uniform
    high -> int => max possible val of generated outliers. arg for np.random.uniform
    
    Other:
        Return labels unchanged.
    '''
    polluted_data_sets = []
    ### BEGIN SOLUTION
    rng = np.random.RandomState(rs)
    for dataset in datasets_:
        X = dataset[0]
        X = np.concatenate([X, rng.uniform(low=low_, high=high_,
                               size=(n_outliers, 2))], axis=0)
        Y = dataset[1] # return as it is
        polluted_data_sets.append((X, Y))
    ### END SOLUTION
    return polluted_data_sets


polluted_datasets = pollute_dataset(data_sets, 42, n_outliers,  -5, 4)
test_.ok_ (len(polluted_datasets), 3)
test_.ok_ ((np.isclose (polluted_datasets[0][0][:3], np.asarray(
    [[ 1.50518593,  2.47038559],
     [ 0.03868415, -0.4306421 ],
     [-0.18048308,  0.5796649 ]
    ]))).all())


polluted_datasets = pollute_dataset(data_sets, 42, n_outliers, -5, 4)


def plot_datasets_with_outliers(polluted_datasets, fig_size=(5,10),):
    ### BEGIN SOLUTION
    plt.figure(figsize=fig_size) 
    for idx in range(1, len(polluted_datasets)+1):
        
        X_polluted = polluted_datasets[0][0]
        plt.subplot(3, 1, idx)
        plt.scatter(X_polluted[:, 0], X_polluted[:, 1])
        plt.title('Dataset %d with outliers'% (idx))
    plt.show()
    ### END SOLUTION


# All test cases hidden.
### BEGIN HIDDEN TESTS
old_f = plt.subplot
del plt.subplot

# try running the students' code
try:
    plot_datasets_with_outliers(X1, dataset_id=1)

# if an NameError is thrown, that means their function calls mse
except NameError:
    pass

except: # any exception
    pass

# if no error is thrown, that means their function does not call mse
else:
    raise AssertionError("plot_decision_boundary_after_training_clf does \
    not call the required fun1")

# restore the original function
finally:
    plt.subplot = old_f
    del old_f
# ### END HIDDEN TESTS


plot_datasets_with_outliers(polluted_datasets)


X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X1_pol, dataset_id=1, \
                                          with_outliers=True)


X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X2_pol, dataset_id=2)


X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(X=X3_pol, clf_id=1, dataset_id=3)


X1 = data_sets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1)


X2 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2)


X3 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3)


X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1, with_outliers=True)


X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2, with_outliers=True)


X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3, with_outliers=True)

Test 10 Solution

Anamoly Detection¶

`We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.`¶

Question 1¶

Generate three datasets¶

Observe the three datasets. Think about the anamolous locations (where points will be called anamolies).¶

Question 2¶

`Expected Output`¶

Question 3¶

Question 4¶

`Expected Output`¶

`Expected Output`¶

Isolation Forest¶

`Expected Output`¶

With Outliers (Random Forest is not too sensitive to Outliers).¶

`Expected Output`¶

Anamoly Detection¶

We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.¶

Question 1¶

Generate three datasets¶

Observe the three datasets. Think about the anamolous locations (where points will be called anamolies).¶

Question 2¶

Expected Output¶

Question 3¶

Question 4¶

Expected Output¶

Expected Output¶

Isolation Forest¶

Expected Output¶

With Outliers (Random Forest is not too sensitive to Outliers).¶

Expected Output¶

`We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶