NAME = ""
COLLABORATORS = ""


# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_blobs
from sklearn import svm
from sklearn.ensemble import IsolationForest
import nose.tools as test_# For testin


# Global variables. Make sure you understand what each var means.
n_samples = 1000
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers


center_1_coordinates = [0,2] # [2,0], [0,2]
center_2_coordinates = [2,2]
cluster_std_ = 0.4
rs = 42
n_samples_ = n_inliers
def generate_guassian_clusters_dataset(centers_coords, cluster_std_, rs, n_samples_):
    '''
    args: centers_coords -> tuple of len 2. => first elment is a pair of x,y coordinates 
                                               for cluster 1 location
                                               second elment is a pair of x,y coordinates 
                                               for cluster 2 location
          cluster_std_ -> float => cluster standard deviation for both clusters.
          rs -> int => random state 
          n_samples_ -> int 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
    data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
                                                        cluster_std_=.5, rs=42, n_samples_=n_samples))
test_.eq_ (len(data_sets), 3)
test_.ok_ (np.isclose (data_sets[0][0][:5], np.asarray([[ 1.50518593,  2.47038559],
       [ 0.03868415, -0.4306421 ],
       [-0.18048308,  0.5796649 ],
       [ 0.04352353, -0.14950368],
       [ 2.16067861,  2.21096038]])).all())

test_.ok_ (((data_sets[0][1][:5]) == np.asarray([1, 0, 0, 0, 1])).all())


all_centers_locations = [([0,0],[2,2]), ([2,0],[2,2]), ([0,0],[0,0])]
data_sets = []
for center_locaion in all_centers_locations:
    data_sets.append(generate_guassian_clusters_dataset(centers_coords=center_locaion, \
                                                        cluster_std_=.5, rs=42, n_samples_=n_samples))


X1 = data_sets[0][0]
plt.scatter(X1[:, 0], X1[:, 1])


X2 = data_sets[1][0]
plt.scatter(X2[:, 0], X2[:, 1])


X3 = data_sets[2][0]
plt.scatter(X3[:, 0], X3[:, 1])


def plot_decision_boundary_after_training_clf(X, dataset_id, with_outliers=False,\
    _outliers_fraction=outliers_fraction, rs=42, clf_id=1):
    
    '''
    args: 
        X -> ndarray -> shape (m, 2) => features only (no labels)
        dataset_id -> int -> use for plt.title()
        clf -> int => if 1 use on class SVC else IsolationForest
        with_outliers -> Bool => whether the input data contains manually added outliers.
        outliers_fraction -> float => arg for svm and IsolationForest
        rs -> int => random state for IsolationForest
          
    return: y_pred -> ndarray -> shape (m,) => which are predicted labels (-1 or 1) of
                                 X using the given classifier. More info in docs.
                                 
    Out solution has ~35 lines of code.
    '''
    if clf_id == 1:
        clf = svm.OneClassSVM(nu=_outliers_fraction, kernel="rbf",)
    else:
        clf = IsolationForest(contamination=_outliers_fraction, random_state=rs)
        
    # YOUR CODE HERE
    raise NotImplementedError()


y_pred = plot_decision_boundary_after_training_clf(X1, dataset_id=1)
test_.eq_ (len(y_pred), 850)


_ = plot_decision_boundary_after_training_clf(X1, dataset_id=1)


_ = plot_decision_boundary_after_training_clf(X2, dataset_id=2)


_ = plot_decision_boundary_after_training_clf(X3, dataset_id=3)


def pollute_dataset(datasets_, rs, n_outliers, low_, high_): # add outliers.
    '''
    datasets_ -> list => containing n datasets (X, y)
    rs -> int => random state
    n_outliers -> int => number of outliers to put in each dataset
    low -> int => min possible val of generated outliers. arg for np.random.uniform
    high -> int => max possible val of generated outliers. arg for np.random.uniform
    
    Other:
        Return labels unchanged.
    '''
    polluted_data_sets = []
    # YOUR CODE HERE
    raise NotImplementedError()
    return polluted_data_sets


polluted_datasets = pollute_dataset(data_sets, 42, n_outliers,  -5, 4)
test_.ok_ (len(polluted_datasets), 3)
test_.ok_ ((np.isclose (polluted_datasets[0][0][:3], np.asarray(
    [[ 1.50518593,  2.47038559],
     [ 0.03868415, -0.4306421 ],
     [-0.18048308,  0.5796649 ]
    ]))).all())


polluted_datasets = pollute_dataset(data_sets, 42, n_outliers, -5, 4)


def plot_datasets_with_outliers(polluted_datasets, fig_size=(5,10),):
    # YOUR CODE HERE
    raise NotImplementedError()


# All test cases hidden.


plot_datasets_with_outliers(polluted_datasets)


X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X1_pol, dataset_id=1, \
                                          with_outliers=True)


X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=1, X=X2_pol, dataset_id=2)


X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(X=X3_pol, clf_id=1, dataset_id=3)


X1 = data_sets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1)


X2 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2)


X3 = data_sets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3)


X1_pol = polluted_datasets[0][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X1, dataset_id=1, with_outliers=True)


X2_pol = polluted_datasets[1][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X2, dataset_id=2, with_outliers=True)


X3_pol = polluted_datasets[2][0]
_ = plot_decision_boundary_after_training_clf(clf_id=2, X=X3, dataset_id=3, with_outliers=True)

Test 10

Anamoly Detection¶

`We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.`¶

Question 1¶

Generate three datasets¶

Observe the three datasets. Think about the anamolous locations (where points will be called anamolies).¶

Question 2¶

`Expected Output`¶

Question 3¶

Question 4¶

`Expected Output`¶

`Expected Output`¶

Isolation Forest¶

`Expected Output`¶

With Outliers (Random Forest is not too sensitive to Outliers).¶

`Expected Output`¶

Anamoly Detection¶

We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.¶

Question 1¶

Generate three datasets¶

Observe the three datasets. Think about the anamolous locations (where points will be called anamolies).¶

Question 2¶

Expected Output¶

Question 3¶

Question 4¶

Expected Output¶

Expected Output¶

Isolation Forest¶

Expected Output¶

With Outliers (Random Forest is not too sensitive to Outliers).¶

Expected Output¶

`We said that SVM is sensitive to outliers thus not really a good outlier detector but a novelty detector. Here, we'll plot boundaries and observe the sensitivity.`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶

`Expected Output`¶