NAME = ""
COLLABORATORS = ""


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_gaussian_quantiles
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import nose.tools as test_# For testing
from collections import Counter
from sklearn.model_selection import StratifiedShuffleSplit


# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


def generate_mean_vector(rs=42):
    '''
    args: rs -> int => random state
    
    return: numpy.ndarray -> shape (2,)
    
    Note: we want to generate the same random vector on every function call; That's why why passed
          rs (random state) as argument
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.ok_(len(generate_mean_vector()) == 2)
test_.ok_(generate_mean_vector()[0] > 0)

# With same random state, each call should return the same vector.
first_call_random_V = generate_mean_vector(rs=42)
second_call_random_V = generate_mean_vector(rs=42)
# should be same since random state is same:
test_.ok_(np.isclose(first_call_random_V, second_call_random_V).all())

# With different random states, each call should return different vectors.
first_call_random_V = generate_mean_vector(rs=42)
second_call_random_V = generate_mean_vector(rs=45)
test_.ok_(not np.isclose(first_call_random_V, second_call_random_V).all())


mean_vector_class0 = generate_mean_vector()
mean_vector_class1 = generate_mean_vector()


def generate_samples_for_a_class(mean_vector, n_samples, rs):
    '''
    args: mean_vector -> ndarray -> shape (2,) 
          n_samples -> int => number of sampels to generate
          rs -> int => Random state 
          
    return: tuple of length 2 whose first element is ndarray 
            containing features of shape (n_samples, 2) and 
            second element is ndarray containing labels of shape 
            (1000,)
            
          
    "You will pass '1' for n_classes argument, since all samples will belong to
    a single class"
    
    Leave other arguments with their default values. This will also automatically 
    use an identity matrix as covariance matrix. 
    
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


(test_.ok_(np.isclose(generate_samples_for_a_class([.5,.2], 1000, 42)[0][:5], \
                      np.asarray( [[ 2.14496771, -0.04903604],
       [ 1.68947049, -1.02760782],
       [ 0.56980208, -0.1853136 ],
       [ 2.346637  , -0.87008477],
       [ 0.86163603, -0.44511975]])).all()))

test_.eq_(generate_samples_for_a_class([.5,.2], 1000, \
                                       42)[0].shape, (1000, 2))


n_samples_class0 = 1000 # arbitrary number


guassian_data_class0 = generate_samples_for_a_class(mean_vector_class0, \
                                                    n_samples_class0, 42)
features_class0 = guassian_data_class0[0]
labels_class0 = guassian_data_class0[1]


pp(features_class0) # features head
pp(labels_class0) # all lebels


def minority_class_samples_count(IR, n_samples_majority):
    '''
    Find the number of sampels in minority class. 
    
    args: IR -> int 
          n_samples_majority -> int
    
    return: int
    
    Make sure you check for invalid IR (i.e. less than 0),
    return 0 if IR is less than 1
    
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.eq_(minority_class_samples_count(128, 1000), 7)
test_.eq_(minority_class_samples_count(256, 10000), 39)


n_samples_class1 = minority_class_samples_count(128, n_samples_class0)


guassian_data_class1 = generate_samples_for_a_class(mean_vector_class1, \
                                                    n_samples_class1, 42)
features_class1 = guassian_data_class1[0]
# Let's overwrite labels to 1s instead of 0s for class1.
# Labels returned were 0s by default by .
labels_class1 = np.asarray([1 for i in range(n_samples_class1)])


pp(features_class1)


Counter(labels_class1)


Counter(labels_class0)


def merge_features(features_class1, features_class0):
    '''
    Combine Features
    
    args: features_class1 -> ndarray -> shape (a, 2)
          features_class1 -> ndarray -> shape (b, 2)
    
    return: ndarray -> shape (a+b, 2)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.eq_(merge_features(features_class1, features_class0).shape, (1007, 2))


X = merge_features(features_class1, features_class0)
pp(X)


def merge_labels(labels_class1, labels_class0):
    '''
    Combine labels
    
    args: labels_class1 -> ndarray -> shape (a,)
          labels_class0 -> ndarray -> shape (b,)
    
    return: ndarray -> shape (a+b,)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.eq_(merge_labels(labels_class1, labels_class0).shape, (1007,))


y = merge_labels(labels_class1, labels_class0)
pp(y)


def __train_test_split(X, y):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = __train_test_split(X, y)


def train_svc(X_train, y_train):
    _svc_clf_non_linear = SVC()
    _svc_clf_non_linear.fit(X_train, y_train)
    return _svc_clf_non_linear


_svc = train_svc(X_train, y_train)


def predict_test_data_labels(X_test, svc_):
    y_pred = svc_.predict(X_test)
    return y_pred


y_pred = predict_test_data_labels(X_test, _svc)


def get_metrics(y_test, y_pred):
    '''
    Compute Metrics and put them in a dict.
    
    args: y_test -> ndarray -> shape (a, )
          y_pred -> ndarray -> shape (a, )
    
    
    return: a dict with the following string keys:  "accuracy",
                                                    "precision_score",
                                                    "recall_score",
                                                    "f1_score_macro",
                                                    "f1_score_micro",
                                                    "f1_score_weighted",
                                                    "auc_score",
            value against each key is the corresponding metric (float)  
            
            
    Other:
        In 'Precision' set 'zero_division=1' to silence warnings; 
        Read docs to know what does it mean to set it to 1.

    '''
    
    metrics = {}
    # YOUR CODE HERE
    raise NotImplementedError()
    return metrics


_met = get_metrics(y_test, y_pred)
list(get_metrics(y_test, y_pred).keys()) == ['accuracy',
 'precision_score',
 'recall_score',
 'f1_score_macro',
 'f1_score_micro',
 'f1_score_weighted',
 'auc_score']
test_.ok_(np.isclose(_met['f1_score_macro'], 0.4987593052109181))


metrics_no_sampling = get_metrics(y_test, y_pred) 
metrics_no_sampling


def undersample(X, y, rs):
    '''
    args: X -> ndarray -> shape (k, 2)
          y -> ndarray -> shape (k,)
          rs -> int => random state
          
    return: tuple -> (X_undersampled, y_undersampled) 
                      X_undersampled -> ndarray -> shape (j, 2) => features with reduced majority class samples
                      X_undersampled -> ndarray -> shape (j,)=> labels with reduced majority class samples
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.ok_(np.isclose(undersample(X, y, 42)[0][:5], np.asarray([[ 0.74568599,  0.34672912],
       [ 0.03976479,  0.54706584],
       [-1.25300232,  0.99879925],
       [ 0.14968414,  0.30071173],
       [-0.38871904, -0.85416779]])).all())
test_.eq_(undersample(X, y, 42)[1].shape, (14,))


Counter((undersample(X, y, 42)[1])) # 7 Examples in each class.


X_us, y_us = undersample(X, y, 42)


pp(X_us)


pp(y_undersampled)


X_train_us, X_test_us, y_train_us, y_test_us = __train_test_split(X_us, y_us)


svc_undersampled = train_svc(X_train_us, y_train_us) # calling the function defined above.


y_pred_us = predict_test_data_labels(X_test_us, svc_undersampled)


metrics_undersampling = get_metrics(y_test_us, y_pred_us)
metrics_undersampling


svc_oversampled.score(X_test_us, y_test_us)


def oversample(X, y, rs):
    '''
    args: X -> ndarray -> shape (k, 2)
          y -> ndarray -> shape (k,)
          rs -> int => random state
          
    return: tuple -> (X_oversampled, y_oversampled) 
                      X_oversampled -> ndarray -> shape (j, 2) => features with increased minority class samples
                      X_oversampled -> ndarray -> shape (j,)=> labels with increased minority class samples
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


(oversample(X, y, 42)[0][:5])


test_.ok_(np.isclose(oversample(X, y, 42)[0][:5], np.asarray([[ 0.87125427,  0.81245001],
       [ 1.02222866,  2.47374416],
       [ 0.61650239, -0.96256594],
       [-0.08887757,  0.48498455],
       [ 1.95375293,  1.71814904]])).all())
test_.eq_(oversample(X, y, 42)[1].shape, (2000,))


Counter((oversample(X, y, 42)[1])) # 1000 examples each.


X_os, y_os = oversample(X, y, 42)


pp(X_oversampled) # only head displayed
pp(y_oversampled)


X_train_os, X_test_os, y_train_os, y_test_os = __train_test_split(X_os, y_os)


svc_oversampled = train_svc(X_train_os, y_train_os)


y_pred_os = predict_test_data_labels(X_test_os, svc_oversampled)


metrics_oversampling = get_metrics(y_test_os, y_pred_os)
metrics_oversampling


svc_oversampled.score(X_test_os, y_test_os)


print('metrics No Sampling:', metrics_no_sampling)
print()
print('metrics_undersampling:', metrics_undersampling)
print()
print('metrics_oversampling:', metrics_oversampling)

Test 7

Advanced Experiments¶

Question 1¶

Question 2¶

Generate samples for both classes¶

Question 3¶

Question 4¶

Question 5¶

Train Test Split¶

Ungraded Question¶

Ungraded Question¶

Question 6¶

UnderSamping (Random)¶

Question 7¶

Train Test Split¶

Train SVC on Undersampled Data¶

Metrics on Undersampled Data¶

Over Sampling¶

Question 8¶