NAME = ""
COLLABORATORS = ""


import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets 
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits


def download_data():
    '''
    args: None
    return: dict with at least two keys. The first key is 'data' of shape (70000, 784) 
    (i.e. it contains 70,000 examples (images of digits) and each example contains 784 pixels. 
    The second key is 'target' of shape (70,000,) which contains labels for each example.
    
    Staff's solution is two lines of code. 
    
    This function may take a few seconds to execute.
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


mnist = download_data() # takes a few seconds.


assert mnist.target.shape == (70000,)
assert mnist.data.shape == (70000,784)
assert mnist.data[0].shape == (784,)
assert list(mnist.keys()) == ['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url']


## 
def labels_to_int_dtype(mnist):
    '''
    args: dict (mnist containing above keys)
    returns: ndarray of shape (70000,) -> Labels (in mnist at key target) are strings e.g. '5', '3'. 
                        Return an ndarray with integer labels e.g. 5, 3
                        
    Staff's solution contains two lines of code. You should use map(lambda) for this which is good practice but
        not required. Or you can use list comprehension. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


assert labels_to_int_dtype(mnist).shape == (70000,)


labels = labels_to_int_dtype(mnist)
labels.shape # (70,000)


digits_dict = load_digits()
_data = digits_dict.data
labels = digits_dict.target


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(_data, labels, \
                                                                test_size=.2, random_state=42)


# A sanity check
assert x_train.shape[0] == y_train.shape[0] and x_test.shape[0] == y_test.shape[0]


def plot_digits(x, y, _fig_size=(20,4)):
    '''
    args: x -> ndarray of shape (m,n) -> (m examples/digits each of shape n). Think of x 
                                        as truncated x_train or truncated x_test
          y -> ndarray of shape (m,) -> (corresponding labels of x)
          
          _fig_size: tupe (a,b) -> (Width, height in inches of the figure to be created)

    return: None (just observe the digits images yourself)
    '''
    
    # YOUR CODE HERE
    raise NotImplementedError()


plot_digits(x_train[:5], y_train[:5])


# We'll use sophisticated hidden tests to check plots. 
# Make sure you follow the instructions and you're good to go.


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)


def fit_log_reg_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.linear_model._logistic.LogisticRegression object which can predict. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


logst_reg = fit_log_reg_model(scaled_x_train, y_train)


score = logst_reg.score(scaled_x_test, y_test)
score


logst_reg.predict(x_test[30].reshape(1,-1))


score = logst_reg.score(scaled_x_test, y_test)
assert score > .96


predictions = logst_reg.predict(scaled_x_test)


def misclassified_idx(y_test, predictions):
    '''
    args: y_test: actual labels -> ndarray
          predictions: predicted labels  -> ndarray
          
    return: ndarray (Note that you'll work with Python list. We provide code to convert your list to ndarray)
    '''
    misclassifiedIndexes = [] # empty list
    # YOUR CODE HERE
    raise NotImplementedError()
    return np.asarray(misclassifiedIndexes)


idxs[50:60]


misclassified_idx(y_test, predictions)


assert len(misclassified_idx(y_test, predictions)) == 13


misclassifiedIndexes = misclassified_idx(y_test, predictions)


plt.figure(figsize=(20,4))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:5]):
    plt.subplot(1, 5, plotIndex + 1)
    plt.imshow(np.reshape(x_test[badIndex], (8,8)), cmap=plt.cm.gray)
    plt.title('Predicted: {}, Actual: {}'.format(predictions[badIndex], y_test[badIndex]), fontsize = 15)


# import any packages you need.
# YOUR CODE HERE
raise NotImplementedError()


def fit_kNN_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.neighbors._classification.KNeighborsClassifier object which can predict. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


kNN_model = fit_kNN_model(scaled_x_train, y_train)


kNN_model.score(scaled_x_test, y_test)


neigh_score = kNN_model.score(scaled_x_test, y_test) # recall, this step is very expensive
assert neigh_score > .95


# import any packages you need.
# YOUR CODE HERE
raise NotImplementedError()


def fit_svm_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.svm._classes.LinearSVC object which can predict. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


svm_model = fit_svm_model(scaled_x_train, y_train) # ignore any warnings.


svm_model.score(scaled_x_test, y_test)


score_svm = svm_model.score(scaled_x_test, y_test)
print(score_svm)
assert score_svm > .95


# import any packages you need.
# YOUR CODE HERE
raise NotImplementedError()


def fit_kernel_svm_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.svm._classes.SVC object which can predict. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


kernel_svm = fit_kernel_svm_model(scaled_x_train, y_train)
kernel_svm.score(scaled_x_test, y_test)


score_k_svm = kernel_svm.score(scaled_x_test, y_test)
assert score_k_svm > .97


# import any packages you need.
# YOUR CODE HERE
raise NotImplementedError()


def fit_dt_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.tree._classes.DecisionTreeClassifier object which can predict. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


dt_model = fit_dt_model(scaled_x_train, y_train)


dt_model.score(scaled_x_test, y_test)


score_dt = dt_model.score(scaled_x_test, y_test)
assert score_dt > .84


# Import any libraries you need.
# YOUR CODE HERE
raise NotImplementedError()


def get_confusion_matrix(x_test):
    '''
    args: x_test -> ndarray
    return ndarray
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


assert (get_confusion_matrix(scaled_x_test)[:5] == np.array([[33,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 28,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 33,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1, 32,  0,  1,  0,  0,  0,  0],
       [ 0,  1,  0,  0, 45,  0,  0,  0,  0,  0]])).all()

Test 4

Classification¶

MNIST dataset¶

Question 1 (5 poins)¶

Download MNIST dataset¶

Question 2 (3 points)¶

Transition to a Smaller Dataset¶

Split dataset into Train Split¶

Question 3¶

Visualize Digits¶

Scaling Features¶

Question 4¶

Train a logistic Regression on x_train¶

Score¶

Predictions¶

Misclassified Labels:¶

Plot Misclassified Digits¶

Question 5¶

Train a k nearest neighbour¶

Ungraded Question¶

Question 6¶

Train a Linear Support Vector Classifier.¶

Question 7¶

Train a Kernel Support Vector Classifier.¶

Question 8¶

Train a Decision Tree (DT) Classifier.¶

Ungrader Question¶

Question 9¶