import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets 
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits


def download_data():
    '''
    args: None
    return: dict with at least two keys. The first key is 'data' of shape (70000, 784) 
    (i.e. it contains 70,000 examples (images of digits) and each example contains 784 pixels. 
    The second key is 'target' of shape (70,000,) which contains labels for each example.
    
    Staff's solution is two lines of code. 
    
    This function may take a few seconds to execute.
    '''
    ### BEGIN SOLUTION
    mnist = sklearn.datasets.fetch_openml('mnist_784')
    return mnist
    ### END SOLUTION


mnist = download_data() # takes a few seconds.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-3-1d435d1ea221> in <module>
----> 1 mnist = download_data() # takes a few seconds.

<ipython-input-2-97a7053a1781> in download_data()
     11     '''
     12     ### BEGIN SOLUTION
---> 13     mnist = sklearn.datasets.fetch_openml('mnist_784')
     14     return mnist
     15     ### END SOLUTION

/opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in fetch_openml(name, version, data_id, data_home, target_column, cache, return_X_y, as_frame)
    773 
    774     # download data features, meta-info about column types
--> 775     features_list = _get_data_features(data_id, data_home)
    776 
    777     if not as_frame:

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _get_data_features(data_id, data_home)
    409     url = _DATA_FEATURES.format(data_id)
    410     error_message = "Dataset with data_id {} not found.".format(data_id)
--> 411     json_data = _get_json_content_from_openml_api(url, error_message, True,
    412                                                   data_home)
    413     return json_data['data_features']['feature']

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _get_json_content_from_openml_api(url, error_message, raise_if_error, data_home)
    159 
    160     try:
--> 161         return _load_json()
    162     except HTTPError as error:
    163         # 412 is an OpenML specific error code, indicating a generic error

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in wrapper(*args, **kw)
     51                 return f(*args, **kw)
     52             try:
---> 53                 return f(*args, **kw)
     54             except HTTPError:
     55                 raise

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _load_json()
    155     @_retry_with_clean_cache(url, data_home)
    156     def _load_json():
--> 157         with closing(_open_openml_url(url, data_home)) as response:
    158             return json.loads(response.read().decode("utf-8"))
    159 

/opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _open_openml_url(openml_path, data_home)
    110                 else:
    111                     with gzip.GzipFile(local_path, 'wb') as fdst:
--> 112                         shutil.copyfileobj(fsrc, fdst)
    113         except Exception:
    114             if os.path.exists(local_path):

/opt/anaconda3/lib/python3.8/shutil.py in copyfileobj(fsrc, fdst, length)
    200     fdst_write = fdst.write
    201     while True:
--> 202         buf = fsrc_read(length)
    203         if not buf:
    204             break

/opt/anaconda3/lib/python3.8/http/client.py in read(self, amt)
    456             # Amount is given, implement using readinto
    457             b = bytearray(amt)
--> 458             n = self.readinto(b)
    459             return memoryview(b)[:n].tobytes()
    460         else:

/opt/anaconda3/lib/python3.8/http/client.py in readinto(self, b)
    500         # connection, and the user is reading more bytes than will be provided
    501         # (for example, reading in 1k chunks)
--> 502         n = self.fp.readinto(b)
    503         if not n and b:
    504             # Ideally, we would raise IncompleteRead if the content-length

/opt/anaconda3/lib/python3.8/socket.py in readinto(self, b)
    667         while True:
    668             try:
--> 669                 return self._sock.recv_into(b)
    670             except timeout:
    671                 self._timeout_occurred = True

/opt/anaconda3/lib/python3.8/ssl.py in recv_into(self, buffer, nbytes, flags)
   1239                   "non-zero flags not allowed in calls to recv_into() on %s" %
   1240                   self.__class__)
-> 1241             return self.read(nbytes, buffer)
   1242         else:
   1243             return super().recv_into(buffer, nbytes, flags)

/opt/anaconda3/lib/python3.8/ssl.py in read(self, len, buffer)
   1097         try:
   1098             if buffer is not None:
-> 1099                 return self._sslobj.read(len, buffer)
   1100             else:
   1101                 return self._sslobj.read(len)

KeyboardInterrupt:


assert mnist.target.shape == (70000,)
assert mnist.data.shape == (70000,784)
assert mnist.data[0].shape == (784,)
assert list(mnist.keys()) == ['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url']

### BEGIN HIDDEN TESTS
assert mnist.data[56].shape == (784,)
assert mnist.data[5356].shape == (784,)
### END HIDDEN TESTS


## 
def labels_to_int_dtype(mnist):
    '''
    args: dict (mnist containing above keys)
    returns: ndarray of shape (70000,) -> Labels (in mnist at key target) are strings e.g. '5', '3'. 
                        Return an ndarray with integer labels e.g. 5, 3
                        
    Staff's solution contains two lines of code. You should use map(lambda) for this which is good practice but
        not required. Or you can use list comprehension. 
    '''
    ### BEGIN SOLUTION
    labels = mnist.target
    labels = np.asarray(list(map(lambda label: int(label), labels)))
    return labels
    ### END SOLUTION


assert labels_to_int_dtype(mnist).shape == (70000,)

### BEGIN HIDDEN TESTS
assert labels_to_int_dtype(mnist)[53] == 4
### END HIDDEN TESTS


labels = labels_to_int_dtype(mnist)
labels.shape # (70,000)

(70000,)


digits_dict = load_digits()
_data = digits_dict.data
labels = digits_dict.target


from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(_data, labels, \
                                                                test_size=.2, random_state=42)


# A sanity check
assert x_train.shape[0] == y_train.shape[0] and x_test.shape[0] == y_test.shape[0]


def plot_digits(x, y, _fig_size=(20,4)):
    '''
    args: x -> ndarray of shape (m,n) -> (m examples/digits each of shape n). Think of x 
                                        as truncated x_train or truncated x_test
          y -> ndarray of shape (m,) -> (corresponding labels of x)
          
          _fig_size: tupe (a,b) -> (Width, height in inches of the figure to be created)

    return: None (just observe the digits images yourself)
    '''
    
    ### BEGIN SOLUTION
    plt.figure(figsize=_fig_size)
    n_digits_to_show = len(x)
    for index, (image, label) in enumerate(zip(x, y)):
        plt.subplot(1, n_digits_to_show, index + 1)
        plt.imshow(image.reshape(8,8), cmap=plt.cm.gray)
        plt.title('Digit: %i\n' % label, fontsize = 20)
    ### END SOLUTION


plot_digits(x_train[:5], y_train[:5])


# We'll use sophisticated hidden tests to check plots. 
# Make sure you follow the instructions and you're good to go.

### BEGIN HIDDEN TESTS
# This test checks if plt.subplot has been called by the student as per the instructions.

# save a reference to the original function, then delete it from the
# global namespace
must_use_func = plt.subplot
old_must_use_func = must_use_func
del must_use_func

# try running the students' code
try:
    plot_mse()

# if an NameError is thrown, that means their function calls mse
except NameError:
    pass

# if no error is thrown, that means their function does not call mse
else:
    raise AssertionError("plot_mse does not call plt.subplot")

# restore the original function
finally:
    must_use_func = old_must_use_func
    del old_must_use_func
### END HIDDEN TESTS


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)


def fit_log_reg_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.linear_model._logistic.LogisticRegression object which can predict. 
    '''
    ### BEGIN SOLUTION
    logisticRegr = LogisticRegression()
    return logisticRegr.fit(x_train, y_train)
    ### END SOLUTION


logst_reg = fit_log_reg_model(scaled_x_train, y_train)


score = logst_reg.score(scaled_x_test, y_test)
score

0.9638888888888889


logst_reg.predict(x_test[30].reshape(1,-1))

array([1])


score = logst_reg.score(scaled_x_test, y_test)
assert score > .96

### BEGIN HIDDEN TESTS
assert logst_reg.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS


predictions = logst_reg.predict(scaled_x_test)


def misclassified_idx(y_test, predictions):
    '''
    args: y_test: actual labels -> ndarray
          predictions: predicted labels  -> ndarray
          
    return: ndarray (Note that you'll work with Python list. We provide code to convert your list to ndarray)
    '''
    misclassifiedIndexes = [] # empty list
    ### BEGIN SOLUTION
    index = 0
    for label, predict in zip(y_test, predictions):
        if label != predict: 
            misclassifiedIndexes.append(index)
        index +=1
    ### END SOLUTION
    return np.asarray(misclassifiedIndexes)


idxs[50:60]

array([], dtype=int64)


misclassified_idx(y_test, predictions)

array([ 52,  71, 124, 133, 149, 159, 193, 222, 234, 239, 244, 262, 339])


assert len(misclassified_idx(y_test, predictions)) == 13

### BEGIN HIDDEN TESTS
idxs = misclassified_idx(y_test, predictions)
assert (idxs == np.array([52,  71, 124, 133, 149, 159, 193, 222, 234, 239, 244, 262, 339])).all()
### END HIDDEN TESTS


misclassifiedIndexes = misclassified_idx(y_test, predictions)


plt.figure(figsize=(20,4))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:5]):
    plt.subplot(1, 5, plotIndex + 1)
    plt.imshow(np.reshape(x_test[badIndex], (8,8)), cmap=plt.cm.gray)
    plt.title('Predicted: {}, Actual: {}'.format(predictions[badIndex], y_test[badIndex]), fontsize = 15)


# import any packages you need.
### BEGIN SOLUTION
from sklearn.neighbors import KNeighborsClassifier
### END SOLUTION


def fit_kNN_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.neighbors._classification.KNeighborsClassifier object which can predict. 
    '''
    ### BEGIN SOLUTION
    neigh = KNeighborsClassifier(n_neighbors=10)
    return neigh.fit(x_train, y_train)
    ### END SOLUTION


kNN_model = fit_kNN_model(scaled_x_train, y_train)


kNN_model.score(scaled_x_test, y_test)

0.9611111111111111


neigh_score = kNN_model.score(scaled_x_test, y_test) # recall, this step is very expensive
assert neigh_score > .95

### BEGIN HIDDEN TESTS
assert logst_reg.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS


# import any packages you need.
### BEGIN SOLUTION
from sklearn.svm import LinearSVC
### END SOLUTION


def fit_svm_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.svm._classes.LinearSVC object which can predict. 
    '''
    ### BEGIN SOLUTION
    svm_lin_clf = LinearSVC()
    return svm_lin_clf.fit(x_train, y_train)
    ### END SOLUTION


svm_model = fit_svm_model(scaled_x_train, y_train) # ignore any warnings.

/Users/hamzaliaqet/.local/lib/python3.6/site-packages/sklearn/svm/_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)


svm_model.score(scaled_x_test, y_test)

0.9527777777777777


score_svm = svm_model.score(scaled_x_test, y_test)
print(score_svm)
assert score_svm > .95

### BEGIN HIDDEN TESTS
assert svm_model.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS

0.9527777777777777


# import any packages you need.
### BEGIN SOLUTION
from sklearn.svm import SVC
### END SOLUTION


def fit_kernel_svm_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.svm._classes.SVC object which can predict. 
    '''
    ### BEGIN SOLUTION
    _svc_clf_non_linear = SVC()
    _svc_clf_non_linear.fit(x_train, y_train)
    return _svc_clf_non_linear
    ### END SOLUTION


kernel_svm = fit_kernel_svm_model(scaled_x_train, y_train)
kernel_svm.score(scaled_x_test, y_test)

0.975


score_k_svm = kernel_svm.score(scaled_x_test, y_test)
assert score_k_svm > .97

### BEGIN HIDDEN TESTS
assert kernel_svm.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS


# import any packages you need.
### BEGIN SOLUTION
from sklearn.tree import DecisionTreeClassifier
### END SOLUTION


def fit_dt_model(x_train, y_train):
    '''
    args: x_train-> ndarray (m, n)
          y_train-> ndarray (m, )
    
    return: a fitted sklearn.tree._classes.DecisionTreeClassifier object which can predict. 
    '''
    ### BEGIN SOLUTION
    dt_clf = DecisionTreeClassifier()
    dt_clf.fit(x_train, y_train)
    return dt_clf
    ### END SOLUTION


dt_model = fit_dt_model(scaled_x_train, y_train)


dt_model.score(scaled_x_test, y_test)

0.8583333333333333


score_dt = dt_model.score(scaled_x_test, y_test)
assert score_dt > .84

### BEGIN HIDDEN TESTS
assert kernel_svm.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS


# Import any libraries you need.
### BEGIN SOLUTION
from sklearn.metrics import confusion_matrix
### END SOLUTION


def get_confusion_matrix(x_test):
    '''
    args: x_test -> ndarray
    return ndarray
    '''
    ### BEGIN SOLUTION
    y_pred = svm_model.predict(x_test)
    return confusion_matrix(y_test, y_pred)
    ### END SOLUTION


assert (get_confusion_matrix(scaled_x_test)[:5] == np.array([[33,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 28,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 33,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1, 32,  0,  1,  0,  0,  0,  0],
       [ 0,  1,  0,  0, 45,  0,  0,  0,  0,  0]])).all()

### BEGIN HIDDEN TESTS
assert (get_confusion_matrix(scaled_x_test) == np.array([[33,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 28,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 33,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  1, 32,  0,  1,  0,  0,  0,  0],
       [ 0,  1,  0,  0, 45,  0,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0, 43,  2,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  1, 34,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 33,  0,  1],
       [ 0,  2,  0,  0,  1,  1,  0,  0, 26,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  0,  3, 36]])).all()

### END HIDDEN TESTS

Test 4 Solution

Classification¶

MNIST dataset¶

Question 1 (5 poins)¶

Download MNIST dataset¶

Question 2 (3 points)¶

Transition to a Smaller Dataset¶

Split dataset into Train Split¶

Question 3¶

Visualize Digits¶

Scaling Features¶

Question 4¶

Train a logistic Regression on x_train¶

Score¶

Predictions¶

Misclassified Labels:¶

Plot Misclassified Digits¶

Question 5¶

Train a k nearest neighbour¶

Ungraded Question¶

Question 6¶

Train a Linear Support Vector Classifier.¶

Question 7¶

Train a Kernel Support Vector Classifier.¶

Question 8¶

Train a Decision Tree (DT) Classifier.¶

Ungrader Question¶

Question 9¶