Test 4 Solution
Feel free to go online. In fact, we encourage you to read documentations where needed. However, you may not collaborate with anybody. To certify that you didn't collaborate with anyone you'll write 'Nobody' in 'collaborators' above.
Classification¶
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
MNIST dataset¶
Finding dataset is foremost part to build any Machine Learning Algorithm. As a Machine Learning engineer, you may at times need to create your own dataset and label each example which could get highly expensive both in terms of time and money. For example, you can start taking pictures with your camera and save each image by putting a label against it e.g. cat. Theoretically, you can create a giant dataset with picture of every object. Imagenet is such a dataset with 14 million images. Here, you'll not create any dataset instead you'll find a dataset already on web. Specifically, you'll find MNIST dataset. Which is a dataset of hand written digits. See a few examples below.
Question 1 (5 poins)¶
Download MNIST dataset¶
Download MNIST dataset from 'sklearn.datasets'sklearn.datasets. Feel free to Google/StackOverFlow.
def download_data():
'''
args: None
return: dict with at least two keys. The first key is 'data' of shape (70000, 784)
(i.e. it contains 70,000 examples (images of digits) and each example contains 784 pixels.
The second key is 'target' of shape (70,000,) which contains labels for each example.
Staff's solution is two lines of code.
This function may take a few seconds to execute.
'''
### BEGIN SOLUTION
mnist = sklearn.datasets.fetch_openml('mnist_784')
return mnist
### END SOLUTION
mnist = download_data() # takes a few seconds.
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-3-1d435d1ea221> in <module> ----> 1 mnist = download_data() # takes a few seconds. <ipython-input-2-97a7053a1781> in download_data() 11 ''' 12 ### BEGIN SOLUTION ---> 13 mnist = sklearn.datasets.fetch_openml('mnist_784') 14 return mnist 15 ### END SOLUTION /opt/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs) 70 FutureWarning) 71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)}) ---> 72 return f(**kwargs) 73 return inner_f 74 /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in fetch_openml(name, version, data_id, data_home, target_column, cache, return_X_y, as_frame) 773 774 # download data features, meta-info about column types --> 775 features_list = _get_data_features(data_id, data_home) 776 777 if not as_frame: /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _get_data_features(data_id, data_home) 409 url = _DATA_FEATURES.format(data_id) 410 error_message = "Dataset with data_id {} not found.".format(data_id) --> 411 json_data = _get_json_content_from_openml_api(url, error_message, True, 412 data_home) 413 return json_data['data_features']['feature'] /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _get_json_content_from_openml_api(url, error_message, raise_if_error, data_home) 159 160 try: --> 161 return _load_json() 162 except HTTPError as error: 163 # 412 is an OpenML specific error code, indicating a generic error /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in wrapper(*args, **kw) 51 return f(*args, **kw) 52 try: ---> 53 return f(*args, **kw) 54 except HTTPError: 55 raise /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _load_json() 155 @_retry_with_clean_cache(url, data_home) 156 def _load_json(): --> 157 with closing(_open_openml_url(url, data_home)) as response: 158 return json.loads(response.read().decode("utf-8")) 159 /opt/anaconda3/lib/python3.8/site-packages/sklearn/datasets/_openml.py in _open_openml_url(openml_path, data_home) 110 else: 111 with gzip.GzipFile(local_path, 'wb') as fdst: --> 112 shutil.copyfileobj(fsrc, fdst) 113 except Exception: 114 if os.path.exists(local_path): /opt/anaconda3/lib/python3.8/shutil.py in copyfileobj(fsrc, fdst, length) 200 fdst_write = fdst.write 201 while True: --> 202 buf = fsrc_read(length) 203 if not buf: 204 break /opt/anaconda3/lib/python3.8/http/client.py in read(self, amt) 456 # Amount is given, implement using readinto 457 b = bytearray(amt) --> 458 n = self.readinto(b) 459 return memoryview(b)[:n].tobytes() 460 else: /opt/anaconda3/lib/python3.8/http/client.py in readinto(self, b) 500 # connection, and the user is reading more bytes than will be provided 501 # (for example, reading in 1k chunks) --> 502 n = self.fp.readinto(b) 503 if not n and b: 504 # Ideally, we would raise IncompleteRead if the content-length /opt/anaconda3/lib/python3.8/socket.py in readinto(self, b) 667 while True: 668 try: --> 669 return self._sock.recv_into(b) 670 except timeout: 671 self._timeout_occurred = True /opt/anaconda3/lib/python3.8/ssl.py in recv_into(self, buffer, nbytes, flags) 1239 "non-zero flags not allowed in calls to recv_into() on %s" % 1240 self.__class__) -> 1241 return self.read(nbytes, buffer) 1242 else: 1243 return super().recv_into(buffer, nbytes, flags) /opt/anaconda3/lib/python3.8/ssl.py in read(self, len, buffer) 1097 try: 1098 if buffer is not None: -> 1099 return self._sslobj.read(len, buffer) 1100 else: 1101 return self._sslobj.read(len) KeyboardInterrupt:
assert mnist.target.shape == (70000,)
assert mnist.data.shape == (70000,784)
assert mnist.data[0].shape == (784,)
assert list(mnist.keys()) == ['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url']
### BEGIN HIDDEN TESTS
assert mnist.data[56].shape == (784,)
assert mnist.data[5356].shape == (784,)
### END HIDDEN TESTS
Question 2 (3 points)¶
Convert labels of strings to labels of integers.
##
def labels_to_int_dtype(mnist):
'''
args: dict (mnist containing above keys)
returns: ndarray of shape (70000,) -> Labels (in mnist at key target) are strings e.g. '5', '3'.
Return an ndarray with integer labels e.g. 5, 3
Staff's solution contains two lines of code. You should use map(lambda) for this which is good practice but
not required. Or you can use list comprehension.
'''
### BEGIN SOLUTION
labels = mnist.target
labels = np.asarray(list(map(lambda label: int(label), labels)))
return labels
### END SOLUTION
assert labels_to_int_dtype(mnist).shape == (70000,)
### BEGIN HIDDEN TESTS
assert labels_to_int_dtype(mnist)[53] == 4
### END HIDDEN TESTS
labels = labels_to_int_dtype(mnist)
labels.shape # (70,000)
(70000,)
Transition to a Smaller Dataset¶
The questions above tested you on 'finding real-world data' and some preprocessing. It turns out MNIST dataset above is a large dataset and somme of the algorithms you'll implement in this test might take more than a few minutes on that dataset which we don't want considering that it's a test. To this end, we'll load another smaller digits dataset and you'll work with that to expediate training and testing. We provide you code for that. In MNIST each image has 784 features/dims/pixels whereas in the smaller dataset there are only 64 and images might appear blurry.
digits_dict = load_digits()
_data = digits_dict.data
labels = digits_dict.target
Split dataset into Train Split¶
You should be comfortable with this from previous test. Therefore, it's already provided you.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(_data, labels, \
test_size=.2, random_state=42)
# A sanity check
assert x_train.shape[0] == y_train.shape[0] and x_test.shape[0] == y_test.shape[0]
Question 3¶
4 points
Visualize Digits¶
Each example in our train/test data contains an image of a digit (currently, as ndarray). Let's visualize a few examples. You will plot/show_image subplots of n digits to the current figure instead of plotting each digit in a separate figure. Note that each image/digit is ndarray of shape (784,); You'll have to reshape it to fit your needs.
def plot_digits(x, y, _fig_size=(20,4)):
'''
args: x -> ndarray of shape (m,n) -> (m examples/digits each of shape n). Think of x
as truncated x_train or truncated x_test
y -> ndarray of shape (m,) -> (corresponding labels of x)
_fig_size: tupe (a,b) -> (Width, height in inches of the figure to be created)
return: None (just observe the digits images yourself)
'''
### BEGIN SOLUTION
plt.figure(figsize=_fig_size)
n_digits_to_show = len(x)
for index, (image, label) in enumerate(zip(x, y)):
plt.subplot(1, n_digits_to_show, index + 1)
plt.imshow(image.reshape(8,8), cmap=plt.cm.gray)
plt.title('Digit: %i\n' % label, fontsize = 20)
### END SOLUTION
plot_digits(x_train[:5], y_train[:5])
# We'll use sophisticated hidden tests to check plots.
# Make sure you follow the instructions and you're good to go.
### BEGIN HIDDEN TESTS
# This test checks if plt.subplot has been called by the student as per the instructions.
# save a reference to the original function, then delete it from the
# global namespace
must_use_func = plt.subplot
old_must_use_func = must_use_func
del must_use_func
# try running the students' code
try:
plot_mse()
# if an NameError is thrown, that means their function calls mse
except NameError:
pass
# if no error is thrown, that means their function does not call mse
else:
raise AssertionError("plot_mse does not call plt.subplot")
# restore the original function
finally:
must_use_func = old_must_use_func
del old_must_use_func
### END HIDDEN TESTS
Scaling Features¶
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)
def fit_log_reg_model(x_train, y_train):
'''
args: x_train-> ndarray (m, n)
y_train-> ndarray (m, )
return: a fitted sklearn.linear_model._logistic.LogisticRegression object which can predict.
'''
### BEGIN SOLUTION
logisticRegr = LogisticRegression()
return logisticRegr.fit(x_train, y_train)
### END SOLUTION
logst_reg = fit_log_reg_model(scaled_x_train, y_train)
Score¶
Find coefficient of determination a.k.a. score.
score = logst_reg.score(scaled_x_test, y_test)
score
0.9638888888888889
logst_reg.predict(x_test[30].reshape(1,-1))
array([1])
score = logst_reg.score(scaled_x_test, y_test)
assert score > .96
### BEGIN HIDDEN TESTS
assert logst_reg.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS
Predictions¶
predictions = logst_reg.predict(scaled_x_test)
Misclassified Labels:¶
Store the indexes of images (digits) in test data which were misclassfied (whose correct label was not predicted)
def misclassified_idx(y_test, predictions):
'''
args: y_test: actual labels -> ndarray
predictions: predicted labels -> ndarray
return: ndarray (Note that you'll work with Python list. We provide code to convert your list to ndarray)
'''
misclassifiedIndexes = [] # empty list
### BEGIN SOLUTION
index = 0
for label, predict in zip(y_test, predictions):
if label != predict:
misclassifiedIndexes.append(index)
index +=1
### END SOLUTION
return np.asarray(misclassifiedIndexes)
idxs[50:60]
array([], dtype=int64)
misclassified_idx(y_test, predictions)
array([ 52, 71, 124, 133, 149, 159, 193, 222, 234, 239, 244, 262, 339])
assert len(misclassified_idx(y_test, predictions)) == 13
### BEGIN HIDDEN TESTS
idxs = misclassified_idx(y_test, predictions)
assert (idxs == np.array([52, 71, 124, 133, 149, 159, 193, 222, 234, 239, 244, 262, 339])).all()
### END HIDDEN TESTS
Plot Misclassified Digits¶
misclassifiedIndexes = misclassified_idx(y_test, predictions)
plt.figure(figsize=(20,4))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:5]):
plt.subplot(1, 5, plotIndex + 1)
plt.imshow(np.reshape(x_test[badIndex], (8,8)), cmap=plt.cm.gray)
plt.title('Predicted: {}, Actual: {}'.format(predictions[badIndex], y_test[badIndex]), fontsize = 15)
You'll now implement kNN. Look up the sklearn documentation and see if we do classification using kNN. Import the relevant library below. Note for library function: Use 10 Number of neighbors. Leave the default values for other parameters.
# import any packages you need.
### BEGIN SOLUTION
from sklearn.neighbors import KNeighborsClassifier
### END SOLUTION
def fit_kNN_model(x_train, y_train):
'''
args: x_train-> ndarray (m, n)
y_train-> ndarray (m, )
return: a fitted sklearn.neighbors._classification.KNeighborsClassifier object which can predict.
'''
### BEGIN SOLUTION
neigh = KNeighborsClassifier(n_neighbors=10)
return neigh.fit(x_train, y_train)
### END SOLUTION
kNN_model = fit_kNN_model(scaled_x_train, y_train)
kNN_model.score(scaled_x_test, y_test)
0.9611111111111111
neigh_score = kNN_model.score(scaled_x_test, y_test) # recall, this step is very expensive
assert neigh_score > .95
### BEGIN HIDDEN TESTS
assert logst_reg.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS
Ungraded Question¶
Observe the misclassified images by kNN as we did for logistic regression.
# import any packages you need.
### BEGIN SOLUTION
from sklearn.svm import LinearSVC
### END SOLUTION
def fit_svm_model(x_train, y_train):
'''
args: x_train-> ndarray (m, n)
y_train-> ndarray (m, )
return: a fitted sklearn.svm._classes.LinearSVC object which can predict.
'''
### BEGIN SOLUTION
svm_lin_clf = LinearSVC()
return svm_lin_clf.fit(x_train, y_train)
### END SOLUTION
svm_model = fit_svm_model(scaled_x_train, y_train) # ignore any warnings.
/Users/hamzaliaqet/.local/lib/python3.6/site-packages/sklearn/svm/_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
svm_model.score(scaled_x_test, y_test)
0.9527777777777777
score_svm = svm_model.score(scaled_x_test, y_test)
print(score_svm)
assert score_svm > .95
### BEGIN HIDDEN TESTS
assert svm_model.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS
0.9527777777777777
Question 7¶
2 points
Train a Kernel Support Vector Classifier.¶
You'll now implement Kernel SVC. Look up the sklearn documentation and see if we can do classification using kernel svc. Import the relevant library below. Note for library function: Leave the default values for other parameters. Specifically, implement Non-linear SVC with RBF kernel.
# import any packages you need.
### BEGIN SOLUTION
from sklearn.svm import SVC
### END SOLUTION
def fit_kernel_svm_model(x_train, y_train):
'''
args: x_train-> ndarray (m, n)
y_train-> ndarray (m, )
return: a fitted sklearn.svm._classes.SVC object which can predict.
'''
### BEGIN SOLUTION
_svc_clf_non_linear = SVC()
_svc_clf_non_linear.fit(x_train, y_train)
return _svc_clf_non_linear
### END SOLUTION
kernel_svm = fit_kernel_svm_model(scaled_x_train, y_train)
kernel_svm.score(scaled_x_test, y_test)
0.975
score_k_svm = kernel_svm.score(scaled_x_test, y_test)
assert score_k_svm > .97
### BEGIN HIDDEN TESTS
assert kernel_svm.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS
# import any packages you need.
### BEGIN SOLUTION
from sklearn.tree import DecisionTreeClassifier
### END SOLUTION
def fit_dt_model(x_train, y_train):
'''
args: x_train-> ndarray (m, n)
y_train-> ndarray (m, )
return: a fitted sklearn.tree._classes.DecisionTreeClassifier object which can predict.
'''
### BEGIN SOLUTION
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
return dt_clf
### END SOLUTION
dt_model = fit_dt_model(scaled_x_train, y_train)
dt_model.score(scaled_x_test, y_test)
0.8583333333333333
score_dt = dt_model.score(scaled_x_test, y_test)
assert score_dt > .84
### BEGIN HIDDEN TESTS
assert kernel_svm.predict(scaled_x_test[30].reshape(1,-1)) == np.array([4])
### END HIDDEN TESTS
Ungrader Question¶
Which Model performed the best in terms of good coefficient of determination?
Question 9¶
Compute Confusion matrix for svm_model
(you implemented in Q-6)
# Import any libraries you need.
### BEGIN SOLUTION
from sklearn.metrics import confusion_matrix
### END SOLUTION
def get_confusion_matrix(x_test):
'''
args: x_test -> ndarray
return ndarray
'''
### BEGIN SOLUTION
y_pred = svm_model.predict(x_test)
return confusion_matrix(y_test, y_pred)
### END SOLUTION
assert (get_confusion_matrix(scaled_x_test)[:5] == np.array([[33, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 28, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 1, 32, 0, 1, 0, 0, 0, 0],
[ 0, 1, 0, 0, 45, 0, 0, 0, 0, 0]])).all()
### BEGIN HIDDEN TESTS
assert (get_confusion_matrix(scaled_x_test) == np.array([[33, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 28, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 33, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 1, 32, 0, 1, 0, 0, 0, 0],
[ 0, 1, 0, 0, 45, 0, 0, 0, 0, 0],
[ 0, 0, 1, 0, 0, 43, 2, 0, 0, 1],
[ 0, 0, 0, 0, 0, 1, 34, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 0, 33, 0, 1],
[ 0, 2, 0, 0, 1, 1, 0, 0, 26, 0],
[ 0, 0, 0, 1, 0, 0, 0, 0, 3, 36]])).all()
### END HIDDEN TESTS