import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from sklearn.datasets import load_diabetes


diabetes_all_data = load_diabetes()


labels = diabetes_all_data.target


# feature names
feature_names = diabetes_all_data.feature_names
print('feature names', feature_names)

# features
features = diabetes_all_data.data
df = pd.DataFrame(features, columns = feature_names)
df

feature names ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


plt.scatter(df["age"], labels, marker='x')

<matplotlib.collections.PathCollection at 0x7fb86a17e400>


plt.scatter(df["sex"], labels, marker='x')

<matplotlib.collections.PathCollection at 0x7fb86a1d3dd8>


plt.scatter(df["bmi"], labels, marker='x')

<matplotlib.collections.PathCollection at 0x7fb86a38bda0>


plt.scatter(df["bp"], labels, marker='x')

<matplotlib.collections.PathCollection at 0x7fb86a487748>


df['labels'] = labels


df.corr()


df = df.iloc[:,:-1]  # remove the labels cols again.


plt.scatter(df["bmi"], labels, marker='x')

<matplotlib.collections.PathCollection at 0x7fb86a3f4208>


df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')


bmi = df['bmi']


_bmi_nor = (bmi - min(bmi)) / (max(bmi) - min(bmi)) # normmalization is very useful usually
_labels_nor = (labels - min(labels)) / (max(labels) - min(labels)) # not necessary usually


x_train, x_test, y_train, y_test = train_test_split(_bmi_nor, _labels_nor,\
                                                    test_size=.3, random_state=32)


y_train.shape

(309,)


theta = np.array([1, 0]) # Our guess for params. Guess from hit and trial.
plt.scatter(x_train, y_train, marker='x')
xlim, ylim =(plt.gca().get_xlim(), plt.gca().get_ylim())
print(xlim, ylim)
plt.plot(xlim, [theta[0]*xlim[0]+theta[1], theta[0]*xlim[1]+theta[1]], 'C1')
plt.xlim(xlim)
plt.ylim(ylim)
plt.xlabel("bmi")
plt.ylabel("Disease Prog")

(-0.05985663082437276, 1.0598566308243726) (-0.025924263992902985, 1.0633074415629964)

Text(0, 0.5, 'Disease Prog')


X = np.array([x_train, np.ones(len(x_train))]).T
print(X.shape)
y = y_train
computed_theta = np.linalg.solve(X.T @ X, X.T @ y) # solve the normal equation above to find params.

(309, 2)


computed_theta # Actual values. Close to our guess

array([0.77890484, 0.1318822 ])


theta = np.array([1, 0]) # our guess for params
theta = computed_theta # overwrite guess with theta that minimizes error.
plt.scatter(x_train, y_train, marker='x')
xlim, ylim =(plt.gca().get_xlim(), plt.gca().get_ylim())
print(xlim, ylim)
plt.plot(xlim, [theta[0]*xlim[0]+theta[1], theta[0]*xlim[1]+theta[1]], 'C1')
plt.xlim(xlim)
plt.ylim(ylim)
plt.xlabel("bmi")
plt.ylabel("Disease Prog")

(-0.05985663082437276, 1.0598566308243726) (-0.025924263992902985, 1.0633074415629964)

Text(0, 0.5, 'Disease Prog')


x_test

149    0.326446
314    0.500000
214    0.219008
95     0.123967
19     0.276860
         ...   
409    0.301653
108    0.520661
172    0.619835
386    0.190083
18     0.305785
Name: bmi, Length: 133, dtype: float64


# Add bias
_x_test = np.array([x_test, np.ones(len(x_test))]).T


# Let's predict label of example at index 0 in test set
example_0 = _x_test[0]
example_0 # observe 1 (as mentioned before, this 1 will simply our work by automatically handling intercept).

array([0.32644628, 1.        ])


hypo = computed_theta * example_0
hypo # slope multiplied by example_0 and intercept as it is (multiplied by 1). Add these in next step to get the prediction.

array([0.25427059, 0.1318822 ])


predicted_label = hypo.sum()
predicted_label # this is the predicted label (value).

0.3861527881352883


y[0] # this was the actual label.

0.2398753894080997


reg = LinearRegression(fit_intercept=False)
reg.fit(X, y)
# return (reg, reg.coef_)
reg.coef_

array([0.77890484, 0.1318822 ])


type(reg) # a _base.LinearRegression model which you also use to make predictions.

sklearn.linear_model._base.LinearRegression


reg.predict([example_0])

array([0.38615279])


# Answer
predictions = reg.predict(_x_test)
predictions.shape # shape (133,)

(133,)


# using sklearn 'mean_squared_error' func
mse = (mean_squared_error(y_test, predictions))
rmse = np.sqrt(mse)
rmse # lower the better

0.18776380189555186


score = reg.score(_x_test, y_test) # R2 value- ~0.33
score # higher the better

0.3316041747499109

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6
0	0.038076	0.050680	0.061696	0.021872	-0.044223	-0.034821	-0.043401	-0.002592	0.019908	-0.017646
1	-0.001882	-0.044642	-0.051474	-0.026328	-0.008449	-0.019163	0.074412	-0.039493	-0.068330	-0.092204
2	0.085299	0.050680	0.044451	-0.005671	-0.045599	-0.034194	-0.032356	-0.002592	0.002864	-0.025930
3	-0.089063	-0.044642	-0.011595	-0.036656	0.012191	0.024991	-0.036038	0.034309	0.022692	-0.009362
4	0.005383	-0.044642	-0.036385	0.021872	0.003935	0.015596	0.008142	-0.002592	-0.031991	-0.046641
...	...	...	...	...	...	...	...	...	...	...
437	0.041708	0.050680	0.019662	0.059744	-0.005697	-0.002566	-0.028674	-0.002592	0.031193	0.007207
438	-0.005515	0.050680	-0.015906	-0.067642	0.049341	0.079165	-0.028674	0.034309	-0.018118	0.044485
439	0.041708	0.050680	-0.015906	0.017282	-0.037344	-0.013840	-0.024993	-0.011080	-0.046879	0.015491
440	-0.045472	-0.044642	0.039062	0.001215	0.016318	0.015283	-0.028674	0.026560	0.044528	-0.025930
441	-0.045472	-0.044642	-0.073030	-0.081414	0.083740	0.027809	0.173816	-0.039493	-0.004220	0.003064

	age	sex	bmi	bp	s1	s2	s3	s4	s5	s6	labels
age	1.000000	0.173737	0.185085	0.335427	0.260061	0.219243	-0.075181	0.203841	0.270777	0.301731	0.187889
sex	0.173737	1.000000	0.088161	0.241013	0.035277	0.142637	-0.379090	0.332115	0.149918	0.208133	0.043062
bmi	0.185085	0.088161	1.000000	0.395415	0.249777	0.261170	-0.366811	0.413807	0.446159	0.388680	0.586450
bp	0.335427	0.241013	0.395415	1.000000	0.242470	0.185558	-0.178761	0.257653	0.393478	0.390429	0.441484
s1	0.260061	0.035277	0.249777	0.242470	1.000000	0.896663	0.051519	0.542207	0.515501	0.325717	0.212022
s2	0.219243	0.142637	0.261170	0.185558	0.896663	1.000000	-0.196455	0.659817	0.318353	0.290600	0.174054
s3	-0.075181	-0.379090	-0.366811	-0.178761	0.051519	-0.196455	1.000000	-0.738493	-0.398577	-0.273697	-0.394789
s4	0.203841	0.332115	0.413807	0.257653	0.542207	0.659817	-0.738493	1.000000	0.617857	0.417212	0.430453
s5	0.270777	0.149918	0.446159	0.393478	0.515501	0.318353	-0.398577	0.617857	1.000000	0.464670	0.565883
s6	0.301731	0.208133	0.388680	0.390429	0.325717	0.290600	-0.273697	0.417212	0.464670	1.000000	0.382483
labels	0.187889	0.043062	0.586450	0.441484	0.212022	0.174054	-0.394789	0.430453	0.565883	0.382483	1.000000

Linear Regression

Linear Regression¶

Load Diabetes dataset¶

Target (labels)¶

Dataset Exploration¶

Select the most useful feature¶

Alternatively, find correlation matrix to find the most useful feature.¶

It seems that labels have a highest correlation with 'bmi' (observe the last column or row).¶

Hypothesis¶

Normalize Features and Labels¶

Train/Test Split¶

Exercise¶

Finding Parameters (thetas in our hypothesis)¶

Let's try to guess what the slope and and y-intercept are:¶

Normalized labels and features¶

A Disciplined Way to Find Parameters¶

Objective Function¶

Analytic Solution¶

For unnormalized data.¶

Exercise (Make a Prediction)¶

Answer¶

Finding Parameters Using sklearn¶

Predict Using sklearn¶

Exercise¶

RMSE and R2¶

Overfitting and Regularization¶

Base for ML Aglorithms (Setting the Stage)¶

ML Terms to define a general ML Algorithm¶

ML Algorithms General Form:¶