# Set up library imports. These imports also give you pointers on how to approach a question.
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


def read_into_df(csv_file_path):
    df = pd.read_csv('^GSPC.csv')
    df.set_index('Date', inplace=True) # change index to daily date
    return df


assert list(read_into_df('^GSPC.csv').columns) == ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']


df = read_into_df('^GSPC.csv')
df.head()


df = df.loc[:,df.columns != 'Close'] # ignore Close, We'll use Adj close as labels


df.plot()
# plt.show()

<matplotlib.axes._subplots.AxesSubplot at 0x7f821aed9f60>


## Extract all columns but 'Close'
def extract_features(df):
    '''
    arg: Pandas DF
    return: Pandas DF (with one less column)
    '''
    ### BEGIN SOLUTION
    return df.loc[:, df.columns != 'Adj Close']
    ### END SOLUTION


assert (list (extract_features(df).columns) == ['Open', 'High', 'Low', 'Volume'])

# np.isclose compares floats.
assert (np.isclose(extract_features(df).values[0], np.array([3.01621997e+03, 3.01739990e+03, \
                                                             2.95808008e+03, 4.62343000e+09]))).all()

### BEGIN HIDDEN TESTS
assert len(extract_features(df)) == 253
### END HIDDEN TESTS


features = extract_features(df)
features.head()


def extract_labels(df):
    '''
    arg: Pandas DF
    return: pandas.core.series.Series
    
    '''
    ### BEGIN SOLUTION
    return df.loc[:,'Adj Close']
    ### END SOLUTION


assert np.isclose(np.asarray(extract_labels(df))[:10], np.array([2980.379883, 2953.560059, 2932.050049, 2844.73999 , 2881.77002 ,
       2883.97998 , 2938.090088, 2918.649902, 2882.699951, 2926.320068])).all()

### BEGIN HIDDEN TESTS
assert np.isclose(np.asarray(extract_labels(df))[-10:], np.array([3224.72998 , 3251.840088, 3257.300049, 3276.02002 , 3235.659912,
       3215.629883, 3239.409912, 3218.439941, 3258.439941, 3246.219971])).all()
### END HIDDEN TESTS


labels  = extract_labels(df)
labels

Date
2019-07-31    2980.379883
2019-08-01    2953.560059
2019-08-02    2932.050049
2019-08-05    2844.739990
2019-08-06    2881.770020
                 ...     
2020-07-24    3215.629883
2020-07-27    3239.409912
2020-07-28    3218.439941
2020-07-29    3258.439941
2020-07-30    3246.219971
Name: Adj Close, Length: 253, dtype: float64


def split(scaled_features, labels, _test_size, _random_state=42):
    '''
    args: scaled_features  -> numpy.ndarray 
          labels -> pandas.core.series.Series
          _test_size -> float (between 0 to 1)
          _random_state -> int (to reproduce the same results across multiple runs)
          
    return: 
        X_train -> pandas.core.frame.DataFrame
        y_train -> pandas.core.frame.DataFrame
        y_train -> pandas.core.series.Series
        y_test ->  pandas.core.series.Series

    '''

    x_train, x_test, y_train, y_test = train_test_split(scaled_features, labels,\
                                                    test_size=_test_size, random_state=_random_state)
    return x_train, x_test, y_train, y_test


x_train, x_test, y_train, y_test = split(features, labels, .2, _random_state=42)


scaler = StandardScaler()
_scaled_features_train = scaler.fit_transform(x_train)
_scaled_features_test = scaler.transform(x_test)


def add_bias(x):
    '''
    args: ndarray -> x -> of shape (m, d)
    return: ndarray -> of shapee (m, d+1) <- bias included. 
    
    Add a third column of ones in input array.
    '''
    
    ### BEGIN SOLUTOIN
    ones_ = np.ones(len(x)).reshape(-1,1)
    x_bias_included = np.append(x, ones_, axis=1)
    return x_bias_included
    ### END SOLUTION


assert (add_bias(np.asarray([[5,3],[3,2]])) == np.array([[5., 3., 1.],
       [3., 2., 1.]])).all()

### BEGIN HIDDEN TESTS
assert (add_bias(np.asarray([[55,33],[33,52]])) == np.array([[55., 33.,  1.],
       [33., 52.,  1.]])).all()
### END HIDDEN TESTS


def solve_normal_eq(_scaled_features_train, y_train):
    '''
    args: ndarray -> _scaled_features_train -> shape (m, d)
          pandas.core.series.Series -> y_train
    
    return: ndarray -> parameters -> values of thetas -> shape (d+1,) # plus 1 for bias.
    
    Make sure you add bias in _scaled_features_train as a final feature in each example. 
    '''
    
    
    train_scaled_feats_bias_included = add_bias(_scaled_features_train) # implement this function (add_bias) above first.
    
    ### BEGIN SOLUTION
    X = train_scaled_feats_bias_included
    y = y_train
    computed_theta = np.linalg.solve(X.T @ X, X.T @ y)
    return computed_theta
    ### END SOLUTION


assert solve_normal_eq(_scaled_features_train, y_train).shape[0] == 5

assert np.isclose(solve_normal_eq(_scaled_features_train, y_train), np.array([-1.83668608e+02,  2.14610082e+02,  1.86071338e+02, -4.34628838e-01,
        3.02967905e+03])).all()

### BEGIN HIDDEN TESTS
"Make sure student is not using sklearn"
# This test checks if plt.subplot has been called by the student as per the instructions.

# save a reference to the original function, then delete it from the
# global namespace
dont_use_func = LinearRegression
old_dont_use_func = dont_use_func
del dont_use_func

# try running the students' code
try:
    solve_normal_eq(_scaled_features_train, y_train)

# if a NameError is thrown, that means their function calls dont_use_func
except NameError:
    raise AssertionError("func calls dont_use_func")
    

# if no error is thrown, that means their function does not call dont_use_func
else:
    pass

# restore the original function
finally:
    dont_use_func = old_dont_use_func
    del old_dont_use_func
### END HIDDEN TESTS


params_using_numpy = solve_normal_eq(_scaled_features_train, y_train)
params_using_numpy.shape # shape (5,)

(5,)


def predict_labels_using_computed_params(_scaled_features_test, thetas):
    '''
    args: _scaled_features_test -> ndarray ->  -> shape (m, d)
          thetas ->  ndarray -> shape(d+1, )
          
    Make sure you add bias in _scaled_features_test. You should call add_bias() to reduce your work. 
    '''
    ### BEGIN SOLUTION
    test_scaled_feats_bias_included = add_bias(_scaled_features_test)
    predictions = test_scaled_feats_bias_included @ thetas
    return predictions
    ### END SOLUTION


assert predict_labels_using_computed_params(_scaled_features_test, params_using_numpy).shape[0] == _scaled_features_test.shape[0]

assert np.isclose(predict_labels_using_computed_params(_scaled_features_test, params_using_numpy)[:10], np.array([3048.70132238, 2936.30572427, 3099.06483764, 2951.08687085,
       3317.45752556, 2829.58150383, 2852.31739358, 2812.48676972,
       2940.1100774 , 3012.70870057])).all()

### BEGIN HIDDEN TESTS
assert np.isclose(predict_labels_using_computed_params(_scaled_features_test, params_using_numpy)[-10:], np.array([2803.96334424, 3157.84204694, 2214.30147566, 2885.86118152,
       3065.87824258, 2915.98801594, 3080.55080936, 2943.13002932,
       2828.90198244, 2729.42115593])).all()
### END HIDDEN TESTS


def find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train):
    '''
    args: ndarray -> train_scaled_feats_bias_included -> already includes bias (1 as final feature)
          pandas.core.series.Series -> y_train of shape (m, d+1)
          
    return: tuple (model, params) -> model is of type "sklearn.linear_model._base.LinearRegression",
                                     params is of type ndarray of shape (d+1,)
    
    Pay attention to argument 'fit_intercept'. Note that we're passing features with bias included.
    '''
    ### BEGIN SOLUTION
    reg = LinearRegression(fit_intercept=False)
    reg.fit(train_scaled_feats_bias_included, y_train)
    return (reg, reg.coef_)
    ### END SOLUTION


# To test this, you've to first implement add_bias()
train_scaled_feats_bias_included = add_bias(_scaled_features_train)

assert np.isclose(find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train)[1], np.array([-1.83668608e+02,  2.14610082e+02,  1.86071338e+02, -4.34628838e-01,
        3.02967905e+03])).all()

### BEGIN HIDDEN TESTS
params_using_sklearn = find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train)[1]
assert np.isclose(params_using_numpy, params_using_sklearn).all()
### END HIDDEN TESTS


reg_model = find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train)[0]
reg_model

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)


def get_coefficient_of_determination(model, _scaled_features_test, y_test):
    '''
    args: model -> of type sklearn.linear_model._base.LinearRegression-> shape (m, d)
    return: float (score)
    
    Note that model was originally fitted with on data with shape (m, d+1)- bias included.
    
    use sklearn
    '''
    ### BEGIN SOLUTION
    _scaled_features_test_bias_included = add_bias(_scaled_features_test)
    return model.score(_scaled_features_test_bias_included, y_test)
    ### END SOLUTION


### BEGIN HIDDEN TESTS
assert np.isclose(get_coefficient_of_determination(reg_model, _scaled_features_test, y_test), 0.9952866095664767)
### END HIDDEN TESTS


score = get_coefficient_of_determination(reg_model, _scaled_features_test, y_test)
score # ~.99

0.9952866095664767


## Import any library you need. We've not imported any packages for this one.

### BEGIN SOLUTION
from sklearn.metrics import mean_squared_error
### END SOLUTION


def rmse(model, _scaled_features_test):
    '''
    args: model -> type "sklearn.linear_model._base.LinearRegression"
          _scaled_features_test -> ndarray -> of shape (m, d)
          
          
    return: float (rmse)
    
    Note that model was trained on data of shape (m, d+1)- bias included.
    
    Hint: pay attention to details. It's easy to get this wrong since all test cases are hidden.
    '''
    ### BEGIN SOLUTION
    y_pred = model.predict(add_bias(_scaled_features_test))
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse
    ### END SOLUTION


### BEGIN HIDDEN TESTS
assert np.isclose(rmse(reg_model, _scaled_features_test), 13.805553283267848)
### END HIDDEN TESTS

	Open	High	Low	Volume
Date
2019-07-31	3016.219971	3017.399902	2958.080078	4623430000
2019-08-01	2980.320068	3013.590088	2945.229980	4762300000
2019-08-02	2943.899902	2945.500000	2914.110107	3874660000
2019-08-05	2898.070068	2898.070068	2822.120117	4513730000
2019-08-06	2861.179932	2884.399902	2847.419922	4154240000

Test 3 Solution

Stocks Dataset¶

Question 1¶

Adj Close Price¶

Plot df¶

Question 2¶

Extract Features¶

Question 3¶

Extract labels¶

Train Test Split¶

Scaling¶

Question 4¶

Question 5¶

Normal Equations¶

Question 6¶

Question 7¶

Question 8¶

Compute coefficient of determination¶

RMSE¶