NAME = ""
COLLABORATORS = ""


# Set up library imports. These imports also give you pointers on how to approach a question.
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


# YOUR CODE HERE
raise NotImplementedError()


assert list(read_into_df('^GSPC.csv').columns) == ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']


df = read_into_df('^GSPC.csv')
df.head()


df = df.loc[:,df.columns != 'Close'] # ignore Close, We'll use Adj close as labels


df.plot()
# plt.show()


## Extract all columns but 'Close'
def extract_features(df):
    '''
    arg: Pandas DF
    return: Pandas DF (with one less column)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


assert (list (extract_features(df).columns) == ['Open', 'High', 'Low', 'Volume'])

# np.isclose compares floats.
assert (np.isclose(extract_features(df).values[0], np.array([3.01621997e+03, 3.01739990e+03, \
                                                             2.95808008e+03, 4.62343000e+09]))).all()


features = extract_features(df)
features.head()


def extract_labels(df):
    '''
    arg: Pandas DF
    return: pandas.core.series.Series
    
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


assert np.isclose(np.asarray(extract_labels(df))[:10], np.array([2980.379883, 2953.560059, 2932.050049, 2844.73999 , 2881.77002 ,
       2883.97998 , 2938.090088, 2918.649902, 2882.699951, 2926.320068])).all()


labels  = extract_labels(df)
labels


def split(scaled_features, labels, _test_size, _random_state=42):
    '''
    args: scaled_features  -> numpy.ndarray 
          labels -> pandas.core.series.Series
          _test_size -> float (between 0 to 1)
          _random_state -> int (to reproduce the same results across multiple runs)
          
    return: 
        X_train -> pandas.core.frame.DataFrame
        y_train -> pandas.core.frame.DataFrame
        y_train -> pandas.core.series.Series
        y_test ->  pandas.core.series.Series

    '''

    x_train, x_test, y_train, y_test = train_test_split(scaled_features, labels,\
                                                    test_size=_test_size, random_state=_random_state)
    return x_train, x_test, y_train, y_test


x_train, x_test, y_train, y_test = split(features, labels, .2, _random_state=42)


scaler = StandardScaler()
_scaled_features_train = scaler.fit_transform(x_train)
_scaled_features_test = scaler.transform(x_test)


# YOUR CODE HERE
raise NotImplementedError()


assert (add_bias(np.asarray([[5,3],[3,2]])) == np.array([[5., 3., 1.],
       [3., 2., 1.]])).all()


def solve_normal_eq(_scaled_features_train, y_train):
    '''
    args: ndarray -> _scaled_features_train -> shape (m, d)
          pandas.core.series.Series -> y_train
    
    return: ndarray -> parameters -> values of thetas -> shape (d+1,) # plus 1 for bias.
    
    Make sure you add bias in _scaled_features_train as a final feature in each example. 
    '''
    
    
    train_scaled_feats_bias_included = add_bias(_scaled_features_train) # implement this function (add_bias) above first.
    
    # YOUR CODE HERE
    raise NotImplementedError()


assert solve_normal_eq(_scaled_features_train, y_train).shape[0] == 5

assert np.isclose(solve_normal_eq(_scaled_features_train, y_train), np.array([-1.83668608e+02,  2.14610082e+02,  1.86071338e+02, -4.34628838e-01,
        3.02967905e+03])).all()


params_using_numpy = solve_normal_eq(_scaled_features_train, y_train)
params_using_numpy.shape # shape (5,)


def predict_labels_using_computed_params(_scaled_features_test, thetas):
    '''
    args: _scaled_features_test -> ndarray ->  -> shape (m, d)
          thetas ->  ndarray -> shape(d+1, )
          
    Make sure you add bias in _scaled_features_test. You should call add_bias() to reduce your work. 
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


assert predict_labels_using_computed_params(_scaled_features_test, params_using_numpy).shape[0] == _scaled_features_test.shape[0]

assert np.isclose(predict_labels_using_computed_params(_scaled_features_test, params_using_numpy)[:10], np.array([3048.70132238, 2936.30572427, 3099.06483764, 2951.08687085,
       3317.45752556, 2829.58150383, 2852.31739358, 2812.48676972,
       2940.1100774 , 3012.70870057])).all()


def find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train):
    '''
    args: ndarray -> train_scaled_feats_bias_included -> already includes bias (1 as final feature)
          pandas.core.series.Series -> y_train of shape (m, d+1)
          
    return: tuple (model, params) -> model is of type "sklearn.linear_model._base.LinearRegression",
                                     params is of type ndarray of shape (d+1,)
    
    Pay attention to argument 'fit_intercept'. Note that we're passing features with bias included.
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


# To test this, you've to first implement add_bias()
train_scaled_feats_bias_included = add_bias(_scaled_features_train)

assert np.isclose(find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train)[1], np.array([-1.83668608e+02,  2.14610082e+02,  1.86071338e+02, -4.34628838e-01,
        3.02967905e+03])).all()


reg_model = find_linear_reg_model_params_using_sklearn(train_scaled_feats_bias_included, y_train)[0]
reg_model


def get_coefficient_of_determination(model, _scaled_features_test, y_test):
    '''
    args: model -> of type sklearn.linear_model._base.LinearRegression-> shape (m, d)
    return: float (score)
    
    Note that model was originally fitted with on data with shape (m, d+1)- bias included.
    
    use sklearn
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


score = get_coefficient_of_determination(reg_model, _scaled_features_test, y_test)
score # ~.99


## Import any library you need. We've not imported any packages for this one.

# YOUR CODE HERE
raise NotImplementedError()


def rmse(model, _scaled_features_test):
    '''
    args: model -> type "sklearn.linear_model._base.LinearRegression"
          _scaled_features_test -> ndarray -> of shape (m, d)
          
          
    return: float (rmse)
    
    Note that model was trained on data of shape (m, d+1)- bias included.
    
    Hint: pay attention to details. It's easy to get this wrong since all test cases are hidden.
    '''
    # YOUR CODE HERE
    raise NotImplementedError()

Test 3

Stocks Dataset¶

Question 1¶

Adj Close Price¶

Plot df¶

Question 2¶

Extract Features¶

Question 3¶

Extract labels¶

Train Test Split¶

Scaling¶

Question 4¶

Question 5¶

Normal Equations¶

Question 6¶

Question 7¶

Question 8¶

Compute coefficient of determination¶

RMSE¶