NAME = ""
COLLABORATORS = ""


# Set up library imports. These imports also give you pointers on how to approach a question.
from sklearn.datasets import load_boston
import pandas as pd
from sklearn.svm import SVR
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import numpy as np


def minimum_features_to_classify_dataset():
    '''
    arg: None
    return: list (containing strings)
    
    Features are called x1, x2, x1^2, x2^2, x1x2. Return your answer in a list of string. 
    For example, if your answer is x1 and x2, return ['x1', 'x2']
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


## This test is only checking whether your list contains  a string. Other tests are hidden
assert isinstance(minimum_features_to_classify_dataset()[0], str)


# YOUR CODE HERE
raise NotImplementedError()


all_data_boston = get_boston_data()
assert list(all_data_boston.keys()) == ['data', 'target', 'feature_names', 'DESCR', 'filename']


raw_features = get_boston_data().data # ndarray of shp
raw_features.shape


def get_df(all_data_boston):
    '''
    args: dict containing boston dataset return by 
    return: pd dataframe
    
    get data into a dataframe
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


list(get_df(all_data_boston).columns) == ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT']


df_head = get_df(all_data_boston).head()
df_head


labels  = all_data_boston.target


# First 10 labels
labels[:10]


def split(raw_features, labels, _test_size, _random_state=42):
    '''
    args: raw_features  -> numpy.ndarray 
          labels -> pandas.core.series.Series
          _test_size -> float (between 0 to 1)
          _random_state -> int (to reproduce the same results across multiple runs)
          
    return: 
        X_train -> numpy.ndarray
        y_train -> numpy.ndarray
        y_train -> pandas.core.series.Series
        y_test -> pandas.core.series.Series

    '''
    # YOUR CODE HERE
    raise NotImplementedError()


x_train_raw_feats, x_test_raw_feats, y_train_raw_feats, y_test_raw_feats = split(\
                                                        raw_features, labels, _test_size=.3)
assert x_train_raw_feats.shape == (354, 13)
assert x_test_raw_feats.shape == (152, 13)

assert y_train_raw_feats.shape[0] == x_train_raw_feats.shape[0]
assert y_test_raw_feats.shape[0] == y_test_raw_feats.shape[0]


x_train_raw_feats, x_test_raw_feats, y_train_raw_feats, y_test_raw_feats = split(\
                                                        raw_features, labels, _test_size=.3)


# YOUR CODE HERE
raise NotImplementedError()


svr_model = fit_model(x_train_raw_feats, y_train_raw_feats)


assert str(type(svr_model)) == "<class 'sklearn.svm._classes.SVR'>"


svr_model.score(x_test_raw_feats, y_test_raw_feats)


class _Standard_Scaler(object):
    def __init__(self, x):
        self.mean_each_feat = 0.0 # mean of each feature- ndarray 
        self.std_dev_each_feat = 0.0
        self.x = x
        
    def compute_mean_and_std_dev(self):
        '''
        args: x -> ndarray (here, containing scaled features) of shape (m,n)
        return: None (compute mean and standard deviation of each feature and
                      store them in self.mean_each_feat and self.std_dev_each_feat respectively)

        you may onle use numpy functions.
        '''

        # YOUR CODE HERE
        raise NotImplementedError()
        
    def get_params(self):
        return self.mean_each_feat, self.std_dev_each_feat
    
    def get_scaled_input(self, _mean=None, _std=None):
        '''
        args: None
        return: scaled x such that each feature has 0 mean and unit var (also unit std)
        '''
        if _mean == None:
            _mean = self.mean_each_feat
        if _std == None:
            _std = self.std_dev_each_feat
            
        # YOUR CODE HERE
        raise NotImplementedError()


_scaler = _Standard_Scaler(x_train_raw_feats)
_scaler.compute_mean_and_std_dev()
_scaler.get_scaled_input()

assert _scaler.get_scaled_input().shape == x_train_raw_feats.shape
scaled_feats = _scaler.get_scaled_input()
assert np.isclose(scaled_feats[:2], np.array([[-0.41425879, -0.50512499, -1.29214218, -0.28154625, -0.85108479,
         0.14526384, -0.365584  ,  1.08162833, -0.74617905, -1.11279004,
         0.18727079,  0.39651419, -1.01531611],
       [-0.40200818, -0.50512499, -0.16208345, -0.28154625, -0.08796708,
        -0.20840082,  0.13394078, -0.48787608, -0.39846419,  0.15008778,
        -0.21208981,  0.3870674 , -0.05366252]])).all()


_scaler = _Standard_Scaler(x_train_raw_feats)
_scaler.compute_mean_and_std_dev()
scaled_feats_x_train = _scaler.get_scaled_input()


_scaler.get_params()


def standard_scale_with_sklearn(x):
    '''
    args: ndarray (here, containing scaled features) of shape (m,n)
    return: return standard scaled x of shape (m,n)
    
    '''
    
    # YOUR CODE HERE
    raise NotImplementedError()


assert (x_train_raw_feats).shape == x_train_raw_feats.shape
scaled_feats = standard_scale_with_sklearn(x_train_raw_feats)
assert np.isclose(scaled_feats[:2], np.array([[-0.41425879, -0.50512499, -1.29214218, -0.28154625, -0.85108479,
         0.14526384, -0.365584  ,  1.08162833, -0.74617905, -1.11279004,
         0.18727079,  0.39651419, -1.01531611],
       [-0.40200818, -0.50512499, -0.16208345, -0.28154625, -0.08796708,
        -0.20840082,  0.13394078, -0.48787608, -0.39846419,  0.15008778,
        -0.21208981,  0.3870674 , -0.05366252]])).all()


def mean_and_std_of_each_feature(x_scaled):
    ''' args: x_scaled-> ndarray of shape (m,n)
        return tuple (mean, std each of shape (n,)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


x_scaled_feats = standard_scale_with_sklearn(x_train_raw_feats)
assert np.isclose(mean_and_std_of_each_feature(x_scaled_feats)[0], np.array([-1.26232985e-16, -4.82978378e-17,  3.72473552e-15, -6.68015549e-17,
        -5.44322904e-15, -1.59406386e-15, -6.96241558e-17, -2.24459497e-15,
        -8.18554264e-17, -1.89035855e-16,  1.72849807e-14,  8.11654573e-15,
        -7.53320821e-16])).all()

assert np.isclose(mean_and_std_of_each_feature(x_scaled_feats)[1], np.array(([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]))).all()


x_train_scaled_feats = standard_scale_with_sklearn(x_train_raw_feats)


svr_model_for_scaled_input = fit_model(x_train_scaled_feats, y_train_raw_feats)


## test features scaled using the same 'scaler'
x_test_scaled = standard_scale_with_sklearn(x_test_raw_feats)
svr_model_for_scaled_input.score(x_test_scaled, y_test_raw_feats)


# import Necessary sklearn packages here.
# YOUR CODE HERE
raise NotImplementedError()


def select_k_best_feats(_data_dict, _K):
    '''args: _data_dict-> dict -> with following keys:  x_train_scaled_feats, 
                                    y_train_raw_feats, x_test_scaled, y_test_raw_feats
                                    
       return: tuple -> (score, k) -> (float, int) 
               where k is the number of best features.
    '''
    # unpack train and test data from _data_dict
    x_train_scaled_feats, y_train_raw_feats, x_test_scaled, y_test_raw_feats = _data_dict['x_train_scaled_feats'],\
        _data_dict['y_train_raw_feats'], _data_dict['x_test_scaled'], _data_dict['y_test_raw_feats'],
    
    # trucated features of test x_test- currently set to None. (Used at the end)
    x_test_trucated = None
    
    # Try different values of k and report which gives the most score.
    ##########################
    
    "Once you determin it, overwrite the value of '_K' below with the one which gives highest score e.g. replace _K by 8"
    _K = _K
    
    ##########################
    
    # YOUR CODE HERE
    raise NotImplementedError()
    
    svr_model_for_k_best_feats = svr_model_for_scaled_input.fit(x_train_trucated, y_train_raw_feats)
    

    # score on x_test_trucated and y_test_raw_feats
    _score = svr_model_for_k_best_feats.score(x_test_trucated, y_test_raw_feats)
    return _score, _K


# Use this cell to do your scratch work. For example, you can call 
# the above function inside a for loop to print scores and corresponding values of k- thereby note the best k.


_data_dict = {'x_train_scaled_feats' : x_train_scaled_feats, 'y_train_raw_feats' : y_train_raw_feats, \
              'x_test_scaled' : x_test_scaled, 'y_test_raw_feats' : y_test_raw_feats}
assert len(select_k_best_feats(_data_dict, 4)) ==  2
assert isinstance(select_k_best_feats(_data_dict, 4)[1], int)
assert isinstance(select_k_best_feats(_data_dict, 4)[0], float)

# The tests above are superficial. Other tests are hidden as in every other question.

Test 2

Question 1¶

Question 2¶

Raw Features¶

Question 3¶

Labels¶

Question 4¶

Importance of Feature engineering.¶

Question 5¶

Fit¶

Question 6¶

Zero Mean and unit variace scaling using numpy only.¶

Ungraded Question 7¶

Answer¶

Question 8¶

`(3 points)`¶

Zero Mean and unit variace scaling.¶

Remarks on Question 7¶

Question 9¶

Train SVR on scaled input¶

Compute score on scaled test set¶

Question 10¶

Curse of Dimensionality¶

Extract Best Features¶

Question 1¶

Question 2¶

Raw Features¶

Question 3¶

Labels¶

Question 4¶

Importance of Feature engineering.¶

Question 5¶

Fit¶

Question 6¶

Zero Mean and unit variace scaling using numpy only.¶

Ungraded Question 7¶

Answer¶

Question 8¶

(3 points)¶

Zero Mean and unit variace scaling.¶

Remarks on Question 7¶

Question 9¶

Train SVR on scaled input¶

Compute score on scaled test set¶

Question 10¶

Curse of Dimensionality¶

Extract Best Features¶

`(3 points)`¶