NAME = ""
COLLABORATORS = ""


from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
import nose.tools as test_# For testing
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


def generate_dataset(n_samples_, n_classes_, n_features_, \
                           n_informative_, n_redundant_, random_state_, shuffle_):
    '''
    args: n_samples -> int => number of samples to generate
          n_classes  -> int => number of classes in your dataset
          n_features -> int => total number of features (inforamtive + redundant)
          n_informative  -> int => number of informative features
          n_informative  -> int => number of informative features
          n_redundant  -> int => number of redundant features
          random_state_ -> int => random state (for reproduciable results)
          shuffle_ -> Bool => whether to shuffle data.
          
    return: tuple (X, y) => X is ndarray of features (m, 7)   
                         => y is ndarray of labels (m,)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


X_for_test = generate_dataset(10000, 16, 7, 5, 2, 42, True)[0]
y_for_test = generate_dataset(10000, 16, 7, 5, 2, 42, True)[1]
test_.eq_ (X_for_test.shape, (10000, 7))
test_.eq_ (y_for_test.shape, (10000,))


X = generate_dataset(10000, 16, 7, 5, 2, 42, True)[0]
y = generate_dataset(10000, 16, 7, 5, 2, 42, True)[1]


print('Dataset with 5 informative features and 2 redundant features:')
pp(X)
print('Labels (Total Classes-16):')
pp(y)


X_train, X_test, y_train, y_test = train_test_split(X, y, \
                                                    test_size=0.33, random_state=42)


svc = SVC()
svc.fit(X_train, y_train)


y_pred = svc.predict(X_test)


svc.score(X_test, y_test)


accuracy_score(y_test, y_pred)


# Set up any libarary imports here
# YOUR CODE HERE
raise NotImplementedError()


def instantiate_classifiers_for_voting(rs, n_estimators_):
    '''
    Return 4 instantiated (not fitted) classifiers.
    args: rs -> int => random state for all classifiers which accept it.
          n_estimators_ -> int => number of estimators for RandomForestClassifier
    
    return: tuple  of length 4 -> (clf1, clf2, clf3, clf4) => Each element is 
                an instantiated classifier in this order logistic regression, Random Forest Classifier,
                Gaussian NB, SVC.
    
    Other:
        > In LogisticRegression use 'multinomial' for 'multi_class' arg.
        > For all classifiers, leave other parameters with their default values.
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.eq_ (len(instantiate_classifiers_for_voting(34, 10)), 4)
logr = instantiate_classifiers_for_voting(34, 10)[0]
test_.eq_(logr.random_state, 34)


_classifiers_voting = instantiate_classifiers_for_voting(42, 100)


_classifiers_voting


# YOUR CODE HERE
raise NotImplementedError()


def fit_voting_classifier(classifiers_, voting_):
    '''
    Fit a Voting classifier which uses classifiers_ as estimators. Note that in sklearn
    fitting a voting classifier will automatically fit its estimators. Read relevant documentation
    for more information. 
    
    
    args: classifiers_ -> tuple  of length 4 -> (clf1, clf2, clf3, clf4) => Each element is 
                                                              an instantiated classifier.
          voting_ -> string -> 'hard', 'soft'. We'll only do 'hard' voting here. Do not 
                                               worry about soft voting at this point. Read relevant documentation
                                               for more information. 
    
    return: fitted voting classifier.
    
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


_vot_clf = fit_voting_classifier(_classifiers_voting, 'hard')


estimators_list = _vot_clf.estimators_


_vot_clf = fit_voting_classifier(_classifiers_voting, 'hard')
estimators_list = _vot_clf.estimators_
test_.eq_ (
    [str(est_name) for est_name in estimators_list], [
        "LogisticRegression(multi_class='multinomial', random_state=42)",
        'RandomForestClassifier(random_state=42)',
        'GaussianNB()',
        'SVC(random_state=42)'
    ]
)


_vot_clf = fit_voting_classifier(_classifiers_voting, 'hard')


_vot_clf = fit_voting_classifier(_classifiers_voting, 'hard')


y_pred_vot_clf = _vot_clf.predict(X_test)


print('R2 score:', _vot_clf.score(X_test, y_test))
print('Accuracy:', accuracy_score(y_pred_vot_clf, y_test))


# Set up any libarary imports here
# YOUR CODE HERE
raise NotImplementedError()


def instantiate_classifiers_for_stacking(rs, n_estimators_, max_iter_):
    '''
    Return 5 instantiated (not fitted) classifiers.
    args: rs -> int => random state for all classifiers which accept it.
          n_estimators_ -> int => number of estimators for RandomForestClassifier
          max_iter_ -> int => pass this to SVC and both Logistic Regressions to silence warnings. 
                              Read relevant docs for more information.
    
    return: tuple  of length 4 -> (clf1, clf2, clf3, clf4, final_clf) => Each element is 
                an instantiated classifier in this order Random Forest Classifier,
                SVC, Logistic Regression, Gaussian NB. The final_clf is also a 
                Logistic Regression.
    
    Other:
        > In both LogisticRegressions use 'multinomial' for 'multi_class' arg.
        > For all classifiers, leave other parameters with their default values.
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


test_.eq_ (len(instantiate_classifiers_for_stacking(34, 10, 10000)), 5)
logr = instantiate_classifiers_for_stacking(34, 10, 10000)[0]
test_.eq_(logr.random_state, 34)


classifiers_stacking = instantiate_classifiers_for_stacking(42, 100, 100000)


# Set up any libarary imports here
# YOUR CODE HERE
raise NotImplementedError()


def fit_stacking_classifier(classifiers_):
    '''
    Fit a Voting classifier which uses classifiers_ as estimators. Note that in sklearn
    fitting a voting classifier will automatically fit its estimators. Read relevant documentation
    for more information. 
    
    
    args: classifiers_ -> tuple  of length 4 -> (clf1, clf2, clf3, clf4) => Each element is 
                                                              an instantiated classifier. 
    
    return: fitted voting classifier.
    
    The solution should be very similar to that of fit_voting_classifier.
    '''
    
    # YOUR CODE HERE
    raise NotImplementedError()


_stack_clf = fit_stacking_classifier(classifiers_stacking)
estimators_list = _stack_clf.estimators_
test_.eq_ (
    ["".join(str(est_name).split()) for est_name in estimators_list], # remove white spaces then compare
    [
        'RandomForestClassifier(random_state=42)',
        "Pipeline(steps=[('standardscaler',StandardScaler()),('svc',SVC(max_iter=100000,random_state=42))])",
        "LogisticRegression(max_iter=100000,multi_class='multinomial',random_state=42)",
        'GaussianNB()'
    ]
)


# YOUR CODE HERE
raise NotImplementedError()


# Set up any library imports here
# YOUR CODE HERE
raise NotImplementedError()


# You may use this cell (or create other cells) 
# for scratch work e.g. GridSearch() for hyperparameters tuning.


def train_XGBoost(X_train_, y_train_):
    '''
    Train a booster object on 
    return: xgboost.core.Booster object 
    
    args: X_train_ -> ndarray -> shape (m, 7)
          y_train_ -> ndarray -> shape (m,)
          
    Returned Booster should be trained (hyperparameter tuned etc) which can predict
    accuracy score on test data. Accuracy should be at least 56 to earn credit.
    '''
    bst = None
    # YOUR CODE HERE
    raise NotImplementedError()
    return bst # xgboost.core.Booster object


bst = train_XGBoost(X_train, y_train)
test_.eq_ (str(type(bst)), "<class 'xgboost.core.Booster'>")


xgb.plot_importance(bst) # feature immportance


# !pip install graphviz


xgb.to_graphviz(bst, num_trees=2) # second tree. Change this number to display the corresponding tree.

Test 8

Ensemble Learning¶

Question 1¶

Train Test Split¶

Train SVC¶

Bagging¶

Question 2¶

Question 3¶

Train¶

Predict¶

Accuracy¶

Stacking¶

Question 4¶

Boosting¶

Question 5¶

Other Insights From XGBoost¶