# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])
eclf1 = VotingClassifier(estimators=[
    ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(X, y)
pp(eclf1.predict(X))


from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier


estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('svr', make_pipeline(StandardScaler(),
                                    LinearSVC(random_state=42)))
             ]


s_clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)


s_clf

StackingClassifier(estimators=[('rf',
                                RandomForestClassifier(n_estimators=10,
                                                       random_state=42)),
                               ('svr',
                                Pipeline(steps=[('standardscaler',
                                                 StandardScaler()),
                                                ('linearsvc',
                                                 LinearSVC(random_state=42))]))],
                   final_estimator=LogisticRegression())


# s_clf.fit(X,y) # increase number of classes


from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Generate a synthetic random dataset with 4 features and 1000 examples (datapoints)
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
pp(X) # features
print('labels:')
pp(y) # labels. Binary.

labels:


# Train a Random Forest
rf_clf = RandomForestClassifier(max_depth=2, random_state=0)
rf_clf.fit(X, y)

RandomForestClassifier(max_depth=2, random_state=0)


random_test_example = [[0, 1, 0.5, 0.5]] # An example with 4 features- in compliance with training data.
print('predicted label:')
pp(rf_clf.predict(random_test_example)) # prints 1 as label.

predicted label:


from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)


pp(X[:5]) # head of the dataset (featuresa)


pp(y[:5]) # first 5 labels.


training_size = int(len(X) * .8) # 20% data for testing


X_train, X_test = X[:training_size], X[training_size:]
y_train, y_test = y[:training_size], y[training_size:]

# It doesn't overfit easy; 100 is a good number of trees to use.
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                 max_depth=1, random_state=0)
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=1.0, max_depth=1, random_state=0)


gb_clf.score(X_test, y_test) # score

0.92375


# !pip3 install xgboost # If not already in your env.


import xgboost as xgb


# labels 0 and 1 instead of -1 and 1. 
_y_train = [0 if x == -1 else 1 for x in y_train]
_y_test = [0 if x == -1 else 1 for x in y_test]


dtrain = xgb.DMatrix(X_train, _y_train)
dtest = xgb.DMatrix(X_test, _y_test)


dtrain

<xgboost.core.DMatrix at 0x7f853c593b00>


param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'


evallist = [(dtest, 'eval'), (dtrain, 'train')]


pp(X)


num_round = 10
bst = xgb.train(param, dtrain, num_round)


ypred = bst.predict(dtest)


_ypred = [1 if y>0.5 else 0 for y in ypred]
pp(np.asarray(_ypred))

-1.66853	-1.29901	0.274647	-0.60362
-2.97288	-1.08878	0.70886	0.422819
-0.596141	-1.37007	-3.11686	0.644452
-1.06895	-1.17506	-1.91374	0.663562
-1.30527	-0.965926	-0.154072	1.19361

1.76405	0.400157	0.978738	2.24089	1.86756	-0.977278	0.950088	-0.151357	-0.103219	0.410599
0.144044	1.45427	0.761038	0.121675	0.443863	0.333674	1.49408	-0.205158	0.313068	-0.854096
-2.55299	0.653619	0.864436	-0.742165	2.26975	-1.45437	0.0457585	-0.187184	1.53278	1.46936
0.154947	0.378163	-0.887786	-1.9808	-0.347912	0.156349	1.23029	1.20238	-0.387327	-0.302303
-1.04855	-1.42002	-1.70627	1.95078	-0.509652	-0.438074	-1.2528	0.77749	-1.6139	-0.21274

1.76405	0.400157	0.978738	2.24089	1.86756	-0.977278	0.950088	-0.151357	-0.103219	0.410599
0.144044	1.45427	0.761038	0.121675	0.443863	0.333674	1.49408	-0.205158	0.313068	-0.854096
-2.55299	0.653619	0.864436	-0.742165	2.26975	-1.45437	0.0457585	-0.187184	1.53278	1.46936
0.154947	0.378163	-0.887786	-1.9808	-0.347912	0.156349	1.23029	1.20238	-0.387327	-0.302303
-1.04855	-1.42002	-1.70627	1.95078	-0.509652	-0.438074	-1.2528	0.77749	-1.6139	-0.21274

Ensemble Learning

Ensemble Learning¶

Bagging¶

Voting¶

Stacking¶

Random Forest¶

Sklearn Random Forest¶

Boosting¶

Xgboost (Extreme Gradient Boosting)¶

Prediction¶

GPU Support¶