NAME = ""
COLLABORATORS = ""


# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


import nltk
import collections
import string
import numpy as np
import sklearn
import pandas as pd
import csv
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import nose.tools as test_

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


# Read the csv file dataset into a dataframe
tweets_df = pd.read_csv("UK_MPs_tweets/MPsTweets_from_24Aug_31Aug_2020.csv")
len(tweets_df) # total tweets


# You may define any helper functions in this cell or any other cell if needed.
# YOUR CODE HERE
raise NotImplementedError()


stopwords = set(nltk.corpus.stopwords.words('english')) | set(["http", "co", "rt", "amp"])


# Create class (instead of a function) so that we don't have to pass 
# stopwords in every func call

class PreprocessTweets(object): 
    
    def __init__(self, _stopwords=[]):
        self.stopwords = _stopwords
        
    def __call__(self, tweet_text): # call this everytime an object of this class is instantiated
        # YOUR CODE HERE
        raise NotImplementedError()


"  >>>>> Testing Without Stopwords <<<<< "
#___________________________________________________
preprocess = PreprocessTweets()
test_.eq_ ((preprocess("teacher's",)), ['teacher'])
test_.eq_ ((preprocess("Let's get them back to classroom",)), ['let', 'get', 'them', 'back', 'to', 'classroom'])

# Underscore is a unicode char. It should be in the output
test_.eq_ ((preprocess("@gone_too_far__ Read the article ",)), 
           ['gone_too_far__', 'read', 'the', 'article'])

test_.eq_ ((preprocess("Good luck👍👍,Alex_Stafford")), ['good', 'luck', 'alex_stafford'])
test_.eq_  (preprocess("$%#hello-^^world!!"), ['world', 'hello'])
#___________________________________________________

# URL 
test_.eq_ (preprocess("Register 👉 https://t.co/dCjWFDKoKO"), ['register'])
# There is text embedded after the URL. Extract it.
test_.eq_ (preprocess("https://t.co/MAdn2K1PwH,Alex_Stafford,Conservative"), \
           ['alex_stafford', 'conservative'])
test_.eq_ (preprocess("Register 👉 https://t.co/3zi5fXSrOkHello-Griffitha,Conservative"), \
           ['register', 'hello', 'griffitha', 'conservative'])

# URL contains non-unicode chars: Invalid URL. Don't Remove it entirely.
test_.eq_ (preprocess("https://t.co/ZhEyÄ¶aaaa"), ['http', 't', 'co', 'zhey', 'aaaa'])

test_.eq_ ((preprocess("I'm live now with @AngelaRayner of @UKLabour as I")), 
           ['im', 'live', 'now', 'with', 'angelarayner', 'of', 'uklabour', 'a', 'i'])

test_.eq_ (preprocess("Don't…',GwynneMP,Labour"), ['dont', 'gwynnemp', 'labour'])

#___________________________________________________

# Hashtags Split at words. For simplicity, 
# assume that next word starts with a capital letter 

test_.eq_ (preprocess(' #ShopLocal'), ['shop', 'local'])
test_.eq_ (preprocess(' #EatOutHelpOut'), ['eat', 'out', 'help', 'out'])
test_.eq_ (preprocess('#InternationalDayoftheDisappeared'),
           ['international', 'dayofthe', 'disappeared'])
'''

'Aside':  # It's possible to split the above example into:
['International', 'Day', 'of', 'the', 'Disappeared'] using probablistic models. 
We' won't ask you to do so for the sake of this test- primarily for simplicity.
If interested, check this small module out >>> pip install wordninja

'''



"  >>>>> Testing With Stopwords <<<<< "

#___________________________________________________

preprocess_with_stopwords = PreprocessTweets(stopwords)
test_.eq_ ((preprocess_with_stopwords("LET'S get them back to classroom")), 
           ['let', 'get', 'back', 'classroom'])

test_.eq_ ((preprocess_with_stopwords("I'm live now with @AngelaRayner of @UKLabour as I")), 
                                ['im', 'live', 'angelarayner', 'uklabour'])
# https etc are stopwords. Now they'll be removed.
test_.eq_ (preprocess_with_stopwords("https://t.co/ZhEyÄ¶cccc"), ['zhey', 'cccc'])

test_.eq_ ((preprocess_with_stopwords('https://t.co/vVzR52faue",Afzal4Gorton,Labour')), 
                    ['afzal4gorton', 'labour'])
#___________________________________________________

test_.eq_ (
    preprocess_with_stopwords
    ( # Don't confuse: backslashes are to break the line (not part of the tweet text)
        "✈ It's back to London today for a new Parliamentary\
        Session🗓 This Week I'm👨‍⚕️ In Health Questions🐟  \
        In the Fisheries Bill Debate🙋‍♂️ Question Number     \
        One in PMQs󠁢󠁳󠁣󠁴󠁿 Talking more Fish in Scottish Affairs       \
        Committee 🚢 Raising issue of Freedom of Navigation on   \
        the South China Sea https://t.co/aisLmptsCR"
    ), 
    [
        'back',
        'london',
        'today',
        'new',
        'parliamentary',
        'session',
        'week',
        'im',
        'health',
        'question',
        'fishery',
        'bill',
        'debate',
        'question',
        'number',
        'one',
        'pmqs',
        'talking',
        'fish',
        'scottish',
        'affair',
        'committee',
        'raising',
        'issue',
        'freedom',
        'navigation',
        'south',
        'china',
        'sea'
    ]
)
#___________________________________________________


# Extract tweets text (raw features) and labels.
raw_features_tweets = tweets_df['tweets']
labels = tweets_df['Party']


# Preprocess features using preprocess.
preprocessed_features = raw_features_tweets.apply(func=lambda tweet_text: preprocess(tweet_text))


# Put preprocessed features and labels together again.
preprocessed_df = pd.concat([preprocessed_features, labels], axis=1)
preprocessed_df.head()


# Split into train/test dataset.
train_df, test_df = train_test_split(preprocessed_df, test_size=0.15, 
                                   random_state=42, shuffle=True)


# This is how train set looks like.
train_df.head()


def fit_TFIDF_vectorizer(train_data):
    '''
    args: train_data -> pandas.core.series.Series
    return: Fitted (not transformed) TFIDF Vectorizer -> i.e.
                        (sklearn.feature_extraction.text.TfidfVectorizer)
    '''
    # YOUR CODE HERE
    raise NotImplementedError()


# Extract Train tweets text
train_corpus = train_df['tweets']


tfidf__ = fit_TFIDF_vectorizer(train_corpus)
X_train__ = tfidf__.transform(train_corpus)
test_.ok_ ((X_train__.nonzero()[:5][0][-15:] == np.asarray(
    [
        2942, 2942, 2942, 2942, 2942, 2942, 
        2942, 2942, 2942, 2942, 2943,
        2943, 2943, 2943, 2943
    ]
)).all())


tfidf = fit_TFIDF_vectorizer(train_corpus)


# In the transform method, actual TFIDF matrix will be created.
X_train = tfidf.transform(train_corpus)
# Convert labels to integers [0,1].
Train_labels = train_df['Party']
y_train = [1 if l == 'Conservative' else 0 for l in Train_labels]
y_train[:5] # First 5 labels. 1 for conservative. 0 for labour.


# Instantiate a SVC.
clf = SVC()


# Train the classifier on train data.
clf.fit(X_train, y_train)


# Performance (R2 score) on train data.
clf.score(X_train, y_train)


# Make TFIDF features of test data.
test_corpus = test_df['tweets']
X_test = tfidf.transform(test_corpus)
Test_labels = test_df['Party']
y_test = [1 if l == 'Conservative' else 0 for l in Test_labels]


# Performance (R2 score) on test data.
clf.score(X_test, y_test) # Highest possible = 1


y_pred = clf.predict(X_test)


# Accuracy on test data.
accuracy_score(y_test, y_pred)

Test 9

Natural Language Processing¶

Question¶

Features (TFIDF Vectorization)¶

Question¶