# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True): 
    '''
    args: show_head -> if True print only first 5 rows.
    return: None
    '''
    if a.ndim < 2:
        a = [a]
    if show_head:
        display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
        return
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


import nltk
import collections
import string
import numpy as np
import sklearn
import pandas as pd
import csv
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import nose.tools as test_

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hamzaliaqet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hamzaliaqet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hamzaliaqet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True


# Read the csv file dataset into a dataframe
tweets_df = pd.read_csv("UK_MPs_tweets/MPsTweets_from_24Aug_31Aug_2020.csv")
len(tweets_df) # total tweets

3464


# You may define any helper functions in this cell or any other cell if needed.
### BEGIN SOLUTION
# First find hashtags.
def extract_hashtags(tweet_text):
    match_hashtag = re.compile('#\w+')
    hashtags_list = match_hashtag.findall(tweet_text)
    hashtags_list_without_hash_symbol = [hashtag[1:] for hashtag in hashtags_list]
    return hashtags_list_without_hash_symbol

# Split hashtags. Based on Capital letter assumption
def get_words_from_hashtags(hashtag):
    expanded = [a for a in re.split('([A-Z][a-z]+)', hashtag) if a]
    return expanded # If this is providing more words than probablistic approach (wordninja). Use this.
### END SOLUTION


stopwords = set(nltk.corpus.stopwords.words('english')) | set(["http", "co", "rt", "amp"])


# Create class (instead of a function) so that we don't have to pass 
# stopwords in every func call

class PreprocessTweets(object): 
    
    def __init__(self, _stopwords=[]):
        self.stopwords = _stopwords
        
    def __call__(self, tweet_text): # call this everytime an object of this class is instantiated
        ### BEGIN SOLUTION
        hashtags = extract_hashtags(tweet_text)

        # Remove only 10 chars after t.co/  . Any thing else is meaningful
        t_dot_co_url_re = re.compile('https://t.co/\w{10}')
        tweet_text_no_url = t_dot_co_url_re.sub('', tweet_text)

        # Remove, 's e.g. teacher's => teacher
        re_for_removing_s = re.compile("('s)|('S)") # step 1
        tweet_text_no_s = re_for_removing_s.sub('', tweet_text_no_url)

        # Remove apostrophe comma. e.g. won't => wont
        re_for_removing_apostrophe = re.compile("'") # step 2
        tweet_text_no_apostrophe = re_for_removing_apostrophe.sub('', tweet_text_no_s)

        tokenized_text = nltk.word_tokenize(tweet_text_no_apostrophe)
        
        # Keep only unicode chars
        re_for_removing_non_alphanumeric_chars = re.compile("[a-zA-Z0-9_]+")
        tokens_with_alphanumeric_words = []
        for word in tokenized_text:
            words_with_alpha_numeric_chars = re_for_removing_non_alphanumeric_chars.findall(word)
            tokens_with_alphanumeric_words = tokens_with_alphanumeric_words \
                                             + words_with_alpha_numeric_chars
        
        # From tokenized text, remove hashtags- otherwise duplicates might occur.    
        tokenized_text = [token for token in tokens_with_alphanumeric_words if token not in hashtags]

        # Hashtag to words
        hashtag_words_extracted = list(map(lambda hashtag: get_words_from_hashtags(hashtag),
                                           hashtags))
        hashtag_words_in_1D_list = [item for sublist in hashtag_words_extracted 
                                            for item in sublist]

        tokenized_text = tokenized_text + hashtag_words_in_1D_list

        # Convert each word to lower case
        tokenized_text_lowercase = list(map(lambda word: word.lower(), tokenized_text))

        # Lemmatizer
        wnl = WordNetLemmatizer()
        lemmatized_tokens = list(map(lambda word: str(wnl.lemmatize(word)), tokenized_text_lowercase))


        # Stop words removal.
        tokens_without_stop_words = [word for word in lemmatized_tokens \
                                    if word not in self.stopwords]
        return tokens_without_stop_words
        ### END SOLUTION


"  >>>>> Testing Without Stopwords <<<<< "
#___________________________________________________
preprocess = PreprocessTweets()
test_.eq_ ((preprocess("teacher's",)), ['teacher'])
test_.eq_ ((preprocess("Let's get them back to classroom",)), ['let', 'get', 'them', 'back', 'to', 'classroom'])

# Underscore is a unicode char. It should be in the output
test_.eq_ ((preprocess("@gone_too_far__ Read the article ",)), 
           ['gone_too_far__', 'read', 'the', 'article'])

test_.eq_ ((preprocess("Good luck👍👍,Alex_Stafford")), ['good', 'luck', 'alex_stafford'])
test_.eq_  (preprocess("$%#hello-^^world!!"), ['world', 'hello'])
#___________________________________________________

# URL 
test_.eq_ (preprocess("Register 👉 https://t.co/dCjWFDKoKO"), ['register'])
# There is text embedded after the URL. Extract it.
test_.eq_ (preprocess("https://t.co/MAdn2K1PwH,Alex_Stafford,Conservative"), \
           ['alex_stafford', 'conservative'])
test_.eq_ (preprocess("Register 👉 https://t.co/3zi5fXSrOkHello-Griffitha,Conservative"), \
           ['register', 'hello', 'griffitha', 'conservative'])

# URL contains non-unicode chars: Invalid URL. Don't Remove it entirely.
test_.eq_ (preprocess("https://t.co/ZhEyÄ¶aaaa"), ['http', 't', 'co', 'zhey', 'aaaa'])

test_.eq_ ((preprocess("I'm live now with @AngelaRayner of @UKLabour as I")), 
           ['im', 'live', 'now', 'with', 'angelarayner', 'of', 'uklabour', 'a', 'i'])

test_.eq_ (preprocess("Don't…',GwynneMP,Labour"), ['dont', 'gwynnemp', 'labour'])

#___________________________________________________

# Hashtags Split at words. For simplicity, 
# assume that next word starts with a capital letter 

test_.eq_ (preprocess(' #ShopLocal'), ['shop', 'local'])
test_.eq_ (preprocess(' #EatOutHelpOut'), ['eat', 'out', 'help', 'out'])
test_.eq_ (preprocess('#InternationalDayoftheDisappeared'),
           ['international', 'dayofthe', 'disappeared'])
'''

'Aside':  # It's possible to split the above example into:
['International', 'Day', 'of', 'the', 'Disappeared'] using probablistic models. 
We' won't ask you to do so for the sake of this test- primarily for simplicity.
If interested, check this small module out >>> pip install wordninja

'''



"  >>>>> Testing With Stopwords <<<<< "

#___________________________________________________

preprocess_with_stopwords = PreprocessTweets(stopwords)
test_.eq_ ((preprocess_with_stopwords("LET'S get them back to classroom")), 
           ['let', 'get', 'back', 'classroom'])

test_.eq_ ((preprocess_with_stopwords("I'm live now with @AngelaRayner of @UKLabour as I")), 
                                ['im', 'live', 'angelarayner', 'uklabour'])
# https etc are stopwords. Now they'll be removed.
test_.eq_ (preprocess_with_stopwords("https://t.co/ZhEyÄ¶cccc"), ['zhey', 'cccc'])

test_.eq_ ((preprocess_with_stopwords('https://t.co/vVzR52faue",Afzal4Gorton,Labour')), 
                    ['afzal4gorton', 'labour'])
#___________________________________________________

test_.eq_ (
    preprocess_with_stopwords
    ( # Don't confuse: backslashes are to break the line (not part of the tweet text)
        "✈ It's back to London today for a new Parliamentary\
        Session🗓 This Week I'm👨‍⚕️ In Health Questions🐟  \
        In the Fisheries Bill Debate🙋‍♂️ Question Number     \
        One in PMQs󠁢󠁳󠁣󠁴󠁿 Talking more Fish in Scottish Affairs       \
        Committee 🚢 Raising issue of Freedom of Navigation on   \
        the South China Sea https://t.co/aisLmptsCR"
    ), 
    [
        'back',
        'london',
        'today',
        'new',
        'parliamentary',
        'session',
        'week',
        'im',
        'health',
        'question',
        'fishery',
        'bill',
        'debate',
        'question',
        'number',
        'one',
        'pmqs',
        'talking',
        'fish',
        'scottish',
        'affair',
        'committee',
        'raising',
        'issue',
        'freedom',
        'navigation',
        'south',
        'china',
        'sea'
    ]
)
#___________________________________________________
### BEGIN HIDDEN TESTS
test_.eq_ (
    preprocess(
        'On #InternationalDayoftheDisappeared,        \
        I think of the loved ones and friends of       \
        the hundreds of thousands of people who have    \
        “disappeared” in #Syria - imprisoned or murdered \
        by the Dictator Assad. Thank you Caesar for the   \
        photos you smuggled out to ensure the world knew   \
        the truth https://t.co/Lg5jm8Iwu9.', 
    ),
    [
        'on',
        'i',
        'think',
        'of',
        'the',
        'loved',
        'one',
        'and',
        'friend',
        'of',
        'the',
        'hundred',
        'of',
        'thousand',
        'of',
        'people',
        'who',
        'have',
        'disappeared',
        'in',
        'imprisoned',
        'or',
        'murdered',
        'by',
        'the',
        'dictator',
        'assad',
        'thank',
        'you',
        'caesar',
        'for',
        'the',
        'photo',
        'you',
        'smuggled',
        'out',
        'to',
        'ensure',
        'the',
        'world',
        'knew',
        'the',
        'truth',
        'international',
        'dayofthe',
        'disappeared',
        'syria'
    ])

### END HIDDEN TESTS


# Extract tweets text (raw features) and labels.
raw_features_tweets = tweets_df['tweets']
labels = tweets_df['Party']


# Preprocess features using preprocess.
preprocessed_features = raw_features_tweets.apply(func=lambda tweet_text: preprocess(tweet_text))


# Put preprocessed features and labels together again.
preprocessed_df = pd.concat([preprocessed_features, labels], axis=1)
preprocessed_df.head()


# Split into train/test dataset.
train_df, test_df = train_test_split(preprocessed_df, test_size=0.15, 
                                   random_state=42, shuffle=True)


# This is how train set looks like.
train_df.head()


def fit_TFIDF_vectorizer(train_data):
    '''
    args: train_data -> pandas.core.series.Series
    return: Fitted (not transformed) TFIDF Vectorizer -> i.e.
                        (sklearn.feature_extraction.text.TfidfVectorizer)
    '''
    ### BEGIN SOLUTION
    def do_nothing(doc): return doc

    # Instantiate a TFIDF vectorizer
    tfidf = TfidfVectorizer(
        analyzer='word',
        tokenizer=do_nothing,
        preprocessor=do_nothing,
        token_pattern=None
    )
    tfidf.fit(train_data)
    return tfidf
    ### END SOLUTION


# Extract Train tweets text
train_corpus = train_df['tweets']


tfidf__ = fit_TFIDF_vectorizer(train_corpus)
X_train__ = tfidf__.transform(train_corpus)
test_.ok_ ((X_train__.nonzero()[:5][0][-15:] == np.asarray(
    [
        2942, 2942, 2942, 2942, 2942, 2942, 
        2942, 2942, 2942, 2942, 2943,
        2943, 2943, 2943, 2943
    ]
)).all())
### BEGIN HIDDEN TESTS
test_.ok_ ((X_train__.nonzero()[-100:][0][-100:] == np.asarray(
    [
        2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939,
        2939, 2939, 2939, 2939, 2939, 2940, 2940, 2940, 2940, 2940, 2940,
        2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940,
        2940, 2940, 2940, 2940, 2940, 2940, 2941, 2941, 2941, 2941, 2941,
        2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941,
        2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941,
        2941, 2941, 2941, 2941, 2941, 2941, 2942, 2942, 2942, 2942, 2942,
        2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942,
        2942, 2942, 2942, 2942, 2942, 2942, 2942, 2943, 2943, 2943, 2943,
        2943
    ]
)).all())
### END HIDDEN TESTS


tfidf = fit_TFIDF_vectorizer(train_corpus)


# In the transform method, actual TFIDF matrix will be created.
X_train = tfidf.transform(train_corpus)
# Convert labels to integers [0,1].
Train_labels = train_df['Party']
y_train = [1 if l == 'Conservative' else 0 for l in Train_labels]
y_train[:5] # First 5 labels. 1 for conservative. 0 for labour.

[0, 1, 0, 1, 0]


# Instantiate a SVC.
clf = SVC()


# Train the classifier on train data.
clf.fit(X_train, y_train)

SVC()


# Performance (R2 score) on train data.
clf.score(X_train, y_train)

0.9925271739130435


# Make TFIDF features of test data.
test_corpus = test_df['tweets']
X_test = tfidf.transform(test_corpus)
Test_labels = test_df['Party']
y_test = [1 if l == 'Conservative' else 0 for l in Test_labels]


# Performance (R2 score) on test data.
clf.score(X_test, y_test) # Highest possible = 1

0.8173076923076923


y_pred = clf.predict(X_test)


# Accuracy on test data.
accuracy_score(y_test, y_pred)

0.8173076923076923

	tweets	Party
756	[thank, you, to, krissy, community, champion, ...	Labour
433	[rt, lichfield_dc, only, a, few, more, day, of...	Conservative
1623	[thank, you, for, coming, to, visit, alex, a, ...	Labour
371	[rt, amandamilling, today, we, re, launching, ...	Conservative
2851	[rt, scrumqueens, thread, there, a, massive, g...	Labour

Test 9 Solution

Natural Language Processing¶

Question¶

Features (TFIDF Vectorization)¶

Question¶

	tweets	Party
0	[it, back, to, london, today, for, a, new, par...	Conservative
1	[rt, morrison_jf, counter, terror, arrest, at,...	Conservative
2	[do, you, know, of, any, school, with, electri...	Conservative
3	[rt, aliciakearns, diplomacy, and, peace, are,...	Conservative
4	[shop, across, the, uk, are, following, govern...	Conservative