Test 9 Solution
Natural Language Processing¶
Here, we'll predict political party ('Conservative' or 'Labour') of a Member of Parliament (MP) of UK Parliament based on his/her tweets. The data (tweets of ~500 MPs since Aug 23- 2020) were extracted using V2 of Twitter API.
# Useful in beautifying numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a, show_head=True):
'''
args: show_head -> if True print only first 5 rows.
return: None
'''
if a.ndim < 2:
a = [a]
if show_head:
display(HTML(tabulate.tabulate(a[:5], tablefmt='html')))
return
display(HTML(tabulate.tabulate(a, tablefmt='html')))
import nltk
import collections
import string
import numpy as np
import sklearn
import pandas as pd
import csv
import re
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import nose.tools as test_
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to [nltk_data] /Users/hamzaliaqet/nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] /Users/hamzaliaqet/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package punkt to [nltk_data] /Users/hamzaliaqet/nltk_data... [nltk_data] Package punkt is already up-to-date!
True
# Read the csv file dataset into a dataframe
tweets_df = pd.read_csv("UK_MPs_tweets/MPsTweets_from_24Aug_31Aug_2020.csv")
len(tweets_df) # total tweets
3464
# You may define any helper functions in this cell or any other cell if needed.
### BEGIN SOLUTION
# First find hashtags.
def extract_hashtags(tweet_text):
match_hashtag = re.compile('#\w+')
hashtags_list = match_hashtag.findall(tweet_text)
hashtags_list_without_hash_symbol = [hashtag[1:] for hashtag in hashtags_list]
return hashtags_list_without_hash_symbol
# Split hashtags. Based on Capital letter assumption
def get_words_from_hashtags(hashtag):
expanded = [a for a in re.split('([A-Z][a-z]+)', hashtag) if a]
return expanded # If this is providing more words than probablistic approach (wordninja). Use this.
### END SOLUTION
stopwords = set(nltk.corpus.stopwords.words('english')) | set(["http", "co", "rt", "amp"])
Question¶
In the function below, given a text, return preprocessed text such that:
- it is tokenized.
- it only contains unicode chars (alphabets, digits or underscore only).
- Remove trailing 's e.g.
teacher's
becomesteacher
- apostrophe is removed. e.g.
don't
becomesdont
- tokens are lemmatized
- Contains no stopwords if list of stopwords is provided.
- Extract words from hashtag.
#BackToSchool
becomesback, to, school
(if stopwords list is empty otherwiseto
will also be removed) - each token is lowercased.
- All valid
t.co
URLs are removed.
Hints:
- For hashtags, each word starts with a capital letter; Assume that it's always true.
- For URLs, pay attention to length of a valid
t.co
URL. - If the above preprocessing points are applied in the right order, solving this problem is simpler.
More Hints/Pointers with examples are in the test cases below; Comments are also written for clarifications.
A Hint from Twitter:
'You cannot add spaces or punctuation in a hashtag, or it will not work properly. (Twitter)'
Useful Libraries:
# Create class (instead of a function) so that we don't have to pass
# stopwords in every func call
class PreprocessTweets(object):
def __init__(self, _stopwords=[]):
self.stopwords = _stopwords
def __call__(self, tweet_text): # call this everytime an object of this class is instantiated
### BEGIN SOLUTION
hashtags = extract_hashtags(tweet_text)
# Remove only 10 chars after t.co/ . Any thing else is meaningful
t_dot_co_url_re = re.compile('https://t.co/\w{10}')
tweet_text_no_url = t_dot_co_url_re.sub('', tweet_text)
# Remove, 's e.g. teacher's => teacher
re_for_removing_s = re.compile("('s)|('S)") # step 1
tweet_text_no_s = re_for_removing_s.sub('', tweet_text_no_url)
# Remove apostrophe comma. e.g. won't => wont
re_for_removing_apostrophe = re.compile("'") # step 2
tweet_text_no_apostrophe = re_for_removing_apostrophe.sub('', tweet_text_no_s)
tokenized_text = nltk.word_tokenize(tweet_text_no_apostrophe)
# Keep only unicode chars
re_for_removing_non_alphanumeric_chars = re.compile("[a-zA-Z0-9_]+")
tokens_with_alphanumeric_words = []
for word in tokenized_text:
words_with_alpha_numeric_chars = re_for_removing_non_alphanumeric_chars.findall(word)
tokens_with_alphanumeric_words = tokens_with_alphanumeric_words \
+ words_with_alpha_numeric_chars
# From tokenized text, remove hashtags- otherwise duplicates might occur.
tokenized_text = [token for token in tokens_with_alphanumeric_words if token not in hashtags]
# Hashtag to words
hashtag_words_extracted = list(map(lambda hashtag: get_words_from_hashtags(hashtag),
hashtags))
hashtag_words_in_1D_list = [item for sublist in hashtag_words_extracted
for item in sublist]
tokenized_text = tokenized_text + hashtag_words_in_1D_list
# Convert each word to lower case
tokenized_text_lowercase = list(map(lambda word: word.lower(), tokenized_text))
# Lemmatizer
wnl = WordNetLemmatizer()
lemmatized_tokens = list(map(lambda word: str(wnl.lemmatize(word)), tokenized_text_lowercase))
# Stop words removal.
tokens_without_stop_words = [word for word in lemmatized_tokens \
if word not in self.stopwords]
return tokens_without_stop_words
### END SOLUTION
" >>>>> Testing Without Stopwords <<<<< "
#___________________________________________________
preprocess = PreprocessTweets()
test_.eq_ ((preprocess("teacher's",)), ['teacher'])
test_.eq_ ((preprocess("Let's get them back to classroom",)), ['let', 'get', 'them', 'back', 'to', 'classroom'])
# Underscore is a unicode char. It should be in the output
test_.eq_ ((preprocess("@gone_too_far__ Read the article ",)),
['gone_too_far__', 'read', 'the', 'article'])
test_.eq_ ((preprocess("Good luck👍👍,Alex_Stafford")), ['good', 'luck', 'alex_stafford'])
test_.eq_ (preprocess("$%#hello-^^world!!"), ['world', 'hello'])
#___________________________________________________
# URL
test_.eq_ (preprocess("Register 👉 https://t.co/dCjWFDKoKO"), ['register'])
# There is text embedded after the URL. Extract it.
test_.eq_ (preprocess("https://t.co/MAdn2K1PwH,Alex_Stafford,Conservative"), \
['alex_stafford', 'conservative'])
test_.eq_ (preprocess("Register 👉 https://t.co/3zi5fXSrOkHello-Griffitha,Conservative"), \
['register', 'hello', 'griffitha', 'conservative'])
# URL contains non-unicode chars: Invalid URL. Don't Remove it entirely.
test_.eq_ (preprocess("https://t.co/ZhEyĶaaaa"), ['http', 't', 'co', 'zhey', 'aaaa'])
test_.eq_ ((preprocess("I'm live now with @AngelaRayner of @UKLabour as I")),
['im', 'live', 'now', 'with', 'angelarayner', 'of', 'uklabour', 'a', 'i'])
test_.eq_ (preprocess("Don't…',GwynneMP,Labour"), ['dont', 'gwynnemp', 'labour'])
#___________________________________________________
# Hashtags Split at words. For simplicity,
# assume that next word starts with a capital letter
test_.eq_ (preprocess(' #ShopLocal'), ['shop', 'local'])
test_.eq_ (preprocess(' #EatOutHelpOut'), ['eat', 'out', 'help', 'out'])
test_.eq_ (preprocess('#InternationalDayoftheDisappeared'),
['international', 'dayofthe', 'disappeared'])
'''
'Aside': # It's possible to split the above example into:
['International', 'Day', 'of', 'the', 'Disappeared'] using probablistic models.
We' won't ask you to do so for the sake of this test- primarily for simplicity.
If interested, check this small module out >>> pip install wordninja
'''
" >>>>> Testing With Stopwords <<<<< "
#___________________________________________________
preprocess_with_stopwords = PreprocessTweets(stopwords)
test_.eq_ ((preprocess_with_stopwords("LET'S get them back to classroom")),
['let', 'get', 'back', 'classroom'])
test_.eq_ ((preprocess_with_stopwords("I'm live now with @AngelaRayner of @UKLabour as I")),
['im', 'live', 'angelarayner', 'uklabour'])
# https etc are stopwords. Now they'll be removed.
test_.eq_ (preprocess_with_stopwords("https://t.co/ZhEyĶcccc"), ['zhey', 'cccc'])
test_.eq_ ((preprocess_with_stopwords('https://t.co/vVzR52faue",Afzal4Gorton,Labour')),
['afzal4gorton', 'labour'])
#___________________________________________________
test_.eq_ (
preprocess_with_stopwords
( # Don't confuse: backslashes are to break the line (not part of the tweet text)
"✈ It's back to London today for a new Parliamentary\
Session🗓 This Week I'm👨⚕️ In Health Questions🐟 \
In the Fisheries Bill Debate🙋♂️ Question Number \
One in PMQs Talking more Fish in Scottish Affairs \
Committee 🚢 Raising issue of Freedom of Navigation on \
the South China Sea https://t.co/aisLmptsCR"
),
[
'back',
'london',
'today',
'new',
'parliamentary',
'session',
'week',
'im',
'health',
'question',
'fishery',
'bill',
'debate',
'question',
'number',
'one',
'pmqs',
'talking',
'fish',
'scottish',
'affair',
'committee',
'raising',
'issue',
'freedom',
'navigation',
'south',
'china',
'sea'
]
)
#___________________________________________________
### BEGIN HIDDEN TESTS
test_.eq_ (
preprocess(
'On #InternationalDayoftheDisappeared, \
I think of the loved ones and friends of \
the hundreds of thousands of people who have \
“disappeared” in #Syria - imprisoned or murdered \
by the Dictator Assad. Thank you Caesar for the \
photos you smuggled out to ensure the world knew \
the truth https://t.co/Lg5jm8Iwu9.',
),
[
'on',
'i',
'think',
'of',
'the',
'loved',
'one',
'and',
'friend',
'of',
'the',
'hundred',
'of',
'thousand',
'of',
'people',
'who',
'have',
'disappeared',
'in',
'imprisoned',
'or',
'murdered',
'by',
'the',
'dictator',
'assad',
'thank',
'you',
'caesar',
'for',
'the',
'photo',
'you',
'smuggled',
'out',
'to',
'ensure',
'the',
'world',
'knew',
'the',
'truth',
'international',
'dayofthe',
'disappeared',
'syria'
])
### END HIDDEN TESTS
# Extract tweets text (raw features) and labels.
raw_features_tweets = tweets_df['tweets']
labels = tweets_df['Party']
# Preprocess features using preprocess.
preprocessed_features = raw_features_tweets.apply(func=lambda tweet_text: preprocess(tweet_text))
# Put preprocessed features and labels together again.
preprocessed_df = pd.concat([preprocessed_features, labels], axis=1)
preprocessed_df.head()
tweets | Party | |
---|---|---|
0 | [it, back, to, london, today, for, a, new, par... | Conservative |
1 | [rt, morrison_jf, counter, terror, arrest, at,... | Conservative |
2 | [do, you, know, of, any, school, with, electri... | Conservative |
3 | [rt, aliciakearns, diplomacy, and, peace, are,... | Conservative |
4 | [shop, across, the, uk, are, following, govern... | Conservative |
# Split into train/test dataset.
train_df, test_df = train_test_split(preprocessed_df, test_size=0.15,
random_state=42, shuffle=True)
# This is how train set looks like.
train_df.head()
tweets | Party | |
---|---|---|
756 | [thank, you, to, krissy, community, champion, ... | Labour |
433 | [rt, lichfield_dc, only, a, few, more, day, of... | Conservative |
1623 | [thank, you, for, coming, to, visit, alex, a, ... | Labour |
371 | [rt, amandamilling, today, we, re, launching, ... | Conservative |
2851 | [rt, scrumqueens, thread, there, a, massive, g... | Labour |
def fit_TFIDF_vectorizer(train_data):
'''
args: train_data -> pandas.core.series.Series
return: Fitted (not transformed) TFIDF Vectorizer -> i.e.
(sklearn.feature_extraction.text.TfidfVectorizer)
'''
### BEGIN SOLUTION
def do_nothing(doc): return doc
# Instantiate a TFIDF vectorizer
tfidf = TfidfVectorizer(
analyzer='word',
tokenizer=do_nothing,
preprocessor=do_nothing,
token_pattern=None
)
tfidf.fit(train_data)
return tfidf
### END SOLUTION
# Extract Train tweets text
train_corpus = train_df['tweets']
tfidf__ = fit_TFIDF_vectorizer(train_corpus)
X_train__ = tfidf__.transform(train_corpus)
test_.ok_ ((X_train__.nonzero()[:5][0][-15:] == np.asarray(
[
2942, 2942, 2942, 2942, 2942, 2942,
2942, 2942, 2942, 2942, 2943,
2943, 2943, 2943, 2943
]
)).all())
### BEGIN HIDDEN TESTS
test_.ok_ ((X_train__.nonzero()[-100:][0][-100:] == np.asarray(
[
2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939, 2939,
2939, 2939, 2939, 2939, 2939, 2940, 2940, 2940, 2940, 2940, 2940,
2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940, 2940,
2940, 2940, 2940, 2940, 2940, 2940, 2941, 2941, 2941, 2941, 2941,
2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941,
2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941, 2941,
2941, 2941, 2941, 2941, 2941, 2941, 2942, 2942, 2942, 2942, 2942,
2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942, 2942,
2942, 2942, 2942, 2942, 2942, 2942, 2942, 2943, 2943, 2943, 2943,
2943
]
)).all())
### END HIDDEN TESTS
tfidf = fit_TFIDF_vectorizer(train_corpus)
# In the transform method, actual TFIDF matrix will be created.
X_train = tfidf.transform(train_corpus)
# Convert labels to integers [0,1].
Train_labels = train_df['Party']
y_train = [1 if l == 'Conservative' else 0 for l in Train_labels]
y_train[:5] # First 5 labels. 1 for conservative. 0 for labour.
[0, 1, 0, 1, 0]
# Instantiate a SVC.
clf = SVC()
# Train the classifier on train data.
clf.fit(X_train, y_train)
SVC()
# Performance (R2 score) on train data.
clf.score(X_train, y_train)
0.9925271739130435
# Make TFIDF features of test data.
test_corpus = test_df['tweets']
X_test = tfidf.transform(test_corpus)
Test_labels = test_df['Party']
y_test = [1 if l == 'Conservative' else 0 for l in Test_labels]
# Performance (R2 score) on test data.
clf.score(X_test, y_test) # Highest possible = 1
0.8173076923076923
y_pred = clf.predict(X_test)
# Accuracy on test data.
accuracy_score(y_test, y_pred)
0.8173076923076923
Remarks:
The classifier, for example, may learn that if a tweet mentions something negative about Boris Jhonson ('Conservative' party ), then it's more likely the author of that tweet is not
of 'Conservative' party. It learned all that from the training examples with labels we provided. And the classifier's prediction is correct about $~82\%$ of the times.