import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict


# useful for printing numpy arrays.
from IPython.display import HTML, display
import tabulate
def pp(a):
    if a.ndim < 2:
        a = [a]
    display(HTML(tabulate.tabulate(a, tablefmt='html')))


documents = ["machine learning is such an amazing field",
             
             "i enjoy writing machine learning programs",
             
             "i think of tests as learning instruments yet they also provide the course staff\
             a guage of my understanding"]


ngram_vectorizer = CountVectorizer( ngram_range=(2, 2)) # (2,2) means extract bigrams only
bigrams = ngram_vectorizer.fit_transform(documents)
ngram_vectorizer.get_feature_names()

['also provide',
 'amazing field',
 'an amazing',
 'as learning',
 'course staff',
 'enjoy writing',
 'guage of',
 'instruments yet',
 'is such',
 'learning instruments',
 'learning is',
 'learning programs',
 'machine learning',
 'my understanding',
 'of my',
 'of tests',
 'provide the',
 'staff guage',
 'such an',
 'tests as',
 'the course',
 'they also',
 'think of',
 'writing machine',
 'yet they']


bigrams # rewritten here for shape comprison- next.

<3x25 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>


from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=10)
hv.transform(documents)

<3x10 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>


documents

['machine learning is such an amazing field',
 'i enjoy writing machine learning programs',
 'i think of tests as learning instruments yet they also provide the course staff             a guage of my understanding']


document_words = [doc.split() for doc in documents]
vocab = sorted(set(sum(document_words, [])))
print(vocab, "\n") # list containing unique words in corpus.

['a', 'also', 'amazing', 'an', 'as', 'course', 'enjoy', 'field', 'guage', 'i', 'instruments', 'is', 'learning', 'machine', 'my', 'of', 'programs', 'provide', 'staff', 'such', 'tests', 'the', 'they', 'think', 'understanding', 'writing', 'yet']


vocab_dict = {k:i for i,k in enumerate(vocab)} # word-index mapping. Useful to have.
print(vocab_dict, "\n")

{'a': 0, 'also': 1, 'amazing': 2, 'an': 3, 'as': 4, 'course': 5, 'enjoy': 6, 'field': 7, 'guage': 8, 'i': 9, 'instruments': 10, 'is': 11, 'learning': 12, 'machine': 13, 'my': 14, 'of': 15, 'programs': 16, 'provide': 17, 'staff': 18, 'such': 19, 'tests': 20, 'the': 21, 'they': 22, 'think': 23, 'understanding': 24, 'writing': 25, 'yet': 26}


TF = np.zeros((len(documents), len(vocab)), dtype=int)
for i, doc in enumerate(document_words):
    for word in doc:
        TF[i, vocab_dict[word]] += 1
print('shape of TF matrix', TF.shape)
print('TF Matrix:')
pp(TF)

shape of TF matrix (3, 27)
TF Matrix:


vectorizer = CountVectorizer()

TF_sklearn = vectorizer.fit_transform(documents,)
print(TF_sklearn.shape) # (3,25) but it should be (3,27).
pp(TF_sklearn.toarray())

(3, 25)


TF_sklearn.shape == TF.shape

False


print((vectorizer.get_feature_names()))

['also', 'amazing', 'an', 'as', 'course', 'enjoy', 'field', 'guage', 'instruments', 'is', 'learning', 'machine', 'my', 'of', 'programs', 'provide', 'staff', 'such', 'tests', 'the', 'they', 'think', 'understanding', 'writing', 'yet']


vectorizer_inclcudes_single_char = CountVectorizer(token_pattern=u"(?u)\\b\\w+\\b")
TF_sklearn_inclcudes_single_char  = vectorizer_inclcudes_single_char.fit_transform(documents,)
print(TF_sklearn_inclcudes_single_char.shape)
pp(TF_sklearn_inclcudes_single_char.toarray())

(3, 27)


TF_sklearn_inclcudes_single_char.shape == TF.shape

True


count_docs_with_word_j = TF.astype(bool).sum(axis=0)


total_docs = TF.shape[0]
IDF = np.log(total_docs/count_docs_with_word_j)
pp(IDF)


idx_arr = np.where(IDF == 0) # (array([12]),)
idx = int(idx_arr[0][0]) # extract int 12 
vocab[idx] # 'learning' which indeed occurs in all three docs (verify).

'learning'


TFIDF = TF * IDF
print(TFIDF.shape) # (3,27)
pp(TFIDF)

(3, 27)


from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer()
tfidf_sklearn = tfidf_vectorizer.fit_transform(documents)
tfidf_sklearn

<3x25 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>


pp(tfidf_sklearn.toarray()) # readble numpy array


from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')


n_samples = 1000
# tf_news = vectorizer.fit_transform(newsgroups_train.data)
raw_text = newsgroups_train.data[:n_samples]
tfidf_feats = tfidf_vectorizer.fit_transform(raw_text)
labels = newsgroups_train.target[:n_samples]


from sklearn.svm import SVC
svc_clf = SVC()


tfidf_feats

<1000x32190 sparse matrix of type '<class 'numpy.float64'>'
	with 157775 stored elements in Compressed Sparse Row format>


train_size = int(n_samples * .8) # 80% data for training
x_train = tfidf_feats[:train_size]
y_train = labels[:train_size]
svc_clf.fit(tfidf_feats, labels)

SVC()


x_test = tfidf_feats[train_size:]
y_test = labels[train_size:]

svc_clf.score(x_test, y_test)

0.995

0	0	1.09861	1.09861	0	0	0	1.09861	0	0	0	1.09861	0.405465	0	0	0	0	0	1.09861	0	0	0	0	0	0	0
0	0	0	0	0	0	1.09861	0	0	0.405465	0	0	0.405465	0	0	1.09861	0	0	0	0	0	0	0	0	1.09861	0
1.09861	1.09861	0	0	1.09861	1.09861	0	0	1.09861	0.405465	1.09861	0	0	1.09861	2.19722	0	1.09861	1.09861	0	1.09861	1.09861	1.09861	1.09861	1.09861	0	1.09861

0	0.410747	0.410747	0	0	0	0.410747	0	0	0.410747	0.242594	0.312384	0	0	0	0	0	0.410747	0	0	0	0	0	0	0
0	0	0	0	0	0.504611	0	0	0	0	0.298032	0.38377	0	0	0.504611	0	0	0	0	0	0	0	0	0.504611	0
0.233451	0	0	0.233451	0.233451	0	0	0.233451	0.233451	0	0.13788	0	0.233451	0.466902	0	0.233451	0.233451	0	0.233451	0.233451	0.233451	0.233451	0.233451	0	0.233451

NLP

Natual Language Processing NLP¶

Feature Extraction¶

Bag of Words (BOW)¶

N-grams¶

n-gram extraction using sklearn¶

Exercise¶

Features Transformation¶

Hashing¶

Sklearn HashingVectorizer¶

TF-IDF¶

TF (Term Frequency)¶

TF in sklearn¶

Exercise¶

Answer¶

Drawback of TF¶

IDF (Inverse Document Frequency)¶

Exercise¶

TFIDF (Term Frequency Inverse Document Frequency)¶

TFIDF in Sklearn¶

Text classification Using TFIDF Features¶

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	1	1	0	0	0	1	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	0	0	1	1	0	0	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	1	1	0	0	0	1	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	0	0	1	1	0	0	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	1	1	0	0	0	1	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	1	0	0	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	0	0	1	1	0	0	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1

0	0	1	1	0	0	0	1	0	0	0	1	1	1	0	0	0	0	0	1	0	0	0	0	0	0	0
0	0	0	0	0	0	1	0	0	1	0	0	1	1	0	0	1	0	0	0	0	0	0	0	0	1	0
1	1	0	0	1	1	0	0	1	1	1	0	1	0	1	2	0	1	1	0	1	1	1	1	1	0	1