## Classification with Sklearn

## loading imdb dataset

In [None]:
import urllib.request
import os

def download_file(url,local_file, force=False):
 """
 Helper function to download a file and store it locally
 """
 if not os.path.exists(local_file) or force:
 print('Downloading')
 with urllib.request.urlopen(url) as opener, \
 open(local_file, mode='wb') as outfile:
 outfile.write(opener.read())
 else:
 print('Already downloaded')

In [None]:
train_file = 'imdb_train.txt'
train_url='https://goo.gl/FPFnfh'
test_file = 'imdb_test.txt'
test_url = 'https://goo.gl/mg8bsD'
delimiter = '\t'

#train_file = 'news_train.txt'
#train_url='https://goo.gl/vxHzTR'
#test_file = 'news_test.txt'
#test_url = 'https://goo.gl/Lz5xNS'
#delimiter = ','

download_file(train_url, train_file)
download_file(test_url, test_file)

In [None]:
import csv
x_train = list()
y_train = list()
with open(train_file, encoding='utf-8', newline='') as infile:
 reader = csv.reader(infile, delimiter=delimiter)
 for row in reader:
 x_train.append(row[0])
 y_train.append(row[1])

x_test = list()
y_test = list()
with open(test_file, encoding='utf-8', newline='') as infile:
 reader = csv.reader(infile, delimiter=delimiter)
 for row in reader:
 x_test.append(row[0])
 y_test.append(row[1])


In [None]:
x_train[1]

In [None]:
y_train[1]

## Feature extraction with nltk and spacy

In [None]:
import spacy
from nltk.sentiment.util import mark_negation
from time import time
nlp = spacy.load('en')
from nltk.corpus import sentiwordnet as swn

def spacy_tokenizer(text):
 doc = nlp(text)
 # tokens
 tokens = [token.text for token in doc]
 # lemmatized
 lemmas = [token.lemma_ for token in doc]
 # tokens with negation
 tokens_w_neg = mark_negation(tokens,double_neg_flip=True)
 bigrams = ['BI_'+w1+'_'+w2 for w1,w2 in nltk.ngrams(tokens_w_neg,2)]
 # pos tagging features
 taggedtokens = [token.tag_ for token in doc]
 pos_bigrams = ['POS_BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(taggedtokens,2)]
 # sentiment features
 swntokens = list()
 swn_threshold = 0.1
 for pos,lemma,neg_token in zip(taggedtokens,lemmas,tokens_w_neg):
 if pos[0] in ('R', 'N', 'J', 'V'): # translating spacy postag to nltk.swn postag
 if pos[0] == 'R':
 swnpos = 'r'
 if pos[0] == 'N':
 swnpos = 'n'
 if pos[0] == 'V':
 swnpos = 'v'
 if pos[0] == 'J':
 swnpos = 'a'
 values = list(swn.senti_synsets(lemma, swnpos))
 if len(values) > 0:
 score = 0.0
 i = 1
 sum = 0.0
 for value in values:
 score += value.pos_score() / i
 score -= value.neg_score() / i
 i += 1
 sum += 1.0 / i
 score /= sum
 if score > swn_threshold:
 if neg_token.endswith('_NEG'):
 swntokens.append('_SWN_NEG_%s' % swnpos)
 else:
 swntokens.append('_SWN_POS_%s' % swnpos)
 elif score < -swn_threshold:
 if neg_token.endswith('_NEG'):
 swntokens.append('_SWN_POS_%s' % swnpos)
 else:
 swntokens.append('_SWN_NEG_%s' % swnpos)
 all_tokens = list()
 all_tokens.extend(tokens_w_neg)
 all_tokens.extend(['LEMMA_'+lemma for lemma in lemmas])
 all_tokens.extend(bigrams)
 all_tokens.extend(pos_bigrams)
 all_tokens.extend(swntokens)
 return all_tokens

## Using sklearn pipeline object

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

mindf = 5
pipeline = Pipeline([
 ('vect', CountVectorizer(analyzer=spacy_tokenizer, min_df=mindf)), # feature extraction
 ('sel', SelectKBest(chi2, k=5000)), # feature selection
 ('tfidf', TfidfTransformer()), # weighting
 ('learner', LinearSVC()) # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)

In [None]:
accuracy = 0
for prediction,correct in zip(predictions, y_test):
 if prediction==correct:
 accuracy += 1
print(accuracy/len(predictions))

In [None]:
spacy_tokenizer("I don't like it")

In [None]:
import pickle
with open('movie_class_feats.pkl',mode='wb') as outfile:
 pickle.dump(pipeline,outfile)

In [None]:
with open('movie_class_feats.pkl',mode='rb') as infile:
 pipeline = pickle.load(infile)

In [None]:
tokenizer = pipeline.named_steps['vect']
selector = pipeline.named_steps['sel']
classifier = pipeline.named_steps['learner']


In [None]:
feature_names = tokenizer.get_feature_names()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
 feats_w_score.append((score,selected,feature_names[index]))

In [None]:
len(feature_names)

In [None]:
sorted(feats_w_score,reverse=True)

In [None]:
feats_w_classifier_weight = list()
for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):
 if weight!=0:
 feats_w_classifier_weight.append((weight,feature_names[index]))

In [None]:
sorted(feats_w_classifier_weight,reverse=True)

In [None]:
sorted(feats_w_classifier_weight)