## Classification with Sklearn

## loading imdb dataset

In [None]:
import urllib.request
import os

def download_file(url,local_file, force=False):
 """
 Helper function to download a file and store it locally
 """
 if not os.path.exists(local_file) or force:
 print('Downloading')
 with urllib.request.urlopen(url) as opener, \
 open(local_file, mode='wb') as outfile:
 outfile.write(opener.read())
 else:
 print('Already downloaded')

In [None]:
train_file = 'imdb_train.txt'
train_url='https://goo.gl/FPFnfh'
test_file = 'imdb_test.txt'
test_url = 'https://goo.gl/mg8bsD'
delimiter = '\t'

# train_file = 'news_train.txt'
# train_url='https://goo.gl/vxHzTR'
# test_file = 'news_test.txt'
# test_url = 'https://goo.gl/Lz5xNS'
# delimiter = ','

download_file(train_url, train_file)
download_file(test_url, test_file)

In [None]:
import csv
x_train = list()
y_train = list()
with open(train_file, encoding='utf-8', newline='') as infile:
 reader = csv.reader(infile, delimiter=delimiter)
 for row in reader:
 x_train.append(row[0])
 y_train.append(row[1])

x_test = list()
y_test = list()
with open(test_file, encoding='utf-8', newline='') as infile:
 reader = csv.reader(infile, delimiter=delimiter)
 for row in reader:
 x_test.append(row[0])
 y_test.append(row[1])


In [None]:
x_train[1]

In [None]:
set(y_train)

## Setup of pipeline

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
vect = CountVectorizer() # tokenization and feature extraction
vect.fit(x_train)
X_train = vect.transform(x_train)
X_test =vect.transform(x_test)

In [None]:
vect.get_feature_names()

In [None]:
X_train[0,:]

In [None]:
vect.get_feature_names()[60314]

In [None]:
feats = list()
for i in range(X_train[0,:].shape[1]):
 if X_train[0,i]!=0:
 feats.append(i)
print(feats,len(feats))

In [None]:
print([vect.get_feature_names()[i] for i in feats])

In [None]:
sel = SelectKBest(chi2, k=5000) # feature selection
sel.fit(X_train,y_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

In [None]:
sel.get_support()

In [None]:
X_train

In [None]:
X_train[0,:]

In [None]:
tfidf = TfidfTransformer() # weighting
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)
X_test =tfidf.transform(X_test)

In [None]:
print(X_train[0,:])

In [None]:
learner = LinearSVC() # learning algorithm
classifier = learner.fit(X_train,y_train)
predictions = classifier.predict(X_test)

In [None]:
len(predictions)

In [None]:
predictions

## Evaluation of accuracy

In [None]:
correct = 0
for prediction,true_label in zip(predictions, y_test):
 if prediction==true_label:
 correct += 1
print(correct/len(predictions))

## Using sklearn pipeline object

In [None]:
pipeline = Pipeline([
 ('vect', CountVectorizer()), # feature extraction
 ('sel', SelectKBest(chi2, k=5000)), # feature selection
 ('tfidf', TfidfTransformer()), # weighting
 ('learner', LinearSVC()) # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)
correct = 0
for prediction,true_label in zip(predictions, y_test):
 if prediction==true_label:
 correct += 1
print(correct/len(predictions))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)