
# Sentiment classification with VADER

## loading IMDB dataset
Only the test data is loaded, since VADER does not require training data

In [None]:
#download only once
from urllib import request
url = "https://goo.gl/mg8bsD"
response = request.urlopen(url)
text = response.read().decode('utf-8')
with open('imdb_test.txt',mode='w',encoding='utf-8') as outputfile:
    outputfile.write(text)


In [None]:
with open('imdb_test.txt',mode='r',encoding='utf-8') as inputfile:
    text = inputfile.read()

In [None]:
import csv
x_test = list()
y_test = list()
with open('imdb_test.txt', encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter='\t')
    for row in reader:
        x_test.append(row[0])
        y_test.append(int(row[1]))
x_test[0],y_test[0]

In [None]:
len(x_test)

## creating VADER classifier

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer()

In [None]:
vader.polarity_scores('not the best experience I had')

In [None]:
vader.polarity_scores(':P')

## classification of test data

In [None]:
scores = list()
for text in x_test:
    scores.append(vader.polarity_scores(text)['compound'])

In [None]:
list(zip(scores,y_test))

Let's try to assign labels only when the confidence is very high.
This is not useful when you must classify all documents.
It is instead useful when you want to bootstrap a training set

In [None]:
selection = []
for score,label in list(zip(scores,y_test)):
    if abs(score)>0.99:
        selection.append((score,label))

In [None]:
len(selection),len(y_test)

## Evaluation of accuracy

In [None]:
accuracy = 0
for prediction,correct in zip(scores, y_test):
    if prediction>0 and correct==1 or prediction<=0 and correct==0:
        accuracy += 1
print(len(scores),accuracy/len(scores))

The following evaluation is not fair, because it is not computed on the full training set.
Yet it shows that the subset of documents that get a label is more accurately labeled.
So if such subset is used as a training set for a supervised learning algorithm you can expect to learn a better classifier.

In [None]:
accuracy = 0
for prediction,correct in selection:
    if prediction>0 and correct==1 or prediction<=0 and correct==0:
        accuracy += 1
print(len(selection),accuracy/len(selection))