In [None]:
from keras.utils.data_utils import get_file
import csv

In [None]:
train_file = get_file('imdb_train.txt', origin='https://goo.gl/FPFnfh')
test_file = get_file('imdb_test.txt', origin='https://goo.gl/mg8bsD')
text = ''
with open(train_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter='\t')
    for row in reader:
        text += row[0]
with open(test_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter='\t')
    for row in reader:
        text += row[0]
print('corpus length:', len(text))

In [None]:
from nltk.tokenize import word_tokenize
words = word_tokenize(text.lower())

In [None]:
from collections import Counter, deque
from math import log

class SOPMI:
    def __init__(self, pos_seeds, neg_seeds, words, near = 10):
        self.w_count = Counter()
        self.p_count = dict()
        self.pos_seeds = set(pos_seeds)
        self.neg_seeds = set(neg_seeds)
        window = deque()
        window_size = near*2+1
        for word in words:
            window.append(word)
            if len(window)>window_size:
                window.popleft()
            elif len(window)<window_size:
                continue
            current_word = window[near]
            if current_word in self.pos_seeds or current_word in self.neg_seeds:
                self.w_count[current_word] += 1
                if current_word not in self.p_count:
                    self.p_count[current_word] = Counter()
                for window_word in window:
                    self.p_count[current_word][window_word] += 1

    def hits(self,word):
        return self.w_count[word]+1
    
    def hits_near(self,w1,w2):
        if w1 in self.p_count:
            return self.p_count[w1][w2]+1
        else:
            return 1
     
    def so(self, word):
        score = 0
        for pos_seed in self.pos_seeds:
            score += log(self.hits_near(pos_seed,word)/self.hits(pos_seed))
        for neg_seed in self.neg_seeds:
            score -= log(self.hits_near(neg_seed,word)/self.hits(neg_seed))
        return score   

In [None]:
pos_seeds = ['good','nice','excellent','positive','fortunate','correct','superior']
neg_seeds = ['bad','nasty','poor','negative','unfortunate','wrong','inferior']

In [None]:
print(words[:200])

In [None]:
model = SOPMI(pos_seeds, neg_seeds, words)

In [None]:
model.so('unnatural')

In [None]:
model.so('elegant')

In [None]:
from nltk.corpus import stopwords

sw = stopwords.words('english')

In [None]:
vocab = set(words)
so_values = [(word, model.so(word)) for word in vocab if word not in pos_seeds and word not in neg_seeds and word not in sw]
so_values.sort(key = lambda x: x[1])

In [None]:
so_values[:50]

In [None]:
so_values[-50:]