# Convolutional neural net with embeddings
Adapted from code at https://github.com/fchollet/keras/blob/master/examples/imdb_cnn.py

## Importing objects and setting parameters

In [None]:
'''This example demonstrates the use of Convolution1D for text classification.
Gets to 0.89 test accuracy after 2 epochs.
90s/epoch on Intel i5 2.4Ghz CPU.
10s/epoch on Tesla K40 GPU.
'''

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

# set parameters:
max_features = 5000
maxlen = 80
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

## Loading data

In [None]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

In [None]:
x_train[0]

In [None]:
print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
 embedding_dims,
 input_length=maxlen))

model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
 kernel_size,
 padding='valid',
 activation='relu',
 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
 optimizer='adam',
 metrics=['accuracy'])
model.summary()

## Training

In [None]:
model.fit(x_train, y_train,
 batch_size=batch_size,
 epochs=epochs,
 validation_data=(x_test, y_test))

## Testing
We already used test data during training to check accuracy on held out data, so this step isn't really necessary.

However, let's see how to do it after the training.

In [None]:
predictions = model.predict(x_test)

In [None]:
predictions

In [None]:
predictions.round()

In [None]:
y_test

In [None]:
import numpy as np
errors = y_test-predictions.round().flatten()
errors

In [None]:
len(errors[errors==0])

In [None]:
len(errors[errors==0])/len(errors)

## Exploring the embeddings

In [None]:
model.layers

In [None]:
embeddings = model.get_weights()[0]
embeddings.shape

In [None]:
embeddings[0]

In [None]:
embeddings[0].dot(embeddings[1])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(embeddings)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
plt.imshow(similarities, interpolation='nearest')
plt.show()

In [None]:
word_index = imdb.get_word_index()
word_index

In [None]:
index_words = { value:key for key,value in word_index.items()}
index_words

In [None]:
idx = word_index['good']
sim = list(enumerate(similarities[idx]))
idx,sim

In [None]:
most_sim = sorted(sim,key=lambda x: x[1],reverse=True)
for sim_idx,value in most_sim[:20]:
 print(index_words[sim_idx],value)