{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convolutional neural net with embeddings\n",
    "Adapted from code at https://github.com/fchollet/keras/blob/master/examples/imdb_cnn.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Importing objects and setting parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''This example demonstrates the use of Convolution1D for text classification.\n",
    "Gets to 0.89 test accuracy after 2 epochs.\n",
    "90s/epoch on Intel i5 2.4Ghz CPU.\n",
    "10s/epoch on Tesla K40 GPU.\n",
    "'''\n",
    "\n",
    "from keras.preprocessing import sequence\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Dropout, Activation, Input\n",
    "from keras.layers import Embedding\n",
    "from keras.layers import Conv1D, GlobalMaxPooling1D\n",
    "from keras.datasets import imdb\n",
    "\n",
    "# set parameters:\n",
    "max_features = 5000\n",
    "maxlen = 80\n",
    "batch_size = 32\n",
    "embedding_dims = 50\n",
    "filters = 250\n",
    "kernel_size = 3\n",
    "hidden_dims = 250\n",
    "epochs = 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Loading data...')\n",
    "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n",
    "print(len(x_train), 'train sequences')\n",
    "print(len(x_test), 'test sequences')\n",
    "\n",
    "print('Pad sequences (samples x time)')\n",
    "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
    "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
    "print('x_train shape:', x_train.shape)\n",
    "print('x_test shape:', x_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train[0]"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "## Defining the network"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Build model...')\n",
    "model = Sequential()\n",
    "\n",
    "# we start off with an efficient embedding layer which maps\n",
    "# our vocab indices into embedding_dims dimensions\n",
    "model.add(Embedding(max_features,\n",
    "                    embedding_dims,\n",
    "                    input_length=maxlen))\n",
    "\n",
    "model.add(Dropout(0.2))\n",
    "\n",
    "# we add a Convolution1D, which will learn filters\n",
    "# word group filters of size filter_length:\n",
    "model.add(Conv1D(filters,\n",
    "                 kernel_size,\n",
    "                 padding='valid',\n",
    "                 activation='relu',\n",
    "                 strides=1))\n",
    "# we use max pooling:\n",
    "model.add(GlobalMaxPooling1D())\n",
    "\n",
    "# We add a vanilla hidden layer:\n",
    "model.add(Dense(hidden_dims))\n",
    "model.add(Dropout(0.2))\n",
    "model.add(Activation('relu'))\n",
    "\n",
    "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
    "model.add(Dense(1))\n",
    "model.add(Activation('sigmoid'))\n",
    "\n",
    "model.compile(loss='binary_crossentropy',\n",
    "              optimizer='adam',\n",
    "              metrics=['accuracy'])\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(x_train, y_train,\n",
    "          batch_size=batch_size,\n",
    "          epochs=epochs,\n",
    "          validation_data=(x_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Testing\n",
    "We already used test data during training to check accuracy on held out data, so this step isn't really necessary.\n",
    "\n",
    "However, let's see how to do it after the training."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = model.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions.round()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "errors = y_test-predictions.round().flatten()\n",
    "errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(errors[errors==0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(errors[errors==0])/len(errors)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exploring the embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings = model.get_weights()[0]\n",
    "embeddings.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "embeddings[0].dot(embeddings[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "similarities = cosine_similarity(embeddings)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib notebook\n",
    "import matplotlib.pyplot as plt\n",
    "plt.imshow(similarities, interpolation='nearest')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_index = imdb.get_word_index()\n",
    "word_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index_words =  { value:key for key,value in word_index.items()}\n",
    "index_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "idx = word_index['good']\n",
    "sim = list(enumerate(similarities[idx]))\n",
    "idx,sim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "most_sim = sorted(sim,key=lambda x: x[1],reverse=True)\n",
    "for sim_idx,value in most_sim[:20]:\n",
    "    print(index_words[sim_idx],value)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (TA)",
   "language": "python",
   "name": "ta"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}