{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification with Sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## loading imdb dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "import os\n",
    "\n",
    "def download_file(url,local_file, force=False):\n",
    "    \"\"\"\n",
    "    Helper function to download a file and store it locally\n",
    "    \"\"\"\n",
    "    if not os.path.exists(local_file) or force:\n",
    "        print('Downloading')\n",
    "        with urllib.request.urlopen(url) as opener, \\\n",
    "             open(local_file, mode='wb') as outfile:\n",
    "                    outfile.write(opener.read())\n",
    "    else:\n",
    "        print('Already downloaded')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file = 'imdb_train.txt'\n",
    "train_url='https://goo.gl/FPFnfh'\n",
    "test_file = 'imdb_test.txt'\n",
    "test_url = 'https://goo.gl/mg8bsD'\n",
    "delimiter = '\\t'\n",
    "\n",
    "# train_file = 'news_train.txt'\n",
    "# train_url='https://goo.gl/vxHzTR'\n",
    "# test_file = 'news_test.txt'\n",
    "# test_url = 'https://goo.gl/Lz5xNS'\n",
    "# delimiter = ','\n",
    "\n",
    "download_file(train_url, train_file)\n",
    "download_file(test_url, test_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "x_train = list()\n",
    "y_train = list()\n",
    "with open(train_file, encoding='utf-8', newline='') as infile:\n",
    "    reader = csv.reader(infile, delimiter=delimiter)\n",
    "    for row in reader:\n",
    "        x_train.append(row[0])\n",
    "        y_train.append(row[1])\n",
    "\n",
    "x_test = list()\n",
    "y_test = list()\n",
    "with open(test_file, encoding='utf-8', newline='') as infile:\n",
    "    reader = csv.reader(infile, delimiter=delimiter)\n",
    "    for row in reader:\n",
    "        x_test.append(row[0])\n",
    "        y_test.append(row[1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set(y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup of pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer\n",
    "from sklearn.feature_selection import SelectKBest, chi2\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.svm import LinearSVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vect = CountVectorizer()  # tokenization and feature extraction\n",
    "vect.fit(x_train)\n",
    "X_train = vect.transform(x_train)\n",
    "X_test =vect.transform(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vect.get_feature_names()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train[0,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vect.get_feature_names()[60314]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feats = list()\n",
    "for i in range(X_train[0,:].shape[1]):\n",
    "    if X_train[0,i]!=0:\n",
    "        feats.append(i)\n",
    "print(feats,len(feats))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print([vect.get_feature_names()[i] for i in feats])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sel = SelectKBest(chi2, k=5000)  # feature selection\n",
    "sel.fit(X_train,y_train)\n",
    "X_train = sel.transform(X_train)\n",
    "X_test = sel.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sel.get_support()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train[0,:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tfidf = TfidfTransformer()  # weighting\n",
    "tfidf.fit(X_train)\n",
    "X_train = tfidf.transform(X_train)\n",
    "X_test =tfidf.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(X_train[0,:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "learner = LinearSVC()  # learning algorithm\n",
    "classifier = learner.fit(X_train,y_train)\n",
    "predictions = classifier.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation of accuracy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correct = 0\n",
    "for prediction,true_label in zip(predictions, y_test):\n",
    "    if prediction==true_label:\n",
    "        correct += 1\n",
    "print(correct/len(predictions))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using sklearn pipeline object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = Pipeline([\n",
    "    ('vect', CountVectorizer()),  # feature extraction\n",
    "    ('sel', SelectKBest(chi2, k=5000)),  # feature selection\n",
    "    ('tfidf', TfidfTransformer()),  # weighting\n",
    "    ('learner', LinearSVC())  # learning algorithm\n",
    "])\n",
    "\n",
    "classifier = pipeline.fit(x_train,y_train)\n",
    "predictions = classifier.predict(x_test)\n",
    "correct = 0\n",
    "for prediction,true_label in zip(predictions, y_test):\n",
    "    if prediction==true_label:\n",
    "        correct += 1\n",
    "print(correct/len(predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "print('Classification report:')\n",
    "print(classification_report(y_test, predictions))\n",
    "print('Confusion matrix:')\n",
    "cm = confusion_matrix(y_test, predictions)\n",
    "print(cm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (TA)",
   "language": "python",
   "name": "ta"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}