{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Classification with Sklearn"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## loading imdb dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "import os\n",
    "\n",
    "def download_file(url,local_file, force=False):\n",
    "    \"\"\"\n",
    "    Helper function to download a file and store it locally\n",
    "    \"\"\"\n",
    "    if not os.path.exists(local_file) or force:\n",
    "        print('Downloading')\n",
    "        with urllib.request.urlopen(url) as opener, \\\n",
    "             open(local_file, mode='wb') as outfile:\n",
    "                    outfile.write(opener.read())\n",
    "    else:\n",
    "        print('Already downloaded')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_file = 'imdb_train.txt'\n",
    "train_url='https://goo.gl/FPFnfh'\n",
    "test_file = 'imdb_test.txt'\n",
    "test_url = 'https://goo.gl/mg8bsD'\n",
    "delimiter = '\\t'\n",
    "\n",
    "#train_file = 'news_train.txt'\n",
    "#train_url='https://goo.gl/vxHzTR'\n",
    "#test_file = 'news_test.txt'\n",
    "#test_url = 'https://goo.gl/Lz5xNS'\n",
    "#delimiter = ','\n",
    "\n",
    "download_file(train_url, train_file)\n",
    "download_file(test_url, test_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "x_train = list()\n",
    "y_train = list()\n",
    "with open(train_file, encoding='utf-8', newline='') as infile:\n",
    "    reader = csv.reader(infile, delimiter=delimiter)\n",
    "    for row in reader:\n",
    "        x_train.append(row[0])\n",
    "        y_train.append(row[1])\n",
    "\n",
    "x_test = list()\n",
    "y_test = list()\n",
    "with open(test_file, encoding='utf-8', newline='') as infile:\n",
    "    reader = csv.reader(infile, delimiter=delimiter)\n",
    "    for row in reader:\n",
    "        x_test.append(row[0])\n",
    "        y_test.append(row[1])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_train[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train[1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature extraction with nltk and spacy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "from nltk.sentiment.util import mark_negation\n",
    "from time import time\n",
    "nlp = spacy.load('en')\n",
    "from nltk.corpus import sentiwordnet as swn\n",
    "\n",
    "def spacy_tokenizer(text):\n",
    "    doc = nlp(text)\n",
    "    # tokens\n",
    "    tokens = [token.text for token in doc]\n",
    "    # lemmatized\n",
    "    lemmas = [token.lemma_ for token in doc]\n",
    "    # tokens with negation\n",
    "    tokens_w_neg = mark_negation(tokens,double_neg_flip=True)\n",
    "    bigrams = ['BI_'+w1+'_'+w2 for w1,w2 in nltk.ngrams(tokens_w_neg,2)]\n",
    "    # pos tagging features\n",
    "    taggedtokens = [token.tag_ for token in doc]\n",
    "    pos_bigrams = ['POS_BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(taggedtokens,2)]\n",
    "    # sentiment features\n",
    "    swntokens = list()\n",
    "    swn_threshold = 0.1\n",
    "    for pos,lemma,neg_token in zip(taggedtokens,lemmas,tokens_w_neg):\n",
    "        if pos[0] in ('R', 'N', 'J', 'V'): # translating spacy postag to nltk.swn postag\n",
    "            if pos[0] == 'R':\n",
    "                swnpos = 'r'\n",
    "            if pos[0] == 'N':\n",
    "                swnpos = 'n'\n",
    "            if pos[0] == 'V':\n",
    "                swnpos = 'v'\n",
    "            if pos[0] == 'J':\n",
    "                swnpos = 'a'\n",
    "            values = list(swn.senti_synsets(lemma, swnpos))\n",
    "            if len(values) > 0:\n",
    "                score = 0.0\n",
    "                i = 1\n",
    "                sum = 0.0\n",
    "                for value in values:\n",
    "                    score += value.pos_score() / i\n",
    "                    score -= value.neg_score() / i\n",
    "                    i += 1\n",
    "                    sum += 1.0 / i\n",
    "                score /= sum\n",
    "                if score > swn_threshold:\n",
    "                    if neg_token.endswith('_NEG'):\n",
    "                        swntokens.append('_SWN_NEG_%s' % swnpos)\n",
    "                    else:\n",
    "                        swntokens.append('_SWN_POS_%s' % swnpos)\n",
    "                elif score < -swn_threshold:\n",
    "                    if neg_token.endswith('_NEG'):\n",
    "                        swntokens.append('_SWN_POS_%s' % swnpos)\n",
    "                    else:\n",
    "                        swntokens.append('_SWN_NEG_%s' % swnpos)\n",
    "    all_tokens = list()\n",
    "    all_tokens.extend(tokens_w_neg)\n",
    "    all_tokens.extend(['LEMMA_'+lemma for lemma in lemmas])\n",
    "    all_tokens.extend(bigrams)\n",
    "    all_tokens.extend(pos_bigrams)\n",
    "    all_tokens.extend(swntokens)\n",
    "    return all_tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using sklearn pipeline object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer\n",
    "from sklearn.feature_selection import SelectKBest, chi2\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "mindf = 5\n",
    "pipeline = Pipeline([\n",
    "    ('vect', CountVectorizer(analyzer=spacy_tokenizer, min_df=mindf)),  # feature extraction\n",
    "    ('sel', SelectKBest(chi2, k=5000)),  # feature selection\n",
    "    ('tfidf', TfidfTransformer()),  # weighting\n",
    "    ('learner', LinearSVC())  # learning algorithm\n",
    "])\n",
    "\n",
    "classifier = pipeline.fit(x_train,y_train)\n",
    "predictions = classifier.predict(x_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "accuracy = 0\n",
    "for prediction,correct in zip(predictions, y_test):\n",
    "    if prediction==correct:\n",
    "        accuracy += 1\n",
    "print(accuracy/len(predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spacy_tokenizer(\"I don't like it\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "with open('movie_class_feats.pkl',mode='wb') as outfile:\n",
    "    pickle.dump(pipeline,outfile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('movie_class_feats.pkl',mode='rb') as infile:\n",
    "    pipeline = pickle.load(infile)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = pipeline.named_steps['vect']\n",
    "selector = pipeline.named_steps['sel']\n",
    "classifier = pipeline.named_steps['learner']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = tokenizer.get_feature_names()\n",
    "feats_w_score = list()\n",
    "for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):\n",
    "    feats_w_score.append((score,selected,feature_names[index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(feature_names)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(feats_w_score,reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "feats_w_classifier_weight = list()\n",
    "for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):\n",
    "    if weight!=0:\n",
    "        feats_w_classifier_weight.append((weight,feature_names[index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(feats_w_classifier_weight,reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted(feats_w_classifier_weight)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (TA)",
   "language": "python",
   "name": "ta"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}