{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification with Sklearn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## loading imdb dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "import os\n", "\n", "def download_file(url,local_file, force=False):\n", " \"\"\"\n", " Helper function to download a file and store it locally\n", " \"\"\"\n", " if not os.path.exists(local_file) or force:\n", " print('Downloading')\n", " with urllib.request.urlopen(url) as opener, \\\n", " open(local_file, mode='wb') as outfile:\n", " outfile.write(opener.read())\n", " else:\n", " print('Already downloaded')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_file = 'imdb_train.txt'\n", "train_url='https://goo.gl/FPFnfh'\n", "test_file = 'imdb_test.txt'\n", "test_url = 'https://goo.gl/mg8bsD'\n", "delimiter = '\\t'\n", "\n", "#train_file = 'news_train.txt'\n", "#train_url='https://goo.gl/vxHzTR'\n", "#test_file = 'news_test.txt'\n", "#test_url = 'https://goo.gl/Lz5xNS'\n", "#delimiter = ','\n", "\n", "download_file(train_url, train_file)\n", "download_file(test_url, test_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import csv\n", "x_train = list()\n", "y_train = list()\n", "with open(train_file, encoding='utf-8', newline='') as infile:\n", " reader = csv.reader(infile, delimiter=delimiter)\n", " for row in reader:\n", " x_train.append(row[0])\n", " y_train.append(row[1])\n", "\n", "x_test = list()\n", "y_test = list()\n", "with open(test_file, encoding='utf-8', newline='') as infile:\n", " reader = csv.reader(infile, delimiter=delimiter)\n", " for row in reader:\n", " x_test.append(row[0])\n", " y_test.append(row[1])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train[1]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature extraction with nltk and spacy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import spacy\n", "from nltk.sentiment.util import mark_negation\n", "from time import time\n", "nlp = spacy.load('en')\n", "from nltk.corpus import sentiwordnet as swn\n", "\n", "def spacy_tokenizer(text):\n", " doc = nlp(text)\n", " # tokens\n", " tokens = [token.text for token in doc]\n", " # lemmatized\n", " lemmas = [token.lemma_ for token in doc]\n", " # tokens with negation\n", " tokens_w_neg = mark_negation(tokens,double_neg_flip=True)\n", " bigrams = ['BI_'+w1+'_'+w2 for w1,w2 in nltk.ngrams(tokens_w_neg,2)]\n", " # pos tagging features\n", " taggedtokens = [token.tag_ for token in doc]\n", " pos_bigrams = ['POS_BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(taggedtokens,2)]\n", " # sentiment features\n", " swntokens = list()\n", " swn_threshold = 0.1\n", " for pos,lemma,neg_token in zip(taggedtokens,lemmas,tokens_w_neg):\n", " if pos[0] in ('R', 'N', 'J', 'V'): # translating spacy postag to nltk.swn postag\n", " if pos[0] == 'R':\n", " swnpos = 'r'\n", " if pos[0] == 'N':\n", " swnpos = 'n'\n", " if pos[0] == 'V':\n", " swnpos = 'v'\n", " if pos[0] == 'J':\n", " swnpos = 'a'\n", " values = list(swn.senti_synsets(lemma, swnpos))\n", " if len(values) > 0:\n", " score = 0.0\n", " i = 1\n", " sum = 0.0\n", " for value in values:\n", " score += value.pos_score() / i\n", " score -= value.neg_score() / i\n", " i += 1\n", " sum += 1.0 / i\n", " score /= sum\n", " if score > swn_threshold:\n", " if neg_token.endswith('_NEG'):\n", " swntokens.append('_SWN_NEG_%s' % swnpos)\n", " else:\n", " swntokens.append('_SWN_POS_%s' % swnpos)\n", " elif score < -swn_threshold:\n", " if neg_token.endswith('_NEG'):\n", " swntokens.append('_SWN_POS_%s' % swnpos)\n", " else:\n", " swntokens.append('_SWN_NEG_%s' % swnpos)\n", " all_tokens = list()\n", " all_tokens.extend(tokens_w_neg)\n", " all_tokens.extend(['LEMMA_'+lemma for lemma in lemmas])\n", " all_tokens.extend(bigrams)\n", " all_tokens.extend(pos_bigrams)\n", " all_tokens.extend(swntokens)\n", " return all_tokens" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using sklearn pipeline object" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer\n", "from sklearn.feature_selection import SelectKBest, chi2\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import LinearSVC\n", "\n", "mindf = 5\n", "pipeline = Pipeline([\n", " ('vect', CountVectorizer(analyzer=spacy_tokenizer, min_df=mindf)), # feature extraction\n", " ('sel', SelectKBest(chi2, k=5000)), # feature selection\n", " ('tfidf', TfidfTransformer()), # weighting\n", " ('learner', LinearSVC()) # learning algorithm\n", "])\n", "\n", "classifier = pipeline.fit(x_train,y_train)\n", "predictions = classifier.predict(x_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "accuracy = 0\n", "for prediction,correct in zip(predictions, y_test):\n", " if prediction==correct:\n", " accuracy += 1\n", "print(accuracy/len(predictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spacy_tokenizer(\"I don't like it\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pickle\n", "with open('movie_class_feats.pkl',mode='wb') as outfile:\n", " pickle.dump(pipeline,outfile)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with open('movie_class_feats.pkl',mode='rb') as infile:\n", " pipeline = pickle.load(infile)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tokenizer = pipeline.named_steps['vect']\n", "selector = pipeline.named_steps['sel']\n", "classifier = pipeline.named_steps['learner']\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "feature_names = tokenizer.get_feature_names()\n", "feats_w_score = list()\n", "for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):\n", " feats_w_score.append((score,selected,feature_names[index]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(feature_names)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sorted(feats_w_score,reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "feats_w_classifier_weight = list()\n", "for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):\n", " if weight!=0:\n", " feats_w_classifier_weight.append((weight,feature_names[index]))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sorted(feats_w_classifier_weight,reverse=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sorted(feats_w_classifier_weight)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (TA)", "language": "python", "name": "ta" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }