{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Classification with Sklearn" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## loading imdb dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import urllib.request\n", "import os\n", "\n", "def download_file(url,local_file, force=False):\n", " \"\"\"\n", " Helper function to download a file and store it locally\n", " \"\"\"\n", " if not os.path.exists(local_file) or force:\n", " print('Downloading')\n", " with urllib.request.urlopen(url) as opener, \\\n", " open(local_file, mode='wb') as outfile:\n", " outfile.write(opener.read())\n", " else:\n", " print('Already downloaded')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_file = 'imdb_train.txt'\n", "train_url='https://goo.gl/FPFnfh'\n", "test_file = 'imdb_test.txt'\n", "test_url = 'https://goo.gl/mg8bsD'\n", "delimiter = '\\t'\n", "\n", "# train_file = 'news_train.txt'\n", "# train_url='https://goo.gl/vxHzTR'\n", "# test_file = 'news_test.txt'\n", "# test_url = 'https://goo.gl/Lz5xNS'\n", "# delimiter = ','\n", "\n", "download_file(train_url, train_file)\n", "download_file(test_url, test_file)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import csv\n", "x_train = list()\n", "y_train = list()\n", "with open(train_file, encoding='utf-8', newline='') as infile:\n", " reader = csv.reader(infile, delimiter=delimiter)\n", " for row in reader:\n", " x_train.append(row[0])\n", " y_train.append(row[1])\n", "\n", "x_test = list()\n", "y_test = list()\n", "with open(test_file, encoding='utf-8', newline='') as infile:\n", " reader = csv.reader(infile, delimiter=delimiter)\n", " for row in reader:\n", " x_test.append(row[0])\n", " y_test.append(row[1])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "x_train[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "set(y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Setup of pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer\n", "from sklearn.feature_selection import SelectKBest, chi2\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.svm import LinearSVC" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vect = CountVectorizer() # tokenization and feature extraction\n", "vect.fit(x_train)\n", "X_train = vect.transform(x_train)\n", "X_test =vect.transform(x_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vect.get_feature_names()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train[0,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vect.get_feature_names()[60314]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "feats = list()\n", "for i in range(X_train[0,:].shape[1]):\n", " if X_train[0,i]!=0:\n", " feats.append(i)\n", "print(feats,len(feats))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print([vect.get_feature_names()[i] for i in feats])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sel = SelectKBest(chi2, k=5000) # feature selection\n", "sel.fit(X_train,y_train)\n", "X_train = sel.transform(X_train)\n", "X_test = sel.transform(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sel.get_support()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train[0,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "tfidf = TfidfTransformer() # weighting\n", "tfidf.fit(X_train)\n", "X_train = tfidf.transform(X_train)\n", "X_test =tfidf.transform(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(X_train[0,:])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "learner = LinearSVC() # learning algorithm\n", "classifier = learner.fit(X_train,y_train)\n", "predictions = classifier.predict(X_test)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(predictions)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Evaluation of accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correct = 0\n", "for prediction,true_label in zip(predictions, y_test):\n", " if prediction==true_label:\n", " correct += 1\n", "print(correct/len(predictions))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Using sklearn pipeline object" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline([\n", " ('vect', CountVectorizer()), # feature extraction\n", " ('sel', SelectKBest(chi2, k=5000)), # feature selection\n", " ('tfidf', TfidfTransformer()), # weighting\n", " ('learner', LinearSVC()) # learning algorithm\n", "])\n", "\n", "classifier = pipeline.fit(x_train,y_train)\n", "predictions = classifier.predict(x_test)\n", "correct = 0\n", "for prediction,true_label in zip(predictions, y_test):\n", " if prediction==true_label:\n", " correct += 1\n", "print(correct/len(predictions))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix, classification_report\n", "print('Classification report:')\n", "print(classification_report(y_test, predictions))\n", "print('Confusion matrix:')\n", "cm = confusion_matrix(y_test, predictions)\n", "print(cm)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (TA)", "language": "python", "name": "ta" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 }