{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-08-27T15:00:22.776639Z", "start_time": "2019-08-27T15:00:18.961286Z" } }, "outputs": [], "source": [ "#!/usr/bin/python\n", "\n", "from sklearn import model_selection # cross_validation\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "import pickle\n", "import numpy\n", "numpy.random.seed(42)\n", "\n", "\n", "# The words (features) and authors (labels), already largely processed.\n", "# These files should have been created from the previous (Lesson 10)\n", "# mini-project.\n", "#words_file = \"word_data.pkl\"\n", "#authors_file = \"email_authors.pkl\"\n", "words_file = \"../text_learning/your_word_data.pkl\"\n", "authors_file = \"../text_learning/your_email_authors.pkl\"\n", "word_data = pickle.load(open(words_file, \"rb\"))\n", "authors = pickle.load(open(authors_file, \"rb\"))\n", "\n", "\n", "# test_size is the percentage of events assigned to the test set (the\n", "# remainder go into training)\n", "# feature matrices changed to dense representations for compatibility with\n", "# classifier functions in versions 0.15.2 and earlier\n", "features_train, features_test, labels_train, labels_test = model_selection.train_test_split(\n", " word_data, authors, test_size=0.1, random_state=42)\n", "\n", "vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n", " stop_words='english')\n", "features_train = vectorizer.fit_transform(features_train)\n", "features_test = vectorizer.transform(features_test).toarray()\n", "\n", "words = vectorizer.get_feature_names()\n", "\n", "# a classic way to overfit is to use a small number\n", "# of data points and a large number of features;\n", "# train on only 150 events to put ourselves in this regime\n", "features_train = features_train[:] # .toarray()\n", "labels_train = labels_train[:]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-08-27T15:03:53.439131Z", "start_time": "2019-08-27T15:03:48.110785Z" }, "run_control": { "marked": true } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# your code goes here\n", "from sklearn.tree import DecisionTreeClassifier\n", "\n", "clf = DecisionTreeClassifier(min_samples_split=40)\n", "clf.fit(features_train, labels_train)\n", "pred = clf.predict(features_test)\n", "print()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-08-27T15:00:37.775144Z", "start_time": "2019-08-27T15:00:37.349169Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Acc: 99.26 %\n" ] } ], "source": [ "print(\"Acc:\", round(clf.score(features_test, labels_test)*100, 2), \"%\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2019-08-27T15:07:21.513721Z", "start_time": "2019-08-27T15:07:21.487146Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy:99.20%\n", "\n", "The most important feature's index: 22819\n", "\n", "Important features:\n", "feature no: 22819\n", "importance: 0.2542521680212961\n", "word: houectect\n" ] } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import accuracy_score\n", "\n", "print(\"Accuracy:%.2f%s\" % (accuracy_score(labels_test, pred)*100, \"%\"))\n", "print(\"\\nThe most important feature's index:\",\n", " clf.feature_importances_.argmax())\n", "print(\"\\nImportant features:\")\n", "for index, feature in enumerate(clf.feature_importances_):\n", " if feature > 0.2:\n", " print(\"feature no:\", index)\n", " print(\"importance:\", feature)\n", " print(\"word:\", words[index])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "position": { "height": "487px", "left": "385px", "right": "20px", "top": "98px", "width": "800px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "window_display": false } }, "nbformat": 4, "nbformat_minor": 4 }