{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-27T15:00:22.776639Z",
     "start_time": "2019-08-27T15:00:18.961286Z"
    }
   },
   "outputs": [],
   "source": [
    "#!/usr/bin/python\n",
    "\n",
    "from sklearn import model_selection  # cross_validation\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "import pickle\n",
    "import numpy\n",
    "numpy.random.seed(42)\n",
    "\n",
    "\n",
    "# The words (features) and authors (labels), already largely processed.\n",
    "# These files should have been created from the previous (Lesson 10)\n",
    "# mini-project.\n",
    "#words_file = \"word_data.pkl\"\n",
    "#authors_file = \"email_authors.pkl\"\n",
    "words_file = \"../text_learning/your_word_data.pkl\"\n",
    "authors_file = \"../text_learning/your_email_authors.pkl\"\n",
    "word_data = pickle.load(open(words_file, \"rb\"))\n",
    "authors = pickle.load(open(authors_file, \"rb\"))\n",
    "\n",
    "\n",
    "# test_size is the percentage of events assigned to the test set (the\n",
    "# remainder go into training)\n",
    "# feature matrices changed to dense representations for compatibility with\n",
    "# classifier functions in versions 0.15.2 and earlier\n",
    "features_train, features_test, labels_train, labels_test = model_selection.train_test_split(\n",
    "    word_data, authors, test_size=0.1,  random_state=42)\n",
    "\n",
    "vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,\n",
    "                             stop_words='english')\n",
    "features_train = vectorizer.fit_transform(features_train)\n",
    "features_test = vectorizer.transform(features_test).toarray()\n",
    "\n",
    "words = vectorizer.get_feature_names()\n",
    "\n",
    "# a classic way to overfit is to use a small number\n",
    "# of data points and a large number of features;\n",
    "# train on only 150 events to put ourselves in this regime\n",
    "features_train = features_train[:]  # .toarray()\n",
    "labels_train = labels_train[:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-27T15:03:53.439131Z",
     "start_time": "2019-08-27T15:03:48.110785Z"
    },
    "run_control": {
     "marked": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "# your code goes here\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "clf = DecisionTreeClassifier(min_samples_split=40)\n",
    "clf.fit(features_train, labels_train)\n",
    "pred = clf.predict(features_test)\n",
    "print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-27T15:00:37.775144Z",
     "start_time": "2019-08-27T15:00:37.349169Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Acc: 99.26 %\n"
     ]
    }
   ],
   "source": [
    "print(\"Acc:\", round(clf.score(features_test, labels_test)*100, 2), \"%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-08-27T15:07:21.513721Z",
     "start_time": "2019-08-27T15:07:21.487146Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy:99.20%\n",
      "\n",
      "The most important feature's index: 22819\n",
      "\n",
      "Important features:\n",
      "feature no: 22819\n",
      "importance: 0.2542521680212961\n",
      "word: houectect\n"
     ]
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "print(\"Accuracy:%.2f%s\" % (accuracy_score(labels_test, pred)*100, \"%\"))\n",
    "print(\"\\nThe most important feature's index:\",\n",
    "      clf.feature_importances_.argmax())\n",
    "print(\"\\nImportant features:\")\n",
    "for index, feature in enumerate(clf.feature_importances_):\n",
    "    if feature > 0.2:\n",
    "        print(\"feature no:\", index)\n",
    "        print(\"importance:\", feature)\n",
    "        print(\"word:\", words[index])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "position": {
    "height": "487px",
    "left": "385px",
    "right": "20px",
    "top": "98px",
    "width": "800px"
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}