{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pickle\n",
    "import re\n",
    "import sys\n",
    "import gc\n",
    "\n",
    "sys.path.append( \"../tools/\" )\n",
    "from parse_out_email_text import parseOutText"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "    Starter code to process the emails from Sara and Chris to extract\n",
    "    the features and get the documents ready for classification.\n",
    "\n",
    "    The list of all the emails from Sara are in the from_sara list\n",
    "    likewise for emails from Chris (from_chris)\n",
    "\n",
    "    The actual documents are in the Enron email dataset, which\n",
    "    you downloaded/unpacked in Part 0 of the first mini-project. If you have\n",
    "    not obtained the Enron email corpus, run startup.py in the tools folder.\n",
    "\n",
    "    The data is stored in lists and packed away in pickle files at the end."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "emails processed\n"
     ]
    }
   ],
   "source": [
    "from_sara  = open(\"from_sara.txt\", \"r\")\n",
    "from_chris = open(\"from_chris.txt\", \"r\")\n",
    "\n",
    "from_data = []\n",
    "word_data = []\n",
    "\n",
    "### temp_counter is a way to speed up the development--there are\n",
    "### thousands of emails from Sara and Chris, so running over all of them\n",
    "### can take a long time\n",
    "### temp_counter helps you only look at the first 200 emails in the list so you\n",
    "### can iterate your modifications quicker\n",
    "temp_counter = 0\n",
    "\n",
    "\n",
    "for name, from_person in [(\"sara\", from_sara), (\"chris\", from_chris)]:\n",
    "    for path in from_person:\n",
    "        ### only look at first 200 emails when developing\n",
    "        ### once everything is working, remove this line to run over full dataset\n",
    "        #temp_counter += 1\n",
    "        if temp_counter < 201:\n",
    "            path = os.path.join('..', path[:-1])\n",
    "            #print (path)\n",
    "            email = open(path, \"r\")\n",
    "            #print(email.read())\n",
    "            ### use parseOutText to extract the text from the opened email\n",
    "            words = parseOutText(email)\n",
    "            #words = re.sub(\"\\n\",\" \",words)\n",
    "            #words = re.sub(\"\\t\",\" \",words)\n",
    "            # 遍历十次，把 2 个和  2 个以上的空格全部替换成 1 个空格\n",
    "            #for num in range(0,10):\n",
    "                #words = re.sub(\"  \",\" \",words)\n",
    "            #print(words)\n",
    "            #print(type(words))\n",
    "            ### use str.replace() to remove any instances of the words\n",
    "            ### [\"sara\", \"shackleton\", \"chris\", \"germani\"]\n",
    "            name_list = [\"sara\", \"shackleton\", \"chris\", \"germani\",\"sshacklensf\",\"cgermannsf\"]\n",
    "            for word in name_list:            \n",
    "                words = words.replace(word, \"\") \n",
    "            del word\n",
    "            #gc.collect()\n",
    "            ### append the text to word_data\n",
    "            word_data.append(words)\n",
    "            #print(word_data)\n",
    "            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris\n",
    "            #print(name)\n",
    "            if name == \"sara\":from_data.append([0])\n",
    "            if name == \"chris\":from_data.append([1])\n",
    "            #print(from_data)\n",
    "            email.close()\n",
    "#print(from_data)\n",
    "print (\"emails processed\")\n",
    "from_sara.close()\n",
    "from_chris.close()\n",
    "\n",
    "pickle.dump( word_data, open(\"your_word_data.pkl\", \"wb\") )\n",
    "pickle.dump( from_data, open(\"your_email_authors.pkl\", \"wb\") )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tjonesnsf\n",
      "\n",
      "stephani and sam need nymex calendar\n"
     ]
    }
   ],
   "source": [
    "print(word_data[152])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "### in Part 4, do TfIdf vectorization here\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from nltk.corpus import stopwords\n",
    "\n",
    "# 从 nltk 导入停止词\n",
    "stop_words = stopwords.words(\"english\")\n",
    "\n",
    "words = pickle.load(open(\"your_word_data.pkl\",\"rb\"))\n",
    "#words = word_data\n",
    "\"\"\"\n",
    "class sklearn.feature_extraction.text.TfidfVectorizer(input=’content’, encoding=’utf-8’,\n",
    "decode_error=’strict’, strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None,\n",
    "analyzer=’word’, stop_words=None, token_pattern=’(?u)\\b\\w\\w+\\b’, ngram_range=(1, 1), max_df=1.0, \n",
    "min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class ‘numpy.float64’>, norm=’l2’, \n",
    "use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶\n",
    "\"\"\"\n",
    "vectorizer = TfidfVectorizer(stop_words=\"english\", max_df=0.5,sublinear_tf=True)\n",
    "X = vectorizer.fit_transform(words)\n",
    "#print(vectorizer.get_feature_names())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "这里课程要求要打印出有多少个特征词，以及提取出第 34597 个特征词是什么，此处输出的结果全都不对。\n",
    "\n",
    "没有办法知道原因和正确答案，系统提示这里的结果大致正确但是有一些偏差，**我认为问题是在 Stop——words 上，**\n",
    "因为上次在学习使用 NLTK 导入 Stop Words 的时候，**教程中输出了 127 个停止词，而我的 NLTK 输出了更多，\n",
    "有 179 个**，这里应该是差异的主要来源，但是那 127 个停止词我也不知道是什么，所以应该就没法还原出教程中的答案了。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The lenth of stop words 179\n",
      "The shape of words' feature is (17578, 42283)\n",
      "The 34597th feature words is repswarranti\n"
     ]
    }
   ],
   "source": [
    "print(\"The lenth of stop words\", len(stop_words))\n",
    "print(\"The shape of words' feature is\",X.shape)\n",
    "\n",
    "feature_names = vectorizer.get_feature_names()\n",
    "print(\"The 34597th feature words is\", feature_names[34596])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "varInspector": {
   "cols": {
    "lenName": 16,
    "lenType": 16,
    "lenVar": 40
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}