{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The type is \n", "['\\n\\nhi', 'everyon', 'if', 'you', 'can', 'read', 'this', 'messag', 'your', 'proper', 'use', 'parseouttext', 'pleas', 'proceed', 'to', 'the', 'next', 'part', 'of', 'the', 'project\\n']\n" ] } ], "source": [ "#!/usr/bin/python\n", "\n", "from nltk.stem.snowball import SnowballStemmer\n", "import string\n", "\n", "def parseOutText(f):\n", " \"\"\" given an opened email file f, parse out all text below the\n", " metadata block at the top\n", " (in Part 2, you will also add stemming capabilities)\n", " and return a string that contains all the words\n", " in the email (space-separated) \n", " \n", " example use case:\n", " f = open(\"email_file_name.txt\", \"r\")\n", " text = parseOutText(f)\n", " \n", " \"\"\"\n", "\n", "\n", " f.seek(0) ### go back to beginning of file (annoying)\n", " all_text = f.read()\n", "\n", " ### split off metadata\n", " content = all_text.split(\"X-FileName:\")\n", " words = \"\"\n", " if len(content) > 1:\n", " ### remove punctuation\n", " text_string = content[1].translate(str.maketrans(\"\", \"\", string.punctuation))\n", "\n", " ### project part 2: comment out the line below\n", " #words = text_string\n", "\n", " ### split the text string into individual words, stem each word,\n", " ### and append the stemmed word to words (make sure there's a single\n", " ### space between each stemmed word)\n", " str_list = text_string.split(\" \")\n", " #print(str_list)\n", " #不知道为什么会有空的元素夹杂在里边,所以写个循环删掉空的元素\n", " for txt in str_list:\n", " if txt == \"\":\n", " str_list.remove(\"\")\n", " \n", " stemmer = SnowballStemmer(\"english\")\n", " for x in range(0,len(str_list)):\n", " str_list[x] = stemmer.stem(str_list[x])\n", " \n", " # 将一个列表转换为字符串并且用空格隔开\n", " #words = \" \".join(str_list)\n", " words = str_list\n", " print(\"The type is\", type(str_list))\n", " \n", " return words\n", "\n", " \n", "\n", "def main():\n", " ff = open(\"../text_learning/test_email.txt\", \"r\")\n", " text = parseOutText(ff)\n", " print(text)\n", "\n", "\n", "\n", "if __name__ == '__main__':\n", " main()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }