parse_out_email_text.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!/usr/bin/python
  2. from nltk.stem.snowball import SnowballStemmer
  3. import string
  4. def parseOutText(f):
  5. """ given an opened email file f, parse out all text below the
  6. metadata block at the top
  7. (in Part 2, you will also add stemming capabilities)
  8. and return a string that contains all the words
  9. in the email (space-separated)
  10. example use case:
  11. f = open("email_file_name.txt", "r")
  12. text = parseOutText(f)
  13. """
  14. f.seek(0) ### go back to beginning of file (annoying)
  15. all_text = f.read()
  16. ### split off metadata
  17. content = all_text.split("X-FileName:")
  18. words = ""
  19. if len(content) > 1:
  20. ### remove punctuation
  21. text_string = content[1].translate(str.maketrans("", "", string.punctuation))
  22. ### project part 2: comment out the line below
  23. #words = text_string
  24. ### split the text string into individual words, stem each word,
  25. ### and append the stemmed word to words (make sure there's a single
  26. ### space between each stemmed word)
  27. str_list = text_string.split(" ")
  28. #print(str_list)
  29. #不知道为什么会有空的元素夹杂在里边,所以写个循环删掉空的元素
  30. for txt in str_list:
  31. if txt == "":
  32. str_list.remove("")
  33. stemmer = SnowballStemmer("english")
  34. for x in range(0,len(str_list)):
  35. str_list[x] = stemmer.stem(str_list[x])
  36. # 将一个列表转换为字符串并且用空格隔开
  37. words = " ".join(str_list)
  38. #words = str_list
  39. #print("The type is", type(str_list))
  40. return words
  41. def main():
  42. ff = open("../text_learning/test_email.txt", "r")
  43. text = parseOutText(ff)
  44. print(text)
  45. if __name__ == '__main__':
  46. main()