vectorize_text.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/python
  2. import os
  3. import pickle
  4. import re
  5. import sys
  6. sys.path.append( "../tools/" )
  7. from parse_out_email_text import parseOutText
  8. """
  9. Starter code to process the emails from Sara and Chris to extract
  10. the features and get the documents ready for classification.
  11. The list of all the emails from Sara are in the from_sara list
  12. likewise for emails from Chris (from_chris)
  13. The actual documents are in the Enron email dataset, which
  14. you downloaded/unpacked in Part 0 of the first mini-project. If you have
  15. not obtained the Enron email corpus, run startup.py in the tools folder.
  16. The data is stored in lists and packed away in pickle files at the end.
  17. """
  18. from_sara = open("from_sara.txt", "r")
  19. from_chris = open("from_chris.txt", "r")
  20. from_data = []
  21. word_data = []
  22. ### temp_counter is a way to speed up the development--there are
  23. ### thousands of emails from Sara and Chris, so running over all of them
  24. ### can take a long time
  25. ### temp_counter helps you only look at the first 200 emails in the list so you
  26. ### can iterate your modifications quicker
  27. temp_counter = 0
  28. for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
  29. for path in from_person:
  30. ### only look at first 200 emails when developing
  31. ### once everything is working, remove this line to run over full dataset
  32. temp_counter += 1
  33. if temp_counter < 200:
  34. path = os.path.join('..', path[:-1])
  35. print path
  36. email = open(path, "r")
  37. ### use parseOutText to extract the text from the opened email
  38. ### use str.replace() to remove any instances of the words
  39. ### ["sara", "shackleton", "chris", "germani"]
  40. ### append the text to word_data
  41. ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
  42. email.close()
  43. print "emails processed"
  44. from_sara.close()
  45. from_chris.close()
  46. pickle.dump( word_data, open("your_word_data.pkl", "w") )
  47. pickle.dump( from_data, open("your_email_authors.pkl", "w") )
  48. ### in Part 4, do TfIdf vectorization here