find_signature.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. #!/usr/bin/python
  2. import pickle
  3. import numpy
  4. numpy.random.seed(42)
  5. ### The words (features) and authors (labels), already largely processed.
  6. ### These files should have been created from the previous (Lesson 10)
  7. ### mini-project.
  8. words_file = "../text_learning/your_word_data.pkl"
  9. authors_file = "../text_learning/your_email_authors.pkl"
  10. word_data = pickle.load( open(words_file, "r"))
  11. authors = pickle.load( open(authors_file, "r") )
  12. ### test_size is the percentage of events assigned to the test set (the
  13. ### remainder go into training)
  14. ### feature matrices changed to dense representations for compatibility with
  15. ### classifier functions in versions 0.15.2 and earlier
  16. from sklearn import cross_validation
  17. features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
  18. from sklearn.feature_extraction.text import TfidfVectorizer
  19. vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
  20. stop_words='english')
  21. features_train = vectorizer.fit_transform(features_train)
  22. features_test = vectorizer.transform(features_test).toarray()
  23. ### a classic way to overfit is to use a small number
  24. ### of data points and a large number of features;
  25. ### train on only 150 events to put ourselves in this regime
  26. features_train = features_train[:150].toarray()
  27. labels_train = labels_train[:150]
  28. ### your code goes here