poi_id.py 2.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. #!/usr/bin/python
  2. import sys
  3. import pickle
  4. sys.path.append("../tools/")
  5. from feature_format import featureFormat, targetFeatureSplit
  6. from tester import dump_classifier_and_data
  7. ### Task 1: Select what features you'll use.
  8. ### features_list is a list of strings, each of which is a feature name.
  9. ### The first feature must be "poi".
  10. features_list = ['poi','salary'] # You will need to use more features
  11. ### Load the dictionary containing the dataset
  12. with open("final_project_dataset.pkl", "r") as data_file:
  13. data_dict = pickle.load(data_file)
  14. ### Task 2: Remove outliers
  15. ### Task 3: Create new feature(s)
  16. ### Store to my_dataset for easy export below.
  17. my_dataset = data_dict
  18. ### Extract features and labels from dataset for local testing
  19. data = featureFormat(my_dataset, features_list, sort_keys = True)
  20. labels, features = targetFeatureSplit(data)
  21. ### Task 4: Try a varity of classifiers
  22. ### Please name your classifier clf for easy export below.
  23. ### Note that if you want to do PCA or other multi-stage operations,
  24. ### you'll need to use Pipelines. For more info:
  25. ### http://scikit-learn.org/stable/modules/pipeline.html
  26. # Provided to give you a starting point. Try a variety of classifiers.
  27. from sklearn.naive_bayes import GaussianNB
  28. clf = GaussianNB()
  29. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  30. ### using our testing script. Check the tester.py script in the final project
  31. ### folder for details on the evaluation method, especially the test_classifier
  32. ### function. Because of the small size of the dataset, the script uses
  33. ### stratified shuffle split cross validation. For more info:
  34. ### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
  35. # Example starting point. Try investigating other evaluation techniques!
  36. from sklearn.cross_validation import train_test_split
  37. features_train, features_test, labels_train, labels_test = \
  38. train_test_split(features, labels, test_size=0.3, random_state=42)
  39. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  40. ### check your results. You do not need to change anything below, but make sure
  41. ### that the version of poi_id.py that you submit can be run on its own and
  42. ### generates the necessary .pkl files for validating your results.
  43. dump_classifier_and_data(clf, my_dataset, features_list)