tester.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/pickle
  2. """ a basic script for importing student's POI identifier,
  3. and checking the results that they get from it
  4. requires that the algorithm, dataset, and features list
  5. be written to my_classifier.pkl, my_dataset.pkl, and
  6. my_feature_list.pkl, respectively
  7. that process should happen at the end of poi_id.py
  8. """
  9. import sys
  10. from sklearn.model_selection import StratifiedShuffleSplit
  11. sys.path.append("../tools/")
  12. from feature_format import featureFormat, targetFeatureSplit
  13. import pickle
  14. PERF_FORMAT_STRING = "\
  15. \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
  16. Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
  17. RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
  18. \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
  19. def test_classifier(clf, dataset, feature_list, folds=1000):
  20. data = featureFormat(dataset, feature_list, sort_keys=True)
  21. labels, features = targetFeatureSplit(data)
  22. cv = StratifiedShuffleSplit(labels, folds, random_state=42)
  23. true_negatives = 0
  24. false_negatives = 0
  25. true_positives = 0
  26. false_positives = 0
  27. for train_idx, test_idx in cv:
  28. features_train = []
  29. features_test = []
  30. labels_train = []
  31. labels_test = []
  32. for ii in train_idx:
  33. features_train.append(features[ii])
  34. labels_train.append(labels[ii])
  35. for jj in test_idx:
  36. features_test.append(features[jj])
  37. labels_test.append(labels[jj])
  38. # fit the classifier using training set, and test on test set
  39. clf.fit(features_train, labels_train)
  40. predictions = clf.predict(features_test)
  41. for prediction, truth in zip(predictions, labels_test):
  42. if prediction == 0 and truth == 0:
  43. true_negatives += 1
  44. elif prediction == 0 and truth == 1:
  45. false_negatives += 1
  46. elif prediction == 1 and truth == 0:
  47. false_positives += 1
  48. elif prediction == 1 and truth == 1:
  49. true_positives += 1
  50. else:
  51. print("Warning: Found a predicted label not == 0 or 1.")
  52. print("All predictions should take value 0 or 1.")
  53. print("Evaluating performance for processed predictions:")
  54. break
  55. try:
  56. total_predictions = true_negatives + \
  57. false_negatives + false_positives + true_positives
  58. accuracy = 1.0*(true_positives + true_negatives)/total_predictions
  59. precision = 1.0*true_positives/(true_positives+false_positives)
  60. recall = 1.0*true_positives/(true_positives+false_negatives)
  61. f1 = 2.0 * true_positives / \
  62. (2*true_positives + false_positives+false_negatives)
  63. f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
  64. print(clf)
  65. print(PERF_FORMAT_STRING.format(accuracy, precision,
  66. recall, f1, f2, display_precision=5))
  67. print(RESULTS_FORMAT_STRING.format(total_predictions,
  68. true_positives, false_positives, false_negatives, true_negatives))
  69. print("")
  70. except:
  71. print("Got a divide by zero when trying out:", clf)
  72. print("Precision or recall may be undefined due to a lack of true positive predicitons.")
  73. CLF_PICKLE_FILENAME = "my_classifier.pkl"
  74. DATASET_PICKLE_FILENAME = "my_dataset.pkl"
  75. FEATURE_LIST_FILENAME = "my_feature_list.pkl"
  76. def dump_classifier_and_data(clf, dataset, feature_list):
  77. with open(CLF_PICKLE_FILENAME, "wb+") as clf_outfile:
  78. pickle.dump(clf, clf_outfile)
  79. #clf_outfile.dump(clf, clf_outfile)
  80. with open(DATASET_PICKLE_FILENAME, "wb+") as dataset_outfile:
  81. pickle.dump(dataset, dataset_outfile)
  82. with open(FEATURE_LIST_FILENAME, "wb+") as featurelist_outfile:
  83. pickle.dump(feature_list, featurelist_outfile)
  84. def load_classifier_and_data():
  85. with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:
  86. clf = pickle.load(clf_infile)
  87. with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:
  88. dataset = pickle.load(dataset_infile)
  89. with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:
  90. feature_list = pickle.load(featurelist_infile)
  91. return clf, dataset, feature_list
  92. def main():
  93. # load up student's classifier, dataset, and feature_list
  94. clf, dataset, feature_list = load_classifier_and_data()
  95. # Run testing script
  96. test_classifier(clf, dataset, feature_list)
  97. if __name__ == '__main__':
  98. main()