outlier_removal_regression.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/python
  2. from sklearn.model_selection import train_test_split
  3. import random
  4. import numpy
  5. import matplotlib.pyplot as plt
  6. import pickle
  7. import sys
  8. from outlier_cleaner import outlierCleaner
  9. ### load up some practice data with outliers in it
  10. # sys.path.append("/Users/l-zl/Desktop/ud120-projects/outliers/")
  11. # sys.path.append("/Users/l-zl/Desktop/ud120-projects/outliers/")
  12. ages = pickle.load(open("practice_outliers_ages.pkl", "r"))
  13. net_worths = pickle.load(open("practice_outliers_net_worths.pkl", "r"))
  14. ### ages and net_worths need to be reshaped into 2D numpy arrays
  15. ### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
  16. ### by convention, n_rows is the number of data points
  17. ### and n_columns is the number of features
  18. ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
  19. net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))
  20. ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(
  21. ages, net_worths, test_size=0.1, random_state=42)
  22. ### fill in a regression here! Name the regression object reg so that
  23. ### the plotting code below works, and you can see what your regression looks like
  24. from sklearn import linear_model
  25. reg = linear_model.LinearRegression()
  26. reg.fit(ages_train,net_worths_train)
  27. print "The slope is", reg.coef_
  28. print "The score is",reg.score(ages_test,net_worths_test)
  29. try:
  30. plt.plot(ages, reg.predict(ages), color="blue")
  31. except NameError:
  32. pass
  33. plt.scatter(ages, net_worths)
  34. plt.show()
  35. ### identify and remove the most outlier-y points
  36. cleaned_data = []
  37. try:
  38. predictions = reg.predict(ages_train)
  39. cleaned_data = outlierCleaner(predictions, ages_train, net_worths_train)
  40. except NameError:
  41. print "your regression object doesn't exist, or isn't name reg"
  42. print "can't make predictions to use in identifying outliers"
  43. ### only run this code if cleaned_data is returning data
  44. if len(cleaned_data) > 0:
  45. ages, net_worths, errors = zip(*cleaned_data)
  46. ages = numpy.reshape(numpy.array(ages), (len(ages), 1))
  47. net_worths = numpy.reshape(numpy.array(net_worths), (len(net_worths), 1))
  48. ### refit your cleaned data!
  49. try:
  50. reg.fit(ages, net_worths)
  51. print "The cleaned data slope is",reg.coef_
  52. print "The cleaned data score is",reg.score(ages_test,net_worths_test)
  53. plt.plot(ages, reg.predict(ages), color="blue")
  54. except NameError:
  55. print "you don't seem to have regression imported/created,"
  56. print " or else your regression object isn't named reg"
  57. print " either way, only draw the scatter plot of the cleaned data"
  58. plt.scatter(ages, net_worths)
  59. plt.xlabel("ages")
  60. plt.ylabel("net worths")
  61. plt.show()
  62. else:
  63. print "outlierCleaner() is returning an empty list, no refitting to be done"