|
|
@@ -0,0 +1,227 @@
|
|
|
+{
|
|
|
+ "cells": [
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 1,
|
|
|
+ "metadata": {
|
|
|
+ "ExecuteTime": {
|
|
|
+ "end_time": "2019-08-11T07:24:24.912061Z",
|
|
|
+ "start_time": "2019-08-11T07:24:24.894997Z"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "#!/usr/bin/python\n",
|
|
|
+ "import random\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def makeTerrainData(n_points=100000):\n",
|
|
|
+ " ###############################################################################\n",
|
|
|
+ " # make the toy dataset\n",
|
|
|
+ " random.seed(42)\n",
|
|
|
+ " grade = [random.random() for ii in range(0, n_points)]\n",
|
|
|
+ " bumpy = [random.random() for ii in range(0, n_points)]\n",
|
|
|
+ " error = [random.random() for ii in range(0, n_points)]\n",
|
|
|
+ " y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii])\n",
|
|
|
+ " for ii in range(0, n_points)]\n",
|
|
|
+ " for ii in range(0, len(y)):\n",
|
|
|
+ " if grade[ii] > 0.8 or bumpy[ii] > 0.8:\n",
|
|
|
+ " y[ii] = 1.0\n",
|
|
|
+ "\n",
|
|
|
+ "# split into train/test sets\n",
|
|
|
+ " X = [[gg, ss] for gg, ss in zip(grade, bumpy)]\n",
|
|
|
+ " split = int(0.75*n_points)\n",
|
|
|
+ " X_train = X[0:split]\n",
|
|
|
+ " X_test = X[split:]\n",
|
|
|
+ " y_train = y[0:split]\n",
|
|
|
+ " y_test = y[split:]\n",
|
|
|
+ "\n",
|
|
|
+ " grade_sig = [X_train[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_train)) if y_train[ii] == 0]\n",
|
|
|
+ " bumpy_sig = [X_train[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_train)) if y_train[ii] == 0]\n",
|
|
|
+ " grade_bkg = [X_train[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_train)) if y_train[ii] == 1]\n",
|
|
|
+ " bumpy_bkg = [X_train[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_train)) if y_train[ii] == 1]\n",
|
|
|
+ "\n",
|
|
|
+ " training_data = {\"fast\": {\"grade\": grade_sig, \"bumpiness\": bumpy_sig}, \"slow\": {\n",
|
|
|
+ " \"grade\": grade_bkg, \"bumpiness\": bumpy_bkg}}\n",
|
|
|
+ "\n",
|
|
|
+ " grade_sig = [X_test[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 0]\n",
|
|
|
+ " bumpy_sig = [X_test[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 0]\n",
|
|
|
+ " grade_bkg = [X_test[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 1]\n",
|
|
|
+ " bumpy_bkg = [X_test[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 1]\n",
|
|
|
+ "\n",
|
|
|
+ " test_data = {\"fast\": {\"grade\": grade_sig, \"bumpiness\": bumpy_sig},\n",
|
|
|
+ " \"slow\": {\"grade\": grade_bkg, \"bumpiness\": bumpy_bkg}}\n",
|
|
|
+ "\n",
|
|
|
+ " return X_train, y_train, X_test, y_test"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 2,
|
|
|
+ "metadata": {
|
|
|
+ "ExecuteTime": {
|
|
|
+ "end_time": "2019-08-11T07:24:25.888057Z",
|
|
|
+ "start_time": "2019-08-11T07:24:25.874599Z"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "#!/usr/bin/python\n",
|
|
|
+ "\n",
|
|
|
+ "import base64\n",
|
|
|
+ "import json\n",
|
|
|
+ "import matplotlib.pyplot as plt\n",
|
|
|
+ "import numpy as np\n",
|
|
|
+ "import pylab as pl\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def prettyPicture(clf, X_test, y_test):\n",
|
|
|
+ " x_min = 0.0\n",
|
|
|
+ " x_max = 1.0\n",
|
|
|
+ " y_min = 0.0\n",
|
|
|
+ " y_max = 1.0\n",
|
|
|
+ "\n",
|
|
|
+ " # Plot the decision boundary. For that, we will assign a color to each\n",
|
|
|
+ " # point in the mesh [x_min, m_max]x[y_min, y_max].\n",
|
|
|
+ " h = .01 # step size in the mesh\n",
|
|
|
+ " xx, yy = np.meshgrid(np.arange(x_min, x_max, h),\n",
|
|
|
+ " np.arange(y_min, y_max, h))\n",
|
|
|
+ " Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
|
|
|
+ "\n",
|
|
|
+ " # Put the result into a color plot\n",
|
|
|
+ " Z = Z.reshape(xx.shape)\n",
|
|
|
+ " plt.xlim(xx.min(), xx.max())\n",
|
|
|
+ " plt.ylim(yy.min(), yy.max())\n",
|
|
|
+ "\n",
|
|
|
+ " plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)\n",
|
|
|
+ "\n",
|
|
|
+ " # Plot also the test points\n",
|
|
|
+ " grade_sig = [X_test[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 0]\n",
|
|
|
+ " bumpy_sig = [X_test[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 0]\n",
|
|
|
+ " grade_bkg = [X_test[ii][0]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 1]\n",
|
|
|
+ " bumpy_bkg = [X_test[ii][1]\n",
|
|
|
+ " for ii in range(0, len(X_test)) if y_test[ii] == 1]\n",
|
|
|
+ "\n",
|
|
|
+ " plt.scatter(grade_sig, bumpy_sig, color=\"b\", label=\"fast\")\n",
|
|
|
+ " plt.scatter(grade_bkg, bumpy_bkg, color=\"r\", label=\"slow\")\n",
|
|
|
+ " plt.legend()\n",
|
|
|
+ " plt.xlabel(\"bumpiness\")\n",
|
|
|
+ " plt.ylabel(\"grade\")\n",
|
|
|
+ "\n",
|
|
|
+ " #plt.savefig(\"test.png\",dpi=300)\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "def output_image(name, format, bytes):\n",
|
|
|
+ " image_start = \"BEGIN_IMAGE_f9825uweof8jw9fj4r8\"\n",
|
|
|
+ " image_end = \"END_IMAGE_0238jfw08fjsiufhw8frs\"\n",
|
|
|
+ " data = {}\n",
|
|
|
+ " data['name'] = name\n",
|
|
|
+ " data['format'] = format\n",
|
|
|
+ " data['bytes'] = base64.encodestring(bytes)\n",
|
|
|
+ " print(image_start+json.dumps(data)+image_end)"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 3,
|
|
|
+ "metadata": {
|
|
|
+ "ExecuteTime": {
|
|
|
+ "end_time": "2019-08-11T07:24:27.027564Z",
|
|
|
+ "start_time": "2019-08-11T07:24:27.022499Z"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [],
|
|
|
+ "source": [
|
|
|
+ "import matplotlib.pyplot as plt\n",
|
|
|
+ "%matplotlib inline"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": 4,
|
|
|
+ "metadata": {
|
|
|
+ "ExecuteTime": {
|
|
|
+ "end_time": "2019-08-11T07:24:29.714375Z",
|
|
|
+ "start_time": "2019-08-11T07:24:28.724489Z"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "outputs": [
|
|
|
+ {
|
|
|
+ "name": "stdout",
|
|
|
+ "output_type": "stream",
|
|
|
+ "text": [
|
|
|
+ "Training data lenth is 75000\n",
|
|
|
+ "training time: 0.072 s\n",
|
|
|
+ "predicting time: 0.679 s\n",
|
|
|
+ "23902 25000\n",
|
|
|
+ "The accuracy is 95.608 %\n"
|
|
|
+ ]
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "source": [
|
|
|
+ "from time import time\n",
|
|
|
+ "from sklearn import neighbors\n",
|
|
|
+ "\n",
|
|
|
+ "features_train, labels_train, features_test, labels_test = makeTerrainData()\n",
|
|
|
+ "print(\"Training data lenth is\", len(features_train))\n",
|
|
|
+ "\n",
|
|
|
+ "\n",
|
|
|
+ "clf = neighbors.KNeighborsClassifier()\n",
|
|
|
+ "t0 = time()\n",
|
|
|
+ "clf = clf.fit(features_train, labels_train)\n",
|
|
|
+ "print(\"training time:\", round(time() - t0, 3), \"s\")\n",
|
|
|
+ "\n",
|
|
|
+ "t1 = time()\n",
|
|
|
+ "result = clf.predict(features_test)\n",
|
|
|
+ "print(\"predicting time:\", round(time() - t1, 3), \"s\")\n",
|
|
|
+ "corr = 0\n",
|
|
|
+ "for x in range(0, len(result)):\n",
|
|
|
+ " if result[x] == labels_test[x]:\n",
|
|
|
+ " corr = corr + 1\n",
|
|
|
+ "print(corr, len(result))\n",
|
|
|
+ "num = float(len(result))\n",
|
|
|
+ "acc = round(corr/num*100, 3)\n",
|
|
|
+ "print(\"The accuracy is\", acc, \"%\")\n"
|
|
|
+ ]
|
|
|
+ },
|
|
|
+ {
|
|
|
+ "cell_type": "code",
|
|
|
+ "execution_count": null,
|
|
|
+ "metadata": {},
|
|
|
+ "outputs": [],
|
|
|
+ "source": []
|
|
|
+ }
|
|
|
+ ],
|
|
|
+ "metadata": {
|
|
|
+ "kernelspec": {
|
|
|
+ "display_name": "Python 3",
|
|
|
+ "language": "python",
|
|
|
+ "name": "python3"
|
|
|
+ },
|
|
|
+ "language_info": {
|
|
|
+ "codemirror_mode": {
|
|
|
+ "name": "ipython",
|
|
|
+ "version": 3
|
|
|
+ },
|
|
|
+ "file_extension": ".py",
|
|
|
+ "mimetype": "text/x-python",
|
|
|
+ "name": "python",
|
|
|
+ "nbconvert_exporter": "python",
|
|
|
+ "pygments_lexer": "ipython3",
|
|
|
+ "version": "3.7.3"
|
|
|
+ }
|
|
|
+ },
|
|
|
+ "nbformat": 4,
|
|
|
+ "nbformat_minor": 4
|
|
|
+}
|