{ "cells": [ { "cell_type": "markdown", "id": "879144d9", "metadata": {}, "source": [ "### Load MNIST dataset" ] }, { "cell_type": "code", "execution_count": 1, "id": "bd032860", "metadata": {}, "outputs": [], "source": [ "# Python ≥3.5 is required\n", "import sys\n", "assert sys.version_info >= (3, 5)" ] }, { "cell_type": "code", "execution_count": 2, "id": "30da011c", "metadata": {}, "outputs": [], "source": [ "# scikit-learn ≥0.20 is required\n", "import sklearn\n", "assert sklearn.__version__ >= \"0.20\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "4f555050", "metadata": {}, "outputs": [], "source": [ "# common imports\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "id": "e4de4331", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "sklearn.utils.Bunch" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import function to scikit-learn datasets\n", "from sklearn.datasets import fetch_openml\n", "\n", "# load specified dataset (MNIST)\n", "mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n", "\n", "# print type of dataset\n", "type(mnist)" ] }, { "cell_type": "code", "execution_count": 5, "id": "b5221963", "metadata": {}, "outputs": [], "source": [ "X, y = mnist[\"data\"], mnist[\"target\"]" ] }, { "cell_type": "markdown", "id": "811db75a", "metadata": {}, "source": [ "### labels to int" ] }, { "cell_type": "code", "execution_count": 6, "id": "2bcc19ad", "metadata": {}, "outputs": [], "source": [ "# import plotting libraries\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 7, "id": "cc4b728f", "metadata": {}, "outputs": [], "source": [ "# convert string labels to int\n", "y = y.astype(np.uint8)" ] }, { "cell_type": "markdown", "id": "d7113df3", "metadata": {}, "source": [ "### Prepare data for machine learning" ] }, { "cell_type": "markdown", "id": "570f328e", "metadata": {}, "source": [ "### Identify Train Set and Test Set" ] }, { "cell_type": "code", "execution_count": 8, "id": "80e1ca03", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train: 56000, (56000, 784)\n", "X_test: 14000, (14000, 784)\n", "y_train: 56000, (56000,)\n", "y_test: 14000, (14000,)\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=177013)\n", "\n", "print(f\"X_train: {len(X_train)}, {X_train.shape}\")\n", "print(f\"X_test: {len(X_test)}, {X_test.shape}\")\n", "print(f\"y_train: {len(y_train)}, {y_train.shape}\")\n", "print(f\"y_test: {len(y_test)}, {y_test.shape}\")" ] }, { "cell_type": "markdown", "id": "ade8a1f6", "metadata": {}, "source": [ "## Pipeline Declaration" ] }, { "cell_type": "code", "execution_count": 9, "id": "bc5896c2", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.decomposition import PCA, KernelPCA\n", "from sklearn.preprocessing import (StandardScaler, \n", " MinMaxScaler, \n", " MaxAbsScaler,\n", " PowerTransformer,\n", " Binarizer)\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.model_selection import cross_validate\n", "from sklearn.metrics import classification_report, accuracy_score\n", "\n", "n_neighbors = 3\n", "metric = 'euclidean'\n", "weights = 'distance'\n", "n95_components = 0.95\n", "n99_components = 0.99\n", "\n", "names = []\n", "classifiers = []\n", "accuracies = []" ] }, { "cell_type": "markdown", "id": "9e905584", "metadata": {}, "source": [ "# Crossvalidation" ] }, { "cell_type": "code", "execution_count": 10, "id": "bbbb447c", "metadata": {}, "outputs": [], "source": [ "# not this\n", "def cv_train(num,cv):\n", " name = names[num]\n", " clf = classifiers[num]\n", " y_train_pred = cross_val_predict(clf, X_train, y_train, cv=cv, n_jobs=-1)\n", " accuracy = accuracy_score(y_train, y_train_pred, normalize=True)*100\n", " print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n", " print(classification_report(y_train, y_train_pred))\n", " return accuracy" ] }, { "cell_type": "code", "execution_count": 11, "id": "4a8240c4", "metadata": {}, "outputs": [], "source": [ "# also not this\n", "def cv_test(num):\n", " name = names[num]\n", " clf = classifiers[num]\n", " y_test_pred = cross_val_predict(clf, X_test, y_test, cv=5, n_jobs=-1)\n", " accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n", " print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n", " print(classification_report(y_test, y_test_pred))\n", " return accuracy" ] }, { "cell_type": "code", "execution_count": 12, "id": "f397cf42", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "# this\n", "def cv(num,cv_arg=10):\n", " name = names[num]\n", " clf = classifiers[num]\n", " clf = clf.fit(X_train, y_train)\n", " cv = cross_validate(clf, X_train, y_train, cv=cv_arg, n_jobs=-1)\n", " # cv_clf = cv['estimator'][np.argmax(cv['test_score'])] # get the estimator where the max(test_score) on the cross validation\n", " y_test_pred = clf.predict(X_test)\n", " accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n", " print(f\"Pipeline: {name} ({accuracy:.4f}%) ({cv['fit_time'][np.argmax(cv['test_score'])]:.4})\")\n", " print(classification_report(y_test, y_test_pred))\n", " print(pd.DataFrame.from_dict(cv))\n", " return accuracy" ] }, { "cell_type": "markdown", "id": "a543706f", "metadata": {}, "source": [ "# Fitting" ] }, { "cell_type": "code", "execution_count": 13, "id": "45452ceb", "metadata": {}, "outputs": [], "source": [ "op_kNN = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1, metric=metric, weights=weights)" ] }, { "cell_type": "code", "execution_count": 14, "id": "03c01cd0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: knn (baseline) (97.3071%) (0.1162)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.96 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.94 0.96 1411\n", " 9 0.96 0.97 0.96 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 0.125081 9.229141 0.972143\n", "1 0.123761 8.918457 0.971429\n", "2 0.125360 9.193326 0.971607\n", "3 0.119904 8.944645 0.970714\n", "4 0.244280 9.182126 0.972143\n", "5 0.233787 9.137941 0.972857\n", "6 0.123714 9.257920 0.975179\n", "7 0.116261 9.135718 0.972500\n", "8 0.116210 9.125259 0.975357\n", "9 0.141345 9.308920 0.973393\n" ] } ], "source": [ "names.append('knn (baseline)')\n", "classifiers.append(Pipeline([\n", " ('knn', op_kNN)\n", "]))\n", " \n", "accuracies.append(cv(0))" ] }, { "cell_type": "code", "execution_count": 15, "id": "18f02d0c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: scalar+knn (94.8000%) (1.146)\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.99 0.98 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.95 0.93 0.94 1369\n", " 3 0.94 0.95 0.95 1415\n", " 4 0.94 0.93 0.94 1373\n", " 5 0.93 0.91 0.92 1257\n", " 6 0.96 0.97 0.97 1351\n", " 7 0.94 0.95 0.94 1422\n", " 8 0.96 0.91 0.93 1411\n", " 9 0.92 0.94 0.93 1449\n", "\n", " accuracy 0.95 14000\n", " macro avg 0.95 0.95 0.95 14000\n", "weighted avg 0.95 0.95 0.95 14000\n", "\n", " fit_time score_time test_score\n", "0 0.758668 8.452150 0.949643\n", "1 0.885169 8.631167 0.945000\n", "2 1.160197 8.948529 0.949107\n", "3 1.353553 8.502009 0.942500\n", "4 0.912583 9.036203 0.948393\n", "5 0.833251 9.013008 0.945179\n", "6 0.886441 8.995505 0.943214\n", "7 0.935067 8.891803 0.946607\n", "8 1.146215 8.633170 0.951607\n", "9 1.234975 8.846080 0.947143\n" ] } ], "source": [ "names.append('scalar+knn') \n", "classifiers.append(Pipeline([\n", " ('standard', StandardScaler()),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(1,10)) # Pipeline: standard+knn (94.3714%)" ] }, { "cell_type": "code", "execution_count": 16, "id": "b2e7ee09", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: minmax+knn (97.3071%) (0.5789)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.96 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.94 0.96 1411\n", " 9 0.96 0.97 0.96 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 0.389049 8.747383 0.972143\n", "1 0.385594 9.063268 0.971429\n", "2 0.527658 8.926810 0.971786\n", "3 0.564983 8.949160 0.970714\n", "4 0.607976 9.087395 0.972143\n", "5 0.689083 8.948745 0.972857\n", "6 0.578933 8.950404 0.975179\n", "7 0.647489 9.248726 0.972679\n", "8 0.634490 9.124195 0.975179\n", "9 0.619997 8.861808 0.973393\n" ] } ], "source": [ "names.append('minmax+knn')\n", "classifiers.append(Pipeline([\n", " ('minmax', MinMaxScaler()),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(2))" ] }, { "cell_type": "code", "execution_count": 17, "id": "23ae34c3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: standard+pca95+knn (95.3929%) (9.151)\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.99 0.98 1359\n", " 1 0.98 0.99 0.98 1594\n", " 2 0.96 0.94 0.95 1369\n", " 3 0.94 0.96 0.95 1415\n", " 4 0.95 0.94 0.95 1373\n", " 5 0.94 0.92 0.93 1257\n", " 6 0.96 0.97 0.97 1351\n", " 7 0.95 0.96 0.95 1422\n", " 8 0.97 0.92 0.94 1411\n", " 9 0.92 0.94 0.93 1449\n", "\n", " accuracy 0.95 14000\n", " macro avg 0.95 0.95 0.95 14000\n", "weighted avg 0.95 0.95 0.95 14000\n", "\n", " fit_time score_time test_score\n", "0 7.882113 6.468349 0.953036\n", "1 8.902088 5.919568 0.950357\n", "2 8.750770 5.959131 0.953571\n", "3 7.767436 6.355160 0.945179\n", "4 9.307496 5.559806 0.952500\n", "5 7.845258 6.534233 0.950000\n", "6 8.874461 5.844108 0.948214\n", "7 7.704501 6.488686 0.950714\n", "8 9.151180 5.622928 0.955893\n", "9 7.257694 6.649508 0.953571\n" ] } ], "source": [ "names.append('standard+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('standard', StandardScaler()),\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(3,10)) # Pipeline: standard+pca95+knn (95.0500%)" ] }, { "cell_type": "code", "execution_count": 18, "id": "cac23616", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: minmax+pca95+knn (97.5500%) (8.162)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.98 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.97 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.98 14000\n", " macro avg 0.98 0.98 0.98 14000\n", "weighted avg 0.98 0.98 0.98 14000\n", "\n", " fit_time score_time test_score\n", "0 8.170779 5.294063 0.973750\n", "1 8.088926 5.365682 0.972857\n", "2 6.857623 5.776273 0.974286\n", "3 7.262239 5.806243 0.973036\n", "4 8.057861 5.374735 0.974464\n", "5 7.276100 5.745563 0.975179\n", "6 8.202060 5.424418 0.975893\n", "7 8.114796 5.319392 0.975357\n", "8 8.162190 5.424424 0.976607\n", "9 7.437983 5.727555 0.975536\n" ] } ], "source": [ "names.append('minmax+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('minmax', MinMaxScaler()),\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(4))" ] }, { "cell_type": "code", "execution_count": 19, "id": "a57eb660", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: standard+pca99+knn (94.9071%) (9.396)\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.99 0.98 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.95 0.93 0.94 1369\n", " 3 0.94 0.95 0.95 1415\n", " 4 0.95 0.93 0.94 1373\n", " 5 0.93 0.91 0.92 1257\n", " 6 0.96 0.97 0.97 1351\n", " 7 0.94 0.95 0.95 1422\n", " 8 0.97 0.91 0.94 1411\n", " 9 0.92 0.94 0.93 1449\n", "\n", " accuracy 0.95 14000\n", " macro avg 0.95 0.95 0.95 14000\n", "weighted avg 0.95 0.95 0.95 14000\n", "\n", " fit_time score_time test_score\n", "0 9.473285 6.515933 0.949821\n", "1 7.714788 7.746434 0.946786\n", "2 7.546448 7.823940 0.950179\n", "3 7.546017 7.754220 0.943036\n", "4 7.534473 7.836074 0.948750\n", "5 7.524864 7.952840 0.945893\n", "6 7.699588 7.758791 0.946071\n", "7 9.373224 6.639953 0.947679\n", "8 9.395972 6.550251 0.953214\n", "9 9.444257 6.667987 0.947679\n" ] } ], "source": [ "names.append('standard+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('standard', StandardScaler()),\n", " ('pca99', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(5,10)) # Pipeline: standard+pca99+knn (94.5357%)" ] }, { "cell_type": "code", "execution_count": 20, "id": "bcbedf38", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: minmax+pca99+knn (97.3643%) (9.134)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.96 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 8.213089 6.006415 0.972679\n", "1 7.115110 6.500668 0.971429\n", "2 7.381336 6.424851 0.972143\n", "3 7.099746 6.442294 0.970179\n", "4 9.174922 5.636195 0.972679\n", "5 6.937273 6.640518 0.973214\n", "6 8.361929 6.048856 0.975536\n", "7 9.058427 5.660812 0.972857\n", "8 9.134326 5.704015 0.975714\n", "9 9.088895 5.631931 0.974286\n" ] } ], "source": [ "names.append('minmax+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('minmax', MinMaxScaler()),\n", " ('pca99', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(6))" ] }, { "cell_type": "code", "execution_count": 21, "id": "5bc4f44b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: maxabs+pca95+knn (97.5500%) (8.568)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.98 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.97 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.98 14000\n", " macro avg 0.98 0.98 0.98 14000\n", "weighted avg 0.98 0.98 0.98 14000\n", "\n", " fit_time score_time test_score\n", "0 7.831842 5.514181 0.973750\n", "1 7.772907 5.701575 0.972857\n", "2 8.581123 5.059241 0.974286\n", "3 7.486752 5.769173 0.973036\n", "4 7.644667 5.623410 0.974464\n", "5 8.568252 5.032640 0.975179\n", "6 7.305880 5.804784 0.975893\n", "7 7.243364 5.869418 0.975357\n", "8 8.568058 5.175154 0.976607\n", "9 8.545392 5.035311 0.975536\n" ] } ], "source": [ "names.append('maxabs+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('maxabs', MaxAbsScaler()),\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(7))" ] }, { "cell_type": "code", "execution_count": 22, "id": "a901ad5d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: maxabs+pca99+knn (97.3643%) (7.451)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.96 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 7.066843 6.724576 0.972679\n", "1 8.163119 5.973942 0.971429\n", "2 7.298925 6.530500 0.972143\n", "3 8.869127 5.643595 0.970179\n", "4 8.898886 5.403235 0.972679\n", "5 7.828445 6.313535 0.973214\n", "6 8.803470 5.529734 0.975536\n", "7 8.294126 5.839177 0.972857\n", "8 7.450544 6.585145 0.975714\n", "9 7.483754 6.628288 0.974286\n" ] } ], "source": [ "names.append('maxabs+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('maxabs', MaxAbsScaler()),\n", " ('pca99', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(8))" ] }, { "cell_type": "code", "execution_count": 23, "id": "19e87457", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n", " loglike = -n_samples / 2 * np.log(x_trans.var())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: power+pca95+knn (94.9786%) (39.14)\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.99 0.98 1359\n", " 1 0.98 0.98 0.98 1594\n", " 2 0.95 0.94 0.95 1369\n", " 3 0.94 0.94 0.94 1415\n", " 4 0.96 0.93 0.95 1373\n", " 5 0.94 0.91 0.93 1257\n", " 6 0.95 0.98 0.96 1351\n", " 7 0.94 0.95 0.94 1422\n", " 8 0.96 0.92 0.94 1411\n", " 9 0.92 0.94 0.93 1449\n", "\n", " accuracy 0.95 14000\n", " macro avg 0.95 0.95 0.95 14000\n", "weighted avg 0.95 0.95 0.95 14000\n", "\n", " fit_time score_time test_score\n", "0 38.802822 5.871142 0.946429\n", "1 37.150519 6.993097 0.945357\n", "2 37.138519 7.017011 0.948929\n", "3 38.357648 6.108760 0.945714\n", "4 37.175505 7.018548 0.949107\n", "5 37.249800 6.882046 0.945357\n", "6 37.172034 6.747881 0.949643\n", "7 38.738593 5.878512 0.948571\n", "8 39.135663 5.579420 0.956607\n", "9 37.060059 6.904209 0.949821\n" ] } ], "source": [ "names.append('power+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('power', PowerTransformer()),\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(9,10)) # likes to die # Pipeline: power+pca95+knn (94.3714%)" ] }, { "cell_type": "code", "execution_count": 24, "id": "6146ccb1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n", " loglike = -n_samples / 2 * np.log(x_trans.var())\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: power+pca99+knn (94.9786%) (37.94)\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.99 0.98 1359\n", " 1 0.98 0.98 0.98 1594\n", " 2 0.95 0.94 0.95 1369\n", " 3 0.94 0.94 0.94 1415\n", " 4 0.96 0.93 0.95 1373\n", " 5 0.94 0.91 0.93 1257\n", " 6 0.95 0.98 0.96 1351\n", " 7 0.94 0.95 0.94 1422\n", " 8 0.96 0.92 0.94 1411\n", " 9 0.92 0.94 0.93 1449\n", "\n", " accuracy 0.95 14000\n", " macro avg 0.95 0.95 0.95 14000\n", "weighted avg 0.95 0.95 0.95 14000\n", "\n", " fit_time score_time test_score\n", "0 39.081897 5.799132 0.946429\n", "1 39.650227 5.733288 0.945357\n", "2 36.740629 6.834802 0.948929\n", "3 39.349449 5.738063 0.945714\n", "4 38.051169 6.654780 0.949107\n", "5 38.139323 6.575838 0.945357\n", "6 36.967650 6.827009 0.949643\n", "7 37.961230 6.632579 0.948571\n", "8 37.944791 6.445367 0.956607\n", "9 39.280427 5.751522 0.949821\n" ] } ], "source": [ "names.append('power+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('power', PowerTransformer()),\n", " ('pca99', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(10,10)) # likes to die # Pipeline: power+pca99+knn (94.6429%)" ] }, { "cell_type": "code", "execution_count": 25, "id": "66a7637b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: bin+pca95+knn (96.8286%) (8.674)\n", " precision recall f1-score support\n", "\n", " 0 0.98 0.99 0.99 1359\n", " 1 0.96 0.99 0.98 1594\n", " 2 0.98 0.97 0.98 1369\n", " 3 0.96 0.96 0.96 1415\n", " 4 0.98 0.95 0.97 1373\n", " 5 0.97 0.94 0.95 1257\n", " 6 0.97 0.99 0.98 1351\n", " 7 0.97 0.97 0.97 1422\n", " 8 0.97 0.95 0.96 1411\n", " 9 0.94 0.97 0.95 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 8.605724 5.490393 0.967679\n", "1 7.290196 6.624226 0.969107\n", "2 7.169216 6.591240 0.968571\n", "3 7.225861 6.645801 0.965357\n", "4 7.196922 6.595732 0.969643\n", "5 7.294749 6.646243 0.966429\n", "6 8.720850 5.419779 0.969821\n", "7 8.711336 5.597409 0.967679\n", "8 7.221519 6.624207 0.970536\n", "9 8.673788 5.737247 0.970714\n" ] } ], "source": [ "names.append('bin+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('bin', Binarizer()),\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(11,10))" ] }, { "cell_type": "code", "execution_count": 26, "id": "d4fadaac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: bin+pca99+knn (96.6571%) (7.146)\n", " precision recall f1-score support\n", "\n", " 0 0.98 0.99 0.98 1359\n", " 1 0.96 0.99 0.98 1594\n", " 2 0.98 0.97 0.97 1369\n", " 3 0.96 0.96 0.96 1415\n", " 4 0.98 0.95 0.97 1373\n", " 5 0.97 0.94 0.95 1257\n", " 6 0.96 0.99 0.97 1351\n", " 7 0.97 0.97 0.97 1422\n", " 8 0.97 0.94 0.96 1411\n", " 9 0.93 0.97 0.95 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 6.961178 6.772386 0.965536\n", "1 8.445867 6.193574 0.967143\n", "2 9.256319 5.771667 0.966607\n", "3 7.188147 6.795888 0.965179\n", "4 9.336459 5.739643 0.966964\n", "5 7.107809 6.823560 0.964821\n", "6 9.127543 5.848893 0.968393\n", "7 9.106971 5.873384 0.966429\n", "8 7.145829 6.698655 0.969107\n", "9 8.202323 6.141933 0.968750\n" ] } ], "source": [ "names.append('bin+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('bin', Binarizer()),\n", " ('pca99', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(12))" ] }, { "cell_type": "code", "execution_count": 27, "id": "d15fb11c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: pca95+knn (97.5500%) (7.416)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.98 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.97 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.98 14000\n", " macro avg 0.98 0.98 0.98 14000\n", "weighted avg 0.98 0.98 0.98 14000\n", "\n", " fit_time score_time test_score\n", "0 7.440297 5.405928 0.973750\n", "1 7.141649 5.609115 0.972857\n", "2 6.829969 5.687647 0.974286\n", "3 8.291485 5.057073 0.973036\n", "4 8.281738 4.872021 0.974464\n", "5 6.882516 5.804941 0.975179\n", "6 8.275357 5.190776 0.975893\n", "7 8.244917 5.013079 0.975357\n", "8 7.416139 5.446140 0.976607\n", "9 6.752497 5.906509 0.975536\n" ] } ], "source": [ "names.append('pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('pca95', PCA(n_components=n95_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(13))" ] }, { "cell_type": "code", "execution_count": 28, "id": "2db8577b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Pipeline: pca99+knn (97.3643%) (6.88)\n", " precision recall f1-score support\n", "\n", " 0 0.98 1.00 0.99 1359\n", " 1 0.97 0.99 0.98 1594\n", " 2 0.98 0.96 0.97 1369\n", " 3 0.97 0.97 0.97 1415\n", " 4 0.98 0.97 0.97 1373\n", " 5 0.96 0.96 0.96 1257\n", " 6 0.98 0.99 0.98 1351\n", " 7 0.97 0.98 0.97 1422\n", " 8 0.99 0.95 0.97 1411\n", " 9 0.96 0.97 0.97 1449\n", "\n", " accuracy 0.97 14000\n", " macro avg 0.97 0.97 0.97 14000\n", "weighted avg 0.97 0.97 0.97 14000\n", "\n", " fit_time score_time test_score\n", "0 8.113361 5.777437 0.972857\n", "1 6.950958 6.503708 0.971429\n", "2 8.775186 5.424367 0.972143\n", "3 8.805392 5.562713 0.970179\n", "4 6.980318 6.581474 0.972679\n", "5 8.412863 5.525903 0.973214\n", "6 7.188666 6.495338 0.975536\n", "7 6.802638 6.664372 0.972857\n", "8 6.879545 6.603721 0.975714\n", "9 7.031381 6.631324 0.974286\n" ] } ], "source": [ "names.append('pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('pca99', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "accuracies.append(cv(14))" ] }, { "cell_type": "code", "execution_count": 29, "id": "a5702428", "metadata": {}, "outputs": [], "source": [ "names.append('kpca+pca95+knn')\n", "classifiers.append(Pipeline([\n", " ('kpca', KernelPCA(n_jobs=-1)),\n", " ('pca', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "# accuracies.append(cv(15))" ] }, { "cell_type": "code", "execution_count": 30, "id": "0ee57cfe", "metadata": {}, "outputs": [], "source": [ "names.append('kpca+pca99+knn')\n", "classifiers.append(Pipeline([\n", " ('kpca', KernelPCA(n_jobs=-1)),\n", " ('pca', PCA(n_components=n99_components)),\n", " ('knn', op_kNN)\n", "]))\n", "\n", "# accuracies.append(cv(16))" ] }, { "cell_type": "markdown", "id": "7fbbc930", "metadata": {}, "source": [ "# Auswertung" ] }, { "cell_type": "code", "execution_count": 31, "id": "e5d609aa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Maximum accuracy (97.55%) for ['minmax+pca95+knn', 'maxabs+pca95+knn', 'pca95+knn']\n" ] } ], "source": [ "print(f\"Maximum accuracy ({max(accuracies):.6}%) for {[names[n] for n in np.where(accuracies==max(accuracies))[0]]}\")" ] }, { "cell_type": "code", "execution_count": 38, "id": "234f14bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name: knn (baseline) (97.3071%)\n", "name: scalar+knn (94.8000%)\n", "name: minmax+knn (97.3071%)\n", "name: standard+pca95+knn (95.3929%)\n", "name: minmax+pca95+knn (97.5500%)\n", "name: standard+pca99+knn (94.9071%)\n", "name: minmax+pca99+knn (97.3643%)\n", "name: maxabs+pca95+knn (97.5500%)\n", "name: maxabs+pca99+knn (97.3643%)\n", "name: power+pca95+knn (94.9786%)\n", "name: power+pca99+knn (94.9786%)\n", "name: bin+pca95+knn (96.8286%)\n", "name: bin+pca99+knn (96.6571%)\n", "name: pca95+knn (97.5500%)\n", "name: pca99+knn (97.3643%)\n" ] } ], "source": [ "for n, a in zip(names, accuracies):\n", " print(f\"name: {n:20} ({a:.4f}%)\")" ] }, { "cell_type": "markdown", "id": "a6ddb6f2", "metadata": {}, "source": [ "Default n=3\\\n", "name: knn (baseline) (97.2143%)\\\n", "name: scalar+knn (94.6286%)\\\n", "name: minmax+knn (97.2143%)\\\n", "name: standard+pca95+knn (95.1429%)\\\n", "name: minmax+pca95+knn (97.4357%)\\\n", "name: standard+pca99+knn (94.7214%)\\\n", "name: minmax+pca99+knn (97.2714%)\\\n", "name: maxabs+pca95+knn (97.4357%)\\\n", "name: maxabs+pca99+knn (97.2714%)\\\n", "name: power+pca95+knn (94.8071%)\\\n", "name: power+pca99+knn (94.8071%)\\\n", "name: bin+pca95+knn (96.6643%)\\\n", "name: bin+pca99+knn (96.5500%)\\\n", "name: pca95+knn (97.4357%)\\\n", "name: pca99+knn (97.2714%)\\" ] }, { "cell_type": "markdown", "id": "bde6e847", "metadata": {}, "source": [ "n=3 euclid distance\\\n", "name: knn (baseline) (97.3071%)\\\n", "name: scalar+knn (94.8000%)\\\n", "name: minmax+knn (97.3071%)\\\n", "name: standard+pca95+knn (95.3929%)\\\n", "name: minmax+pca95+knn (97.5500%)\\\n", "name: standard+pca99+knn (94.9071%)\\\n", "name: minmax+pca99+knn (97.3643%)\\\n", "name: maxabs+pca95+knn (97.5500%)\\\n", "name: maxabs+pca99+knn (97.3643%)\\\n", "name: power+pca95+knn (94.9786%)\\\n", "name: power+pca99+knn (94.9786%)\\\n", "name: bin+pca95+knn (96.8286%)\\\n", "name: bin+pca99+knn (96.6571%)\\\n", "name: pca95+knn (97.5500%)\\\n", "name: pca99+knn (97.3643%)\\\n" ] }, { "cell_type": "markdown", "id": "4c625bc3", "metadata": {}, "source": [ "# Hyper Parameter Optimization" ] }, { "cell_type": "code", "execution_count": 34, "id": "24ff7ea2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fitting 3 folds for each of 32 candidates, totalling 96 fits\n" ] } ], "source": [ "# from sklearn.model_selection import GridSearchCV\n", "\n", "# grid_params = {\n", "# 'n_neighbors': [3, 5, 7 , 11],\n", "# 'weights': ['uniform', 'distance'],\n", "# 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],\n", "# }\n", "\n", "# gs = GridSearchCV(\n", "# KNeighborsClassifier(n_jobs=-1),\n", "# grid_params,\n", "# cv = 3,\n", "# verbose=1,\n", "# n_jobs = -1\n", "# )\n", "\n", "# gs_results = gs.fit(X_train, y_train)\n", "# Results in gs_results.csv" ] }, { "cell_type": "code", "execution_count": 36, "id": "b3b0eac3", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(gs_results.cv_results_).sort_values('rank_test_score').to_csv('./gs_result.csv')" ] }, { "cell_type": "code", "execution_count": 37, "id": "b68589fe", "metadata": {}, "outputs": [], "source": [ "pd.DataFrame.from_dict(gs_results.cv_results_)[['rank_test_score','mean_fit_time','param_n_neighbors', 'param_metric', 'param_weights', 'mean_test_score']].to_csv('./gs_result_filtered.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "b162e908", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }