iui-group-l-name-zensiert/0-pilot-project/MNIST-kNN-Abgabe.ipynb

1320 lines
43 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "8301251c",
"metadata": {},
"source": [
"### Load MNIST dataset"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3368e2c3",
"metadata": {},
"outputs": [],
"source": [
"# Python ≥3.5 is required\n",
"import sys\n",
"assert sys.version_info >= (3, 5)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0dc2fe45",
"metadata": {},
"outputs": [],
"source": [
"# scikit-learn ≥0.20 is required\n",
"import sklearn\n",
"assert sklearn.__version__ >= \"0.20\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "30459411",
"metadata": {},
"outputs": [],
"source": [
"# common imports\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0be717f8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sklearn.utils.Bunch"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import function to scikit-learn datasets\n",
"from sklearn.datasets import fetch_openml\n",
"\n",
"# load specified dataset (MNIST)\n",
"mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n",
"\n",
"# print type of dataset\n",
"type(mnist)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ae9b3d51",
"metadata": {},
"outputs": [],
"source": [
"X, y = mnist[\"data\"], mnist[\"target\"]"
]
},
{
"cell_type": "markdown",
"id": "7c89b6d3",
"metadata": {},
"source": [
"### labels to int"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "30880538",
"metadata": {},
"outputs": [],
"source": [
"# import plotting libraries\n",
"import matplotlib as mpl\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "34b6be41",
"metadata": {},
"outputs": [],
"source": [
"# convert string labels to int\n",
"y = y.astype(np.uint8)"
]
},
{
"cell_type": "markdown",
"id": "361fea4c",
"metadata": {},
"source": [
"### Prepare data for machine learning"
]
},
{
"cell_type": "markdown",
"id": "cab5977a",
"metadata": {},
"source": [
"### Identify Train Set and Test Set"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9bb80760",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train: 56000, (56000, 784)\n",
"X_test: 14000, (14000, 784)\n",
"y_train: 56000, (56000,)\n",
"y_test: 14000, (14000,)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=177013)\n",
"\n",
"print(f\"X_train: {len(X_train)}, {X_train.shape}\")\n",
"print(f\"X_test: {len(X_test)}, {X_test.shape}\")\n",
"print(f\"y_train: {len(y_train)}, {y_train.shape}\")\n",
"print(f\"y_test: {len(y_test)}, {y_test.shape}\")"
]
},
{
"cell_type": "markdown",
"id": "aac09882",
"metadata": {},
"source": [
"## Pipeline Declaration"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "ca389b56",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.decomposition import PCA, KernelPCA\n",
"from sklearn.preprocessing import (StandardScaler, \n",
" MinMaxScaler, \n",
" MaxAbsScaler,\n",
" PowerTransformer,\n",
" Binarizer)\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import cross_validate\n",
"from sklearn.metrics import classification_report, accuracy_score\n",
"\n",
"n_neighbors = 3\n",
"metric = 'euclidean'\n",
"weights = 'distance'\n",
"n95_components = 0.95\n",
"n99_components = 0.99\n",
"\n",
"names = []\n",
"classifiers = []\n",
"accuracies = []"
]
},
{
"cell_type": "markdown",
"id": "b7c97601",
"metadata": {},
"source": [
"# Crossvalidation"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "cd37833d",
"metadata": {},
"outputs": [],
"source": [
"# not this\n",
"def cv_train(num,cv):\n",
" name = names[num]\n",
" clf = classifiers[num]\n",
" y_train_pred = cross_val_predict(clf, X_train, y_train, cv=cv, n_jobs=-1)\n",
" accuracy = accuracy_score(y_train, y_train_pred, normalize=True)*100\n",
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
" print(classification_report(y_train, y_train_pred))\n",
" return accuracy"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f738a4ca",
"metadata": {},
"outputs": [],
"source": [
"# also not this\n",
"def cv_test(num):\n",
" name = names[num]\n",
" clf = classifiers[num]\n",
" y_test_pred = cross_val_predict(clf, X_test, y_test, cv=5, n_jobs=-1)\n",
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
" print(classification_report(y_test, y_test_pred))\n",
" return accuracy"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "756c8015",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"# this\n",
"def cv(num,cv_arg=10):\n",
" name = names[num]\n",
" clf = classifiers[num]\n",
" clf = clf.fit(X_train, y_train)\n",
" cv = cross_validate(clf, X_train, y_train, cv=cv_arg, n_jobs=-1)\n",
" # cv_clf = cv['estimator'][np.argmax(cv['test_score'])] # get the estimator where the max(test_score) on the cross validation\n",
" y_test_pred = clf.predict(X_test)\n",
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
" print(f\"Pipeline: {name} ({accuracy:.4f}%) ({cv['fit_time'][np.argmax(cv['test_score'])]:.4})\")\n",
" print(classification_report(y_test, y_test_pred))\n",
" print(pd.DataFrame.from_dict(cv))\n",
" return accuracy"
]
},
{
"cell_type": "markdown",
"id": "5d3b1484",
"metadata": {},
"source": [
"# Fitting"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "1ea6a154",
"metadata": {},
"outputs": [],
"source": [
"op_kNN = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1, metric=metric, weights=weights)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "ac4c7a18",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: knn (baseline) (97.3071%) (0.1162)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.96 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.94 0.96 1411\n",
" 9 0.96 0.97 0.96 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 0.125081 9.229141 0.972143\n",
"1 0.123761 8.918457 0.971429\n",
"2 0.125360 9.193326 0.971607\n",
"3 0.119904 8.944645 0.970714\n",
"4 0.244280 9.182126 0.972143\n",
"5 0.233787 9.137941 0.972857\n",
"6 0.123714 9.257920 0.975179\n",
"7 0.116261 9.135718 0.972500\n",
"8 0.116210 9.125259 0.975357\n",
"9 0.141345 9.308920 0.973393\n"
]
}
],
"source": [
"names.append('knn (baseline)')\n",
"classifiers.append(Pipeline([\n",
" ('knn', op_kNN)\n",
"]))\n",
" \n",
"accuracies.append(cv(0))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "23c51b9e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: scalar+knn (94.8000%) (1.146)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.95 0.93 0.94 1369\n",
" 3 0.94 0.95 0.95 1415\n",
" 4 0.94 0.93 0.94 1373\n",
" 5 0.93 0.91 0.92 1257\n",
" 6 0.96 0.97 0.97 1351\n",
" 7 0.94 0.95 0.94 1422\n",
" 8 0.96 0.91 0.93 1411\n",
" 9 0.92 0.94 0.93 1449\n",
"\n",
" accuracy 0.95 14000\n",
" macro avg 0.95 0.95 0.95 14000\n",
"weighted avg 0.95 0.95 0.95 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 0.758668 8.452150 0.949643\n",
"1 0.885169 8.631167 0.945000\n",
"2 1.160197 8.948529 0.949107\n",
"3 1.353553 8.502009 0.942500\n",
"4 0.912583 9.036203 0.948393\n",
"5 0.833251 9.013008 0.945179\n",
"6 0.886441 8.995505 0.943214\n",
"7 0.935067 8.891803 0.946607\n",
"8 1.146215 8.633170 0.951607\n",
"9 1.234975 8.846080 0.947143\n"
]
}
],
"source": [
"names.append('scalar+knn') \n",
"classifiers.append(Pipeline([\n",
" ('standard', StandardScaler()),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(1,10)) # Pipeline: standard+knn (94.3714%)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "8c92a008",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: minmax+knn (97.3071%) (0.5789)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.96 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.94 0.96 1411\n",
" 9 0.96 0.97 0.96 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 0.389049 8.747383 0.972143\n",
"1 0.385594 9.063268 0.971429\n",
"2 0.527658 8.926810 0.971786\n",
"3 0.564983 8.949160 0.970714\n",
"4 0.607976 9.087395 0.972143\n",
"5 0.689083 8.948745 0.972857\n",
"6 0.578933 8.950404 0.975179\n",
"7 0.647489 9.248726 0.972679\n",
"8 0.634490 9.124195 0.975179\n",
"9 0.619997 8.861808 0.973393\n"
]
}
],
"source": [
"names.append('minmax+knn')\n",
"classifiers.append(Pipeline([\n",
" ('minmax', MinMaxScaler()),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(2))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "811c3930",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: standard+pca95+knn (95.3929%) (9.151)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 1359\n",
" 1 0.98 0.99 0.98 1594\n",
" 2 0.96 0.94 0.95 1369\n",
" 3 0.94 0.96 0.95 1415\n",
" 4 0.95 0.94 0.95 1373\n",
" 5 0.94 0.92 0.93 1257\n",
" 6 0.96 0.97 0.97 1351\n",
" 7 0.95 0.96 0.95 1422\n",
" 8 0.97 0.92 0.94 1411\n",
" 9 0.92 0.94 0.93 1449\n",
"\n",
" accuracy 0.95 14000\n",
" macro avg 0.95 0.95 0.95 14000\n",
"weighted avg 0.95 0.95 0.95 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 7.882113 6.468349 0.953036\n",
"1 8.902088 5.919568 0.950357\n",
"2 8.750770 5.959131 0.953571\n",
"3 7.767436 6.355160 0.945179\n",
"4 9.307496 5.559806 0.952500\n",
"5 7.845258 6.534233 0.950000\n",
"6 8.874461 5.844108 0.948214\n",
"7 7.704501 6.488686 0.950714\n",
"8 9.151180 5.622928 0.955893\n",
"9 7.257694 6.649508 0.953571\n"
]
}
],
"source": [
"names.append('standard+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('standard', StandardScaler()),\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(3,10)) # Pipeline: standard+pca95+knn (95.0500%)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "3c7440ff",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: minmax+pca95+knn (97.5500%) (8.162)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.98 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.97 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.98 14000\n",
" macro avg 0.98 0.98 0.98 14000\n",
"weighted avg 0.98 0.98 0.98 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 8.170779 5.294063 0.973750\n",
"1 8.088926 5.365682 0.972857\n",
"2 6.857623 5.776273 0.974286\n",
"3 7.262239 5.806243 0.973036\n",
"4 8.057861 5.374735 0.974464\n",
"5 7.276100 5.745563 0.975179\n",
"6 8.202060 5.424418 0.975893\n",
"7 8.114796 5.319392 0.975357\n",
"8 8.162190 5.424424 0.976607\n",
"9 7.437983 5.727555 0.975536\n"
]
}
],
"source": [
"names.append('minmax+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('minmax', MinMaxScaler()),\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(4))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8b491b79",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: standard+pca99+knn (94.9071%) (9.396)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.95 0.93 0.94 1369\n",
" 3 0.94 0.95 0.95 1415\n",
" 4 0.95 0.93 0.94 1373\n",
" 5 0.93 0.91 0.92 1257\n",
" 6 0.96 0.97 0.97 1351\n",
" 7 0.94 0.95 0.95 1422\n",
" 8 0.97 0.91 0.94 1411\n",
" 9 0.92 0.94 0.93 1449\n",
"\n",
" accuracy 0.95 14000\n",
" macro avg 0.95 0.95 0.95 14000\n",
"weighted avg 0.95 0.95 0.95 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 9.473285 6.515933 0.949821\n",
"1 7.714788 7.746434 0.946786\n",
"2 7.546448 7.823940 0.950179\n",
"3 7.546017 7.754220 0.943036\n",
"4 7.534473 7.836074 0.948750\n",
"5 7.524864 7.952840 0.945893\n",
"6 7.699588 7.758791 0.946071\n",
"7 9.373224 6.639953 0.947679\n",
"8 9.395972 6.550251 0.953214\n",
"9 9.444257 6.667987 0.947679\n"
]
}
],
"source": [
"names.append('standard+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('standard', StandardScaler()),\n",
" ('pca99', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(5,10)) # Pipeline: standard+pca99+knn (94.5357%)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "080ea6b8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: minmax+pca99+knn (97.3643%) (9.134)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.96 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 8.213089 6.006415 0.972679\n",
"1 7.115110 6.500668 0.971429\n",
"2 7.381336 6.424851 0.972143\n",
"3 7.099746 6.442294 0.970179\n",
"4 9.174922 5.636195 0.972679\n",
"5 6.937273 6.640518 0.973214\n",
"6 8.361929 6.048856 0.975536\n",
"7 9.058427 5.660812 0.972857\n",
"8 9.134326 5.704015 0.975714\n",
"9 9.088895 5.631931 0.974286\n"
]
}
],
"source": [
"names.append('minmax+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('minmax', MinMaxScaler()),\n",
" ('pca99', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(6))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "6ee320cd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: maxabs+pca95+knn (97.5500%) (8.568)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.98 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.97 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.98 14000\n",
" macro avg 0.98 0.98 0.98 14000\n",
"weighted avg 0.98 0.98 0.98 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 7.831842 5.514181 0.973750\n",
"1 7.772907 5.701575 0.972857\n",
"2 8.581123 5.059241 0.974286\n",
"3 7.486752 5.769173 0.973036\n",
"4 7.644667 5.623410 0.974464\n",
"5 8.568252 5.032640 0.975179\n",
"6 7.305880 5.804784 0.975893\n",
"7 7.243364 5.869418 0.975357\n",
"8 8.568058 5.175154 0.976607\n",
"9 8.545392 5.035311 0.975536\n"
]
}
],
"source": [
"names.append('maxabs+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('maxabs', MaxAbsScaler()),\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(7))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "17934567",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: maxabs+pca99+knn (97.3643%) (7.451)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.96 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 7.066843 6.724576 0.972679\n",
"1 8.163119 5.973942 0.971429\n",
"2 7.298925 6.530500 0.972143\n",
"3 8.869127 5.643595 0.970179\n",
"4 8.898886 5.403235 0.972679\n",
"5 7.828445 6.313535 0.973214\n",
"6 8.803470 5.529734 0.975536\n",
"7 8.294126 5.839177 0.972857\n",
"8 7.450544 6.585145 0.975714\n",
"9 7.483754 6.628288 0.974286\n"
]
}
],
"source": [
"names.append('maxabs+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('maxabs', MaxAbsScaler()),\n",
" ('pca99', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(8))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "88fb14a4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: power+pca95+knn (94.9786%) (39.14)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 1359\n",
" 1 0.98 0.98 0.98 1594\n",
" 2 0.95 0.94 0.95 1369\n",
" 3 0.94 0.94 0.94 1415\n",
" 4 0.96 0.93 0.95 1373\n",
" 5 0.94 0.91 0.93 1257\n",
" 6 0.95 0.98 0.96 1351\n",
" 7 0.94 0.95 0.94 1422\n",
" 8 0.96 0.92 0.94 1411\n",
" 9 0.92 0.94 0.93 1449\n",
"\n",
" accuracy 0.95 14000\n",
" macro avg 0.95 0.95 0.95 14000\n",
"weighted avg 0.95 0.95 0.95 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 38.802822 5.871142 0.946429\n",
"1 37.150519 6.993097 0.945357\n",
"2 37.138519 7.017011 0.948929\n",
"3 38.357648 6.108760 0.945714\n",
"4 37.175505 7.018548 0.949107\n",
"5 37.249800 6.882046 0.945357\n",
"6 37.172034 6.747881 0.949643\n",
"7 38.738593 5.878512 0.948571\n",
"8 39.135663 5.579420 0.956607\n",
"9 37.060059 6.904209 0.949821\n"
]
}
],
"source": [
"names.append('power+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('power', PowerTransformer()),\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(9,10)) # likes to die # Pipeline: power+pca95+knn (94.3714%)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "378c092b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: power+pca99+knn (94.9786%) (37.94)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.97 0.99 0.98 1359\n",
" 1 0.98 0.98 0.98 1594\n",
" 2 0.95 0.94 0.95 1369\n",
" 3 0.94 0.94 0.94 1415\n",
" 4 0.96 0.93 0.95 1373\n",
" 5 0.94 0.91 0.93 1257\n",
" 6 0.95 0.98 0.96 1351\n",
" 7 0.94 0.95 0.94 1422\n",
" 8 0.96 0.92 0.94 1411\n",
" 9 0.92 0.94 0.93 1449\n",
"\n",
" accuracy 0.95 14000\n",
" macro avg 0.95 0.95 0.95 14000\n",
"weighted avg 0.95 0.95 0.95 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 39.081897 5.799132 0.946429\n",
"1 39.650227 5.733288 0.945357\n",
"2 36.740629 6.834802 0.948929\n",
"3 39.349449 5.738063 0.945714\n",
"4 38.051169 6.654780 0.949107\n",
"5 38.139323 6.575838 0.945357\n",
"6 36.967650 6.827009 0.949643\n",
"7 37.961230 6.632579 0.948571\n",
"8 37.944791 6.445367 0.956607\n",
"9 39.280427 5.751522 0.949821\n"
]
}
],
"source": [
"names.append('power+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('power', PowerTransformer()),\n",
" ('pca99', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(10,10)) # likes to die # Pipeline: power+pca99+knn (94.6429%)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "3005da1d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: bin+pca95+knn (96.8286%) (8.674)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.99 0.99 1359\n",
" 1 0.96 0.99 0.98 1594\n",
" 2 0.98 0.97 0.98 1369\n",
" 3 0.96 0.96 0.96 1415\n",
" 4 0.98 0.95 0.97 1373\n",
" 5 0.97 0.94 0.95 1257\n",
" 6 0.97 0.99 0.98 1351\n",
" 7 0.97 0.97 0.97 1422\n",
" 8 0.97 0.95 0.96 1411\n",
" 9 0.94 0.97 0.95 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 8.605724 5.490393 0.967679\n",
"1 7.290196 6.624226 0.969107\n",
"2 7.169216 6.591240 0.968571\n",
"3 7.225861 6.645801 0.965357\n",
"4 7.196922 6.595732 0.969643\n",
"5 7.294749 6.646243 0.966429\n",
"6 8.720850 5.419779 0.969821\n",
"7 8.711336 5.597409 0.967679\n",
"8 7.221519 6.624207 0.970536\n",
"9 8.673788 5.737247 0.970714\n"
]
}
],
"source": [
"names.append('bin+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('bin', Binarizer()),\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(11,10))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "cbf8e245",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: bin+pca99+knn (96.6571%) (7.146)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.99 0.98 1359\n",
" 1 0.96 0.99 0.98 1594\n",
" 2 0.98 0.97 0.97 1369\n",
" 3 0.96 0.96 0.96 1415\n",
" 4 0.98 0.95 0.97 1373\n",
" 5 0.97 0.94 0.95 1257\n",
" 6 0.96 0.99 0.97 1351\n",
" 7 0.97 0.97 0.97 1422\n",
" 8 0.97 0.94 0.96 1411\n",
" 9 0.93 0.97 0.95 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 6.961178 6.772386 0.965536\n",
"1 8.445867 6.193574 0.967143\n",
"2 9.256319 5.771667 0.966607\n",
"3 7.188147 6.795888 0.965179\n",
"4 9.336459 5.739643 0.966964\n",
"5 7.107809 6.823560 0.964821\n",
"6 9.127543 5.848893 0.968393\n",
"7 9.106971 5.873384 0.966429\n",
"8 7.145829 6.698655 0.969107\n",
"9 8.202323 6.141933 0.968750\n"
]
}
],
"source": [
"names.append('bin+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('bin', Binarizer()),\n",
" ('pca99', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(12))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cc1c7c77",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: pca95+knn (97.5500%) (7.416)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.98 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.97 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.98 14000\n",
" macro avg 0.98 0.98 0.98 14000\n",
"weighted avg 0.98 0.98 0.98 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 7.440297 5.405928 0.973750\n",
"1 7.141649 5.609115 0.972857\n",
"2 6.829969 5.687647 0.974286\n",
"3 8.291485 5.057073 0.973036\n",
"4 8.281738 4.872021 0.974464\n",
"5 6.882516 5.804941 0.975179\n",
"6 8.275357 5.190776 0.975893\n",
"7 8.244917 5.013079 0.975357\n",
"8 7.416139 5.446140 0.976607\n",
"9 6.752497 5.906509 0.975536\n"
]
}
],
"source": [
"names.append('pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('pca95', PCA(n_components=n95_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(13))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "562c937f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline: pca99+knn (97.3643%) (6.88)\n",
" precision recall f1-score support\n",
"\n",
" 0 0.98 1.00 0.99 1359\n",
" 1 0.97 0.99 0.98 1594\n",
" 2 0.98 0.96 0.97 1369\n",
" 3 0.97 0.97 0.97 1415\n",
" 4 0.98 0.97 0.97 1373\n",
" 5 0.96 0.96 0.96 1257\n",
" 6 0.98 0.99 0.98 1351\n",
" 7 0.97 0.98 0.97 1422\n",
" 8 0.99 0.95 0.97 1411\n",
" 9 0.96 0.97 0.97 1449\n",
"\n",
" accuracy 0.97 14000\n",
" macro avg 0.97 0.97 0.97 14000\n",
"weighted avg 0.97 0.97 0.97 14000\n",
"\n",
" fit_time score_time test_score\n",
"0 8.113361 5.777437 0.972857\n",
"1 6.950958 6.503708 0.971429\n",
"2 8.775186 5.424367 0.972143\n",
"3 8.805392 5.562713 0.970179\n",
"4 6.980318 6.581474 0.972679\n",
"5 8.412863 5.525903 0.973214\n",
"6 7.188666 6.495338 0.975536\n",
"7 6.802638 6.664372 0.972857\n",
"8 6.879545 6.603721 0.975714\n",
"9 7.031381 6.631324 0.974286\n"
]
}
],
"source": [
"names.append('pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('pca99', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"accuracies.append(cv(14))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0c661938",
"metadata": {},
"outputs": [],
"source": [
"names.append('kpca+pca95+knn')\n",
"classifiers.append(Pipeline([\n",
" ('kpca', KernelPCA(n_jobs=-1)),\n",
" ('pca', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"# accuracies.append(cv(15))"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "05b7b881",
"metadata": {},
"outputs": [],
"source": [
"names.append('kpca+pca99+knn')\n",
"classifiers.append(Pipeline([\n",
" ('kpca', KernelPCA(n_jobs=-1)),\n",
" ('pca', PCA(n_components=n99_components)),\n",
" ('knn', op_kNN)\n",
"]))\n",
"\n",
"# accuracies.append(cv(16))"
]
},
{
"cell_type": "markdown",
"id": "0a37b9d8",
"metadata": {},
"source": [
"# Auswertung"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "480adf73",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maximum accuracy (97.55%) for ['minmax+pca95+knn', 'maxabs+pca95+knn', 'pca95+knn']\n"
]
}
],
"source": [
"print(f\"Maximum accuracy ({max(accuracies):.6}%) for {[names[n] for n in np.where(accuracies==max(accuracies))[0]]}\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "202ff9a7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"name: knn (baseline) (97.3071%)\n",
"name: scalar+knn (94.8000%)\n",
"name: minmax+knn (97.3071%)\n",
"name: standard+pca95+knn (95.3929%)\n",
"name: minmax+pca95+knn (97.5500%)\n",
"name: standard+pca99+knn (94.9071%)\n",
"name: minmax+pca99+knn (97.3643%)\n",
"name: maxabs+pca95+knn (97.5500%)\n",
"name: maxabs+pca99+knn (97.3643%)\n",
"name: power+pca95+knn (94.9786%)\n",
"name: power+pca99+knn (94.9786%)\n",
"name: bin+pca95+knn (96.8286%)\n",
"name: bin+pca99+knn (96.6571%)\n",
"name: pca95+knn (97.5500%)\n",
"name: pca99+knn (97.3643%)\n"
]
}
],
"source": [
"for n, a in zip(names, accuracies):\n",
" print(f\"name: {n:20} ({a:.4f}%)\")"
]
},
{
"cell_type": "markdown",
"id": "94f1af95",
"metadata": {},
"source": [
"Default n=3\\\n",
"name: knn (baseline) (97.2143%)\\\n",
"name: scalar+knn (94.6286%)\\\n",
"name: minmax+knn (97.2143%)\\\n",
"name: standard+pca95+knn (95.1429%)\\\n",
"name: minmax+pca95+knn (97.4357%)\\\n",
"name: standard+pca99+knn (94.7214%)\\\n",
"name: minmax+pca99+knn (97.2714%)\\\n",
"name: maxabs+pca95+knn (97.4357%)\\\n",
"name: maxabs+pca99+knn (97.2714%)\\\n",
"name: power+pca95+knn (94.8071%)\\\n",
"name: power+pca99+knn (94.8071%)\\\n",
"name: bin+pca95+knn (96.6643%)\\\n",
"name: bin+pca99+knn (96.5500%)\\\n",
"name: pca95+knn (97.4357%)\\\n",
"name: pca99+knn (97.2714%)\\"
]
},
{
"cell_type": "markdown",
"id": "99ad8309",
"metadata": {},
"source": [
"n=3 euclid distance\\\n",
"name: knn (baseline) (97.3071%)\\\n",
"name: scalar+knn (94.8000%)\\\n",
"name: minmax+knn (97.3071%)\\\n",
"name: standard+pca95+knn (95.3929%)\\\n",
"name: minmax+pca95+knn (97.5500%)\\\n",
"name: standard+pca99+knn (94.9071%)\\\n",
"name: minmax+pca99+knn (97.3643%)\\\n",
"name: maxabs+pca95+knn (97.5500%)\\\n",
"name: maxabs+pca99+knn (97.3643%)\\\n",
"name: power+pca95+knn (94.9786%)\\\n",
"name: power+pca99+knn (94.9786%)\\\n",
"name: bin+pca95+knn (96.8286%)\\\n",
"name: bin+pca99+knn (96.6571%)\\\n",
"name: pca95+knn (97.5500%)\\\n",
"name: pca99+knn (97.3643%)\\\n"
]
},
{
"cell_type": "markdown",
"id": "497d3216",
"metadata": {},
"source": [
"# Hyper Parameter Optimization"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "e5e0c930",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 3 folds for each of 32 candidates, totalling 96 fits\n"
]
}
],
"source": [
"# from sklearn.model_selection import GridSearchCV\n",
"\n",
"# grid_params = {\n",
"# 'n_neighbors': [3, 5, 7 , 11],\n",
"# 'weights': ['uniform', 'distance'],\n",
"# 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],\n",
"# }\n",
"\n",
"# gs = GridSearchCV(\n",
"# KNeighborsClassifier(n_jobs=-1),\n",
"# grid_params,\n",
"# cv = 3,\n",
"# verbose=1,\n",
"# n_jobs = -1\n",
"# )\n",
"\n",
"# gs_results = gs.fit(X_train, y_train)\n",
"# Results in gs_results.csv"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "41349c36",
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame.from_dict(gs_results.cv_results_).sort_values('rank_test_score').to_csv('./gs_result.csv')"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "91d2f4bc",
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame.from_dict(gs_results.cv_results_)[['rank_test_score','mean_fit_time','param_n_neighbors', 'param_metric', 'param_weights', 'mean_test_score']].to_csv('./gs_result_filtered.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c031b179",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}