1320 lines
43 KiB
Plaintext
1320 lines
43 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "879144d9",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Load MNIST dataset"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"id": "bd032860",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Python ≥3.5 is required\n",
|
||
|
"import sys\n",
|
||
|
"assert sys.version_info >= (3, 5)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 2,
|
||
|
"id": "30da011c",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# scikit-learn ≥0.20 is required\n",
|
||
|
"import sklearn\n",
|
||
|
"assert sklearn.__version__ >= \"0.20\""
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 3,
|
||
|
"id": "4f555050",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# common imports\n",
|
||
|
"import numpy as np"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 4,
|
||
|
"id": "e4de4331",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": [
|
||
|
"sklearn.utils.Bunch"
|
||
|
]
|
||
|
},
|
||
|
"execution_count": 4,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# import function to scikit-learn datasets\n",
|
||
|
"from sklearn.datasets import fetch_openml\n",
|
||
|
"\n",
|
||
|
"# load specified dataset (MNIST)\n",
|
||
|
"mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n",
|
||
|
"\n",
|
||
|
"# print type of dataset\n",
|
||
|
"type(mnist)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 5,
|
||
|
"id": "b5221963",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"X, y = mnist[\"data\"], mnist[\"target\"]"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "811db75a",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### labels to int"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 6,
|
||
|
"id": "2bcc19ad",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# import plotting libraries\n",
|
||
|
"import matplotlib as mpl\n",
|
||
|
"import matplotlib.pyplot as plt"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 7,
|
||
|
"id": "cc4b728f",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# convert string labels to int\n",
|
||
|
"y = y.astype(np.uint8)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "d7113df3",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Prepare data for machine learning"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "570f328e",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"### Identify Train Set and Test Set"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 8,
|
||
|
"id": "80e1ca03",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"X_train: 56000, (56000, 784)\n",
|
||
|
"X_test: 14000, (14000, 784)\n",
|
||
|
"y_train: 56000, (56000,)\n",
|
||
|
"y_test: 14000, (14000,)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"\n",
|
||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=177013)\n",
|
||
|
"\n",
|
||
|
"print(f\"X_train: {len(X_train)}, {X_train.shape}\")\n",
|
||
|
"print(f\"X_test: {len(X_test)}, {X_test.shape}\")\n",
|
||
|
"print(f\"y_train: {len(y_train)}, {y_train.shape}\")\n",
|
||
|
"print(f\"y_test: {len(y_test)}, {y_test.shape}\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "ade8a1f6",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"## Pipeline Declaration"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 9,
|
||
|
"id": "bc5896c2",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"from sklearn.pipeline import Pipeline\n",
|
||
|
"from sklearn.decomposition import PCA, KernelPCA\n",
|
||
|
"from sklearn.preprocessing import (StandardScaler, \n",
|
||
|
" MinMaxScaler, \n",
|
||
|
" MaxAbsScaler,\n",
|
||
|
" PowerTransformer,\n",
|
||
|
" Binarizer)\n",
|
||
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||
|
"from sklearn.model_selection import cross_validate\n",
|
||
|
"from sklearn.metrics import classification_report, accuracy_score\n",
|
||
|
"\n",
|
||
|
"n_neighbors = 3\n",
|
||
|
"metric = 'euclidean'\n",
|
||
|
"weights = 'distance'\n",
|
||
|
"n95_components = 0.95\n",
|
||
|
"n99_components = 0.99\n",
|
||
|
"\n",
|
||
|
"names = []\n",
|
||
|
"classifiers = []\n",
|
||
|
"accuracies = []"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "9e905584",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Crossvalidation"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 10,
|
||
|
"id": "bbbb447c",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# not this\n",
|
||
|
"def cv_train(num,cv):\n",
|
||
|
" name = names[num]\n",
|
||
|
" clf = classifiers[num]\n",
|
||
|
" y_train_pred = cross_val_predict(clf, X_train, y_train, cv=cv, n_jobs=-1)\n",
|
||
|
" accuracy = accuracy_score(y_train, y_train_pred, normalize=True)*100\n",
|
||
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
|
||
|
" print(classification_report(y_train, y_train_pred))\n",
|
||
|
" return accuracy"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 11,
|
||
|
"id": "4a8240c4",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# also not this\n",
|
||
|
"def cv_test(num):\n",
|
||
|
" name = names[num]\n",
|
||
|
" clf = classifiers[num]\n",
|
||
|
" y_test_pred = cross_val_predict(clf, X_test, y_test, cv=5, n_jobs=-1)\n",
|
||
|
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
|
||
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
|
||
|
" print(classification_report(y_test, y_test_pred))\n",
|
||
|
" return accuracy"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 12,
|
||
|
"id": "f397cf42",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"\n",
|
||
|
"# this\n",
|
||
|
"def cv(num,cv_arg=10):\n",
|
||
|
" name = names[num]\n",
|
||
|
" clf = classifiers[num]\n",
|
||
|
" clf = clf.fit(X_train, y_train)\n",
|
||
|
" cv = cross_validate(clf, X_train, y_train, cv=cv_arg, n_jobs=-1)\n",
|
||
|
" # cv_clf = cv['estimator'][np.argmax(cv['test_score'])] # get the estimator where the max(test_score) on the cross validation\n",
|
||
|
" y_test_pred = clf.predict(X_test)\n",
|
||
|
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
|
||
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%) ({cv['fit_time'][np.argmax(cv['test_score'])]:.4})\")\n",
|
||
|
" print(classification_report(y_test, y_test_pred))\n",
|
||
|
" print(pd.DataFrame.from_dict(cv))\n",
|
||
|
" return accuracy"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "a543706f",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Fitting"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 13,
|
||
|
"id": "45452ceb",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"op_kNN = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1, metric=metric, weights=weights)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 14,
|
||
|
"id": "03c01cd0",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: knn (baseline) (97.3071%) (0.1162)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.96 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.94 0.96 1411\n",
|
||
|
" 9 0.96 0.97 0.96 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 0.125081 9.229141 0.972143\n",
|
||
|
"1 0.123761 8.918457 0.971429\n",
|
||
|
"2 0.125360 9.193326 0.971607\n",
|
||
|
"3 0.119904 8.944645 0.970714\n",
|
||
|
"4 0.244280 9.182126 0.972143\n",
|
||
|
"5 0.233787 9.137941 0.972857\n",
|
||
|
"6 0.123714 9.257920 0.975179\n",
|
||
|
"7 0.116261 9.135718 0.972500\n",
|
||
|
"8 0.116210 9.125259 0.975357\n",
|
||
|
"9 0.141345 9.308920 0.973393\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('knn (baseline)')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
" \n",
|
||
|
"accuracies.append(cv(0))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 15,
|
||
|
"id": "18f02d0c",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: scalar+knn (94.8000%) (1.146)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.97 0.99 0.98 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.95 0.93 0.94 1369\n",
|
||
|
" 3 0.94 0.95 0.95 1415\n",
|
||
|
" 4 0.94 0.93 0.94 1373\n",
|
||
|
" 5 0.93 0.91 0.92 1257\n",
|
||
|
" 6 0.96 0.97 0.97 1351\n",
|
||
|
" 7 0.94 0.95 0.94 1422\n",
|
||
|
" 8 0.96 0.91 0.93 1411\n",
|
||
|
" 9 0.92 0.94 0.93 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.95 14000\n",
|
||
|
" macro avg 0.95 0.95 0.95 14000\n",
|
||
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 0.758668 8.452150 0.949643\n",
|
||
|
"1 0.885169 8.631167 0.945000\n",
|
||
|
"2 1.160197 8.948529 0.949107\n",
|
||
|
"3 1.353553 8.502009 0.942500\n",
|
||
|
"4 0.912583 9.036203 0.948393\n",
|
||
|
"5 0.833251 9.013008 0.945179\n",
|
||
|
"6 0.886441 8.995505 0.943214\n",
|
||
|
"7 0.935067 8.891803 0.946607\n",
|
||
|
"8 1.146215 8.633170 0.951607\n",
|
||
|
"9 1.234975 8.846080 0.947143\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('scalar+knn') \n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('standard', StandardScaler()),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(1,10)) # Pipeline: standard+knn (94.3714%)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 16,
|
||
|
"id": "b2e7ee09",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: minmax+knn (97.3071%) (0.5789)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.96 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.94 0.96 1411\n",
|
||
|
" 9 0.96 0.97 0.96 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 0.389049 8.747383 0.972143\n",
|
||
|
"1 0.385594 9.063268 0.971429\n",
|
||
|
"2 0.527658 8.926810 0.971786\n",
|
||
|
"3 0.564983 8.949160 0.970714\n",
|
||
|
"4 0.607976 9.087395 0.972143\n",
|
||
|
"5 0.689083 8.948745 0.972857\n",
|
||
|
"6 0.578933 8.950404 0.975179\n",
|
||
|
"7 0.647489 9.248726 0.972679\n",
|
||
|
"8 0.634490 9.124195 0.975179\n",
|
||
|
"9 0.619997 8.861808 0.973393\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('minmax+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('minmax', MinMaxScaler()),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(2))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 17,
|
||
|
"id": "23ae34c3",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: standard+pca95+knn (95.3929%) (9.151)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.97 0.99 0.98 1359\n",
|
||
|
" 1 0.98 0.99 0.98 1594\n",
|
||
|
" 2 0.96 0.94 0.95 1369\n",
|
||
|
" 3 0.94 0.96 0.95 1415\n",
|
||
|
" 4 0.95 0.94 0.95 1373\n",
|
||
|
" 5 0.94 0.92 0.93 1257\n",
|
||
|
" 6 0.96 0.97 0.97 1351\n",
|
||
|
" 7 0.95 0.96 0.95 1422\n",
|
||
|
" 8 0.97 0.92 0.94 1411\n",
|
||
|
" 9 0.92 0.94 0.93 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.95 14000\n",
|
||
|
" macro avg 0.95 0.95 0.95 14000\n",
|
||
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 7.882113 6.468349 0.953036\n",
|
||
|
"1 8.902088 5.919568 0.950357\n",
|
||
|
"2 8.750770 5.959131 0.953571\n",
|
||
|
"3 7.767436 6.355160 0.945179\n",
|
||
|
"4 9.307496 5.559806 0.952500\n",
|
||
|
"5 7.845258 6.534233 0.950000\n",
|
||
|
"6 8.874461 5.844108 0.948214\n",
|
||
|
"7 7.704501 6.488686 0.950714\n",
|
||
|
"8 9.151180 5.622928 0.955893\n",
|
||
|
"9 7.257694 6.649508 0.953571\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('standard+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('standard', StandardScaler()),\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(3,10)) # Pipeline: standard+pca95+knn (95.0500%)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 18,
|
||
|
"id": "cac23616",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: minmax+pca95+knn (97.5500%) (8.162)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.98 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.97 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.98 14000\n",
|
||
|
" macro avg 0.98 0.98 0.98 14000\n",
|
||
|
"weighted avg 0.98 0.98 0.98 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 8.170779 5.294063 0.973750\n",
|
||
|
"1 8.088926 5.365682 0.972857\n",
|
||
|
"2 6.857623 5.776273 0.974286\n",
|
||
|
"3 7.262239 5.806243 0.973036\n",
|
||
|
"4 8.057861 5.374735 0.974464\n",
|
||
|
"5 7.276100 5.745563 0.975179\n",
|
||
|
"6 8.202060 5.424418 0.975893\n",
|
||
|
"7 8.114796 5.319392 0.975357\n",
|
||
|
"8 8.162190 5.424424 0.976607\n",
|
||
|
"9 7.437983 5.727555 0.975536\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('minmax+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('minmax', MinMaxScaler()),\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(4))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 19,
|
||
|
"id": "a57eb660",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: standard+pca99+knn (94.9071%) (9.396)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.97 0.99 0.98 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.95 0.93 0.94 1369\n",
|
||
|
" 3 0.94 0.95 0.95 1415\n",
|
||
|
" 4 0.95 0.93 0.94 1373\n",
|
||
|
" 5 0.93 0.91 0.92 1257\n",
|
||
|
" 6 0.96 0.97 0.97 1351\n",
|
||
|
" 7 0.94 0.95 0.95 1422\n",
|
||
|
" 8 0.97 0.91 0.94 1411\n",
|
||
|
" 9 0.92 0.94 0.93 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.95 14000\n",
|
||
|
" macro avg 0.95 0.95 0.95 14000\n",
|
||
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 9.473285 6.515933 0.949821\n",
|
||
|
"1 7.714788 7.746434 0.946786\n",
|
||
|
"2 7.546448 7.823940 0.950179\n",
|
||
|
"3 7.546017 7.754220 0.943036\n",
|
||
|
"4 7.534473 7.836074 0.948750\n",
|
||
|
"5 7.524864 7.952840 0.945893\n",
|
||
|
"6 7.699588 7.758791 0.946071\n",
|
||
|
"7 9.373224 6.639953 0.947679\n",
|
||
|
"8 9.395972 6.550251 0.953214\n",
|
||
|
"9 9.444257 6.667987 0.947679\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('standard+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('standard', StandardScaler()),\n",
|
||
|
" ('pca99', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(5,10)) # Pipeline: standard+pca99+knn (94.5357%)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 20,
|
||
|
"id": "bcbedf38",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: minmax+pca99+knn (97.3643%) (9.134)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.96 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 8.213089 6.006415 0.972679\n",
|
||
|
"1 7.115110 6.500668 0.971429\n",
|
||
|
"2 7.381336 6.424851 0.972143\n",
|
||
|
"3 7.099746 6.442294 0.970179\n",
|
||
|
"4 9.174922 5.636195 0.972679\n",
|
||
|
"5 6.937273 6.640518 0.973214\n",
|
||
|
"6 8.361929 6.048856 0.975536\n",
|
||
|
"7 9.058427 5.660812 0.972857\n",
|
||
|
"8 9.134326 5.704015 0.975714\n",
|
||
|
"9 9.088895 5.631931 0.974286\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('minmax+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('minmax', MinMaxScaler()),\n",
|
||
|
" ('pca99', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(6))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 21,
|
||
|
"id": "5bc4f44b",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: maxabs+pca95+knn (97.5500%) (8.568)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.98 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.97 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.98 14000\n",
|
||
|
" macro avg 0.98 0.98 0.98 14000\n",
|
||
|
"weighted avg 0.98 0.98 0.98 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 7.831842 5.514181 0.973750\n",
|
||
|
"1 7.772907 5.701575 0.972857\n",
|
||
|
"2 8.581123 5.059241 0.974286\n",
|
||
|
"3 7.486752 5.769173 0.973036\n",
|
||
|
"4 7.644667 5.623410 0.974464\n",
|
||
|
"5 8.568252 5.032640 0.975179\n",
|
||
|
"6 7.305880 5.804784 0.975893\n",
|
||
|
"7 7.243364 5.869418 0.975357\n",
|
||
|
"8 8.568058 5.175154 0.976607\n",
|
||
|
"9 8.545392 5.035311 0.975536\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('maxabs+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('maxabs', MaxAbsScaler()),\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(7))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 22,
|
||
|
"id": "a901ad5d",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: maxabs+pca99+knn (97.3643%) (7.451)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.96 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 7.066843 6.724576 0.972679\n",
|
||
|
"1 8.163119 5.973942 0.971429\n",
|
||
|
"2 7.298925 6.530500 0.972143\n",
|
||
|
"3 8.869127 5.643595 0.970179\n",
|
||
|
"4 8.898886 5.403235 0.972679\n",
|
||
|
"5 7.828445 6.313535 0.973214\n",
|
||
|
"6 8.803470 5.529734 0.975536\n",
|
||
|
"7 8.294126 5.839177 0.972857\n",
|
||
|
"8 7.450544 6.585145 0.975714\n",
|
||
|
"9 7.483754 6.628288 0.974286\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('maxabs+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('maxabs', MaxAbsScaler()),\n",
|
||
|
" ('pca99', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(8))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 23,
|
||
|
"id": "19e87457",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
|
||
|
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: power+pca95+knn (94.9786%) (39.14)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.97 0.99 0.98 1359\n",
|
||
|
" 1 0.98 0.98 0.98 1594\n",
|
||
|
" 2 0.95 0.94 0.95 1369\n",
|
||
|
" 3 0.94 0.94 0.94 1415\n",
|
||
|
" 4 0.96 0.93 0.95 1373\n",
|
||
|
" 5 0.94 0.91 0.93 1257\n",
|
||
|
" 6 0.95 0.98 0.96 1351\n",
|
||
|
" 7 0.94 0.95 0.94 1422\n",
|
||
|
" 8 0.96 0.92 0.94 1411\n",
|
||
|
" 9 0.92 0.94 0.93 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.95 14000\n",
|
||
|
" macro avg 0.95 0.95 0.95 14000\n",
|
||
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 38.802822 5.871142 0.946429\n",
|
||
|
"1 37.150519 6.993097 0.945357\n",
|
||
|
"2 37.138519 7.017011 0.948929\n",
|
||
|
"3 38.357648 6.108760 0.945714\n",
|
||
|
"4 37.175505 7.018548 0.949107\n",
|
||
|
"5 37.249800 6.882046 0.945357\n",
|
||
|
"6 37.172034 6.747881 0.949643\n",
|
||
|
"7 38.738593 5.878512 0.948571\n",
|
||
|
"8 39.135663 5.579420 0.956607\n",
|
||
|
"9 37.060059 6.904209 0.949821\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('power+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('power', PowerTransformer()),\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(9,10)) # likes to die # Pipeline: power+pca95+knn (94.3714%)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 24,
|
||
|
"id": "6146ccb1",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
|
||
|
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: power+pca99+knn (94.9786%) (37.94)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.97 0.99 0.98 1359\n",
|
||
|
" 1 0.98 0.98 0.98 1594\n",
|
||
|
" 2 0.95 0.94 0.95 1369\n",
|
||
|
" 3 0.94 0.94 0.94 1415\n",
|
||
|
" 4 0.96 0.93 0.95 1373\n",
|
||
|
" 5 0.94 0.91 0.93 1257\n",
|
||
|
" 6 0.95 0.98 0.96 1351\n",
|
||
|
" 7 0.94 0.95 0.94 1422\n",
|
||
|
" 8 0.96 0.92 0.94 1411\n",
|
||
|
" 9 0.92 0.94 0.93 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.95 14000\n",
|
||
|
" macro avg 0.95 0.95 0.95 14000\n",
|
||
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 39.081897 5.799132 0.946429\n",
|
||
|
"1 39.650227 5.733288 0.945357\n",
|
||
|
"2 36.740629 6.834802 0.948929\n",
|
||
|
"3 39.349449 5.738063 0.945714\n",
|
||
|
"4 38.051169 6.654780 0.949107\n",
|
||
|
"5 38.139323 6.575838 0.945357\n",
|
||
|
"6 36.967650 6.827009 0.949643\n",
|
||
|
"7 37.961230 6.632579 0.948571\n",
|
||
|
"8 37.944791 6.445367 0.956607\n",
|
||
|
"9 39.280427 5.751522 0.949821\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('power+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('power', PowerTransformer()),\n",
|
||
|
" ('pca99', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(10,10)) # likes to die # Pipeline: power+pca99+knn (94.6429%)"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 25,
|
||
|
"id": "66a7637b",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: bin+pca95+knn (96.8286%) (8.674)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 0.99 0.99 1359\n",
|
||
|
" 1 0.96 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.97 0.98 1369\n",
|
||
|
" 3 0.96 0.96 0.96 1415\n",
|
||
|
" 4 0.98 0.95 0.97 1373\n",
|
||
|
" 5 0.97 0.94 0.95 1257\n",
|
||
|
" 6 0.97 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.97 0.97 1422\n",
|
||
|
" 8 0.97 0.95 0.96 1411\n",
|
||
|
" 9 0.94 0.97 0.95 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 8.605724 5.490393 0.967679\n",
|
||
|
"1 7.290196 6.624226 0.969107\n",
|
||
|
"2 7.169216 6.591240 0.968571\n",
|
||
|
"3 7.225861 6.645801 0.965357\n",
|
||
|
"4 7.196922 6.595732 0.969643\n",
|
||
|
"5 7.294749 6.646243 0.966429\n",
|
||
|
"6 8.720850 5.419779 0.969821\n",
|
||
|
"7 8.711336 5.597409 0.967679\n",
|
||
|
"8 7.221519 6.624207 0.970536\n",
|
||
|
"9 8.673788 5.737247 0.970714\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('bin+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('bin', Binarizer()),\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(11,10))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 26,
|
||
|
"id": "d4fadaac",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: bin+pca99+knn (96.6571%) (7.146)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 0.99 0.98 1359\n",
|
||
|
" 1 0.96 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.97 0.97 1369\n",
|
||
|
" 3 0.96 0.96 0.96 1415\n",
|
||
|
" 4 0.98 0.95 0.97 1373\n",
|
||
|
" 5 0.97 0.94 0.95 1257\n",
|
||
|
" 6 0.96 0.99 0.97 1351\n",
|
||
|
" 7 0.97 0.97 0.97 1422\n",
|
||
|
" 8 0.97 0.94 0.96 1411\n",
|
||
|
" 9 0.93 0.97 0.95 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 6.961178 6.772386 0.965536\n",
|
||
|
"1 8.445867 6.193574 0.967143\n",
|
||
|
"2 9.256319 5.771667 0.966607\n",
|
||
|
"3 7.188147 6.795888 0.965179\n",
|
||
|
"4 9.336459 5.739643 0.966964\n",
|
||
|
"5 7.107809 6.823560 0.964821\n",
|
||
|
"6 9.127543 5.848893 0.968393\n",
|
||
|
"7 9.106971 5.873384 0.966429\n",
|
||
|
"8 7.145829 6.698655 0.969107\n",
|
||
|
"9 8.202323 6.141933 0.968750\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('bin+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('bin', Binarizer()),\n",
|
||
|
" ('pca99', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(12))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 27,
|
||
|
"id": "d15fb11c",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: pca95+knn (97.5500%) (7.416)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.98 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.97 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.98 14000\n",
|
||
|
" macro avg 0.98 0.98 0.98 14000\n",
|
||
|
"weighted avg 0.98 0.98 0.98 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 7.440297 5.405928 0.973750\n",
|
||
|
"1 7.141649 5.609115 0.972857\n",
|
||
|
"2 6.829969 5.687647 0.974286\n",
|
||
|
"3 8.291485 5.057073 0.973036\n",
|
||
|
"4 8.281738 4.872021 0.974464\n",
|
||
|
"5 6.882516 5.804941 0.975179\n",
|
||
|
"6 8.275357 5.190776 0.975893\n",
|
||
|
"7 8.244917 5.013079 0.975357\n",
|
||
|
"8 7.416139 5.446140 0.976607\n",
|
||
|
"9 6.752497 5.906509 0.975536\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('pca95', PCA(n_components=n95_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(13))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 28,
|
||
|
"id": "2db8577b",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Pipeline: pca99+knn (97.3643%) (6.88)\n",
|
||
|
" precision recall f1-score support\n",
|
||
|
"\n",
|
||
|
" 0 0.98 1.00 0.99 1359\n",
|
||
|
" 1 0.97 0.99 0.98 1594\n",
|
||
|
" 2 0.98 0.96 0.97 1369\n",
|
||
|
" 3 0.97 0.97 0.97 1415\n",
|
||
|
" 4 0.98 0.97 0.97 1373\n",
|
||
|
" 5 0.96 0.96 0.96 1257\n",
|
||
|
" 6 0.98 0.99 0.98 1351\n",
|
||
|
" 7 0.97 0.98 0.97 1422\n",
|
||
|
" 8 0.99 0.95 0.97 1411\n",
|
||
|
" 9 0.96 0.97 0.97 1449\n",
|
||
|
"\n",
|
||
|
" accuracy 0.97 14000\n",
|
||
|
" macro avg 0.97 0.97 0.97 14000\n",
|
||
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
||
|
"\n",
|
||
|
" fit_time score_time test_score\n",
|
||
|
"0 8.113361 5.777437 0.972857\n",
|
||
|
"1 6.950958 6.503708 0.971429\n",
|
||
|
"2 8.775186 5.424367 0.972143\n",
|
||
|
"3 8.805392 5.562713 0.970179\n",
|
||
|
"4 6.980318 6.581474 0.972679\n",
|
||
|
"5 8.412863 5.525903 0.973214\n",
|
||
|
"6 7.188666 6.495338 0.975536\n",
|
||
|
"7 6.802638 6.664372 0.972857\n",
|
||
|
"8 6.879545 6.603721 0.975714\n",
|
||
|
"9 7.031381 6.631324 0.974286\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"names.append('pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('pca99', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"accuracies.append(cv(14))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 29,
|
||
|
"id": "a5702428",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"names.append('kpca+pca95+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('kpca', KernelPCA(n_jobs=-1)),\n",
|
||
|
" ('pca', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"# accuracies.append(cv(15))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 30,
|
||
|
"id": "0ee57cfe",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"names.append('kpca+pca99+knn')\n",
|
||
|
"classifiers.append(Pipeline([\n",
|
||
|
" ('kpca', KernelPCA(n_jobs=-1)),\n",
|
||
|
" ('pca', PCA(n_components=n99_components)),\n",
|
||
|
" ('knn', op_kNN)\n",
|
||
|
"]))\n",
|
||
|
"\n",
|
||
|
"# accuracies.append(cv(16))"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "7fbbc930",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Auswertung"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 31,
|
||
|
"id": "e5d609aa",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Maximum accuracy (97.55%) for ['minmax+pca95+knn', 'maxabs+pca95+knn', 'pca95+knn']\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"print(f\"Maximum accuracy ({max(accuracies):.6}%) for {[names[n] for n in np.where(accuracies==max(accuracies))[0]]}\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 38,
|
||
|
"id": "234f14bb",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"name: knn (baseline) (97.3071%)\n",
|
||
|
"name: scalar+knn (94.8000%)\n",
|
||
|
"name: minmax+knn (97.3071%)\n",
|
||
|
"name: standard+pca95+knn (95.3929%)\n",
|
||
|
"name: minmax+pca95+knn (97.5500%)\n",
|
||
|
"name: standard+pca99+knn (94.9071%)\n",
|
||
|
"name: minmax+pca99+knn (97.3643%)\n",
|
||
|
"name: maxabs+pca95+knn (97.5500%)\n",
|
||
|
"name: maxabs+pca99+knn (97.3643%)\n",
|
||
|
"name: power+pca95+knn (94.9786%)\n",
|
||
|
"name: power+pca99+knn (94.9786%)\n",
|
||
|
"name: bin+pca95+knn (96.8286%)\n",
|
||
|
"name: bin+pca99+knn (96.6571%)\n",
|
||
|
"name: pca95+knn (97.5500%)\n",
|
||
|
"name: pca99+knn (97.3643%)\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"for n, a in zip(names, accuracies):\n",
|
||
|
" print(f\"name: {n:20} ({a:.4f}%)\")"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "a6ddb6f2",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"Default n=3\\\n",
|
||
|
"name: knn (baseline) (97.2143%)\\\n",
|
||
|
"name: scalar+knn (94.6286%)\\\n",
|
||
|
"name: minmax+knn (97.2143%)\\\n",
|
||
|
"name: standard+pca95+knn (95.1429%)\\\n",
|
||
|
"name: minmax+pca95+knn (97.4357%)\\\n",
|
||
|
"name: standard+pca99+knn (94.7214%)\\\n",
|
||
|
"name: minmax+pca99+knn (97.2714%)\\\n",
|
||
|
"name: maxabs+pca95+knn (97.4357%)\\\n",
|
||
|
"name: maxabs+pca99+knn (97.2714%)\\\n",
|
||
|
"name: power+pca95+knn (94.8071%)\\\n",
|
||
|
"name: power+pca99+knn (94.8071%)\\\n",
|
||
|
"name: bin+pca95+knn (96.6643%)\\\n",
|
||
|
"name: bin+pca99+knn (96.5500%)\\\n",
|
||
|
"name: pca95+knn (97.4357%)\\\n",
|
||
|
"name: pca99+knn (97.2714%)\\"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "bde6e847",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"n=3 euclid distance\\\n",
|
||
|
"name: knn (baseline) (97.3071%)\\\n",
|
||
|
"name: scalar+knn (94.8000%)\\\n",
|
||
|
"name: minmax+knn (97.3071%)\\\n",
|
||
|
"name: standard+pca95+knn (95.3929%)\\\n",
|
||
|
"name: minmax+pca95+knn (97.5500%)\\\n",
|
||
|
"name: standard+pca99+knn (94.9071%)\\\n",
|
||
|
"name: minmax+pca99+knn (97.3643%)\\\n",
|
||
|
"name: maxabs+pca95+knn (97.5500%)\\\n",
|
||
|
"name: maxabs+pca99+knn (97.3643%)\\\n",
|
||
|
"name: power+pca95+knn (94.9786%)\\\n",
|
||
|
"name: power+pca99+knn (94.9786%)\\\n",
|
||
|
"name: bin+pca95+knn (96.8286%)\\\n",
|
||
|
"name: bin+pca99+knn (96.6571%)\\\n",
|
||
|
"name: pca95+knn (97.5500%)\\\n",
|
||
|
"name: pca99+knn (97.3643%)\\\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"id": "4c625bc3",
|
||
|
"metadata": {},
|
||
|
"source": [
|
||
|
"# Hyper Parameter Optimization"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 34,
|
||
|
"id": "24ff7ea2",
|
||
|
"metadata": {},
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Fitting 3 folds for each of 32 candidates, totalling 96 fits\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# from sklearn.model_selection import GridSearchCV\n",
|
||
|
"\n",
|
||
|
"# grid_params = {\n",
|
||
|
"# 'n_neighbors': [3, 5, 7 , 11],\n",
|
||
|
"# 'weights': ['uniform', 'distance'],\n",
|
||
|
"# 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],\n",
|
||
|
"# }\n",
|
||
|
"\n",
|
||
|
"# gs = GridSearchCV(\n",
|
||
|
"# KNeighborsClassifier(n_jobs=-1),\n",
|
||
|
"# grid_params,\n",
|
||
|
"# cv = 3,\n",
|
||
|
"# verbose=1,\n",
|
||
|
"# n_jobs = -1\n",
|
||
|
"# )\n",
|
||
|
"\n",
|
||
|
"# gs_results = gs.fit(X_train, y_train)\n",
|
||
|
"# Results in gs_results.csv"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 36,
|
||
|
"id": "b3b0eac3",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"pd.DataFrame.from_dict(gs_results.cv_results_).sort_values('rank_test_score').to_csv('./gs_result.csv')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 37,
|
||
|
"id": "b68589fe",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"pd.DataFrame.from_dict(gs_results.cv_results_)[['rank_test_score','mean_fit_time','param_n_neighbors', 'param_metric', 'param_weights', 'mean_test_score']].to_csv('./gs_result_filtered.csv')"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"id": "b162e908",
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": []
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.8.5"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|