838 lines
26 KiB
Plaintext
838 lines
26 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "03d83636",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Load MNIST dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "242d14f0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Python ≥3.5 is required\n",
|
|
"import sys\n",
|
|
"assert sys.version_info >= (3, 5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "cfd3a54a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# scikit-learn ≥0.20 is required\n",
|
|
"import sklearn\n",
|
|
"assert sklearn.__version__ >= \"0.20\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "7e1587b3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# common imports\n",
|
|
"import numpy as np"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "bbccfc32",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"sklearn.utils.Bunch"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"# import function to scikit-learn datasets\n",
|
|
"from sklearn.datasets import fetch_openml\n",
|
|
"\n",
|
|
"# load specified dataset (MNIST)\n",
|
|
"mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n",
|
|
"\n",
|
|
"# print type of dataset\n",
|
|
"type(mnist)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "6db7c96a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X, y = mnist[\"data\"], mnist[\"target\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "459780d0",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Fix labels"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "48c4e861",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import plotting libraries\n",
|
|
"import matplotlib as mpl\n",
|
|
"import matplotlib.pyplot as plt"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "72876ab1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# convert string labels to int\n",
|
|
"y = y.astype(np.uint8)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c9dacae4",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Prepare data for machine learning"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b44b3f87",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Identify Train Set and Test Set"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "51c5da44",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"X_train: 56000, (56000, 784)\n",
|
|
"X_test: 14000, (14000, 784)\n",
|
|
"y_train: 56000, (56000,)\n",
|
|
"y_test: 14000, (14000,)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)\n",
|
|
"\n",
|
|
"print(f\"X_train: {len(X_train)}, {X_train.shape}\")\n",
|
|
"print(f\"X_test: {len(X_test)}, {X_test.shape}\")\n",
|
|
"print(f\"y_train: {len(y_train)}, {y_train.shape}\")\n",
|
|
"print(f\"y_test: {len(y_test)}, {y_test.shape}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "673e237d",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Pipeline Declaration"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "8ca34ce2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(11, 11)"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.decomposition import PCA\n",
|
|
"from sklearn.preprocessing import (StandardScaler, \n",
|
|
" MinMaxScaler, \n",
|
|
" MaxAbsScaler, \n",
|
|
" PowerTransformer)\n",
|
|
"from sklearn.neighbors import KNeighborsClassifier\n",
|
|
"from sklearn.model_selection import cross_validate\n",
|
|
"from sklearn.metrics import classification_report, accuracy_score\n",
|
|
"\n",
|
|
"n_neighbors = 3\n",
|
|
"n95_components = 0.95\n",
|
|
"n99_components = 0.99\n",
|
|
"\n",
|
|
"names = ['knn (baseline)', \n",
|
|
" 'scalar+knn', \n",
|
|
" 'minmax+knn', \n",
|
|
" 'standard+pca95+knn', \n",
|
|
" 'minmax+pca95+knn', # Best so far w/ 97.0429%\n",
|
|
" 'standard+pca99+knn', \n",
|
|
" 'minmax+pca99+knn',\n",
|
|
" 'maxabs+pca95+knn', \n",
|
|
" 'maxabs+pca99+knn', # Best so far w/ 97.0429%\n",
|
|
" 'power+pca95+knn',\n",
|
|
" 'power+pca99+knn',\n",
|
|
" ]\n",
|
|
"\n",
|
|
"classifiers = [\n",
|
|
" Pipeline([('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))]),\n",
|
|
" Pipeline([\n",
|
|
" ('standard', StandardScaler()),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('minmax', MinMaxScaler()),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('standard', StandardScaler()),\n",
|
|
" ('pca', PCA(n_components=n95_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('minmax', MinMaxScaler()),\n",
|
|
" ('pca', PCA(n_components=n95_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('standard', StandardScaler()),\n",
|
|
" ('pca', PCA(n_components=n99_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('minmax', MinMaxScaler()),\n",
|
|
" ('pca', PCA(n_components=n99_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('maxabs', MaxAbsScaler()),\n",
|
|
" ('pca', PCA(n_components=n99_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('maxabs', MaxAbsScaler()),\n",
|
|
" ('pca', PCA(n_components=n95_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('power', PowerTransformer()),\n",
|
|
" ('pca', PCA(n_components=n99_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
" Pipeline([\n",
|
|
" ('power', PowerTransformer()),\n",
|
|
" ('pca', PCA(n_components=n95_components)),\n",
|
|
" ('knn', KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1))\n",
|
|
" ]),\n",
|
|
"]\n",
|
|
"\n",
|
|
"len(names), len(classifiers)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f38b2bb2",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Crossvalidation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "3465f546",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"accuracies = []"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "1f087f92",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cv_train(num,cv):\n",
|
|
" name = names[num]\n",
|
|
" clf = classifiers[num]\n",
|
|
" y_train_pred = cross_val_predict(clf, X_train, y_train, cv=cv, n_jobs=-1)\n",
|
|
" accuracy = accuracy_score(y_train, y_train_pred, normalize=True)*100\n",
|
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
|
|
" print(classification_report(y_train, y_train_pred))\n",
|
|
" return accuracy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "9cb9b3e7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cv_test(num):\n",
|
|
" name = names[num]\n",
|
|
" clf = classifiers[num]\n",
|
|
" y_test_pred = cross_val_predict(clf, X_test, y_test, cv=5, n_jobs=-1)\n",
|
|
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
|
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
|
|
" print(classification_report(y_test, y_test_pred))\n",
|
|
" return accuracy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "462e340f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def cv(num,cv_arg=10):\n",
|
|
" name = names[num]\n",
|
|
" clf = classifiers[num]\n",
|
|
" clf = clf.fit(X_train, y_train)\n",
|
|
" cv = cross_validate(clf, X_train, y_train, cv=cv_arg, n_jobs=-1, return_estimator=True) \n",
|
|
" cv_clf = cv['estimator'][np.argmax(cv['test_score'])]\n",
|
|
" y_test_pred = cv_clf.predict(X_test)\n",
|
|
" accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100\n",
|
|
" print(f\"Pipeline: {name} ({accuracy:.4f}%)\")\n",
|
|
" print(classification_report(y_test, y_test_pred))\n",
|
|
" return accuracy"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "cbca3b1f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: knn (baseline) (96.9857%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.96 0.99 0.98 1404\n",
|
|
" 1 0.97 0.99 0.98 1590\n",
|
|
" 2 0.98 0.97 0.97 1395\n",
|
|
" 3 0.98 0.96 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.96 0.96 0.96 1245\n",
|
|
" 6 0.98 0.98 0.98 1334\n",
|
|
" 7 0.97 0.97 0.97 1447\n",
|
|
" 8 0.99 0.92 0.95 1321\n",
|
|
" 9 0.95 0.96 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.model_selection import LeaveOneOut\n",
|
|
"# accuracies.append(cv(0,5))\n",
|
|
"accuracies.append(cv(0,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "ad92d1f0",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: scalar+knn (94.2143%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.95 0.98 0.96 1404\n",
|
|
" 1 0.96 0.99 0.97 1590\n",
|
|
" 2 0.93 0.95 0.94 1395\n",
|
|
" 3 0.94 0.94 0.94 1462\n",
|
|
" 4 0.95 0.93 0.94 1374\n",
|
|
" 5 0.93 0.92 0.93 1245\n",
|
|
" 6 0.96 0.97 0.96 1334\n",
|
|
" 7 0.93 0.94 0.94 1447\n",
|
|
" 8 0.97 0.88 0.92 1321\n",
|
|
" 9 0.91 0.91 0.91 1428\n",
|
|
"\n",
|
|
" accuracy 0.94 14000\n",
|
|
" macro avg 0.94 0.94 0.94 14000\n",
|
|
"weighted avg 0.94 0.94 0.94 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(1,5))\n",
|
|
"accuracies.append(cv(1,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "881d8a07",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: minmax+knn (96.9857%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.96 0.99 0.98 1404\n",
|
|
" 1 0.97 0.99 0.98 1590\n",
|
|
" 2 0.98 0.97 0.97 1395\n",
|
|
" 3 0.98 0.96 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.96 0.96 0.96 1245\n",
|
|
" 6 0.98 0.98 0.98 1334\n",
|
|
" 7 0.97 0.97 0.97 1447\n",
|
|
" 8 0.99 0.92 0.95 1321\n",
|
|
" 9 0.95 0.96 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(2,5))\n",
|
|
"accuracies.append(cv(2,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "1402e10b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: standard+pca95+knn (94.7000%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.95 0.98 0.97 1404\n",
|
|
" 1 0.96 0.99 0.98 1590\n",
|
|
" 2 0.94 0.95 0.94 1395\n",
|
|
" 3 0.94 0.94 0.94 1462\n",
|
|
" 4 0.96 0.94 0.95 1374\n",
|
|
" 5 0.93 0.93 0.93 1245\n",
|
|
" 6 0.96 0.97 0.97 1334\n",
|
|
" 7 0.94 0.95 0.94 1447\n",
|
|
" 8 0.97 0.89 0.93 1321\n",
|
|
" 9 0.92 0.92 0.92 1428\n",
|
|
"\n",
|
|
" accuracy 0.95 14000\n",
|
|
" macro avg 0.95 0.95 0.95 14000\n",
|
|
"weighted avg 0.95 0.95 0.95 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(3,5))\n",
|
|
"accuracies.append(cv(3,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "24035514",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: minmax+pca95+knn (97.2571%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.97 0.99 0.98 1404\n",
|
|
" 1 0.98 0.99 0.99 1590\n",
|
|
" 2 0.98 0.97 0.97 1395\n",
|
|
" 3 0.98 0.97 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.97 0.96 0.97 1245\n",
|
|
" 6 0.98 0.99 0.98 1334\n",
|
|
" 7 0.97 0.98 0.97 1447\n",
|
|
" 8 0.99 0.93 0.96 1321\n",
|
|
" 9 0.95 0.97 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(4,5))\n",
|
|
"accuracies.append(cv(4,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "1c27528e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: standard+pca99+knn (94.2929%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.95 0.98 0.96 1404\n",
|
|
" 1 0.96 0.99 0.97 1590\n",
|
|
" 2 0.94 0.95 0.94 1395\n",
|
|
" 3 0.94 0.94 0.94 1462\n",
|
|
" 4 0.96 0.93 0.94 1374\n",
|
|
" 5 0.93 0.92 0.92 1245\n",
|
|
" 6 0.96 0.97 0.96 1334\n",
|
|
" 7 0.93 0.94 0.94 1447\n",
|
|
" 8 0.97 0.88 0.92 1321\n",
|
|
" 9 0.91 0.91 0.91 1428\n",
|
|
"\n",
|
|
" accuracy 0.94 14000\n",
|
|
" macro avg 0.94 0.94 0.94 14000\n",
|
|
"weighted avg 0.94 0.94 0.94 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(5,5))\n",
|
|
"accuracies.append(cv(5,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "46bcb35f",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: minmax+pca99+knn (96.9929%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.96 0.99 0.98 1404\n",
|
|
" 1 0.97 0.99 0.98 1590\n",
|
|
" 2 0.97 0.97 0.97 1395\n",
|
|
" 3 0.97 0.96 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.97 0.96 0.96 1245\n",
|
|
" 6 0.98 0.98 0.98 1334\n",
|
|
" 7 0.97 0.97 0.97 1447\n",
|
|
" 8 0.99 0.93 0.96 1321\n",
|
|
" 9 0.95 0.96 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(6,5))\n",
|
|
"accuracies.append(cv(6,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "45d8092d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: maxabs+pca95+knn (96.9929%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.96 0.99 0.98 1404\n",
|
|
" 1 0.97 0.99 0.98 1590\n",
|
|
" 2 0.97 0.97 0.97 1395\n",
|
|
" 3 0.97 0.96 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.97 0.96 0.96 1245\n",
|
|
" 6 0.98 0.98 0.98 1334\n",
|
|
" 7 0.97 0.97 0.97 1447\n",
|
|
" 8 0.99 0.93 0.96 1321\n",
|
|
" 9 0.95 0.96 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(7,5))\n",
|
|
"accuracies.append(cv(7,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "a805b3fd",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: maxabs+pca99+knn (97.2571%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.97 0.99 0.98 1404\n",
|
|
" 1 0.98 0.99 0.99 1590\n",
|
|
" 2 0.98 0.97 0.97 1395\n",
|
|
" 3 0.98 0.97 0.97 1462\n",
|
|
" 4 0.98 0.97 0.97 1374\n",
|
|
" 5 0.97 0.96 0.97 1245\n",
|
|
" 6 0.98 0.99 0.98 1334\n",
|
|
" 7 0.97 0.98 0.97 1447\n",
|
|
" 8 0.99 0.93 0.96 1321\n",
|
|
" 9 0.95 0.97 0.96 1428\n",
|
|
"\n",
|
|
" accuracy 0.97 14000\n",
|
|
" macro avg 0.97 0.97 0.97 14000\n",
|
|
"weighted avg 0.97 0.97 0.97 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(8,5))\n",
|
|
"accuracies.append(cv(8,10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "3af8abf8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
|
|
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: power+pca95+knn (94.1071%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.95 0.98 0.97 1404\n",
|
|
" 1 0.96 0.99 0.97 1590\n",
|
|
" 2 0.94 0.95 0.94 1395\n",
|
|
" 3 0.94 0.93 0.94 1462\n",
|
|
" 4 0.95 0.93 0.94 1374\n",
|
|
" 5 0.93 0.91 0.92 1245\n",
|
|
" 6 0.95 0.97 0.96 1334\n",
|
|
" 7 0.94 0.94 0.94 1447\n",
|
|
" 8 0.95 0.89 0.92 1321\n",
|
|
" 9 0.90 0.92 0.91 1428\n",
|
|
"\n",
|
|
" accuracy 0.94 14000\n",
|
|
" macro avg 0.94 0.94 0.94 14000\n",
|
|
"weighted avg 0.94 0.94 0.94 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(9,5))\n",
|
|
"accuracies.append(cv(9,10)) # likes to die"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"id": "d971b4df",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"/opt/jupyterhub/lib/python3.8/site-packages/sklearn/preprocessing/_data.py:3237: RuntimeWarning: divide by zero encountered in log\n",
|
|
" loglike = -n_samples / 2 * np.log(x_trans.var())\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Pipeline: power+pca99+knn (94.4429%)\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.95 0.98 0.97 1404\n",
|
|
" 1 0.96 0.99 0.98 1590\n",
|
|
" 2 0.94 0.95 0.94 1395\n",
|
|
" 3 0.94 0.93 0.94 1462\n",
|
|
" 4 0.95 0.93 0.94 1374\n",
|
|
" 5 0.94 0.91 0.93 1245\n",
|
|
" 6 0.95 0.97 0.96 1334\n",
|
|
" 7 0.94 0.94 0.94 1447\n",
|
|
" 8 0.95 0.90 0.93 1321\n",
|
|
" 9 0.91 0.92 0.92 1428\n",
|
|
"\n",
|
|
" accuracy 0.94 14000\n",
|
|
" macro avg 0.94 0.94 0.94 14000\n",
|
|
"weighted avg 0.94 0.94 0.94 14000\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# accuracies.append(cv(10,5))\n",
|
|
"accuracies.append(cv(10,10)) # likes to die"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "281e0f59",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Auswertung"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "e3eeabc7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Maximum accuracy (97.2571%) at: ['minmax+pca95+knn', 'maxabs+pca99+knn']\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(f\"Maximum accuracy ({max(accuracies):.6}%) at: {[names[n] for n in np.where(accuracies==max(accuracies))[0]]}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "7754b1e8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"name: knn (baseline) (96.9857%)\n",
|
|
"name: scalar+knn (94.2143%)\n",
|
|
"name: minmax+knn (96.9857%)\n",
|
|
"name: standard+pca95+knn (94.7000%)\n",
|
|
"name: minmax+pca95+knn (97.2571%)\n",
|
|
"name: standard+pca99+knn (94.2929%)\n",
|
|
"name: minmax+pca99+knn (96.9929%)\n",
|
|
"name: maxabs+pca95+knn (96.9929%)\n",
|
|
"name: maxabs+pca99+knn (97.2571%)\n",
|
|
"name: power+pca95+knn (94.1071%)\n",
|
|
"name: power+pca99+knn (94.4429%)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"for n, a in zip(names, accuracies):\n",
|
|
" print(f\"name: {n:20} ({a:.4f}%)\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "22316563",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|