### Load MNIST dataset

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

In [2]:
# scikit-learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

In [3]:
# common imports
import numpy as np

In [4]:
# import function to scikit-learn datasets
from sklearn.datasets import fetch_openml

# load specified dataset (MNIST)
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

# print type of dataset
type(mnist)

sklearn.utils.Bunch

In [5]:
X, y = mnist["data"], mnist["target"]

### labels to int

In [6]:
# import plotting libraries
import matplotlib as mpl
import matplotlib.pyplot as plt

In [7]:
# convert string labels to int
y = y.astype(np.uint8)

### Prepare data for machine learning

### Identify Train Set and Test Set

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=177013)

print(f"X_train: {len(X_train)}, {X_train.shape}")
print(f"X_test: {len(X_test)}, {X_test.shape}")
print(f"y_train: {len(y_train)}, {y_train.shape}")
print(f"y_test: {len(y_test)}, {y_test.shape}")

X_train: 56000, (56000, 784)
X_test: 14000, (14000, 784)
y_train: 56000, (56000,)
y_test: 14000, (14000,)


## Pipeline Declaration

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import (StandardScaler, 
 MinMaxScaler, 
 MaxAbsScaler,
 PowerTransformer,
 Binarizer)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, accuracy_score

n_neighbors = 3
metric = 'euclidean'
weights = 'distance'
n95_components = 0.95
n99_components = 0.99

names = []
classifiers = []
accuracies = []

# Crossvalidation

In [10]:
# not this
def cv_train(num,cv):
 name = names[num]
 clf = classifiers[num]
 y_train_pred = cross_val_predict(clf, X_train, y_train, cv=cv, n_jobs=-1)
 accuracy = accuracy_score(y_train, y_train_pred, normalize=True)*100
 print(f"Pipeline: {name} ({accuracy:.4f}%)")
 print(classification_report(y_train, y_train_pred))
 return accuracy

In [11]:
# also not this
def cv_test(num):
 name = names[num]
 clf = classifiers[num]
 y_test_pred = cross_val_predict(clf, X_test, y_test, cv=5, n_jobs=-1)
 accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100
 print(f"Pipeline: {name} ({accuracy:.4f}%)")
 print(classification_report(y_test, y_test_pred))
 return accuracy

In [12]:
import pandas as pd

# this
def cv(num,cv_arg=10):
 name = names[num]
 clf = classifiers[num]
 clf = clf.fit(X_train, y_train)
 cv = cross_validate(clf, X_train, y_train, cv=cv_arg, n_jobs=-1)
 # cv_clf = cv['estimator'][np.argmax(cv['test_score'])] # get the estimator where the max(test_score) on the cross validation
 y_test_pred = clf.predict(X_test)
 accuracy = accuracy_score(y_test, y_test_pred, normalize=True)*100
 print(f"Pipeline: {name} ({accuracy:.4f}%) ({cv['fit_time'][np.argmax(cv['test_score'])]:.4})")
 print(classification_report(y_test, y_test_pred))
 print(pd.DataFrame.from_dict(cv))
 return accuracy

# Fitting

In [13]:
op_kNN = KNeighborsClassifier(n_neighbors=n_neighbors, n_jobs=-1, metric=metric, weights=weights)

In [14]:
names.append('knn (baseline)')
classifiers.append(Pipeline([
 ('knn', op_kNN)
]))
 
accuracies.append(cv(0))

Pipeline: knn (baseline) (97.3071%) (0.1162)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.97 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.96 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.94 0.96 1411
 9 0.96 0.97 0.96 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 0.125081 9.229141 0.972143
1 0.123761 8.918457 0.971429
2 0.125360 9.193326 0.971607
3 0.119904 8.944645 0.970714
4 0.244280 9.182126 0.972143
5 0.233787 9.137941 0.972857
6 0.123714 9.257920 0.975179
7 0.116261 9.135718 0.972500
8 0.116210 9.125259 0.975357
9 0.141345 9.308920 0.973393


In [15]:
names.append('scalar+knn') 
classifiers.append(Pipeline([
 ('standard', StandardScaler()),
 ('knn', op_kNN)
]))

accuracies.append(cv(1,10)) # Pipeline: standard+knn (94.3714%)

Pipeline: scalar+knn (94.8000%) (1.146)
 precision recall f1-score support

 0 0.97 0.99 0.98 1359
 1 0.97 0.99 0.98 1594
 2 0.95 0.93 0.94 1369
 3 0.94 0.95 0.95 1415
 4 0.94 0.93 0.94 1373
 5 0.93 0.91 0.92 1257
 6 0.96 0.97 0.97 1351
 7 0.94 0.95 0.94 1422
 8 0.96 0.91 0.93 1411
 9 0.92 0.94 0.93 1449

 accuracy 0.95 14000
 macro avg 0.95 0.95 0.95 14000
weighted avg 0.95 0.95 0.95 14000

 fit_time score_time test_score
0 0.758668 8.452150 0.949643
1 0.885169 8.631167 0.945000
2 1.160197 8.948529 0.949107
3 1.353553 8.502009 0.942500
4 0.912583 9.036203 0.948393
5 0.833251 9.013008 0.945179
6 0.886441 8.995505 0.943214
7 0.935067 8.891803 0.946607
8 1.146215 8.633170 0.951607
9 1.234975 8.846080 0.947143


In [16]:
names.append('minmax+knn')
classifiers.append(Pipeline([
 ('minmax', MinMaxScaler()),
 ('knn', op_kNN)
]))

accuracies.append(cv(2))

Pipeline: minmax+knn (97.3071%) (0.5789)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.97 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.96 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.94 0.96 1411
 9 0.96 0.97 0.96 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 0.389049 8.747383 0.972143
1 0.385594 9.063268 0.971429
2 0.527658 8.926810 0.971786
3 0.564983 8.949160 0.970714
4 0.607976 9.087395 0.972143
5 0.689083 8.948745 0.972857
6 0.578933 8.950404 0.975179
7 0.647489 9.248726 0.972679
8 0.634490 9.124195 0.975179
9 0.619997 8.861808 0.973393


In [17]:
names.append('standard+pca95+knn')
classifiers.append(Pipeline([
 ('standard', StandardScaler()),
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(3,10)) # Pipeline: standard+pca95+knn (95.0500%)

Pipeline: standard+pca95+knn (95.3929%) (9.151)
 precision recall f1-score support

 0 0.97 0.99 0.98 1359
 1 0.98 0.99 0.98 1594
 2 0.96 0.94 0.95 1369
 3 0.94 0.96 0.95 1415
 4 0.95 0.94 0.95 1373
 5 0.94 0.92 0.93 1257
 6 0.96 0.97 0.97 1351
 7 0.95 0.96 0.95 1422
 8 0.97 0.92 0.94 1411
 9 0.92 0.94 0.93 1449

 accuracy 0.95 14000
 macro avg 0.95 0.95 0.95 14000
weighted avg 0.95 0.95 0.95 14000

 fit_time score_time test_score
0 7.882113 6.468349 0.953036
1 8.902088 5.919568 0.950357
2 8.750770 5.959131 0.953571
3 7.767436 6.355160 0.945179
4 9.307496 5.559806 0.952500
5 7.845258 6.534233 0.950000
6 8.874461 5.844108 0.948214
7 7.704501 6.488686 0.950714
8 9.151180 5.622928 0.955893
9 7.257694 6.649508 0.953571


In [18]:
names.append('minmax+pca95+knn')
classifiers.append(Pipeline([
 ('minmax', MinMaxScaler()),
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(4))

Pipeline: minmax+pca95+knn (97.5500%) (8.162)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.98 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.97 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.98 14000
 macro avg 0.98 0.98 0.98 14000
weighted avg 0.98 0.98 0.98 14000

 fit_time score_time test_score
0 8.170779 5.294063 0.973750
1 8.088926 5.365682 0.972857
2 6.857623 5.776273 0.974286
3 7.262239 5.806243 0.973036
4 8.057861 5.374735 0.974464
5 7.276100 5.745563 0.975179
6 8.202060 5.424418 0.975893
7 8.114796 5.319392 0.975357
8 8.162190 5.424424 0.976607
9 7.437983 5.727555 0.975536


In [19]:
names.append('standard+pca99+knn')
classifiers.append(Pipeline([
 ('standard', StandardScaler()),
 ('pca99', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(5,10)) # Pipeline: standard+pca99+knn (94.5357%)

Pipeline: standard+pca99+knn (94.9071%) (9.396)
 precision recall f1-score support

 0 0.97 0.99 0.98 1359
 1 0.97 0.99 0.98 1594
 2 0.95 0.93 0.94 1369
 3 0.94 0.95 0.95 1415
 4 0.95 0.93 0.94 1373
 5 0.93 0.91 0.92 1257
 6 0.96 0.97 0.97 1351
 7 0.94 0.95 0.95 1422
 8 0.97 0.91 0.94 1411
 9 0.92 0.94 0.93 1449

 accuracy 0.95 14000
 macro avg 0.95 0.95 0.95 14000
weighted avg 0.95 0.95 0.95 14000

 fit_time score_time test_score
0 9.473285 6.515933 0.949821
1 7.714788 7.746434 0.946786
2 7.546448 7.823940 0.950179
3 7.546017 7.754220 0.943036
4 7.534473 7.836074 0.948750
5 7.524864 7.952840 0.945893
6 7.699588 7.758791 0.946071
7 9.373224 6.639953 0.947679
8 9.395972 6.550251 0.953214
9 9.444257 6.667987 0.947679


In [20]:
names.append('minmax+pca99+knn')
classifiers.append(Pipeline([
 ('minmax', MinMaxScaler()),
 ('pca99', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(6))

Pipeline: minmax+pca99+knn (97.3643%) (9.134)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.97 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.96 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 8.213089 6.006415 0.972679
1 7.115110 6.500668 0.971429
2 7.381336 6.424851 0.972143
3 7.099746 6.442294 0.970179
4 9.174922 5.636195 0.972679
5 6.937273 6.640518 0.973214
6 8.361929 6.048856 0.975536
7 9.058427 5.660812 0.972857
8 9.134326 5.704015 0.975714
9 9.088895 5.631931 0.974286


In [21]:
names.append('maxabs+pca95+knn')
classifiers.append(Pipeline([
 ('maxabs', MaxAbsScaler()),
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(7))

Pipeline: maxabs+pca95+knn (97.5500%) (8.568)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.98 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.97 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.98 14000
 macro avg 0.98 0.98 0.98 14000
weighted avg 0.98 0.98 0.98 14000

 fit_time score_time test_score
0 7.831842 5.514181 0.973750
1 7.772907 5.701575 0.972857
2 8.581123 5.059241 0.974286
3 7.486752 5.769173 0.973036
4 7.644667 5.623410 0.974464
5 8.568252 5.032640 0.975179
6 7.305880 5.804784 0.975893
7 7.243364 5.869418 0.975357
8 8.568058 5.175154 0.976607
9 8.545392 5.035311 0.975536


In [22]:
names.append('maxabs+pca99+knn')
classifiers.append(Pipeline([
 ('maxabs', MaxAbsScaler()),
 ('pca99', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(8))

Pipeline: maxabs+pca99+knn (97.3643%) (7.451)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.97 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.96 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 7.066843 6.724576 0.972679
1 8.163119 5.973942 0.971429
2 7.298925 6.530500 0.972143
3 8.869127 5.643595 0.970179
4 8.898886 5.403235 0.972679
5 7.828445 6.313535 0.973214
6 8.803470 5.529734 0.975536
7 8.294126 5.839177 0.972857
8 7.450544 6.585145 0.975714
9 7.483754 6.628288 0.974286


In [23]:
names.append('power+pca95+knn')
classifiers.append(Pipeline([
 ('power', PowerTransformer()),
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(9,10)) # likes to die # Pipeline: power+pca95+knn (94.3714%)

 loglike = -n_samples / 2 * np.log(x_trans.var())


Pipeline: power+pca95+knn (94.9786%) (39.14)
 precision recall f1-score support

 0 0.97 0.99 0.98 1359
 1 0.98 0.98 0.98 1594
 2 0.95 0.94 0.95 1369
 3 0.94 0.94 0.94 1415
 4 0.96 0.93 0.95 1373
 5 0.94 0.91 0.93 1257
 6 0.95 0.98 0.96 1351
 7 0.94 0.95 0.94 1422
 8 0.96 0.92 0.94 1411
 9 0.92 0.94 0.93 1449

 accuracy 0.95 14000
 macro avg 0.95 0.95 0.95 14000
weighted avg 0.95 0.95 0.95 14000

 fit_time score_time test_score
0 38.802822 5.871142 0.946429
1 37.150519 6.993097 0.945357
2 37.138519 7.017011 0.948929
3 38.357648 6.108760 0.945714
4 37.175505 7.018548 0.949107
5 37.249800 6.882046 0.945357
6 37.172034 6.747881 0.949643
7 38.738593 5.878512 0.948571
8 39.135663 5.579420 0.956607
9 37.060059 6.904209 0.949821


In [24]:
names.append('power+pca99+knn')
classifiers.append(Pipeline([
 ('power', PowerTransformer()),
 ('pca99', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(10,10)) # likes to die # Pipeline: power+pca99+knn (94.6429%)

 loglike = -n_samples / 2 * np.log(x_trans.var())


Pipeline: power+pca99+knn (94.9786%) (37.94)
 precision recall f1-score support

 0 0.97 0.99 0.98 1359
 1 0.98 0.98 0.98 1594
 2 0.95 0.94 0.95 1369
 3 0.94 0.94 0.94 1415
 4 0.96 0.93 0.95 1373
 5 0.94 0.91 0.93 1257
 6 0.95 0.98 0.96 1351
 7 0.94 0.95 0.94 1422
 8 0.96 0.92 0.94 1411
 9 0.92 0.94 0.93 1449

 accuracy 0.95 14000
 macro avg 0.95 0.95 0.95 14000
weighted avg 0.95 0.95 0.95 14000

 fit_time score_time test_score
0 39.081897 5.799132 0.946429
1 39.650227 5.733288 0.945357
2 36.740629 6.834802 0.948929
3 39.349449 5.738063 0.945714
4 38.051169 6.654780 0.949107
5 38.139323 6.575838 0.945357
6 36.967650 6.827009 0.949643
7 37.961230 6.632579 0.948571
8 37.944791 6.445367 0.956607
9 39.280427 5.751522 0.949821


In [25]:
names.append('bin+pca95+knn')
classifiers.append(Pipeline([
 ('bin', Binarizer()),
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(11,10))

Pipeline: bin+pca95+knn (96.8286%) (8.674)
 precision recall f1-score support

 0 0.98 0.99 0.99 1359
 1 0.96 0.99 0.98 1594
 2 0.98 0.97 0.98 1369
 3 0.96 0.96 0.96 1415
 4 0.98 0.95 0.97 1373
 5 0.97 0.94 0.95 1257
 6 0.97 0.99 0.98 1351
 7 0.97 0.97 0.97 1422
 8 0.97 0.95 0.96 1411
 9 0.94 0.97 0.95 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 8.605724 5.490393 0.967679
1 7.290196 6.624226 0.969107
2 7.169216 6.591240 0.968571
3 7.225861 6.645801 0.965357
4 7.196922 6.595732 0.969643
5 7.294749 6.646243 0.966429
6 8.720850 5.419779 0.969821
7 8.711336 5.597409 0.967679
8 7.221519 6.624207 0.970536
9 8.673788 5.737247 0.970714


In [26]:
names.append('bin+pca99+knn')
classifiers.append(Pipeline([
 ('bin', Binarizer()),
 ('pca99', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(12))

Pipeline: bin+pca99+knn (96.6571%) (7.146)
 precision recall f1-score support

 0 0.98 0.99 0.98 1359
 1 0.96 0.99 0.98 1594
 2 0.98 0.97 0.97 1369
 3 0.96 0.96 0.96 1415
 4 0.98 0.95 0.97 1373
 5 0.97 0.94 0.95 1257
 6 0.96 0.99 0.97 1351
 7 0.97 0.97 0.97 1422
 8 0.97 0.94 0.96 1411
 9 0.93 0.97 0.95 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 6.961178 6.772386 0.965536
1 8.445867 6.193574 0.967143
2 9.256319 5.771667 0.966607
3 7.188147 6.795888 0.965179
4 9.336459 5.739643 0.966964
5 7.107809 6.823560 0.964821
6 9.127543 5.848893 0.968393
7 9.106971 5.873384 0.966429
8 7.145829 6.698655 0.969107
9 8.202323 6.141933 0.968750


In [27]:
names.append('pca95+knn')
classifiers.append(Pipeline([
 ('pca95', PCA(n_components=n95_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(13))

Pipeline: pca95+knn (97.5500%) (7.416)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.98 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.97 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.98 14000
 macro avg 0.98 0.98 0.98 14000
weighted avg 0.98 0.98 0.98 14000

 fit_time score_time test_score
0 7.440297 5.405928 0.973750
1 7.141649 5.609115 0.972857
2 6.829969 5.687647 0.974286
3 8.291485 5.057073 0.973036
4 8.281738 4.872021 0.974464
5 6.882516 5.804941 0.975179
6 8.275357 5.190776 0.975893
7 8.244917 5.013079 0.975357
8 7.416139 5.446140 0.976607
9 6.752497 5.906509 0.975536


In [28]:
names.append('pca99+knn')
classifiers.append(Pipeline([
 ('pca99', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

accuracies.append(cv(14))

Pipeline: pca99+knn (97.3643%) (6.88)
 precision recall f1-score support

 0 0.98 1.00 0.99 1359
 1 0.97 0.99 0.98 1594
 2 0.98 0.96 0.97 1369
 3 0.97 0.97 0.97 1415
 4 0.98 0.97 0.97 1373
 5 0.96 0.96 0.96 1257
 6 0.98 0.99 0.98 1351
 7 0.97 0.98 0.97 1422
 8 0.99 0.95 0.97 1411
 9 0.96 0.97 0.97 1449

 accuracy 0.97 14000
 macro avg 0.97 0.97 0.97 14000
weighted avg 0.97 0.97 0.97 14000

 fit_time score_time test_score
0 8.113361 5.777437 0.972857
1 6.950958 6.503708 0.971429
2 8.775186 5.424367 0.972143
3 8.805392 5.562713 0.970179
4 6.980318 6.581474 0.972679
5 8.412863 5.525903 0.973214
6 7.188666 6.495338 0.975536
7 6.802638 6.664372 0.972857
8 6.879545 6.603721 0.975714
9 7.031381 6.631324 0.974286


In [29]:
names.append('kpca+pca95+knn')
classifiers.append(Pipeline([
 ('kpca', KernelPCA(n_jobs=-1)),
 ('pca', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

# accuracies.append(cv(15))

In [30]:
names.append('kpca+pca99+knn')
classifiers.append(Pipeline([
 ('kpca', KernelPCA(n_jobs=-1)),
 ('pca', PCA(n_components=n99_components)),
 ('knn', op_kNN)
]))

# accuracies.append(cv(16))

# Auswertung

In [31]:
print(f"Maximum accuracy ({max(accuracies):.6}%) for {[names[n] for n in np.where(accuracies==max(accuracies))[0]]}")

Maximum accuracy (97.55%) for ['minmax+pca95+knn', 'maxabs+pca95+knn', 'pca95+knn']


In [38]:
for n, a in zip(names, accuracies):
 print(f"name: {n:20} ({a:.4f}%)")

name: knn (baseline) (97.3071%)
name: scalar+knn (94.8000%)
name: minmax+knn (97.3071%)
name: standard+pca95+knn (95.3929%)
name: minmax+pca95+knn (97.5500%)
name: standard+pca99+knn (94.9071%)
name: minmax+pca99+knn (97.3643%)
name: maxabs+pca95+knn (97.5500%)
name: maxabs+pca99+knn (97.3643%)
name: power+pca95+knn (94.9786%)
name: power+pca99+knn (94.9786%)
name: bin+pca95+knn (96.8286%)
name: bin+pca99+knn (96.6571%)
name: pca95+knn (97.5500%)
name: pca99+knn (97.3643%)


Default n=3\
name: knn (baseline) (97.2143%)\
name: scalar+knn (94.6286%)\
name: minmax+knn (97.2143%)\
name: standard+pca95+knn (95.1429%)\
name: minmax+pca95+knn (97.4357%)\
name: standard+pca99+knn (94.7214%)\
name: minmax+pca99+knn (97.2714%)\
name: maxabs+pca95+knn (97.4357%)\
name: maxabs+pca99+knn (97.2714%)\
name: power+pca95+knn (94.8071%)\
name: power+pca99+knn (94.8071%)\
name: bin+pca95+knn (96.6643%)\
name: bin+pca99+knn (96.5500%)\
name: pca95+knn (97.4357%)\
name: pca99+knn (97.2714%)\

n=3 euclid distance\
name: knn (baseline) (97.3071%)\
name: scalar+knn (94.8000%)\
name: minmax+knn (97.3071%)\
name: standard+pca95+knn (95.3929%)\
name: minmax+pca95+knn (97.5500%)\
name: standard+pca99+knn (94.9071%)\
name: minmax+pca99+knn (97.3643%)\
name: maxabs+pca95+knn (97.5500%)\
name: maxabs+pca99+knn (97.3643%)\
name: power+pca95+knn (94.9786%)\
name: power+pca99+knn (94.9786%)\
name: bin+pca95+knn (96.8286%)\
name: bin+pca99+knn (96.6571%)\
name: pca95+knn (97.5500%)\
name: pca99+knn (97.3643%)\


# Hyper Parameter Optimization

In [34]:
# from sklearn.model_selection import GridSearchCV

# grid_params = {
# 'n_neighbors': [3, 5, 7 , 11],
# 'weights': ['uniform', 'distance'],
# 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski'],
# }

# gs = GridSearchCV(
# KNeighborsClassifier(n_jobs=-1),
# grid_params,
# cv = 3,
# verbose=1,
# n_jobs = -1
# )

# gs_results = gs.fit(X_train, y_train)
# Results in gs_results.csv

Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [36]:
pd.DataFrame.from_dict(gs_results.cv_results_).sort_values('rank_test_score').to_csv('./gs_result.csv')

In [37]:
pd.DataFrame.from_dict(gs_results.cv_results_)[['rank_test_score','mean_fit_time','param_n_neighbors', 'param_metric', 'param_weights', 'mean_test_score']].to_csv('./gs_result_filtered.csv')