{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "8c784b5a", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "import pandas as pd\n", "import numpy as np\n", "import tensorflow as tf\n", "import matplotlib.pyplot as plt\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder, LabelBinarizer\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Flatten, BatchNormalization\n", "\n", "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'\n", "os.environ['CUDA_VISIBLE_DEVICES'] = '2'\n", "\n", "\n", "\n", "delim = ';'\n", "user_count = 100\n", "base_path = '/opt/iui-datarelease1-sose2021/'\n", "\n", "Xpickle_file = './X.pickle'\n", "\n", "ypickle_file = './y.pickle'" ] }, { "cell_type": "code", "execution_count": 2, "id": "7b486d61", "metadata": {}, "outputs": [], "source": [ "def load_pickles():\n", " _p = open(Xpickle_file, 'rb')\n", " X = pickle.load(_p)\n", " _p.close()\n", " \n", " _p = open(ypickle_file, 'rb')\n", " y = pickle.load(_p)\n", " _p.close()\n", " \n", " return (np.asarray(X, dtype=pd.DataFrame), np.asarray(y, dtype=str))" ] }, { "cell_type": "code", "execution_count": 3, "id": "5ea384ea", "metadata": {}, "outputs": [], "source": [ "def shorten(npList):\n", " temp = npList['Force']\n", " thresh = 100\n", " leeway = 5\n", " \n", " temps_over_T = np.where(temp > thresh)[0]\n", " return npList[max(temps_over_T[0]-leeway,0):temps_over_T[-1]+leeway]" ] }, { "cell_type": "code", "execution_count": 4, "id": "09aad3f2", "metadata": {}, "outputs": [], "source": [ "def load_data():\n", " if os.path.isfile(Xpickle_file) and os.path.isfile(ypickle_file):\n", " return load_pickles()\n", " data = []\n", " label = []\n", " for user in range(0, user_count):\n", " user_path = base_path + str(user) + '/split_letters_csv/'\n", " for file in os.listdir(user_path):\n", " file_name = user_path + file\n", " letter = ''.join(filter(lambda x: x.isalpha(), file))[0]\n", " data.append(pd.read_csv(file_name, delim))\n", " label.append(letter)\n", " return (np.asarray(data, dtype=pd.DataFrame), np.asarray(label, dtype=str), np.asarray(file_name))" ] }, { "cell_type": "code", "execution_count": 5, "id": "37d66d26", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 2.76 s, sys: 205 ms, total: 2.97 s\n", "Wall time: 2.97 s\n" ] } ], "source": [ "%%time\n", "x, y = load_data()\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "3178395b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3.22 s, sys: 2.07 ms, total: 3.22 s\n", "Wall time: 3.22 s\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ ":1: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n" ] } ], "source": [ "%%time\n", "f_data = np.array(list(map(shorten, x)))" ] }, { "cell_type": "code", "execution_count": 7, "id": "dcbb85b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 13102.000000\n", "mean 61.169058\n", "std 30.698514\n", "min 10.000000\n", "50% 57.000000\n", "95% 102.000000\n", "96% 107.000000\n", "97% 113.000000\n", "98% 127.000000\n", "99% 156.000000\n", "max 1522.000000\n", "dtype: float64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "x_len = np.asarray(list(map(len, f_data)))\n", "l = []\n", "sq_xlen = pd.Series(x_len)\n", "ptiles = [x*0.01 for x in range(100)]\n", "for i in ptiles:\n", " l.append(sq_xlen.quantile(i))\n", "plt.plot(l, ptiles)\n", "sq_xlen.describe(percentiles=[x*0.01 for x in range(95,100)])" ] }, { "cell_type": "code", "execution_count": 8, "id": "1878d067", "metadata": {}, "outputs": [], "source": [ "thresh_p = 0.99\n", "thresh = int(sq_xlen.quantile(thresh_p))\n", "len_mask = np.where(x_len <= thresh)\n", "\n", "x_filter = f_data[len_mask]\n", "y_filter = y[len_mask]\n", "\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "3a01c1ad", "metadata": {}, "outputs": [], "source": [ "lb = LabelBinarizer()\n", "a = [x.drop(labels='Millis', axis=1) for x in x_filter]\n", "x_filter = pad_sequences(x_filter, dtype=float, padding='post')\n", "yt_filter = lb.fit_transform(y_filter)" ] }, { "cell_type": "code", "execution_count": 10, "id": "634a024c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 34.7 ms, sys: 5.84 ms, total: 40.6 ms\n", "Wall time: 39.2 ms\n" ] } ], "source": [ "%%time\n", "x_train, x_test, y_train, y_test = train_test_split(x_filter, yt_filter, test_size=0.2, random_state=177013)\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "0109b9b6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "flatten (Flatten) (None, 2340) 0 \n", "_________________________________________________________________\n", "batch_normalization (BatchNo (None, 2340) 9360 \n", "_________________________________________________________________\n", "dense (Dense) (None, 2200) 5150200 \n", "_________________________________________________________________\n", "dense_1 (Dense) (None, 1100) 2421100 \n", "_________________________________________________________________\n", "dense_2 (Dense) (None, 550) 605550 \n", "_________________________________________________________________\n", "dense_3 (Dense) (None, 225) 123975 \n", "_________________________________________________________________\n", "dense_4 (Dense) (None, 26) 5876 \n", "=================================================================\n", "Total params: 8,316,061\n", "Trainable params: 8,311,381\n", "Non-trainable params: 4,680\n", "_________________________________________________________________\n" ] } ], "source": [ "model = Sequential()\n", "\n", "model.add(Flatten(input_shape=x_filter[0].shape))\n", "\n", "model.add(BatchNormalization())\n", "\n", "model.add(Dense(2200, activation='relu'))\n", "\n", "model.add(Dense(1100, activation='relu'))\n", "\n", "model.add(Dense(550, activation='relu'))\n", "\n", "model.add(Dense(225, activation='relu'))\n", "\n", "model.add(Dense(26, activation='softmax'))\n", "\n", "model.compile(\n", " optimizer=tf.keras.optimizers.Adam(0.001),\n", " loss=\"categorical_crossentropy\", \n", " metrics=[\"acc\"],\n", ")\n", "\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 12, "id": "204ed561", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/32\n", "82/82 [==============================] - 1s 3ms/step - loss: 2.8553 - acc: 0.1745\n", "Epoch 2/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 1.7793 - acc: 0.4480\n", "Epoch 3/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 1.2391 - acc: 0.6070\n", "Epoch 4/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.9623 - acc: 0.7021\n", "Epoch 5/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.8489 - acc: 0.7336\n", "Epoch 6/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.5827 - acc: 0.8169\n", "Epoch 7/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.5208 - acc: 0.8313\n", "Epoch 8/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.5864 - acc: 0.8147\n", "Epoch 9/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.4101 - acc: 0.8710\n", "Epoch 10/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.2856 - acc: 0.9087\n", "Epoch 11/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.2789 - acc: 0.9126\n", "Epoch 12/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.3118 - acc: 0.9027\n", "Epoch 13/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.3337 - acc: 0.9054\n", "Epoch 14/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.3052 - acc: 0.9049\n", "Epoch 15/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.2052 - acc: 0.9403\n", "Epoch 16/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.4292 - acc: 0.8907\n", "Epoch 17/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1545 - acc: 0.9542\n", "Epoch 18/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1401 - acc: 0.9575\n", "Epoch 19/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1907 - acc: 0.9483\n", "Epoch 20/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.2635 - acc: 0.9303\n", "Epoch 21/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1116 - acc: 0.9671\n", "Epoch 22/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.2453 - acc: 0.9317\n", "Epoch 23/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1090 - acc: 0.9681\n", "Epoch 24/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1578 - acc: 0.9541\n", "Epoch 25/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1609 - acc: 0.9570\n", "Epoch 26/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0801 - acc: 0.9775\n", "Epoch 27/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.1597 - acc: 0.9615\n", "Epoch 28/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0695 - acc: 0.9807\n", "Epoch 29/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0622 - acc: 0.9853\n", "Epoch 30/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0655 - acc: 0.9841\n", "Epoch 31/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0383 - acc: 0.9910\n", "Epoch 32/32\n", "82/82 [==============================] - 0s 3ms/step - loss: 0.0716 - acc: 0.9792\n", "CPU times: user 14 s, sys: 3.02 s, total: 17 s\n", "Wall time: 8.95 s\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "model.fit(x_train, y_train, \n", " epochs=32,\n", " batch_size=128,\n", " shuffle=True,\n", " verbose=1\n", " )" ] }, { "cell_type": "code", "execution_count": 13, "id": "10a0d074", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Evaluate on test data\n", "82/82 [==============================] - 0s 2ms/step - loss: 1.7331 - acc: 0.7341\n", "test loss, test acc: [1.7330855131149292, 0.7341040372848511]\n", "Generate predictions for 3 samples\n", "predictions shape: (3, 26)\n" ] }, { "data": { "text/plain": [ "(array(['N', 'U', 'I'], dtype='