Initial commit

Signed-off-by: Tuan-Dat Tran <tuan-dat.tran@tudattr.dev>
2024-12-31 13:36:22 +01:00
commit 931652c494
78 changed files with 46976 additions and 0 deletions
--- a/obu-node/Dockerfile
+++ b/obu-node/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.11 AS compile-image
+WORKDIR /federated-example
+COPY requirements.txt .
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m venv /venv
+RUN . /venv/bin/activate && \
+    python3 -m ensurepip --upgrade && \
+    python3 -m pip install -r /federated-example/requirements.txt
+
+FROM python:3.11 AS run-image
+COPY --from=compile-image /venv /venv
+
+WORKDIR /federated-example/src
+
+COPY . /federated-example/
+CMD . /venv/bin/activate && python3 client.py $SERVER_IP_FLWR $PARAMETER_IP:5000 $SERVER_IP_AGG $CLIENT_ID
--- a/obu-node/README.md
+++ b/obu-node/README.md
@@ -0,0 +1,16 @@
+# OBU node
+
+This is the version matching the final requirements where the client are started from the policy executor
+
+## Running the code using Docker
+
+1. To create the Docker Image, run "Dockerfile" using this command: `docker build -f Dockerfile -t client-image .`
+2. Create a container from the above image using this command: `docker run -p 8080:8080 -p 5000:5000 -p 80:80 -e SERVER_IP_FLWR={server_ip_port_flwr} -e PARAMETER_IP=1 -e SERVER_IP_AGG={server_ip_port_agg} -e CLIENT_ID={client_id} --name client --rm client-image` (More notes below)
+3. The script for the clients will run automatically. The clients assume the server is ready to accept the connection (which is the scenario to expect given no error happens on the server side), otherwise the clients will fail to establish the connection and stop the execution.
+
+* **Notes**:
+- `{server_ip_port_flwr}`is the IP address and port number used for the flower framework (port 8080 in tests) and `{server_ip_port_agg}` are the ip address and port used to communicated with the DMLO (port 5000 in tests), they should both be of the form `192.168.0.1:5000`.
+- `{client_id}` is the ID to assign the specific client (each client should have a unique ID)
+- The `-p` flag is used to map the docker ports to the devices ports and should be changed according to the ports used in the simulation (currently set to ports 8080 and 5000).
+- The `-e` flag is used to set the variables used to run the script automatically.
+- The execution can be stopped by opening another terminal and using this command `docker kill client`.
--- a/obu-node/buildx/buildkitd.toml
+++ b/obu-node/buildx/buildkitd.toml
@@ -0,0 +1,7 @@
+[registry."192.168.100.2:5000"]
+    http = true
+    insecure = true
+    ca = ["certs/192.168.100.2:5000/ca.crt"]
+    [[registry."192.168.100.2:5000".keypair]]
+        key = "certs/192.168.100.2:5000/client.key"
+        cert = "certs/192.168.100.2:5000/client.cert"
--- a/obu-node/buildx/create_builder.sh
+++ b/obu-node/buildx/create_builder.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+docker buildx create --name iana --platform linux/amd64,linux/arm64 --bootstrap --config ./buildkitd.toml --use
--- a/obu-node/buildx/setup.sh
+++ b/obu-node/buildx/setup.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Nokia
+#IANA_REGISTRY=192.168.100.2:5000
+# TS
+IANA_REGISTRY=192.168.100.2:5000
+
+mkdir -p certs/"$IANA_REGISTRY"
+
+(
+  cd certs/"$IANA_REGISTRY" || exit 1
+
+  openssl s_client -showcerts -connect "$IANA_REGISTRY" </dev/null | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' >ca.crt
+
+  openssl genrsa -out client.key 4096
+  openssl req -new -x509 -text -key client.key -out client.cert \
+    -subj "/C=DE/ST=Northrhine Westphalia/L=Essen/O=University Duisburg-Essen/emailAddress=tuan-dat.tran@stud.uni-due.de"
+)
--- a/obu-node/docker-push.sh
+++ b/obu-node/docker-push.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# docker tag SOURCE_IMAGE[:TAG] 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
+# docker push 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
+
+DOCKERFILE="./Dockerfile"
+REGISTRY=192.168.100.2:5000/uulm
+REMOTE_IMAGE="training_agent"
+TAG=v1.3.0
+
+docker buildx build --platform linux/amd64,linux/arm64 -f $DOCKERFILE -t \
+  $REGISTRY/$REMOTE_IMAGE:$TAG . --push
--- a/obu-node/requirements.txt
+++ b/obu-node/requirements.txt
@@ -0,0 +1,62 @@
+absl-py==2.0.0
+astunparse==1.6.3
+blinker==1.7.0
+cachetools==5.3.2
+certifi==2023.7.22
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==41.0.5
+Flask==3.0.0
+flatbuffers==23.5.26
+flwr==1.5.0
+gast==0.5.4
+google-auth==2.23.4
+google-auth-oauthlib==1.0.0
+google-pasta==0.2.0
+grpcio==1.59.2
+h5py==3.10.0
+idna==3.4
+iterators==0.0.2
+itsdangerous==2.1.2
+Jinja2==3.1.2
+joblib==1.3.2
+keras==2.14.0
+libclang==16.0.6
+Markdown==3.5.1
+MarkupSafe==2.1.3
+ml-dtypes==0.2.0
+netifaces==0.11.0
+numpy==1.26.1
+oauthlib==3.2.2
+opt-einsum==3.3.0
+packaging==23.2
+pandas==2.1.2
+protobuf==3.20.3
+psutil==5.9.6
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pycparser==2.21
+pycryptodome==3.19.0
+Pympler==1.0.1
+python-dateutil==2.8.2
+pytz==2023.3.post1
+requests==2.31.0
+requests-oauthlib==1.3.1
+rsa==4.9
+scikit-learn==1.3.2
+scipy==1.11.3
+six==1.16.0
+tensorboard==2.14.1
+tensorboard-data-server==0.7.2
+tensorflow==2.14.0
+tensorflow-estimator==2.14.0
+tensorflow-io-gcs-filesystem==0.34.0
+termcolor==2.3.0
+threadpoolctl==3.2.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.0.7
+watchdog==3.0.0
+Werkzeug==3.0.1
+wrapt==1.14.1
--- a/obu-node/resources/.gitkeep
+++ b/obu-node/resources/.gitkeep
--- a/obu-node/resources/best_model_no_tuner_40.h5
+++ b/obu-node/resources/best_model_no_tuner_40.h5
--- a/obu-node/resources/train_c1.csv
+++ b/obu-node/resources/train_c1.csv
--- a/obu-node/src/.gitkeep
+++ b/obu-node/src/.gitkeep
--- a/obu-node/src/changedb.py
+++ b/obu-node/src/changedb.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+#Script to change the used database to simulate having a new database in the final version. The new database is the old one minus 50 elements
+df = pd.read_csv('C:/Users/Firas/Desktop/docker/data/train_c1.csv')
+r=len(df)-50
+sampled = df.sample(n=r)
+sampled.to_csv('C:/Users/Firas/Desktop/docker/data/train_c1.csv', index=False)
+print(f"Sampled {r} lines and updated it as a new database")
--- a/obu-node/src/check_conn.py
+++ b/obu-node/src/check_conn.py
@@ -0,0 +1,31 @@
+import requests
+import sys
+from time import sleep
+import subprocess
+
+
+def check_connection(ip):
+    try:
+        response = requests.post(f"http://{ip}/check_connection")
+        if response.status_code == 200:
+            print(f"Connetion established with {ip}. The script will run in 15 seconds.")
+            sleep(15)
+            execute_python_file(main_script, *new_args)
+    except:
+        sleep(5)
+        check_connection(ip)
+
+
+def execute_python_file(main_script, *args):
+    cmd = ['python', main_script] + list(args)
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error running the script: {e}")
+
+
+if __name__ == "__main__":
+    ip = sys.argv[1] #ip with port to check, for the clients, check the DMLO
+    main_script = sys.argv[2]
+    new_args = sys.argv[3:]
+    check_connection(ip)
--- a/obu-node/src/client.py
+++ b/obu-node/src/client.py
@@ -0,0 +1,356 @@
+import argparse
+import os
+from pathlib import Path
+import pandas as pd
+from sklearn.preprocessing import MinMaxScaler
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import sys
+import flwr as fl
+import json
+import requests
+from flwr.common import Scalar, Config
+from time import sleep
+from typing import Dict, Union
+from watchdog.observers import Observer
+from watchdog.events import FileSystemEventHandler
+from flask import Flask, request
+import threading
+from time import time_ns
+
+# Make TensorFlow logs less verbose
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+app = Flask(__name__)
+
+
+@app.route("/upload", methods=["POST"])
+def upload():
+    global new_data, database_changed
+    data = request.data
+    data = data.decode("utf-8")
+    formatted_lines = []
+    for line in data.strip().split("\n"):
+        elements = line.split(",")
+        formatted_line = f"{elements[1]}, {elements[2]}, {elements[4].split()[0]}"
+        formatted_lines.append(formatted_line)
+    new_data = "\n".join(formatted_lines)
+    new_data = pd.DataFrame(
+        [line.split(",") for line in new_data.strip().split("\n")],
+        columns=["lat", "lon", "rtt"],
+    )
+    database_changed = True
+    return "Received new datapoints from the network monitoring tool", 200
+
+
+def run_flask():
+    app.run(host="0.0.0.0", port=80)
+
+
+flask_thread = threading.Thread(target=run_flask)
+flask_thread.setDaemon(True)
+flask_thread.start()
+
+"""
+gpu_id = 0  # Index of the GPU you want to use
+physical_devices = tf.config.list_physical_devices('GPU')
+print(physical_devices)
+tf.config.set_visible_devices(physical_devices[gpu_id], 'GPU')
+tf.config.experimental.set_memory_growth(physical_devices[gpu_id], True)
+"""
+
+client_id = sys.argv[4]
+server_ip = sys.argv[1]
+dmlo_ip = sys.argv[2]
+server_ip_kpi = sys.argv[3]
+
+q_alpha = 0.95
+n_features = 3
+n_future = 1
+n_past = 400
+learning_rate_argv = 0.001
+database_changed = False
+rounds_involved, uc6_02_start_obu = (
+    0,
+    0,
+)  # Simple workaround to help measure the model upload time
+
+data_df = pd.read_csv("../resources/train_c1.csv")
+datapoints = len(data_df)
+
+
+def reload_data(data_df):  # untested change (db01)
+    """Reloading the dataset after detecting a change"""
+    print("Database is being processed")
+    # data_df = pd.read_csv("data/train_c1.csv") #db01
+    train_df, test_df = np.split(data_df, [int(0.70 * len(data_df))])
+
+    # Scaling the dataframe
+    train = train_df
+    scalers = {}
+
+    # Scaling train data
+    for i in train_df.columns:
+        scaler = MinMaxScaler(feature_range=(-1, 1))
+        s_s = scaler.fit_transform(train[i].values.reshape(-1, 1))
+        s_s = np.reshape(s_s, len(s_s))
+        scalers["scaler_" + i] = scaler
+        train[i] = s_s
+
+    # Scaling test data
+    test = test_df
+    for i in train_df.columns:
+        scaler = scalers["scaler_" + i]
+        s_s = scaler.transform(test[i].values.reshape(-1, 1))
+        s_s = np.reshape(s_s, len(s_s))
+        scalers["scaler_" + i] = scaler
+        test[i] = s_s
+
+    def split_series(series, n_past, n_future):
+        X, y = list(), list()
+        # Loop to create array of every observations (past) and predictions (future) for every datapoint
+        for window_start in range(len(series)):
+            # Calculating boundaries for each datapoint
+            past_end = window_start + n_past
+            future_end = past_end + n_future
+            # Loop will end if the number of datapoints is less than observations (past)
+            if future_end > len(series):
+                break
+            past, future = (
+                series[window_start:past_end, :],
+                series[past_end:future_end, :],
+            )
+            X.append(past)
+            y.append(future)
+        return np.array(X), np.array(y)
+
+    # Creating X_train, y_train, X_test, y_test
+    X_train, y_train = split_series(train.values, n_past, n_future)
+    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
+    y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], n_features))
+    X_test, y_test = split_series(test.values, n_past, n_future)
+    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], n_features))
+    y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], n_features))
+
+    print(X_train.shape)
+    print(y_train.shape)
+    print(X_test.shape)
+    print(y_test.shape)
+
+    y_train = y_train[:, :, 2]
+    y_test = y_test[:, :, 2]
+
+    global database_changed
+    database_changed = False
+
+    return X_train, y_train, X_test, y_test, train_df, scalers
+
+
+class QuantileMetric(tf.keras.metrics.Metric):
+    def __init__(self, name="quantile_metric", **kwargs):
+        super(QuantileMetric, self).__init__(name=name, **kwargs)
+        self.quantile_metric = self.add_weight(
+            name="quantile_metric", initializer="zeros"
+        )
+        self.quantile_metric_count = self.add_weight(
+            name="quantile_metric_count", initializer="zeros"
+        )
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        quantileCondition = tf.math.greater(y_true, tf.squeeze(y_pred))
+        qc = tf.math.reduce_sum(tf.cast(quantileCondition, tf.float32))
+        self.quantile_metric.assign_add(qc)
+        self.quantile_metric_count.assign_add(
+            tf.cast(tf.size(quantileCondition), tf.float32)
+        )
+
+    def result(self):
+        return self.quantile_metric / self.quantile_metric_count
+
+    def reset_state(self):
+        self.quantile_metric.assign(0.0)
+        self.quantile_metric_count.assign(0)
+
+
+def tilted_loss(y_true, y_pred):
+    q = q_alpha
+    e = y_true - y_pred
+    tl = tf.stack([q * e, (q - 1) * e])
+    e_max = tf.math.reduce_max(tl, axis=0, keepdims=True)
+    return tf.reduce_mean(e_max)
+
+
+class LSTMClient(fl.client.NumPyClient):
+    def __init__(self, best_model, X_train, y_train, X_test, y_test, train_df, scalers):
+        self.best_model = best_model
+        self.X_train, self.y_train = X_train, y_train
+        self.X_test, self.y_test = X_test, y_test
+        self.train_df = train_df
+        self.scalers = scalers
+        self.properties = {"client_id": client_id}
+
+    def get_properties(self, config: Config) -> Dict[str, Scalar]:
+        return self.properties
+
+    def get_parameters(self, config):
+        """Get parameters of the local model."""
+        return self.best_model.get_weights()
+
+    def fit(self, parameters, config):
+        """Train parameters on the locally held training set."""
+
+        uc6_01_end = time_ns()  # Time required to download the global model from the agg.node in secs (Target <2s) has another part on the agg.node side
+        global uc6_02_start_obu, rounds_involved
+
+        rounds_involved += 1
+        uc6_02_end = time_ns()  # Time required to upload the model (has another part on the agg.node side, in sec * 1000000000) (Target < 2s)
+        if rounds_involved > 1:
+            kpi_uc6_02 = uc6_02_end - uc6_02_start_obu
+            try:
+                response = requests.post(
+                    f"http://{server_ip_kpi}/upload_kpi02", json={f"kpi02": kpi_uc6_02}
+                )
+                if response.status_code != 200:
+                    print(f"Failed to send KPI_02. Status code: {response.status_code}")
+            except requests.exceptions.RequestException as e:
+                print(f"Error while sending KPI_02: {e}")
+
+        try:
+            response = requests.post(
+                f"http://{server_ip_kpi}/upload_kpi01", json={f"kpi01": uc6_01_end}
+            )
+            if response.status_code != 200:
+                print(f"Failed to send KPI_01. Status code: {response.status_code}")
+        except requests.exceptions.RequestException as e:
+            print(f"Error while sending KPI_01: {e}")
+
+        if database_changed == True:
+            try:
+                (
+                    client.X_train,
+                    client.y_train,
+                    client.X_test,
+                    client.y_test,
+                    client.train_df,
+                    client.scalers,
+                ) = reload_data(new_data)
+            except Exception as e:
+                print(f"Error with the new data: {e}")
+
+        uc6_05_start = time_ns()
+
+        # Update local model parameters
+        self.best_model.set_weights(parameters)
+
+        # Get hyperparameters for this round
+        batch_size: int = config["batch_size"]
+        epochs: int = config["local_epochs"]
+
+        # Train the model using hyperparameters from config
+        history = self.best_model.fit(
+            self.X_train, self.y_train, batch_size, epochs, validation_split=0.1
+        )
+
+        # Return updated model parameters and results
+        parameters_prime = self.best_model.get_weights()
+        num_examples_train = len(self.X_train)
+        results = {
+            "id": client_id,
+            "loss": history.history["loss"][0],
+            "accuracy": history.history["mean_absolute_error"][0],
+            "val_loss": history.history["val_loss"][0],
+            "val_accuracy": history.history["val_mean_absolute_error"][0],
+        }
+        uc6_05_end = time_ns()
+        global kpi_uc6_05
+        kpi_uc6_05 = (
+            (uc6_05_end - uc6_05_start) / 1000000000
+        )  # Time required to finish a training round (inkl. all local epochs) on the OBU side in sec (target <240s)
+        try:
+            response = requests.post(
+                f"http://{server_ip_kpi}/upload_kpi05", json={f"kpi05": kpi_uc6_05}
+            )
+            if response.status_code != 200:
+                print(f"Failed to send KPI_05. Status code: {response.status_code}")
+        except requests.exceptions.RequestException as e:
+            print(f"Error while sending KPI_05: {e}")
+
+        uc6_02_start_obu = time_ns()
+        return parameters_prime, num_examples_train, results
+
+    def evaluate(self, parameters, config):
+        """Evaluate parameters on the locally held test set."""
+
+        # Update local model with global parameters
+        self.best_model.set_weights(parameters)
+
+        # Evaluate global model parameters on the local test data and return results
+        loss, metric, error = self.best_model.evaluate(self.X_test, self.y_test, 32)
+        num_examples_test = len(self.X_test)
+
+        pred = self.best_model.predict(self.X_test)
+        pred_copies = np.repeat(pred, 3, axis=-1)
+        pred_copies = np.expand_dims(pred_copies, axis=1)
+        for index, i in enumerate(self.train_df.columns):
+            scaler = self.scalers["scaler_" + i]
+            pred_copies[:, :, index] = scaler.inverse_transform(
+                pred_copies[:, :, index]
+            )
+        np.save("prediction_client1.npy", pred_copies[:, :, 2])
+        return loss, num_examples_test, {"accuracy": error}
+
+
+def main() -> None:
+    uc6_04_start = time_ns()
+
+    X_train, y_train, X_test, y_test, train_df, scalers = reload_data(data_df)
+
+    uc6_04_end = time_ns()
+    global kpi_uc6_04
+    kpi_uc6_04 = (
+        uc6_04_end - uc6_04_start
+    ) / 1000000000  # Time required to process training data by OBU in sec (Target <60s)
+    try:
+        response = requests.post(
+            f"http://{server_ip_kpi}/upload_kpi04", json={f"kpi04": kpi_uc6_04}
+        )
+        if response.status_code != 200:
+            print(f"Failed to send KPI_04. Status code: {response.status_code}")
+    except requests.exceptions.RequestException as e:
+        print(f"Error while sending KPI_04: {e}")
+
+    best_model = tf.keras.models.load_model(
+        "../resources/best_model_no_tuner_40.h5", compile=False
+    )
+    opt = tf.keras.optimizers.Adam(learning_rate=learning_rate_argv)
+
+    best_model.compile(
+        optimizer=opt,
+        loss=[tilted_loss],
+        metrics=[QuantileMetric(), keras.metrics.MeanAbsoluteError()],
+    )
+
+    global client
+    client = LSTMClient(best_model, X_train, y_train, X_test, y_test, train_df, scalers)
+
+    for i in range(40):
+        try:
+            response = requests.post(f"http://{server_ip_kpi}/check_connection")
+            if response.status_code == 200:
+                sleep(5)
+                break
+        except:
+            print(
+                "\n\n\n\nConnection to the Agg.Node could not be established, trying again in 5 seconds...\n",
+                flush=True,
+            )
+            sleep(5)
+
+    fl.client.start_numpy_client(
+        server_address=server_ip,
+        client=client,
+    )
+
+
+if __name__ == "__main__":
+    main()