Initial commit

Signed-off-by: Tuan-Dat Tran <tuan-dat.tran@tudattr.dev>
This commit is contained in:
Tuan-Dat Tran
2024-12-31 13:36:22 +01:00
commit 931652c494
78 changed files with 46976 additions and 0 deletions

16
obu-node/Dockerfile Normal file
View File

@@ -0,0 +1,16 @@
FROM python:3.11 AS compile-image
WORKDIR /federated-example
COPY requirements.txt .
RUN python3 -m pip install --upgrade pip
RUN python3 -m venv /venv
RUN . /venv/bin/activate && \
python3 -m ensurepip --upgrade && \
python3 -m pip install -r /federated-example/requirements.txt
FROM python:3.11 AS run-image
COPY --from=compile-image /venv /venv
WORKDIR /federated-example/src
COPY . /federated-example/
CMD . /venv/bin/activate && python3 client.py $SERVER_IP_FLWR $PARAMETER_IP:5000 $SERVER_IP_AGG $CLIENT_ID

16
obu-node/README.md Normal file
View File

@@ -0,0 +1,16 @@
# OBU node
This is the version matching the final requirements where the client are started from the policy executor
## Running the code using Docker
1. To create the Docker Image, run "Dockerfile" using this command: `docker build -f Dockerfile -t client-image .`
2. Create a container from the above image using this command: `docker run -p 8080:8080 -p 5000:5000 -p 80:80 -e SERVER_IP_FLWR={server_ip_port_flwr} -e PARAMETER_IP=1 -e SERVER_IP_AGG={server_ip_port_agg} -e CLIENT_ID={client_id} --name client --rm client-image` (More notes below)
3. The script for the clients will run automatically. The clients assume the server is ready to accept the connection (which is the scenario to expect given no error happens on the server side), otherwise the clients will fail to establish the connection and stop the execution.
* **Notes**:
- `{server_ip_port_flwr}`is the IP address and port number used for the flower framework (port 8080 in tests) and `{server_ip_port_agg}` are the ip address and port used to communicated with the DMLO (port 5000 in tests), they should both be of the form `192.168.0.1:5000`.
- `{client_id}` is the ID to assign the specific client (each client should have a unique ID)
- The `-p` flag is used to map the docker ports to the devices ports and should be changed according to the ports used in the simulation (currently set to ports 8080 and 5000).
- The `-e` flag is used to set the variables used to run the script automatically.
- The execution can be stopped by opening another terminal and using this command `docker kill client`.

View File

@@ -0,0 +1,7 @@
[registry."192.168.100.2:5000"]
http = true
insecure = true
ca = ["certs/192.168.100.2:5000/ca.crt"]
[[registry."192.168.100.2:5000".keypair]]
key = "certs/192.168.100.2:5000/client.key"
cert = "certs/192.168.100.2:5000/client.cert"

View File

@@ -0,0 +1,3 @@
#!/bin/bash
docker buildx create --name iana --platform linux/amd64,linux/arm64 --bootstrap --config ./buildkitd.toml --use

18
obu-node/buildx/setup.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/bin/bash
# Nokia
#IANA_REGISTRY=192.168.100.2:5000
# TS
IANA_REGISTRY=192.168.100.2:5000
mkdir -p certs/"$IANA_REGISTRY"
(
cd certs/"$IANA_REGISTRY" || exit 1
openssl s_client -showcerts -connect "$IANA_REGISTRY" </dev/null | sed -ne '/-BEGIN CERTIFICATE-/,/-END CERTIFICATE-/p' >ca.crt
openssl genrsa -out client.key 4096
openssl req -new -x509 -text -key client.key -out client.cert \
-subj "/C=DE/ST=Northrhine Westphalia/L=Essen/O=University Duisburg-Essen/emailAddress=tuan-dat.tran@stud.uni-due.de"
)

12
obu-node/docker-push.sh Executable file
View File

@@ -0,0 +1,12 @@
#!/bin/sh
# docker tag SOURCE_IMAGE[:TAG] 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
# docker push 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
DOCKERFILE="./Dockerfile"
REGISTRY=192.168.100.2:5000/uulm
REMOTE_IMAGE="training_agent"
TAG=v1.3.0
docker buildx build --platform linux/amd64,linux/arm64 -f $DOCKERFILE -t \
$REGISTRY/$REMOTE_IMAGE:$TAG . --push

62
obu-node/requirements.txt Normal file
View File

@@ -0,0 +1,62 @@
absl-py==2.0.0
astunparse==1.6.3
blinker==1.7.0
cachetools==5.3.2
certifi==2023.7.22
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
cryptography==41.0.5
Flask==3.0.0
flatbuffers==23.5.26
flwr==1.5.0
gast==0.5.4
google-auth==2.23.4
google-auth-oauthlib==1.0.0
google-pasta==0.2.0
grpcio==1.59.2
h5py==3.10.0
idna==3.4
iterators==0.0.2
itsdangerous==2.1.2
Jinja2==3.1.2
joblib==1.3.2
keras==2.14.0
libclang==16.0.6
Markdown==3.5.1
MarkupSafe==2.1.3
ml-dtypes==0.2.0
netifaces==0.11.0
numpy==1.26.1
oauthlib==3.2.2
opt-einsum==3.3.0
packaging==23.2
pandas==2.1.2
protobuf==3.20.3
psutil==5.9.6
pyasn1==0.5.0
pyasn1-modules==0.3.0
pycparser==2.21
pycryptodome==3.19.0
Pympler==1.0.1
python-dateutil==2.8.2
pytz==2023.3.post1
requests==2.31.0
requests-oauthlib==1.3.1
rsa==4.9
scikit-learn==1.3.2
scipy==1.11.3
six==1.16.0
tensorboard==2.14.1
tensorboard-data-server==0.7.2
tensorflow==2.14.0
tensorflow-estimator==2.14.0
tensorflow-io-gcs-filesystem==0.34.0
termcolor==2.3.0
threadpoolctl==3.2.0
typing_extensions==4.8.0
tzdata==2023.3
urllib3==2.0.7
watchdog==3.0.0
Werkzeug==3.0.1
wrapt==1.14.1

View File

Binary file not shown.

File diff suppressed because it is too large Load Diff

0
obu-node/src/.gitkeep Normal file
View File

8
obu-node/src/changedb.py Normal file
View File

@@ -0,0 +1,8 @@
import pandas as pd
#Script to change the used database to simulate having a new database in the final version. The new database is the old one minus 50 elements
df = pd.read_csv('C:/Users/Firas/Desktop/docker/data/train_c1.csv')
r=len(df)-50
sampled = df.sample(n=r)
sampled.to_csv('C:/Users/Firas/Desktop/docker/data/train_c1.csv', index=False)
print(f"Sampled {r} lines and updated it as a new database")

View File

@@ -0,0 +1,31 @@
import requests
import sys
from time import sleep
import subprocess
def check_connection(ip):
try:
response = requests.post(f"http://{ip}/check_connection")
if response.status_code == 200:
print(f"Connetion established with {ip}. The script will run in 15 seconds.")
sleep(15)
execute_python_file(main_script, *new_args)
except:
sleep(5)
check_connection(ip)
def execute_python_file(main_script, *args):
cmd = ['python', main_script] + list(args)
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"Error running the script: {e}")
if __name__ == "__main__":
ip = sys.argv[1] #ip with port to check, for the clients, check the DMLO
main_script = sys.argv[2]
new_args = sys.argv[3:]
check_connection(ip)

356
obu-node/src/client.py Normal file
View File

@@ -0,0 +1,356 @@
import argparse
import os
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from tensorflow import keras
import sys
import flwr as fl
import json
import requests
from flwr.common import Scalar, Config
from time import sleep
from typing import Dict, Union
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from flask import Flask, request
import threading
from time import time_ns
# Make TensorFlow logs less verbose
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
app = Flask(__name__)
@app.route("/upload", methods=["POST"])
def upload():
global new_data, database_changed
data = request.data
data = data.decode("utf-8")
formatted_lines = []
for line in data.strip().split("\n"):
elements = line.split(",")
formatted_line = f"{elements[1]}, {elements[2]}, {elements[4].split()[0]}"
formatted_lines.append(formatted_line)
new_data = "\n".join(formatted_lines)
new_data = pd.DataFrame(
[line.split(",") for line in new_data.strip().split("\n")],
columns=["lat", "lon", "rtt"],
)
database_changed = True
return "Received new datapoints from the network monitoring tool", 200
def run_flask():
app.run(host="0.0.0.0", port=80)
flask_thread = threading.Thread(target=run_flask)
flask_thread.setDaemon(True)
flask_thread.start()
"""
gpu_id = 0 # Index of the GPU you want to use
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.set_visible_devices(physical_devices[gpu_id], 'GPU')
tf.config.experimental.set_memory_growth(physical_devices[gpu_id], True)
"""
client_id = sys.argv[4]
server_ip = sys.argv[1]
dmlo_ip = sys.argv[2]
server_ip_kpi = sys.argv[3]
q_alpha = 0.95
n_features = 3
n_future = 1
n_past = 400
learning_rate_argv = 0.001
database_changed = False
rounds_involved, uc6_02_start_obu = (
0,
0,
) # Simple workaround to help measure the model upload time
data_df = pd.read_csv("../resources/train_c1.csv")
datapoints = len(data_df)
def reload_data(data_df): # untested change (db01)
"""Reloading the dataset after detecting a change"""
print("Database is being processed")
# data_df = pd.read_csv("data/train_c1.csv") #db01
train_df, test_df = np.split(data_df, [int(0.70 * len(data_df))])
# Scaling the dataframe
train = train_df
scalers = {}
# Scaling train data
for i in train_df.columns:
scaler = MinMaxScaler(feature_range=(-1, 1))
s_s = scaler.fit_transform(train[i].values.reshape(-1, 1))
s_s = np.reshape(s_s, len(s_s))
scalers["scaler_" + i] = scaler
train[i] = s_s
# Scaling test data
test = test_df
for i in train_df.columns:
scaler = scalers["scaler_" + i]
s_s = scaler.transform(test[i].values.reshape(-1, 1))
s_s = np.reshape(s_s, len(s_s))
scalers["scaler_" + i] = scaler
test[i] = s_s
def split_series(series, n_past, n_future):
X, y = list(), list()
# Loop to create array of every observations (past) and predictions (future) for every datapoint
for window_start in range(len(series)):
# Calculating boundaries for each datapoint
past_end = window_start + n_past
future_end = past_end + n_future
# Loop will end if the number of datapoints is less than observations (past)
if future_end > len(series):
break
past, future = (
series[window_start:past_end, :],
series[past_end:future_end, :],
)
X.append(past)
y.append(future)
return np.array(X), np.array(y)
# Creating X_train, y_train, X_test, y_test
X_train, y_train = split_series(train.values, n_past, n_future)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_features))
y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], n_features))
X_test, y_test = split_series(test.values, n_past, n_future)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], n_features))
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], n_features))
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
y_train = y_train[:, :, 2]
y_test = y_test[:, :, 2]
global database_changed
database_changed = False
return X_train, y_train, X_test, y_test, train_df, scalers
class QuantileMetric(tf.keras.metrics.Metric):
def __init__(self, name="quantile_metric", **kwargs):
super(QuantileMetric, self).__init__(name=name, **kwargs)
self.quantile_metric = self.add_weight(
name="quantile_metric", initializer="zeros"
)
self.quantile_metric_count = self.add_weight(
name="quantile_metric_count", initializer="zeros"
)
def update_state(self, y_true, y_pred, sample_weight=None):
quantileCondition = tf.math.greater(y_true, tf.squeeze(y_pred))
qc = tf.math.reduce_sum(tf.cast(quantileCondition, tf.float32))
self.quantile_metric.assign_add(qc)
self.quantile_metric_count.assign_add(
tf.cast(tf.size(quantileCondition), tf.float32)
)
def result(self):
return self.quantile_metric / self.quantile_metric_count
def reset_state(self):
self.quantile_metric.assign(0.0)
self.quantile_metric_count.assign(0)
def tilted_loss(y_true, y_pred):
q = q_alpha
e = y_true - y_pred
tl = tf.stack([q * e, (q - 1) * e])
e_max = tf.math.reduce_max(tl, axis=0, keepdims=True)
return tf.reduce_mean(e_max)
class LSTMClient(fl.client.NumPyClient):
def __init__(self, best_model, X_train, y_train, X_test, y_test, train_df, scalers):
self.best_model = best_model
self.X_train, self.y_train = X_train, y_train
self.X_test, self.y_test = X_test, y_test
self.train_df = train_df
self.scalers = scalers
self.properties = {"client_id": client_id}
def get_properties(self, config: Config) -> Dict[str, Scalar]:
return self.properties
def get_parameters(self, config):
"""Get parameters of the local model."""
return self.best_model.get_weights()
def fit(self, parameters, config):
"""Train parameters on the locally held training set."""
uc6_01_end = time_ns() # Time required to download the global model from the agg.node in secs (Target <2s) has another part on the agg.node side
global uc6_02_start_obu, rounds_involved
rounds_involved += 1
uc6_02_end = time_ns() # Time required to upload the model (has another part on the agg.node side, in sec * 1000000000) (Target < 2s)
if rounds_involved > 1:
kpi_uc6_02 = uc6_02_end - uc6_02_start_obu
try:
response = requests.post(
f"http://{server_ip_kpi}/upload_kpi02", json={f"kpi02": kpi_uc6_02}
)
if response.status_code != 200:
print(f"Failed to send KPI_02. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error while sending KPI_02: {e}")
try:
response = requests.post(
f"http://{server_ip_kpi}/upload_kpi01", json={f"kpi01": uc6_01_end}
)
if response.status_code != 200:
print(f"Failed to send KPI_01. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error while sending KPI_01: {e}")
if database_changed == True:
try:
(
client.X_train,
client.y_train,
client.X_test,
client.y_test,
client.train_df,
client.scalers,
) = reload_data(new_data)
except Exception as e:
print(f"Error with the new data: {e}")
uc6_05_start = time_ns()
# Update local model parameters
self.best_model.set_weights(parameters)
# Get hyperparameters for this round
batch_size: int = config["batch_size"]
epochs: int = config["local_epochs"]
# Train the model using hyperparameters from config
history = self.best_model.fit(
self.X_train, self.y_train, batch_size, epochs, validation_split=0.1
)
# Return updated model parameters and results
parameters_prime = self.best_model.get_weights()
num_examples_train = len(self.X_train)
results = {
"id": client_id,
"loss": history.history["loss"][0],
"accuracy": history.history["mean_absolute_error"][0],
"val_loss": history.history["val_loss"][0],
"val_accuracy": history.history["val_mean_absolute_error"][0],
}
uc6_05_end = time_ns()
global kpi_uc6_05
kpi_uc6_05 = (
(uc6_05_end - uc6_05_start) / 1000000000
) # Time required to finish a training round (inkl. all local epochs) on the OBU side in sec (target <240s)
try:
response = requests.post(
f"http://{server_ip_kpi}/upload_kpi05", json={f"kpi05": kpi_uc6_05}
)
if response.status_code != 200:
print(f"Failed to send KPI_05. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error while sending KPI_05: {e}")
uc6_02_start_obu = time_ns()
return parameters_prime, num_examples_train, results
def evaluate(self, parameters, config):
"""Evaluate parameters on the locally held test set."""
# Update local model with global parameters
self.best_model.set_weights(parameters)
# Evaluate global model parameters on the local test data and return results
loss, metric, error = self.best_model.evaluate(self.X_test, self.y_test, 32)
num_examples_test = len(self.X_test)
pred = self.best_model.predict(self.X_test)
pred_copies = np.repeat(pred, 3, axis=-1)
pred_copies = np.expand_dims(pred_copies, axis=1)
for index, i in enumerate(self.train_df.columns):
scaler = self.scalers["scaler_" + i]
pred_copies[:, :, index] = scaler.inverse_transform(
pred_copies[:, :, index]
)
np.save("prediction_client1.npy", pred_copies[:, :, 2])
return loss, num_examples_test, {"accuracy": error}
def main() -> None:
uc6_04_start = time_ns()
X_train, y_train, X_test, y_test, train_df, scalers = reload_data(data_df)
uc6_04_end = time_ns()
global kpi_uc6_04
kpi_uc6_04 = (
uc6_04_end - uc6_04_start
) / 1000000000 # Time required to process training data by OBU in sec (Target <60s)
try:
response = requests.post(
f"http://{server_ip_kpi}/upload_kpi04", json={f"kpi04": kpi_uc6_04}
)
if response.status_code != 200:
print(f"Failed to send KPI_04. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
print(f"Error while sending KPI_04: {e}")
best_model = tf.keras.models.load_model(
"../resources/best_model_no_tuner_40.h5", compile=False
)
opt = tf.keras.optimizers.Adam(learning_rate=learning_rate_argv)
best_model.compile(
optimizer=opt,
loss=[tilted_loss],
metrics=[QuantileMetric(), keras.metrics.MeanAbsoluteError()],
)
global client
client = LSTMClient(best_model, X_train, y_train, X_test, y_test, train_df, scalers)
for i in range(40):
try:
response = requests.post(f"http://{server_ip_kpi}/check_connection")
if response.status_code == 200:
sleep(5)
break
except:
print(
"\n\n\n\nConnection to the Agg.Node could not be established, trying again in 5 seconds...\n",
flush=True,
)
sleep(5)
fl.client.start_numpy_client(
server_address=server_ip,
client=client,
)
if __name__ == "__main__":
main()