Initial commit
Signed-off-by: Tuan-Dat Tran <tuan-dat.tran@tudattr.dev>
This commit is contained in:
17
aggregator-node/Dockerfile
Normal file
17
aggregator-node/Dockerfile
Normal file
@@ -0,0 +1,17 @@
|
||||
FROM python:3.11 AS compile-image
|
||||
WORKDIR /federated-example
|
||||
COPY requirements.txt .
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN python3 -m venv /venv
|
||||
RUN . /venv/bin/activate && \
|
||||
python3 -m ensurepip --upgrade && \
|
||||
python3 -m pip install -r /federated-example/requirements.txt
|
||||
|
||||
FROM python:3.11 AS run-image
|
||||
COPY --from=compile-image /venv /venv
|
||||
|
||||
WORKDIR /federated-example/src
|
||||
|
||||
COPY . /federated-example/
|
||||
# RUN apt-get update && apt-get install -y tshark && rm -rf /var/lib/apt/lists/*
|
||||
CMD . /venv/bin/activate && python server.py $FLWR_PORT $DMLO_PORT
|
||||
0
aggregator-node/Example files/.gitkeep
Normal file
0
aggregator-node/Example files/.gitkeep
Normal file
3
aggregator-node/Example files/clients_list.json
Normal file
3
aggregator-node/Example files/clients_list.json
Normal file
@@ -0,0 +1,3 @@
|
||||
{
|
||||
"eligible_clients_ids" : ["1", "2"]
|
||||
}
|
||||
10
aggregator-node/Example files/config_server.json
Normal file
10
aggregator-node/Example files/config_server.json
Normal file
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"ml_model": "../resources/best_model_no_tuner_40.h5",
|
||||
"num_epochs": 20,
|
||||
"min_working_nodes": 2,
|
||||
"hyperparam_epochs": 10,
|
||||
"hyperparam_batch_size": 2048,
|
||||
"hyperparam_learning_rate": 0.001,
|
||||
"avg_algorithm": "FedAvg",
|
||||
"training_clients_per_round": 2
|
||||
}
|
||||
25
aggregator-node/README.md
Normal file
25
aggregator-node/README.md
Normal file
@@ -0,0 +1,25 @@
|
||||
# Aggregator node
|
||||
|
||||
This is the version matching the final requirements where the client are started from the policy execution
|
||||
|
||||
## Running the code using Docker
|
||||
|
||||
1. To create the Docker Image, run "Dockerfile" using this command: `docker build -f Dockerfile -t server-image .`
|
||||
2. Create a container from the above image using this command: `docker run -p 8080:8080 -p 5000:5000 -e FLWR_PORT={flwr_port} -e DMLO_PORT={dmlo_port} --name server --rm server-image`
|
||||
3. The script for the Agg.Node will run automatically, the other nodes will await the Agg.Node if they are started first.
|
||||
|
||||
* **Notes**:
|
||||
- `flwr_port` is the port number that will be used to communicate with the clients on the flower level (8080 for tests).
|
||||
- `dmlo_port` is teh port number that will be used to communicate with the dmlo (5000 for tests).
|
||||
- The `-p` flag is used to map the docker ports to the devices ports and should be changed according to the ports used in the simulation (currently set to ports 8080 and 5000).
|
||||
- The execution can be stopped by opening another terminal and using this command `docker kill server`.
|
||||
- The "Example files" directory contains examples for json files to be sent to the server. (The list of client IDs sent to the server should be a list of strings and not integers, see the example json file)
|
||||
|
||||
* **Below are helper shell commands to simulate server functions triggered by the DMLO (if needed):**
|
||||
|
||||
- To send the param file to the server:
|
||||
`curl -X POST -H "Content-Type: application/json" -d @{file_name}.json {server_ip}:5000/config_server`
|
||||
- To send the list of eligible clients to the server:
|
||||
`curl -X POST -H "Content-Type: application/json" -d @{file_name}.json {server_ip}:5000/select_clients`
|
||||
- To terminate the training:
|
||||
`curl -X POST -d "" {server_ip}:5000/terminate_app`
|
||||
12
aggregator-node/docker-push.sh
Executable file
12
aggregator-node/docker-push.sh
Executable file
@@ -0,0 +1,12 @@
|
||||
#!/bin/sh
|
||||
|
||||
# docker tag SOURCE_IMAGE[:TAG] 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
|
||||
# docker push 192.168.100.2:5000/uulm/<COMPONENT_NAME>:<VERSION>
|
||||
|
||||
TA_VERSION=v1.2.0
|
||||
LOCAL_IMAGE="aggregator"
|
||||
REMOTE_IMAGE="uc6aggnode"
|
||||
|
||||
docker build -t $LOCAL_IMAGE .
|
||||
docker tag $LOCAL_IMAGE:latest 192.168.100.2:5000/uulm/$REMOTE_IMAGE:$TA_VERSION
|
||||
docker push 192.168.100.2:5000/uulm/$REMOTE_IMAGE:$TA_VERSION
|
||||
151
aggregator-node/logs/2024-07-15.log
Normal file
151
aggregator-node/logs/2024-07-15.log
Normal file
@@ -0,0 +1,151 @@
|
||||
nxw@5g-iana-manager:~$ kc logs $(kc get pods --all-namespaces | grep agg | awk '{ print $2 } ') -n $(kc get pods --all-namespaces | grep agg | awk '{ print $1 } ') -f
|
||||
2024-07-15 15:48:16.807096: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
|
||||
2024-07-15 15:48:16.828642: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
|
||||
2024-07-15 15:48:16.828672: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
|
||||
2024-07-15 15:48:16.828715: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registe
|
||||
2024-07-15 15:48:16.833761: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
|
||||
2024-07-15 15:48:16.833925: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
|
||||
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
|
||||
2024-07-15 15:48:17.538321: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
|
||||
/federated-example/src/server.py:169: DeprecationWarning: setDaemon() is deprecated, set the daemon attribute instead
|
||||
flask_thread.setDaemon(True)
|
||||
* Serving Flask app 'server'
|
||||
* Debug mode: off
|
||||
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:5000
|
||||
* Running on http://10.1.68.68:5000
|
||||
Press CTRL+C to quit
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:00] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:01] "POST /check_connection HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:07] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:08] "POST /check_connection HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:12] "POST /config_server HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:12] "GET /select_clients HTTP/1.1" 200 -
|
||||
INFO flwr 2024-07-15 15:49:12,445 | app.py:162 | Starting Flower server, config: ServerConfig(num_rounds=5, round_timeout=None)
|
||||
INFO flwr 2024-07-15 15:49:12,449 | app.py:175 | Flower ECE: gRPC server running (5 rounds), SSL is disabled
|
||||
INFO flwr 2024-07-15 15:49:12,449 | server.py:89 | Initializing global parameters
|
||||
INFO flwr 2024-07-15 15:49:12,450 | server.py:272 | Using initial parameters provided by strategy
|
||||
INFO flwr 2024-07-15 15:49:12,450 | server.py:91 | Evaluating initial parameters
|
||||
Parameters loaded
|
||||
Inializing Model
|
||||
Model loaded
|
||||
Model Compiled
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 2s 23ms/step - loss: 0.0739 - quantile_metric: 0.1243 - mean_absolute_error: 0.5655
|
||||
63/63 [==============================] - 2s 22ms/step
|
||||
INFO flwr 2024-07-15 15:49:16,180 | server.py:94 | initial parameters (loss, other metrics): 0.07388024777173996, {'accuracy': 0.5655196309089661}
|
||||
INFO flwr 2024-07-15 15:49:16,180 | server.py:104 | FL starting
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:35] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:36] "POST /check_connection HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:49:41,146 | server.py:222 | fit_round 1: strategy sampled 2 clients (out of 2)
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:41] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:49:41] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:50:21] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:50:21] "POST /check_connection HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:50:42] "GET /select_clients HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:06] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:12] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:51:12,130 | server.py:236 | fit_round 1 received 2 results and 0 failures
|
||||
WARNING flwr 2024-07-15 15:51:12,131 | fedavg.py:242 | No fit_metrics_aggregation_fn provided
|
||||
2 clients connected.
|
||||
WARNING: 2 clients are needed but only 3 client IDs are received. The training will wait for another list with enough eligible clients.
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 1s 23ms/step - loss: 0.1734 - quantile_metric: 0.1908 - mean_absolute_error: 2.4910
|
||||
63/63 [==============================] - 1s 22ms/step
|
||||
INFO flwr 2024-07-15 15:51:15,075 | server.py:125 | fit progress: (1, 0.1733752340078354, {'accuracy': 2.490957498550415}, 118.89502924995031)
|
||||
DEBUG flwr 2024-07-15 15:51:15,149 | server.py:173 | evaluate_round 1: strategy sampled 3 clients (out of 3)
|
||||
DEBUG flwr 2024-07-15 15:51:26,920 | server.py:187 | evaluate_round 1 received 3 results and 0 failures
|
||||
WARNING flwr 2024-07-15 15:51:26,920 | fedavg.py:273 | No evaluate_metrics_aggregation_fn provided
|
||||
DEBUG flwr 2024-07-15 15:51:26,974 | server.py:222 | fit_round 2: strategy sampled 3 clients (out of 3)
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:27] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:27] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:27] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:27] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:51:27] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:05] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:42] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:47] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:52:47,347 | server.py:236 | fit_round 2 received 3 results and 0 failures
|
||||
2 clients connected.
|
||||
2 clients connected.
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 1s 21ms/step - loss: 0.0874 - quantile_metric: 0.2492 - mean_absolute_error: 0.2591
|
||||
63/63 [==============================] - 1s 21ms/step
|
||||
INFO flwr 2024-07-15 15:52:50,161 | server.py:125 | fit progress: (2, 0.08735799789428711, {'accuracy': 0.2590666115283966}, 213.98151048796717)
|
||||
DEBUG flwr 2024-07-15 15:52:50,221 | server.py:173 | evaluate_round 2: strategy sampled 3 clients (out of 3)
|
||||
DEBUG flwr 2024-07-15 15:52:59,542 | server.py:187 | evaluate_round 2 received 3 results and 0 failures
|
||||
DEBUG flwr 2024-07-15 15:52:59,589 | server.py:222 | fit_round 3: strategy sampled 3 clients (out of 3)
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:52:59] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:53:34] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:53:36] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:53:36] "POST /check_connection HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:12] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:54:13,045 | server.py:236 | fit_round 3 received 2 results and 1 failures
|
||||
2 clients connected.
|
||||
2 clients connected.
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 1s 22ms/step - loss: 0.0654 - quantile_metric: 0.1364 - mean_absolute_error: 0.9301
|
||||
63/63 [==============================] - 1s 22ms/step
|
||||
INFO flwr 2024-07-15 15:54:15,922 | server.py:125 | fit progress: (3, 0.06537292897701263, {'accuracy': 0.9301236867904663}, 299.7421916149906)
|
||||
DEBUG flwr 2024-07-15 15:54:15,981 | server.py:173 | evaluate_round 3: strategy sampled 3 clients (out of 3)
|
||||
DEBUG flwr 2024-07-15 15:54:28,262 | server.py:187 | evaluate_round 3 received 3 results and 0 failures
|
||||
DEBUG flwr 2024-07-15 15:54:28,314 | server.py:222 | fit_round 4: strategy sampled 3 clients (out of 3)
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:28] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:28] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:28] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:28] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:54:28] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:55:03] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:55:40] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:55:53] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:55:53,632 | server.py:236 | fit_round 4 received 3 results and 0 failures
|
||||
2 clients connected.
|
||||
2 clients connected.
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 1s 22ms/step - loss: 0.1268 - quantile_metric: 0.3151 - mean_absolute_error: 0.3247
|
||||
63/63 [==============================] - 1s 22ms/step
|
||||
INFO flwr 2024-07-15 15:55:56,563 | server.py:125 | fit progress: (4, 0.12679509818553925, {'accuracy': 0.3247184455394745}, 400.3833388419589)
|
||||
DEBUG flwr 2024-07-15 15:55:56,646 | server.py:173 | evaluate_round 4: strategy sampled 3 clients (out of 3)
|
||||
DEBUG flwr 2024-07-15 15:56:06,016 | server.py:187 | evaluate_round 4 received 3 results and 0 failures
|
||||
DEBUG flwr 2024-07-15 15:56:06,066 | server.py:222 | fit_round 5: strategy sampled 3 clients (out of 3)
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi02 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:06] "POST /upload_kpi01 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:56:41] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:57:17] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:57:25] "POST /upload_kpi05 HTTP/1.1" 200 -
|
||||
DEBUG flwr 2024-07-15 15:57:25,615 | server.py:236 | fit_round 5 received 3 results and 0 failures
|
||||
2 clients connected.
|
||||
2 clients connected.
|
||||
(2003, 400, 3)
|
||||
(2003, 1, 3)
|
||||
63/63 [==============================] - 1s 22ms/step - loss: 0.0718 - quantile_metric: 0.1710 - mean_absolute_error: 0.3574
|
||||
63/63 [==============================] - 1s 22ms/step
|
||||
INFO flwr 2024-07-15 15:57:28,518 | server.py:125 | fit progress: (5, 0.0717623308300972, {'accuracy': 0.35737916827201843}, 492.3376815340016)
|
||||
DEBUG flwr 2024-07-15 15:57:28,599 | server.py:173 | evaluate_round 5: strategy sampled 3 clients (out of 3)
|
||||
DEBUG flwr 2024-07-15 15:57:37,732 | server.py:187 | evaluate_round 5 received 3 results and 0 failures
|
||||
INFO flwr 2024-07-15 15:57:37,732 | server.py:153 | FL finished in 501.5518533719587
|
||||
INFO flwr 2024-07-15 15:57:37,732 | app.py:225 | app_fit: losses_distributed [(1, 0.22432586054007211), (2, 0.05442244683702787), (3, 0.06365528702735901), (4, 0.05708811432123184), (5, 0.04476702958345413)]
|
||||
INFO flwr 2024-07-15 15:57:37,732 | app.py:226 | app_fit: metrics_distributed_fit {}
|
||||
INFO flwr 2024-07-15 15:57:37,732 | app.py:227 | app_fit: metrics_distributed {}
|
||||
INFO flwr 2024-07-15 15:57:37,732 | app.py:228 | app_fit: losses_centralized [(0, 0.07388024777173996), (1, 0.1733752340078354), (2, 0.08735799789428711), (3, 0.06537292897701263), (4, 0.12679509818553925), (5, 0.0717623308300972)]
|
||||
INFO flwr 2024-07-15 15:57:37,732 | app.py:229 | app_fit: metrics_centralized {'accuracy': [(0, 0.5655196309089661), (1, 2.490957498550415), (2, 0.2590666115283966), (3, 0.9301236867904663), (4, 0.3247184455394745), (5, 0.35737916827201843)]}
|
||||
2 clients connected.
|
||||
10.1.3.0 - - [15/Jul/2024 15:58:02] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:58:02] "POST /check_connection HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:58:13] "POST /upload_kpi04 HTTP/1.1" 200 -
|
||||
10.1.3.0 - - [15/Jul/2024 15:58:14] "POST /check_connection HTTP/1.1" 200 -
|
||||
62
aggregator-node/requirements.txt
Normal file
62
aggregator-node/requirements.txt
Normal file
@@ -0,0 +1,62 @@
|
||||
absl-py==2.0.0
|
||||
astunparse==1.6.3
|
||||
blinker==1.7.0
|
||||
cachetools==5.3.2
|
||||
certifi==2023.7.22
|
||||
cffi==1.16.0
|
||||
charset-normalizer==3.3.2
|
||||
click==8.1.7
|
||||
cryptography==41.0.5
|
||||
Flask==3.0.0
|
||||
flatbuffers==23.5.26
|
||||
flwr==1.5.0
|
||||
gast==0.5.4
|
||||
google-auth==2.23.4
|
||||
google-auth-oauthlib==1.0.0
|
||||
google-pasta==0.2.0
|
||||
grpcio==1.59.2
|
||||
h5py==3.10.0
|
||||
idna==3.4
|
||||
iterators==0.0.2
|
||||
itsdangerous==2.1.2
|
||||
Jinja2==3.1.2
|
||||
joblib==1.3.2
|
||||
keras==2.14.0
|
||||
libclang==16.0.6
|
||||
Markdown==3.5.1
|
||||
MarkupSafe==2.1.3
|
||||
ml-dtypes==0.2.0
|
||||
netifaces==0.11.0
|
||||
numpy==1.26.1
|
||||
oauthlib==3.2.2
|
||||
opt-einsum==3.3.0
|
||||
packaging==23.2
|
||||
pandas==2.1.2
|
||||
protobuf==3.20.3
|
||||
psutil==5.9.6
|
||||
pyasn1==0.5.0
|
||||
pyasn1-modules==0.3.0
|
||||
pycparser==2.21
|
||||
pycryptodome==3.19.0
|
||||
Pympler==1.0.1
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3.post1
|
||||
requests==2.31.0
|
||||
requests-oauthlib==1.3.1
|
||||
rsa==4.9
|
||||
scikit-learn==1.3.2
|
||||
scipy==1.11.3
|
||||
six==1.16.0
|
||||
tensorboard==2.14.1
|
||||
tensorboard-data-server==0.7.2
|
||||
tensorflow==2.14.0
|
||||
tensorflow-estimator==2.14.0
|
||||
tensorflow-io-gcs-filesystem==0.34.0
|
||||
termcolor==2.3.0
|
||||
threadpoolctl==3.2.0
|
||||
typing_extensions==4.8.0
|
||||
tzdata==2023.3
|
||||
urllib3==2.0.7
|
||||
watchdog==3.0.0
|
||||
Werkzeug==3.0.1
|
||||
wrapt==1.14.1
|
||||
0
aggregator-node/resources/.gitkeep
Normal file
0
aggregator-node/resources/.gitkeep
Normal file
BIN
aggregator-node/resources/best_model_no_tuner_40.h5
Normal file
BIN
aggregator-node/resources/best_model_no_tuner_40.h5
Normal file
Binary file not shown.
24028
aggregator-node/resources/data.csv
Normal file
24028
aggregator-node/resources/data.csv
Normal file
File diff suppressed because it is too large
Load Diff
2404
aggregator-node/resources/test.csv
Normal file
2404
aggregator-node/resources/test.csv
Normal file
File diff suppressed because it is too large
Load Diff
0
aggregator-node/src/.gitkeep
Normal file
0
aggregator-node/src/.gitkeep
Normal file
506
aggregator-node/src/server.py
Normal file
506
aggregator-node/src/server.py
Normal file
@@ -0,0 +1,506 @@
|
||||
import flwr as fl
|
||||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from typing import Dict, Optional, Tuple, List, Union
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
import sys
|
||||
import json
|
||||
from flwr.server.client_manager import SimpleClientManager
|
||||
from flwr.server.client_proxy import ClientProxy
|
||||
from abc import ABC
|
||||
from logging import INFO
|
||||
from flwr.common.logger import log
|
||||
from time import sleep
|
||||
from time import time_ns
|
||||
from flask import Flask, request
|
||||
import threading
|
||||
import os
|
||||
|
||||
|
||||
Scalar = Union[bool, bytes, float, int, str]
|
||||
Config = Dict[str, Scalar]
|
||||
param_file = None
|
||||
global best_model, list_kpi_11
|
||||
selected_clients_ids = [] # This is the list of client IDs the Agg.Node receives from the DMLO and will use for training.
|
||||
all_round_reports = {} # The dictionary containing all the round reports
|
||||
flwr_port = sys.argv[1]
|
||||
dmlo_port = sys.argv[2]
|
||||
# server_ip = ip
|
||||
l_kpi1, l_kpi2, l_kpi4, l_kpi5, list_kpi_11 = [], [], [], [], []
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@app.route("/config_server", methods=["POST"])
|
||||
def config_server():
|
||||
global param_file
|
||||
param_file = request.json
|
||||
param_received.set()
|
||||
# print("_____Received a config file", flush=True)
|
||||
try:
|
||||
global req_clients
|
||||
# highest_req_clients = max(req_clients, highest_req_clients)
|
||||
req_clients = request["training_clients_per_round"]
|
||||
# print(f"_____The new number of clients (req_clients) is: {req_clients} and the highest was had so far is {highest_req_clients}", flush=True)
|
||||
# if req_clients > highest_req_clients:
|
||||
# print(f"_____Rescaled the last dimension to {req_clients}", flush=True)
|
||||
# kpis = np.resize(kpis, (epochs+1, 12, req_clients))
|
||||
except:
|
||||
# print("_____Except path triggered", flush=True)
|
||||
pass
|
||||
return "Parameters received successfully.", 200
|
||||
|
||||
|
||||
@app.route(
|
||||
"/select_clients", methods=["GET"]
|
||||
) # The method that will receive the list of client IDs the server will use for training.
|
||||
def select_clients():
|
||||
global selected_clients_ids
|
||||
selected_clients_ids = request.json["eligible_clients_ids"]
|
||||
if len(selected_clients_ids) != req_clients:
|
||||
print(
|
||||
f"WARNING: {req_clients} clients are needed but only {len(selected_clients_ids)} client IDs are received. The training will wait for another list with enough eligible clients."
|
||||
)
|
||||
# A selection logic can be added here to modify the "selected_clients_id" variable. Do not forget to modify the next line (return) if this logic is added
|
||||
return request.json, 200
|
||||
|
||||
|
||||
@app.route("/check_connection", methods=["POST"])
|
||||
def check_connection():
|
||||
"""A function part of the older system to synchronize the processes.
|
||||
It does not hurt to keep for the final version to check server availability.
|
||||
"""
|
||||
return "Agg.Node is online", 200
|
||||
|
||||
|
||||
@app.route("/terminate_app", methods=["POST"])
|
||||
def terminate_app():
|
||||
try:
|
||||
save_kpis()
|
||||
except:
|
||||
print("No KPIs saved.")
|
||||
try:
|
||||
global best_model
|
||||
tf.keras.models.save_model(
|
||||
model=best_model,
|
||||
filepath="../resources/last_model.h5",
|
||||
overwrite=True,
|
||||
save_format="h5",
|
||||
)
|
||||
except:
|
||||
print("No model has been saved")
|
||||
print("Agg.Node shutting down...")
|
||||
end_thread = threading.Thread(target=__terminate__)
|
||||
end_thread.start()
|
||||
# myserver.disconnect_all_clients(timeout=None)
|
||||
return "Agg.Node successfully received shutdown command.", 200
|
||||
|
||||
|
||||
@app.route("/upload_kpi01", methods=["POST"])
|
||||
def upload_kpi01():
|
||||
"""for automatic averaging if needed again
|
||||
received01 += 1
|
||||
if received01 != 1:
|
||||
kpi01_value = (kpi01_value*((received01-1)/received01)) + (((request.json["kpi01"] - uc6_01_start)/1000000000)/received01)
|
||||
print(f"KPI01 average so far: {kpi01_value}")
|
||||
else: kpi01_value = (request.json["kpi01"] - uc6_01_start)/1000000000
|
||||
return "", 200
|
||||
"""
|
||||
l_kpi1.append((request.json["kpi01"] - uc6_01_start) / 1000000000)
|
||||
if (
|
||||
current_training_round != 1
|
||||
): # Skipping the measurement for the first round as it is inaccurate because of the starting process
|
||||
kpis[current_training_round, 1, len(l_kpi1) - 1] = (
|
||||
request.json["kpi01"] - uc6_01_start
|
||||
) / 1000000000
|
||||
return "", 200
|
||||
|
||||
|
||||
@app.route("/upload_kpi02", methods=["POST"])
|
||||
def upload_kpi02():
|
||||
tmp = (request.json["kpi02"] - (uc6_02_help_end - uc6_02_help_start)) / 1000000000
|
||||
l_kpi2.append(tmp)
|
||||
kpis[current_training_round, 2, len(l_kpi2) - 1] = tmp
|
||||
return "", 200
|
||||
|
||||
|
||||
@app.route("/upload_kpi04", methods=["POST"])
|
||||
def upload_kpi04():
|
||||
try:
|
||||
l_kpi4.append(request.json["kpi04"])
|
||||
kpis[current_training_round, 4, len(l_kpi4) - 1] = request.json["kpi04"]
|
||||
except:
|
||||
pass
|
||||
return "", 200
|
||||
|
||||
|
||||
@app.route("/upload_kpi05", methods=["POST"])
|
||||
def upload_kpi05():
|
||||
l_kpi5.append(request.json["kpi05"])
|
||||
kpis[current_training_round, 5, len(l_kpi5) - 1] = request.json["kpi05"]
|
||||
return "", 200
|
||||
|
||||
|
||||
@app.route("/get_status", methods=["GET"])
|
||||
def get_status():
|
||||
try:
|
||||
with open("Round_report.txt", "r") as file:
|
||||
report = file.read()
|
||||
return report
|
||||
except FileNotFoundError:
|
||||
return "No report available", 200
|
||||
except Exception as e:
|
||||
return f"An error occurred: {e}", 500
|
||||
|
||||
|
||||
def __terminate__():
|
||||
sleep(2)
|
||||
os._exit(0)
|
||||
|
||||
|
||||
def run_flask():
|
||||
app.run(host="0.0.0.0", port=dmlo_port)
|
||||
|
||||
|
||||
param_received = threading.Event()
|
||||
flask_thread = threading.Thread(target=run_flask)
|
||||
flask_thread.setDaemon(True)
|
||||
flask_thread.start()
|
||||
param_received.wait()
|
||||
|
||||
local_training = param_file["hyperparam_epochs"]
|
||||
epochs = param_file["num_epochs"]
|
||||
req_clients = param_file["training_clients_per_round"] # Number of clients to train
|
||||
# highest_req_clients = req_clients # the highest number of clinets a round has had so far (to resize the KPI matrix if needed)
|
||||
hyperparam_learning_rate = param_file["hyperparam_learning_rate"]
|
||||
hyperparam_batch_size = param_file["hyperparam_batch_size"]
|
||||
ml_model = param_file["ml_model"]
|
||||
kpis = np.empty((epochs + 1, 12, 8), dtype=object)
|
||||
print("Parameters loaded")
|
||||
q_alpha = 0.95
|
||||
n_features = 3
|
||||
n_future = 1
|
||||
n_past = 400
|
||||
|
||||
|
||||
def save_kpis():
|
||||
try:
|
||||
np.save("kpis.npy", kpis)
|
||||
except:
|
||||
print("No KPIs recorded so far.")
|
||||
|
||||
|
||||
def save_round_report(round_status):
|
||||
all_round_reports[f"Round {current_training_round}"] = round_status
|
||||
try:
|
||||
with open("Round_report.txt", "w") as file:
|
||||
json.dump(all_round_reports, file, indent=4)
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
|
||||
class QuantileMetric(tf.keras.metrics.Metric):
|
||||
def __init__(self, name="quantile_metric", **kwargs):
|
||||
super(QuantileMetric, self).__init__(name=name, **kwargs)
|
||||
self.quantile_metric = self.add_weight(
|
||||
name="quantile_metric", initializer="zeros"
|
||||
)
|
||||
self.quantile_metric_count = self.add_weight(
|
||||
name="quantile_metric_count", initializer="zeros"
|
||||
)
|
||||
|
||||
def update_state(self, y_true, y_pred, sample_weight=None):
|
||||
quantileCondition = tf.math.greater(y_true, tf.squeeze(y_pred))
|
||||
qc = tf.math.reduce_sum(tf.cast(quantileCondition, tf.float32))
|
||||
self.quantile_metric.assign_add(qc)
|
||||
self.quantile_metric_count.assign_add(
|
||||
tf.cast(tf.size(quantileCondition), tf.float32)
|
||||
)
|
||||
|
||||
def result(self):
|
||||
return self.quantile_metric / self.quantile_metric_count
|
||||
|
||||
def reset_state(self):
|
||||
self.quantile_metric.assign(0.0)
|
||||
self.quantile_metric_count.assign(0)
|
||||
|
||||
|
||||
def tilted_loss(y_true, y_pred):
|
||||
q = q_alpha
|
||||
e = y_true - y_pred
|
||||
tl = tf.stack([q * e, (q - 1) * e])
|
||||
e_max = tf.math.reduce_max(tl, axis=0, keepdims=True)
|
||||
return tf.reduce_mean(e_max)
|
||||
|
||||
|
||||
""" Choosing GPU
|
||||
gpu_id = 0 # Index of the GPU you want to use
|
||||
physical_devices = tf.config.list_physical_devices('GPU')
|
||||
print(physical_devices)
|
||||
tf.config.set_visible_devices(physical_devices[gpu_id], 'GPU')
|
||||
tf.config.experimental.set_memory_growth(physical_devices[gpu_id], True)
|
||||
"""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
global best_model
|
||||
print("Inializing Model")
|
||||
best_model = tf.keras.models.load_model(ml_model, compile=False)
|
||||
|
||||
print("Model loaded")
|
||||
|
||||
opt = tf.keras.optimizers.Adam(learning_rate=hyperparam_learning_rate)
|
||||
best_model.compile(
|
||||
optimizer=opt,
|
||||
loss=[tilted_loss],
|
||||
metrics=[QuantileMetric(), keras.metrics.MeanAbsoluteError()],
|
||||
)
|
||||
|
||||
print("Model Compiled")
|
||||
|
||||
class CustomStrategy(fl.server.strategy.FedAdagrad):
|
||||
def aggregate_fit(self, rnd, results, failures):
|
||||
uc6_03_start = time_ns()
|
||||
aggregated_parameters = super().aggregate_fit(rnd, results, failures)
|
||||
uc6_03_end = time_ns()
|
||||
global kpi_uc6_03
|
||||
kpi_uc6_03 = (
|
||||
(uc6_03_end - uc6_03_start) / 1000000000
|
||||
) # Time required to aggregate all locally trained models sent by the OBUs in sec (Target <5s)
|
||||
kpis[current_training_round, 3, 0] = kpi_uc6_03
|
||||
|
||||
per_client_accuracy = []
|
||||
per_client_loss = []
|
||||
clients_order = [] # To map the accuracy and loss to a client ID (n'th ID to the n'th accuracy/loss)
|
||||
for result in results:
|
||||
client_info = result[1].metrics
|
||||
clients_order.append(client_info["id"])
|
||||
per_client_accuracy.append(client_info["accuracy"])
|
||||
per_client_loss.append(client_info["loss"])
|
||||
round_status = {
|
||||
"is_completed": "True",
|
||||
"current_accuracy": accuracy_perc,
|
||||
"current_loss": loss_perc,
|
||||
"lost_clients": len(failures),
|
||||
"clients_order": clients_order,
|
||||
"per_client_accuracy": per_client_accuracy,
|
||||
"per_client_loss": per_client_loss,
|
||||
}
|
||||
save_round_report(round_status)
|
||||
kpi_uc6_11 = round(
|
||||
100 - ((len(failures) / (len(results) + len(failures))) * 100), 1
|
||||
) # The % of successfully uploaded trained models for a certain round (Target >90%)
|
||||
kpis[current_training_round, 11, 0] = kpi_uc6_11
|
||||
list_kpi_11.append(kpi_uc6_11)
|
||||
kpi_uc6_10 = sum(list_kpi_11) / len(
|
||||
list_kpi_11
|
||||
) # The % of successfully uploaded trained models in total (Target >90%)
|
||||
kpis[current_training_round, 10, 0] = kpi_uc6_10
|
||||
|
||||
return aggregated_parameters
|
||||
|
||||
strategy = CustomStrategy(
|
||||
evaluate_fn=get_evaluate_fn(best_model),
|
||||
on_fit_config_fn=fit_config,
|
||||
initial_parameters=fl.common.ndarrays_to_parameters(best_model.get_weights()),
|
||||
)
|
||||
|
||||
class GetPropertiesIns:
|
||||
"""Properties request for a client."""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.config = config
|
||||
|
||||
test: GetPropertiesIns = GetPropertiesIns(config={"server_round": 1})
|
||||
|
||||
class Criterion(ABC):
|
||||
"""Abstract class which allows subclasses to implement criterion
|
||||
sampling."""
|
||||
|
||||
def select(self, client: ClientProxy) -> bool:
|
||||
"""Decide whether a client should be eligible for sampling or not."""
|
||||
# if client.get_properties(ins=test, timeout = None).properties["client_id"] in eligible_clients_ids: #This line makes the selection logic on the server side but needs clients to be connected first. In the final test version, the logic is elsewhere. This function just uses the previous selection
|
||||
if (
|
||||
client.get_properties(ins=test, timeout=None).properties["client_id"]
|
||||
in selected_clients_ids
|
||||
):
|
||||
return True
|
||||
else:
|
||||
# # Code to debug clients not being selected for training despite selecting their ID (first thought: ID as str compared to ID as int will always return false)
|
||||
# print(f"Rejected: _{client.get_properties(ins=test, timeout = None).properties['client_id']}_ with the list being:")
|
||||
# for i in selected_clients_ids:
|
||||
# print(f"_{i}_")
|
||||
return False
|
||||
|
||||
c = Criterion()
|
||||
|
||||
class CustomClientManager(SimpleClientManager):
|
||||
def sample(
|
||||
self,
|
||||
num_clients: int = 2, # Number of clients currently connected to the server
|
||||
rq_clients: int = req_clients, # Number of clients to train (added)
|
||||
min_num_clients: int = 3,
|
||||
min_wait: int = req_clients, # Number of clients to have before beginning the selection (added)
|
||||
criterion: [Criterion] = c,
|
||||
) -> List[ClientProxy]:
|
||||
"""Sample a number of Flower ClientProxy instances."""
|
||||
# Block until at least num_clients are connected.
|
||||
if min_wait is None:
|
||||
min_wait = num_clients
|
||||
self.wait_for(min_wait)
|
||||
print(f"{min_wait} clients connected.")
|
||||
|
||||
connection_attempts = 40 # Helper variable to give the OBUs more time to start and connect to the agg.node
|
||||
while connection_attempts != 0:
|
||||
# Sample clients which meet the criterion
|
||||
available_cids = list(self.clients)
|
||||
if criterion is not None:
|
||||
available_cids = [
|
||||
cid
|
||||
for cid in available_cids
|
||||
if criterion.select(self.clients[cid])
|
||||
]
|
||||
|
||||
if rq_clients > len(available_cids):
|
||||
log(
|
||||
INFO,
|
||||
"Sampling failed: number of available clients"
|
||||
" (%s) is less than number of requested clients (%s).",
|
||||
len(available_cids),
|
||||
rq_clients,
|
||||
)
|
||||
connection_attempts -= 1
|
||||
print(
|
||||
f"Retrying in 5 seconds. Attempts left: {connection_attempts}"
|
||||
)
|
||||
sleep(5)
|
||||
else:
|
||||
break
|
||||
|
||||
if rq_clients > len(available_cids):
|
||||
return []
|
||||
|
||||
sampled_cids = available_cids
|
||||
return [self.clients[cid] for cid in sampled_cids]
|
||||
|
||||
fl.server.start_server(
|
||||
server_address=f"0.0.0.0:{flwr_port}",
|
||||
config=fl.server.ServerConfig(num_rounds=epochs),
|
||||
strategy=strategy,
|
||||
client_manager=CustomClientManager(),
|
||||
)
|
||||
|
||||
|
||||
def get_evaluate_fn(best_model):
|
||||
"""Return an evaluation function for server-side evaluation."""
|
||||
|
||||
# The `evaluate` function will be called after every round
|
||||
def evaluate(
|
||||
server_round: int,
|
||||
parameters: fl.common.NDArrays,
|
||||
config: Dict[str, fl.common.Scalar],
|
||||
) -> Optional[Tuple[float, Dict[str, fl.common.Scalar]]]:
|
||||
global uc6_02_help_start
|
||||
uc6_02_help_start = (
|
||||
time_ns()
|
||||
) # Time to be substracted as processing time to know the model upload time
|
||||
best_model.set_weights(parameters) # Update model with the latest parameters
|
||||
|
||||
df_final = pd.read_csv("../resources/test.csv")
|
||||
df_train = pd.read_csv("../resources/data.csv")
|
||||
# train test validation split
|
||||
test_df = df_final
|
||||
# Scaling the dataframe
|
||||
test = test_df
|
||||
scalers = {}
|
||||
|
||||
# Scaling train data
|
||||
for i in test_df.columns:
|
||||
scaler = MinMaxScaler(feature_range=(-1, 1))
|
||||
s_s = scaler.fit_transform(test[i].values.reshape(-1, 1))
|
||||
s_s = np.reshape(s_s, len(s_s))
|
||||
scalers["scaler_" + i] = scaler
|
||||
test[i] = s_s
|
||||
|
||||
def split_series(series, n_past, n_future):
|
||||
X, y = list(), list()
|
||||
# Loop to create array of every observations (past) and predictions (future) for every datapoint
|
||||
for window_start in range(len(series)):
|
||||
# Calculating boundaries for each datapoint
|
||||
past_end = window_start + n_past
|
||||
future_end = past_end + n_future
|
||||
# Loop will end if the number of datapoints is less than observations (past)
|
||||
if future_end > len(series):
|
||||
break
|
||||
past, future = (
|
||||
series[window_start:past_end, :],
|
||||
series[past_end:future_end, :],
|
||||
)
|
||||
X.append(past)
|
||||
y.append(future)
|
||||
return np.array(X), np.array(y)
|
||||
|
||||
X_test, y_test = split_series(test.values, n_past, n_future)
|
||||
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], n_features))
|
||||
y_test = y_test.reshape((y_test.shape[0], y_test.shape[1], n_features))
|
||||
|
||||
print(X_test.shape)
|
||||
print(y_test.shape)
|
||||
|
||||
y_test_sliced = y_test[:, :, 2]
|
||||
|
||||
np.save("X_test_server.npy", X_test)
|
||||
np.save("y_test_server.npy", y_test_sliced)
|
||||
|
||||
loss, metric, error = best_model.evaluate(X_test, y_test_sliced)
|
||||
pred = best_model.predict(X_test)
|
||||
pred_copies = np.repeat(pred, 3, axis=-1)
|
||||
pred_copies = np.expand_dims(pred_copies, axis=1)
|
||||
for index, i in enumerate(test_df.columns):
|
||||
scaler = scalers["scaler_" + i]
|
||||
pred_copies[:, :, index] = scaler.inverse_transform(
|
||||
pred_copies[:, :, index]
|
||||
)
|
||||
y_test[:, :, index] = scaler.inverse_transform(y_test[:, :, index])
|
||||
np.save("prediction_server.npy", pred_copies)
|
||||
np.save("test_server.npy", y_test)
|
||||
|
||||
global loss_perc, accuracy_perc
|
||||
loss_perc = loss
|
||||
accuracy_perc = error
|
||||
|
||||
save_kpis()
|
||||
return loss, {"accuracy": error}
|
||||
|
||||
return evaluate
|
||||
|
||||
|
||||
def fit_config(server_round: int):
|
||||
"""Return training configuration dict for each round.
|
||||
Keep batch size fixed at 2048, perform two rounds of training with one
|
||||
local epoch, increase to two local epochs afterwards.
|
||||
"""
|
||||
global \
|
||||
current_training_round, \
|
||||
uc6_02_help_end, \
|
||||
uc6_01_start, \
|
||||
l_kpi1, \
|
||||
l_kpi2, \
|
||||
l_kpi4, \
|
||||
l_kpi5
|
||||
current_training_round = server_round
|
||||
l_kpi1, l_kpi2, l_kpi4, l_kpi5 = [], [], [], []
|
||||
uc6_02_help_end = time_ns()
|
||||
|
||||
uc6_01_start = time_ns()
|
||||
config = {
|
||||
"batch_size": hyperparam_batch_size,
|
||||
"local_epochs": local_training,
|
||||
}
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user