{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ab5cd7d1-1a57-46fc-8282-dae0a6cc2944", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import random\n", "import pandas as pd\n", "import itertools\n", "from joblib import Parallel, delayed\n", "import os.path\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "3d1ad0b9-f6a8-4e98-84aa-6e02e4279954", "metadata": {}, "outputs": [], "source": [ "SEED = 42\n", "np.random.seed(SEED)\n", "random.seed(SEED)\n", "\n", "ZIPF_CONSTANT = 2" ] }, { "cell_type": "code", "execution_count": 3, "id": "5a27d416-8f98-4814-af9e-6c6bef95f4ef", "metadata": {}, "outputs": [], "source": [ "def eta_star(db_object_count, c_f, cache_sz, c_delta, lambda_vals):\n", " num = (db_object_count * c_f - cache_sz * c_delta)\n", " denom = np.sum(1.0/lambda_vals)\n", " if denom == 0:\n", " print(\"sum(1.0/lambda_vals) == 0\")\n", " print(db_object_count, c_f, cache_sz, c_delta, lambda_vals)\n", " return max(0, num/denom)" ] }, { "cell_type": "code", "execution_count": 4, "id": "6276a9ce-f839-4fe6-90f2-2195cf065fc8", "metadata": {}, "outputs": [], "source": [ "def h_i_star(c_f, eta, lambda_vals, c_delta):\n", " optimized_hitrate = (c_f - (eta/lambda_vals)) / c_delta\n", " return optimized_hitrate" ] }, { "cell_type": "code", "execution_count": 5, "id": "dcd31a8c-6864-4b9a-8bb3-998f0c32baf6", "metadata": {}, "outputs": [], "source": [ "def get_index_of_furthest_hitrate_from_boundary(hitrates):\n", " lower_bound_violation = hitrates[(hitrates < 0)]\n", " upper_bound_violation = hitrates[(hitrates > 1)]\n", " smallest_delta = np.abs(np.min(lower_bound_violation))\n", " biggest_delta = np.max(upper_bound_violation) - 1\n", " if smallest_delta > biggest_delta:\n", " index = np.where(hitrates == np.min(local_hitrates))[0][0]\n", " return index\n", " else:\n", " \n", " index = np.where(hitrates == np.max(local_hitrates))[0][0]\n", " return index" ] }, { "cell_type": "code", "execution_count": 6, "id": "9d774304-ae68-43b3-a76a-e970c06c5236", "metadata": {}, "outputs": [], "source": [ "def get_index_of_furthest_hitrate_from_boundary(hitrates):\n", " outside_bounds = (hitrates < 0) | (hitrates > 1)\n", " distances = np.where(outside_bounds, np.maximum(np.abs(hitrates - 0), np.abs(hitrates - 1)), -np.inf)\n", " index = np.argmax(distances)\n", " return index" ] }, { "cell_type": "code", "execution_count": 7, "id": "0e21c26f-058a-4e56-a5ad-1c47bf28656c", "metadata": { "scrolled": true }, "outputs": [], "source": [ "def optimize_hitrates(db_object_count, cache_size, c_f, c_delta, lambda_vals):\n", " optimized_hitrates = np.zeros(db_object_count)\n", " current_db_object_count = db_object_count\n", " current_cache_size = cache_size\n", " \n", " differenc_set = np.arange(db_object_count)\n", " fix_i = []\n", " while True:\n", " if current_db_object_count == 0:\n", " if current_cache_size > 0:\n", " # print(\"Re-optimize objects with optimal hitrate of 0.\")\n", " differenc_set = np.where(optimized_hitrates == 0)[0]\n", " fix_i = np.setdiff1d(np.arange(db_object_count), differenc_set).tolist()\n", " current_db_object_count = len(differenc_set)\n", " continue\n", " else:\n", " # print(\"Stop optimization.\")\n", " optimized_hitrates[differenc_set] = 0\n", " break\n", " \n", " eta = eta_star(current_db_object_count, c_f, current_cache_size, c_delta, lambda_vals[differenc_set])\n", " optimized_hitrates[differenc_set] = h_i_star(c_f, eta, lambda_vals[differenc_set], c_delta)\n", "\n", " if eta < 0:\n", " # print(\"eta was negative.\")\n", " current_cache_size = current_db_object_count * c_f / c_delta # Adjust cache size for next iteration\n", " continue\n", " \n", " if len((optimized_hitrates[differenc_set])[((optimized_hitrates[differenc_set]) < 0) | ((optimized_hitrates[differenc_set])> 1)]) == 0:\n", " # print(\"All values optimized.\")\n", " break\n", " \n", " max_outbound_index = get_index_of_furthest_hitrate_from_boundary(optimized_hitrates)\n", " fix_i.append(max_outbound_index)\n", " differenc_set = np.setdiff1d(np.arange(db_object_count), fix_i)\n", " \n", " old_hitrate = optimized_hitrates[max_outbound_index]\n", " optimized_hitrates[max_outbound_index] = (1 if optimized_hitrates[max_outbound_index] > 1 else 0)\n", " \n", " current_db_object_count -= 1\n", " current_cache_size -= optimized_hitrates[max_outbound_index]\n", " return optimized_hitrates" ] }, { "cell_type": "code", "execution_count": 8, "id": "b6bf3329-3a63-4807-ab8b-8a54f824f47e", "metadata": {}, "outputs": [], "source": [ "def objective_function(optimized_hitrates, optimized_ttl, c_f, c_delta, lambda_vals):\n", " return np.sum(lambda_vals*(1-optimized_hitrates)*c_f+optimized_ttl*c_delta)" ] }, { "cell_type": "code", "execution_count": null, "id": "d57309c0-2dd9-4b71-8e6d-a0af1600ea3e", "metadata": {}, "outputs": [], "source": [ "def optimize_ttl(optimized_hitrates):\n", " return 0.5*np.power(optimized_hitrates,2)" ] }, { "cell_type": "code", "execution_count": 9, "id": "bd4536e9-273b-4f49-b06c-2f00605e0f7d", "metadata": {}, "outputs": [], "source": [ "# Define the task to be parallelized\n", "def grid_search_task(db_object_count, cache_size, c_f, c_delta, db_object_counts, cache_sizes, c_f_values, c_delta_values):\n", " if db_object_count < cache_size:\n", " return None # Skip this combination if db_object_count < cache_size\n", " \n", " # Generate lambda_vals\n", " lambda_vals = np.array([np.random.zipf(ZIPF_CONSTANT) for _ in np.arange(1, db_object_count + 1, 1)])\n", " \n", " # Call the optimization function\n", " optimized_hitrates = optimize_hitrates(db_object_count, cache_size, c_f, c_delta, lambda_vals)\n", "\n", " optimized_ttl = optimize_ttl(optimized_hitrates)\n", " \n", " # Compute the objective function\n", " objective = objective_function(optimized_hitrates, optimized_ttl, c_f, c_delta, lambda_vals)\n", "\n", " return (objective, db_object_count, cache_size, c_f, c_delta)\n", "\n", "# Perform grid search with parallelization and tqdm progress bar\n", "def grid_search(db_object_counts, cache_sizes, c_f_values, c_delta_values):\n", " results = [] # List to collect the results (objective, parameters)\n", " total_combinations = len(db_object_counts) * len(cache_sizes) * len(c_f_values) * len(c_delta_values)\n", " \n", " # Use Parallel from joblib to parallelize the grid search\n", " task_results = Parallel(n_jobs=-1, verbose=1)(\n", " delayed(grid_search_task)(db_object_count, cache_size, c_f, c_delta, db_object_counts, cache_sizes, c_f_values, c_delta_values)\n", " for db_object_count, cache_size, c_f, c_delta in itertools.product(db_object_counts, cache_sizes, c_f_values, c_delta_values)\n", " )\n", "\n", " # Collect valid results\n", " for result in task_results:\n", " if result is not None:\n", " results.append(result)\n", " \n", " # Convert the results into a pandas DataFrame\n", " df = pd.DataFrame(results, columns=[\"Objective\", \"db_object_count\", \"cache_size\", \"c_f (Miss Cost)\", \"c_delta (Refresh Cost)\"])\n", " \n", " return df\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "a92c6772-6609-41a8-a3d1-4d640b69a864", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 23.2 ms, sys: 6.79 ms, total: 30 ms\n", "Wall time: 27.7 ms\n" ] } ], "source": [ "%%time\n", "# Define the grid search space\n", "test_ratios = np.array([0.1, 0.2, 0.5, 0.7, 1, 1.5, 2, 5, 10])\n", "db_object_count_values = np.round(np.array([10, 15, 30, 100, 200, 500]))\n", "cache_size_values = np.unique(np.round(np.array([db_object_count_values * i for i in test_ratios]).flatten()))\n", "c_f_values = np.array([0.1, 0.2, 0.5, 0.7, 1, 1.5, 2, 5, 10])\n", "c_delta_values = np.unique(np.array([c_f_values * i for i in test_ratios]).flatten())\n", "\n", "objective_result_file = \"./objective_grid-search_multi-core.csv\"\n", "\n", "results_df = None\n", "if not os.path.isfile(objective_result_file):\n", " # Call the grid search function\n", " results_df = grid_search(db_object_count_values, cache_size_values, c_f_values, c_delta_values)\n", " results_df.to_csv(objective_result_file,index=False)\n", "else:\n", " results_df = pd.read_csv(objective_result_file)" ] }, { "cell_type": "code", "execution_count": 11, "id": "45d7f86f-edee-4fc5-835f-1e311ab2e411", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | Objective | \n", "db_object_count | \n", "cache_size | \n", "c_f (Miss Cost) | \n", "c_delta (Refresh Cost) | \n", "
|---|---|---|---|---|---|
| 0 | \n", "0.620000 | \n", "10 | \n", "1.0 | \n", "0.1 | \n", "0.01 | \n", "
| 1 | \n", "0.640000 | \n", "10 | \n", "1.0 | \n", "0.1 | \n", "0.02 | \n", "
| 2 | \n", "1.820000 | \n", "10 | \n", "1.0 | \n", "0.1 | \n", "0.04 | \n", "
| 3 | \n", "1.125000 | \n", "10 | \n", "1.0 | \n", "0.1 | \n", "0.05 | \n", "
| 4 | \n", "1.635000 | \n", "10 | \n", "1.0 | \n", "0.1 | \n", "0.07 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 32719 | \n", "7563.333333 | \n", "500 | \n", "500.0 | \n", "10.0 | \n", "15.00 | \n", "
| 32720 | \n", "12545.000000 | \n", "500 | \n", "500.0 | \n", "10.0 | \n", "20.00 | \n", "
| 32721 | \n", "45496.000000 | \n", "500 | \n", "500.0 | \n", "10.0 | \n", "25.00 | \n", "
| 32722 | \n", "28060.000000 | \n", "500 | \n", "500.0 | \n", "10.0 | \n", "50.00 | \n", "
| 32723 | \n", "28987.000000 | \n", "500 | \n", "500.0 | \n", "10.0 | \n", "100.00 | \n", "
32724 rows × 5 columns
\n", "