In [1]:
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
import itertools
from tqdm import tqdm

In [2]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

ZIPF_CONSTANT = 2

In [3]:
def eta_star(db_object_count, c_f, cache_sz, c_delta, lambda_vals):
    num = (db_object_count * c_f - cache_sz * c_delta)
    denom = np.sum(1.0/lambda_vals)
    if denom == 0:
        print("sum(1.0/lambda_vals) == 0")
        print(db_object_count, c_f, cache_sz, c_delta, lambda_vals)
    return max(0, num/denom)

In [4]:
def h_i_star(c_f, eta, lambda_vals, c_delta):
    optimized_hitrate = (c_f - (eta/lambda_vals)) / c_delta
    return optimized_hitrate

In [5]:
def get_index_of_furthest_hitrate_from_boundary(hitrates):
    lower_bound_violation =  hitrates[(hitrates < 0)]
    upper_bound_violation = hitrates[(hitrates > 1)]
    smallest_delta = np.abs(np.min(lower_bound_violation))
    biggest_delta = np.max(upper_bound_violation) - 1
    if smallest_delta > biggest_delta:
        print(smallest_delta)
        index = np.where(hitrates == np.min(local_hitrates))[0][0]
        return index
    else:
        
        index = np.where(hitrates == np.max(local_hitrates))[0][0]
        return index

In [6]:
def get_index_of_furthest_hitrate_from_boundary(hitrates):
    outside_bounds = (hitrates < 0) | (hitrates > 1)
    distances = np.where(outside_bounds, np.maximum(np.abs(hitrates - 0), np.abs(hitrates - 1)), -np.inf)
    index = np.argmax(distances)
    return index

In [7]:
def optimize_hitrates(db_object_count, cache_size, c_f, c_delta, lambda_vals):
    optimized_hitrates = np.zeros(db_object_count)
    current_db_object_count = db_object_count
    current_cache_size = cache_size
    
    differenc_set = np.arange(db_object_count)
    fix_i = []
    while True:
        if current_db_object_count == 0:
            if current_cache_size > 0:
                # print("Re-optimize objects with optimal hitrate of 0.")
                differenc_set = np.where(optimized_hitrates == 0)[0]
                fix_i = np.setdiff1d(np.arange(db_object_count), differenc_set).tolist()
                current_db_object_count = len(differenc_set)
                continue
            else:
                # print("Stop optimization.")
                optimized_hitrates[differenc_set] = 0
                break
        
        eta = eta_star(current_db_object_count, c_f, current_cache_size, c_delta, lambda_vals[differenc_set])
        optimized_hitrates[differenc_set] = h_i_star(c_f, eta, lambda_vals[differenc_set], c_delta)

        if eta < 0:
            # print("eta was negative.")
            current_cache_size = current_db_object_count * c_f / c_delta  # Adjust cache size for next iteration
            continue
        
        if len((optimized_hitrates[differenc_set])[((optimized_hitrates[differenc_set]) < 0) | ((optimized_hitrates[differenc_set])> 1)]) == 0:
            # print("All values optimized.")
            break
        
        max_outbound_index = get_index_of_furthest_hitrate_from_boundary(optimized_hitrates)
        fix_i.append(max_outbound_index)
        differenc_set = np.setdiff1d(np.arange(db_object_count), fix_i)
    
        old_hitrate = optimized_hitrates[max_outbound_index]
        optimized_hitrates[max_outbound_index] = (1 if optimized_hitrates[max_outbound_index] > 1 else 0)
        
        current_db_object_count -= 1
        current_cache_size -= optimized_hitrates[max_outbound_index]
    return optimized_hitrates

In [8]:
def objective_function(optimized_hitrates, c_f, c_delta, lambda_vals):
    return np.sum(lambda_vals*(1-optimized_hitrates)*c_f+0.5*np.power(optimized_hitrates,2)*c_delta)

In [9]:
# Perform grid search
def grid_search(db_object_counts, cache_sizes, c_f_values, c_delta_values):
    best_objective = float('inf')
    best_params = None

    # Iterate through all combinations of parameters
    for db_object_count, cache_size, c_f, c_delta in tqdm(itertools.product(db_object_counts, cache_sizes, c_f_values, c_delta_values), total=len(db_object_counts) * len(cache_sizes) * len(c_f_values) * len(c_delta_values), desc="Grid Search Progress"):
        if db_object_count < cache_size:
            continue
        lambda_vals = np.array([np.random.zipf(ZIPF_CONSTANT) for i in np.arange(1, db_object_count + 1,1)])
        # print(db_object_count, cache_size, c_f, c_delta)
        # Call the optimization function
        optimized_hitrates = optimize_hitrates(db_object_count, cache_size, c_f, c_delta, lambda_vals)

        # Compute the objective function
        objective = objective_function(optimized_hitrates, c_f, c_delta, lambda_vals)
        
        # Track the best (minimum) objective and corresponding parameters
        if objective < best_objective:
            best_objective = objective
            best_params = (db_object_count, cache_size, c_f, c_delta)

    return best_objective, best_params

In [10]:
%%time

# Define the grid search space
test_ratios = np.array([0.1, 0.2, 0.5, 0.7, 1, 1.5, 2, 5, 10])
db_object_count_values = np.round(np.array([10, 15, 30, 100, 200, 500]))
cache_size_values = np.unique(np.round(np.array([db_object_count_values * i for i in test_ratios]).flatten()))
c_f_values = np.array([0.1, 0.2, 0.5, 0.7, 1, 1.5, 2, 5, 10])
c_delta_values = np.unique(np.array([c_f_values * i for i in test_ratios]).flatten())

best_objective, best_params = grid_search(db_object_count_values, cache_size_values, c_f_values, c_delta_values)

Grid Search Progress: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64152/64152 [12:27<00:00, 85.87it/s]

CPU times: user 12min 16s, sys: 11.5 s, total: 12min 28s
Wall time: 12min 27s





In [11]:
best_objective, best_params 

(0.05000000000000002, (10, 10.0, 1.5, 0.010000000000000002))