Author: Nele Albers
Date: January 2025
This file reproduces our analysis for RQ3 on the effect of different ethical allocation principles on human feedback received by smoker subgroups and thus also Figure 4.
Required files:
Created files:
Authored by Nele Albers, Francisco S. Melo, Mark A. Neerincx, Olya Kudina, and Willem-Paul Brinkman.
Let's load the packages we need.
from copy import deepcopy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import seaborn as sns
import optimal_policy_computations as pol_comp
And we define some variables that we use throughout.
FEAT_SEL = [0, 1, 2] # Selected base state features
NUM_VALS_PER_FEATURE = [3, 2, 2]
DISCOUNT_FACTOR = 0.85
NUM_ACTIONS = 2
PATH = "Intermediate_Results/" # pre-fix for path for storing results
path_to_save = str(str(FEAT_SEL[0]) + str(FEAT_SEL[1]) + str(FEAT_SEL[2]) + "_" + str(DISCOUNT_FACTOR) + "_" + str(NUM_VALS_PER_FEATURE))
# Human feedback cost for base policy
cost_base = 0.07
# For policies with auxiliary rewards
NUM_VALS_FEAT_PRIORITY = 3
FEAT_INDEX_PRIORITY = 3
NUM_VALS_FEAT_WAITING_TIME = 3
FEAT_INDEX_WAITING_TIME = 4
NUM_SESSIONS_PER_WAITING_TIME = 3
FEAT_INDEX_WANTING_HUMAN_SUPPORT = 2
NUM_VALS_WANTING_HUMAN_SUPPORT = 2
base_states = [[i, j, k] for i in range(NUM_VALS_PER_FEATURE[0]) for j in range(NUM_VALS_PER_FEATURE[1]) for k in range(NUM_VALS_PER_FEATURE[2])]
num_base_states = len(base_states)
states_aux = [[i, j, k, l, m] for i in range(NUM_VALS_PER_FEATURE[0]) for j in range(NUM_VALS_PER_FEATURE[1]) for k in range(NUM_VALS_PER_FEATURE[2]) for l in range(NUM_VALS_FEAT_PRIORITY) for m in range(NUM_VALS_FEAT_WAITING_TIME)]
num_states_aux = len(states_aux)
And we also define some simulation-related variables.
capacity = 166 # number of people we can have in the application at the same time
max_add_at_once = 100 # max. number of people we add at once
max_timesteps = 365 # max. number of timesteps we want to follow the application
num_starting_people = max_add_at_once # number of people to start with
max_sessions_per_person = NUM_VALS_FEAT_WAITING_TIME * NUM_SESSIONS_PER_WAITING_TIME
cutoff_for_avg = 40 # cut-off the first and last iterations of the simulation for computing averages
And we load 1) the weights to use for our auxiliary rewards compared to the base reward, and 2) the weights to use for the policy that uses all auxiliary rewards together.
with open("Data/ethical_principle_relative_weights_with_prognosis", 'rb') as f:
aux_relative_weights = pickle.load(f)
# Load ethical principle weights from users
with open("Data/ethical_principle_weights", 'rb') as f:
ethical_principle_weights = pickle.load(f)
Let's load thet dataframe with the transition samples.
data = pd.read_csv("Data/data_rl_samples_abstracted" + str(FEAT_SEL) + str(NUM_VALS_PER_FEATURE) + ".csv",
converters={'s0': eval, 's1': eval})
data_train = data.copy(deep=True)
And we load the data on all states (e.g., also states from session 1 from people with no data from session 2).
df_all_states = pd.read_csv("Data/all_abstract_states_with_session.csv",
converters={'state': eval})
And we load the previously computed reward and transition functions and Q-values.
with open(PATH + "_reward_func_" + path_to_save, "rb") as f:
reward_func = pickle.load(f)
with open(PATH + "_trans_func_" + path_to_save, "rb") as f:
trans_func = pickle.load(f)
with open(PATH + "_qvals_" + path_to_save, "rb") as f:
q_vals = pickle.load(f)
Let's compute the fraction of people in each base state observed in the first session of our study. This is used to inform people's starting states in our simulation.
all_states_s1 = df_all_states[df_all_states['session'] == 1]
all_states_s1 = all_states_s1.reset_index(drop=True)
all_states_count = np.zeros(num_base_states)
for p in range(len(all_states_s1)):
state = list(np.take(all_states_s1.iloc[p]["state"], FEAT_SEL))
state_idx = base_states.index(state)
all_states_count[state_idx] += 1
all_base_states_frac = all_states_count/sum(all_states_count)
print("Fraction of people in each state in session 1:", np.round(all_base_states_frac, 2))
Fraction of people in each state in session 1: [0.34 0.17 0.06 0.05 0.07 0.05 0.02 0.04 0.03 0.07 0.02 0.07]
To define a dropout rate for our simulation, we look at the return likelihood rating from our longitudinal research study. We assume that people with a negative return likelihood would have dropped out.
total = 0
for session in range(1, 5):
data_s = data[data["session"] == session]
data_return_negative_s = data_s[data_s["dropout_response"] < 0]
fraction = len(data_return_negative_s)/len(data_s)
total += fraction
print("Potential dropout after session", session, ":", len(data_return_negative_s), "out of", len(data_s), "->", round(fraction * 100, 2), "%")
dropout = total/4
print("\nAverage:", round(dropout * 100, 2), "%")
Potential dropout after session 1 : 112 out of 679 -> 16.49 % Potential dropout after session 2 : 94 out of 599 -> 15.69 % Potential dropout after session 3 : 90 out of 544 -> 16.54 % Potential dropout after session 4 : 66 out of 504 -> 13.1 % Average: 15.46 %
Here we define the function we use to simulate people in our live application setting.
def simulate(num_timesteps,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
opt_policy,
num_sessions_per_firstcome,
all_base_states_frac,
trans_func,
reward_func,
all_base_states,
all_states,
num_vals_feat_priority,
num_vals_feat_firstcome,
random_prob = 0):
"""
num_timesteps: max number of sessions per person
dropout: ratio of people dropping out randomly each session
capacity: max number of people that can be in the application at a time
max_add_at_once: max. number of new people we add at each time step to fill available spots
max_timesteps: max. number of timesteps we want to track the application for
num_starting_people: number of people starting in first session
opt_policy: policy to use
num_sessions_per_firstcome: number of sessions per waiting time feature value
all_base_states_frac: fraction of people in each base state at start
trans_func: transition function
reward_func: reward function
all_base_states: all possible base states
all_states: all possible states using all state features
num_vals_feat_priority: number of values for the priority state feature
num_vals_feat_firstcome: number of values for the waiting time state feature
random_prob: choose a random action with a certain probability
"""
random.seed(1)
# Create a starting population of participants
pop = random.choices(all_base_states, k = num_starting_people, weights = all_base_states_frac)
# Choose a random priority and set the waiting time feature to 0
for p in range(len(pop)):
pop[p] = deepcopy(pop[p]) + random.choices(np.arange(num_vals_feat_priority)) + [0]
pop_all = [pop]
sessions_curr = [0] * num_starting_people # keeps track of current session of each person
firstcomes_curr = [0] * num_starting_people # keeps track of waiting times of each person since last human support
action_sum = [] # number of human involvement per time step
reward_sum = [] # total reward per time step
num_people_prev = num_starting_people
t = 0 # time step
# while there are people left
while len(sessions_curr) > 0 and t < max_timesteps:
pop_new = []
action_sum.append(0)
reward_sum.append(0)
# add new people if there is capacity left and we can still finish those people
if len(sessions_curr) < capacity and t < (max_timesteps - num_timesteps + 1):
num_to_add = min(capacity - len(sessions_curr), max_add_at_once)
pop_added = random.choices(all_base_states, k = num_to_add, weights = all_base_states_frac)
# Choose random priority and set waiting time state feature to 0
for p in range(len(pop_added)):
pop_added[p] = deepcopy(pop_added[p]) + random.choices(np.arange(num_vals_feat_priority)) + [0]
pop_all[t] += pop_added
# also keep track of current session and of waiting times for new people
sessions_curr = sessions_curr + [0 for i in range(num_to_add)]
firstcomes_curr = firstcomes_curr + [0 for i in range(num_to_add)]
num_people_prev += num_to_add
# get action and next state and reward for each person
for p in range(num_people_prev):
# Take random action with a certain probability
r = random.uniform(0, 1)
if r < random_prob:
action = random.choice([0, 1])
else:
action = opt_policy[all_states.index(pop_all[t][p])]
action_sum[t] += action
# get next state
state_new = random.choices(all_states, k = 1, weights = trans_func[all_states.index(pop_all[t][p])][action])
pop_new.append(deepcopy(state_new[0]))
reward_sum[t] += reward_func[all_states.index(pop_all[t][p])][action]
# set waiting time to 0 when human is involved
if action == 1:
firstcomes_curr[p] = 0
# increase waiting time when no human is involved
elif action == 0:
# move to next waiting phase when user is in last session of a not-last waiting time value
# need to do this since the transition function only works with probabilities for
# transitioning to the next waiting time value
if firstcomes_curr[p] in [i * num_sessions_per_firstcome - 1 for i in range(1, num_vals_feat_firstcome - 1)]:
pop_new[p] = deepcopy(pop_new[p])
pop_new[p][4] = min(pop_all[t][p][4] + 1, num_vals_feat_firstcome - 1)
firstcomes_curr[p] += 1
# increase number of sessions completed per person
sessions_curr = [i + 1 for i in sessions_curr]
# remove people who have had all sessions
pop_new = [pop_new[p_idx] for p_idx in range(len(pop_new)) if sessions_curr[p_idx] < num_timesteps]
firstcomes_curr = [firstcomes_curr[p_idx] for p_idx in range(len(firstcomes_curr)) if sessions_curr[p_idx] < num_timesteps]
sessions_curr = [p for p in sessions_curr if p < num_timesteps]
num_people_prev = len(sessions_curr)
# some people drop out after the session, i.e. do not arrive in next session
# number of people to drop out
num_people_dropout = round(num_people_prev* dropout)
# randomly choose people drop out
dropout_ids = random.sample(range(0, len(pop_new)), num_people_dropout)
# remove indices of those people
pop_new = [pop_new[p_idx] for p_idx in range(len(pop_new)) if not p_idx in dropout_ids]
sessions_curr = [sessions_curr[p_idx] for p_idx in range(len(sessions_curr)) if not p_idx in dropout_ids]
firstcomes_curr = [firstcomes_curr[p_idx] for p_idx in range(len(firstcomes_curr)) if not p_idx in dropout_ids]
pop_all.append(pop_new)
num_people_prev = num_people_prev - num_people_dropout
t += 1
return action_sum, reward_sum, pop_all
We tune the human feedback cost for each of our policies such that they give a similar average amount of human feedback in our simulated live application.
First we compute the average simulated cost for our base optimal policy.
For this we need the reward function for the full state space that includes the additional state features introduced for the auxiliary rewards.
reward_func_base_for_aux = pol_comp.compute_reward_func_base_to_aux_statespace(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
cost_base)
And similarly, we need the transition function.
trans_func_aux = pol_comp.get_transitions_with_aux(trans_func,
base_states,
states_aux,
NUM_ACTIONS,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME,
NUM_SESSIONS_PER_WAITING_TIME)
And we use the reward and transition functions to compute the optimal policy.
optimal_policy_base, trans_func_for_q_val_comp, V = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_base_for_aux,
trans_func_aux,
DISCOUNT_FACTOR)
print("Optimal policy:")
for state_idx, state in enumerate(states_aux):
print("State " + str(state) + ":", optimal_policy_base[state_idx])
q_vals_base = np.zeros((num_states_aux, NUM_ACTIONS))
for s_idx in range(num_states_aux):
for a in range(NUM_ACTIONS):
q_vals_base[s_idx, a] = reward_func_base_for_aux[s_idx, a] + DISCOUNT_FACTOR * sum(trans_func_for_q_val_comp[a][s_idx, :] * V)
Iteration V-variation 1 1.0584204928695233 2 0.4115871613904838 3 0.22750998959227742 4 0.125420585239173 5 0.11635497484240664 6 0.09129846556244603 7 0.06818863242884632 8 0.05049472661705634 9 0.039862937753591066 10 0.031141407569016177 11 0.027038624764119668 12 0.023391825425900237 13 0.020388233455100657 14 0.017653956525828907 15 0.013983656533793187 16 0.010935333453801377 17 0.008528058591252385 18 0.006646221696488208 19 0.005186218484121152 20 0.004057536891139968 21 0.0031853526781997488 22 0.0025102220268249376 23 0.0019860323519531597 24 0.0015774777421477815 25 0.0012576977494811459 26 0.0010063008288008213 27 0.0008097346797160121 28 0.0006542224671979291 29 0.0005297636978047748 30 0.0004298514317975233 31 0.00034941409144551017 32 0.00028480947113651567 33 0.00023244288370749544 34 0.00018991419526326325 35 0.00015531623004916284 Iterating stopped, epsilon-optimal policy found. Optimal policy: State [0, 0, 0, 0, 0]: 0 State [0, 0, 0, 0, 1]: 0 State [0, 0, 0, 0, 2]: 0 State [0, 0, 0, 1, 0]: 0 State [0, 0, 0, 1, 1]: 0 State [0, 0, 0, 1, 2]: 0 State [0, 0, 0, 2, 0]: 0 State [0, 0, 0, 2, 1]: 0 State [0, 0, 0, 2, 2]: 0 State [0, 0, 1, 0, 0]: 0 State [0, 0, 1, 0, 1]: 0 State [0, 0, 1, 0, 2]: 0 State [0, 0, 1, 1, 0]: 0 State [0, 0, 1, 1, 1]: 0 State [0, 0, 1, 1, 2]: 0 State [0, 0, 1, 2, 0]: 0 State [0, 0, 1, 2, 1]: 0 State [0, 0, 1, 2, 2]: 0 State [0, 1, 0, 0, 0]: 1 State [0, 1, 0, 0, 1]: 1 State [0, 1, 0, 0, 2]: 1 State [0, 1, 0, 1, 0]: 1 State [0, 1, 0, 1, 1]: 1 State [0, 1, 0, 1, 2]: 1 State [0, 1, 0, 2, 0]: 1 State [0, 1, 0, 2, 1]: 1 State [0, 1, 0, 2, 2]: 1 State [0, 1, 1, 0, 0]: 1 State [0, 1, 1, 0, 1]: 1 State [0, 1, 1, 0, 2]: 1 State [0, 1, 1, 1, 0]: 1 State [0, 1, 1, 1, 1]: 1 State [0, 1, 1, 1, 2]: 1 State [0, 1, 1, 2, 0]: 1 State [0, 1, 1, 2, 1]: 1 State [0, 1, 1, 2, 2]: 1 State [1, 0, 0, 0, 0]: 0 State [1, 0, 0, 0, 1]: 0 State [1, 0, 0, 0, 2]: 0 State [1, 0, 0, 1, 0]: 0 State [1, 0, 0, 1, 1]: 0 State [1, 0, 0, 1, 2]: 0 State [1, 0, 0, 2, 0]: 0 State [1, 0, 0, 2, 1]: 0 State [1, 0, 0, 2, 2]: 0 State [1, 0, 1, 0, 0]: 0 State [1, 0, 1, 0, 1]: 0 State [1, 0, 1, 0, 2]: 0 State [1, 0, 1, 1, 0]: 0 State [1, 0, 1, 1, 1]: 0 State [1, 0, 1, 1, 2]: 0 State [1, 0, 1, 2, 0]: 0 State [1, 0, 1, 2, 1]: 0 State [1, 0, 1, 2, 2]: 0 State [1, 1, 0, 0, 0]: 1 State [1, 1, 0, 0, 1]: 1 State [1, 1, 0, 0, 2]: 1 State [1, 1, 0, 1, 0]: 1 State [1, 1, 0, 1, 1]: 1 State [1, 1, 0, 1, 2]: 1 State [1, 1, 0, 2, 0]: 1 State [1, 1, 0, 2, 1]: 1 State [1, 1, 0, 2, 2]: 1 State [1, 1, 1, 0, 0]: 1 State [1, 1, 1, 0, 1]: 1 State [1, 1, 1, 0, 2]: 1 State [1, 1, 1, 1, 0]: 1 State [1, 1, 1, 1, 1]: 1 State [1, 1, 1, 1, 2]: 1 State [1, 1, 1, 2, 0]: 1 State [1, 1, 1, 2, 1]: 1 State [1, 1, 1, 2, 2]: 1 State [2, 0, 0, 0, 0]: 0 State [2, 0, 0, 0, 1]: 0 State [2, 0, 0, 0, 2]: 0 State [2, 0, 0, 1, 0]: 0 State [2, 0, 0, 1, 1]: 0 State [2, 0, 0, 1, 2]: 0 State [2, 0, 0, 2, 0]: 0 State [2, 0, 0, 2, 1]: 0 State [2, 0, 0, 2, 2]: 0 State [2, 0, 1, 0, 0]: 0 State [2, 0, 1, 0, 1]: 0 State [2, 0, 1, 0, 2]: 0 State [2, 0, 1, 1, 0]: 0 State [2, 0, 1, 1, 1]: 0 State [2, 0, 1, 1, 2]: 0 State [2, 0, 1, 2, 0]: 0 State [2, 0, 1, 2, 1]: 0 State [2, 0, 1, 2, 2]: 0 State [2, 1, 0, 0, 0]: 0 State [2, 1, 0, 0, 1]: 0 State [2, 1, 0, 0, 2]: 0 State [2, 1, 0, 1, 0]: 0 State [2, 1, 0, 1, 1]: 0 State [2, 1, 0, 1, 2]: 0 State [2, 1, 0, 2, 0]: 0 State [2, 1, 0, 2, 1]: 0 State [2, 1, 0, 2, 2]: 0 State [2, 1, 1, 0, 0]: 1 State [2, 1, 1, 0, 1]: 1 State [2, 1, 1, 0, 2]: 1 State [2, 1, 1, 1, 0]: 1 State [2, 1, 1, 1, 1]: 1 State [2, 1, 1, 1, 2]: 1 State [2, 1, 1, 2, 0]: 1 State [2, 1, 1, 2, 1]: 1 State [2, 1, 1, 2, 2]: 1
And now we compute the average amount of human feedback in our simulated live application.
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_base,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME)
base_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Base mean amount of human feedback:", base_mean_cost)
Base mean amount of human feedback: 58.19298245614035
max_iterations = 30
cost_aux = 0.2
it = 0
increment = 0.05
above = [] # keep track if mean cost is below or above the one for the base policy
all_cost_settings = []
all_mean_cost_results_userweighted = []
optimal_policies_aux_userweighted = []
reward_funcs_userweighted = []
while it < max_iterations - 1:
reward_func_userweighted = pol_comp.compute_reward_func_user_weighted(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
FEAT_INDEX_PRIORITY,
NUM_VALS_FEAT_PRIORITY,
FEAT_INDEX_WANTING_HUMAN_SUPPORT,
NUM_VALS_WANTING_HUMAN_SUPPORT,
FEAT_INDEX_WAITING_TIME,
NUM_VALS_FEAT_WAITING_TIME,
ethical_principle_weights["prognosis"],
aux_weight_priority = ethical_principle_weights["priority"],
aux_weight_request = ethical_principle_weights["autonomy"],
aux_weight_waitingtime = ethical_principle_weights["equal"],
aux_weight_worststate = ethical_principle_weights["sickest-first"],
cost = cost_aux)
reward_funcs_userweighted.append(reward_func_userweighted)
optimal_policy_aux, _, _ = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_userweighted,
trans_func_aux,
DISCOUNT_FACTOR,
verbose=False)
optimal_policies_aux_userweighted.append(optimal_policy_aux)
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_aux,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME,
random_prob = 0)
aux_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Cost setting:", cost_aux, "Mean cost:", np.round(aux_mean_cost, 2))
all_cost_settings.append(cost_aux)
all_mean_cost_results_userweighted.append(aux_mean_cost)
if aux_mean_cost > base_mean_cost:
cost_aux += increment
above.append(1)
else:
cost_aux -= increment
above.append(0)
# Reduce the increment if we have switched to the other side of the base mean cost
if it > 0:
if above[-1] != above[-2]:
increment = increment/2
it += 1
mean_differences = [abs(all_mean_cost_results_userweighted[i] - base_mean_cost) for i in range(len(all_mean_cost_results_userweighted))]
best_cost_setting_idx_userweighted = np.argmin(mean_differences)
print("Best cost setting user weights:",
all_cost_settings[best_cost_setting_idx_userweighted],
"(mean cost:",
np.round(all_mean_cost_results_userweighted[best_cost_setting_idx_userweighted], 2), ")")
Cost setting: 0.2 Mean cost: 72.25 Cost setting: 0.25 Mean cost: 60.31 Cost setting: 0.3 Mean cost: 44.26 Cost setting: 0.25 Mean cost: 60.31 Cost setting: 0.275 Mean cost: 57.1 Cost setting: 0.2625 Mean cost: 59.72 Cost setting: 0.26875 Mean cost: 58.54 Cost setting: 0.271875 Mean cost: 57.47 Cost setting: 0.26875 Mean cost: 58.54 Cost setting: 0.2703125 Mean cost: 58.54 Cost setting: 0.27109375 Mean cost: 58.54 Cost setting: 0.27187500000000003 Mean cost: 57.47 Cost setting: 0.27109375 Mean cost: 58.54 Cost setting: 0.271484375 Mean cost: 57.86 Cost setting: 0.2712890625 Mean cost: 58.54 Cost setting: 0.27138671875000003 Mean cost: 58.54 Cost setting: 0.27143554687500004 Mean cost: 58.54 Cost setting: 0.27148437500000006 Mean cost: 57.86 Cost setting: 0.27143554687500004 Mean cost: 58.54 Cost setting: 0.2714599609375 Mean cost: 58.54 Cost setting: 0.27147216796875 Mean cost: 58.54 Cost setting: 0.271484375 Mean cost: 57.86 Cost setting: 0.27147216796875 Mean cost: 58.54 Cost setting: 0.27147827148437503 Mean cost: 58.54 Cost setting: 0.27148132324218754 Mean cost: 58.54 Cost setting: 0.27148437500000006 Mean cost: 57.86 Cost setting: 0.27148132324218754 Mean cost: 58.54 Cost setting: 0.27148284912109377 Mean cost: 58.54 Cost setting: 0.2714836120605469 Mean cost: 58.54 Best cost setting user weights: 0.271484375 (mean cost: 57.86 )
cost_aux = 0.2
it = 0
increment = 0.05
above = [] # keep track if mean cost is below or above the one for the base policy
all_cost_settings = []
all_mean_cost_results_firstcome = []
optimal_policies_aux_firstcome = []
reward_funcs_firstcome = []
while it < max_iterations - 1:
reward_func_firstcome = pol_comp.compute_reward_func_waitingtime(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
aux_relative_weights["equal"],
FEAT_INDEX_WAITING_TIME,
NUM_VALS_FEAT_WAITING_TIME,
cost_aux)
reward_funcs_firstcome.append(reward_func_firstcome)
optimal_policy_aux, _, _ = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_firstcome,
trans_func_aux,
DISCOUNT_FACTOR,
verbose=False)
optimal_policies_aux_firstcome.append(optimal_policy_aux)
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_aux,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME)
aux_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Cost setting:", cost_aux, "Mean cost:", np.round(aux_mean_cost, 2))
all_cost_settings.append(cost_aux)
all_mean_cost_results_firstcome.append(aux_mean_cost)
if aux_mean_cost > base_mean_cost:
cost_aux += increment
above.append(1)
else:
cost_aux -= increment
above.append(0)
# Reduce the increment if we have switched to the other side of the base mean cost
if it > 0:
if above[-1] != above[-2]:
increment = increment/2
it += 1
mean_differences = [abs(all_mean_cost_results_firstcome[i] - base_mean_cost) for i in range(len(all_mean_cost_results_firstcome))]
best_cost_setting_idx_firstcome = np.argmin(mean_differences)
print("Best cost setting waiting time:",
all_cost_settings[best_cost_setting_idx_firstcome],
"(mean cost:",
np.round(all_mean_cost_results_firstcome[best_cost_setting_idx_firstcome], 2), ")")
Cost setting: 0.2 Mean cost: 18.28 Cost setting: 0.15000000000000002 Mean cost: 20.05 Cost setting: 0.10000000000000002 Mean cost: 30.65 Cost setting: 0.05000000000000002 Mean cost: 36.62 Cost setting: 1.3877787807814457e-17 Mean cost: 42.85 Cost setting: -0.04999999999999999 Mean cost: 85.5 Cost setting: 1.3877787807814457e-17 Mean cost: 42.85 Cost setting: -0.024999999999999988 Mean cost: 78.59 Cost setting: -0.012499999999999987 Mean cost: 76.48 Cost setting: -0.0062499999999999865 Mean cost: 42.85 Cost setting: -0.012499999999999987 Mean cost: 76.48 Cost setting: -0.009374999999999988 Mean cost: 46.84 Cost setting: -0.010937499999999987 Mean cost: 46.84 Cost setting: -0.011718749999999988 Mean cost: 76.48 Cost setting: -0.010937499999999987 Mean cost: 46.84 Cost setting: -0.011328124999999988 Mean cost: 46.84 Cost setting: -0.011523437499999987 Mean cost: 76.48 Cost setting: -0.011328124999999988 Mean cost: 46.84 Cost setting: -0.011425781249999987 Mean cost: 46.84 Cost setting: -0.011474609374999988 Mean cost: 76.48 Cost setting: -0.011425781249999987 Mean cost: 46.84 Cost setting: -0.011450195312499988 Mean cost: 76.48 Cost setting: -0.011437988281249988 Mean cost: 46.84 Cost setting: -0.011444091796874988 Mean cost: 76.48 Cost setting: -0.011441040039062487 Mean cost: 76.48 Cost setting: -0.011439514160156237 Mean cost: 46.84 Cost setting: -0.011441040039062487 Mean cost: 76.48 Cost setting: -0.011440277099609363 Mean cost: 76.48 Cost setting: -0.0114398956298828 Mean cost: 46.84 Best cost setting waiting time: -0.009374999999999988 (mean cost: 46.84 )
cost_aux = 0.2
it = 0
increment = 0.05
above = [] # keep track if mean cost is below or above the one for the base policy
all_cost_settings = []
all_mean_cost_results_priority = []
optimal_policies_aux_priority = []
reward_funcs_priority = []
while it < max_iterations - 1:
reward_func_priority = pol_comp.compute_reward_func_priority(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
aux_relative_weights["priority"],
FEAT_INDEX_PRIORITY,
NUM_VALS_FEAT_PRIORITY,
cost_aux)
reward_funcs_priority.append(reward_func_priority)
optimal_policy_aux, _, _ = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_priority,
trans_func_aux,
DISCOUNT_FACTOR,
verbose=False)
optimal_policies_aux_priority.append(optimal_policy_aux)
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_aux,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME)
aux_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Cost setting:", cost_aux, "Mean cost:", np.round(aux_mean_cost, 2))
all_cost_settings.append(cost_aux)
all_mean_cost_results_priority.append(aux_mean_cost)
if aux_mean_cost > base_mean_cost:
cost_aux += increment
above.append(1)
else:
cost_aux -= increment
above.append(0)
# Reduce the increment if we have switched to the other side of the base mean cost
if it > 0:
if above[-1] != above[-2]:
increment = increment/2
it += 1
mean_differences = [abs(all_mean_cost_results_priority[i] - base_mean_cost) for i in range(len(all_mean_cost_results_priority))]
best_cost_setting_idx_priority = np.argmin(mean_differences)
print("Best cost setting priority:",
all_cost_settings[best_cost_setting_idx_priority],
"(mean cost:",
np.round(all_mean_cost_results_priority[best_cost_setting_idx_priority], 2), ")")
Cost setting: 0.2 Mean cost: 49.78 Cost setting: 0.15000000000000002 Mean cost: 84.12 Cost setting: 0.2 Mean cost: 49.78 Cost setting: 0.17500000000000002 Mean cost: 80.34 Cost setting: 0.18750000000000003 Mean cost: 74.16 Cost setting: 0.19375000000000003 Mean cost: 74.16 Cost setting: 0.20000000000000004 Mean cost: 49.78 Cost setting: 0.19375000000000003 Mean cost: 74.16 Cost setting: 0.19687500000000002 Mean cost: 62.95 Cost setting: 0.19843750000000002 Mean cost: 62.95 Cost setting: 0.2 Mean cost: 49.78 Cost setting: 0.19843750000000002 Mean cost: 62.95 Cost setting: 0.19921875000000003 Mean cost: 49.78 Cost setting: 0.19882812500000002 Mean cost: 62.95 Cost setting: 0.1990234375 Mean cost: 53.35 Cost setting: 0.19892578125000002 Mean cost: 62.95 Cost setting: 0.19897460937500003 Mean cost: 53.35 Cost setting: 0.19895019531250002 Mean cost: 53.35 Cost setting: 0.19893798828125003 Mean cost: 62.95 Cost setting: 0.19895019531250002 Mean cost: 53.35 Cost setting: 0.19894409179687503 Mean cost: 53.35 Cost setting: 0.19894104003906252 Mean cost: 62.95 Cost setting: 0.19894409179687503 Mean cost: 53.35 Cost setting: 0.19894256591796877 Mean cost: 53.35 Cost setting: 0.19894180297851566 Mean cost: 62.95 Cost setting: 0.19894256591796877 Mean cost: 53.35 Cost setting: 0.19894218444824222 Mean cost: 62.95 Cost setting: 0.1989423751831055 Mean cost: 53.35 Cost setting: 0.19894227981567386 Mean cost: 53.35 Best cost setting priority: 0.19687500000000002 (mean cost: 62.95 )
cost_aux = 0.2
it = 0
increment = 0.05
above = [] # keep track if mean cost is below or above the one for the base policy
all_cost_settings = []
all_mean_cost_results_sickestfirst = []
optimal_policies_aux_sickestfirst = []
reward_funcs_sickestfirst = []
while it < max_iterations - 1:
reward_func_sickestfirst = pol_comp.compute_reward_func_worst_state(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
aux_relative_weights["sickest-first"],
cost_aux)
reward_funcs_sickestfirst.append(reward_func_sickestfirst)
optimal_policy_aux, _, _ = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_sickestfirst,
trans_func_aux,
DISCOUNT_FACTOR,
verbose=False)
optimal_policies_aux_sickestfirst.append(optimal_policy_aux)
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_aux,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME)
aux_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Cost setting:", cost_aux, "Mean cost:", np.round(aux_mean_cost, 2))
all_cost_settings.append(cost_aux)
all_mean_cost_results_sickestfirst.append(aux_mean_cost)
if aux_mean_cost > base_mean_cost:
cost_aux += increment
above.append(1)
else:
cost_aux -= increment
above.append(0)
# Reduce the increment if we have switched to the other side of the base mean cost
if it > 0:
if above[-1] != above[-2]:
increment = increment/2
it += 1
mean_differences = [abs(all_mean_cost_results_sickestfirst[i] - base_mean_cost) for i in range(len(all_mean_cost_results_sickestfirst))]
best_cost_setting_idx_sickestfirst = np.argmin(mean_differences)
print("Best cost setting worst state:",
all_cost_settings[best_cost_setting_idx_sickestfirst],
"(mean cost:",
np.round(all_mean_cost_results_sickestfirst[best_cost_setting_idx_sickestfirst], 2), ")")
# Show resulting policy
print("Worst state policy:")
for state_idx, state in enumerate(states_aux):
print("State " + str(state) + ":", optimal_policies_aux_sickestfirst[best_cost_setting_idx_sickestfirst][state_idx])
Cost setting: 0.2 Mean cost: 93.58 Cost setting: 0.25 Mean cost: 69.15 Cost setting: 0.3 Mean cost: 69.15 Cost setting: 0.35 Mean cost: 56.99 Cost setting: 0.3 Mean cost: 69.15 Cost setting: 0.325 Mean cost: 56.99 Cost setting: 0.3125 Mean cost: 69.15 Cost setting: 0.31875 Mean cost: 69.15 Cost setting: 0.32187499999999997 Mean cost: 56.99 Cost setting: 0.31875 Mean cost: 69.15 Cost setting: 0.3203125 Mean cost: 62.06 Cost setting: 0.32109375 Mean cost: 62.06 Cost setting: 0.321875 Mean cost: 56.99 Cost setting: 0.32109375 Mean cost: 62.06 Cost setting: 0.321484375 Mean cost: 62.06 Cost setting: 0.3216796875 Mean cost: 56.99 Cost setting: 0.321484375 Mean cost: 62.06 Cost setting: 0.32158203125 Mean cost: 58.92 Cost setting: 0.321630859375 Mean cost: 56.99 Cost setting: 0.32158203125 Mean cost: 58.92 Cost setting: 0.3216064453125 Mean cost: 56.99 Cost setting: 0.32159423828125 Mean cost: 58.92 Cost setting: 0.321600341796875 Mean cost: 58.92 Cost setting: 0.32160339355468753 Mean cost: 58.92 Cost setting: 0.32160644531250004 Mean cost: 56.99 Cost setting: 0.32160339355468753 Mean cost: 58.92 Cost setting: 0.32160491943359376 Mean cost: 58.92 Cost setting: 0.3216056823730469 Mean cost: 58.92 Cost setting: 0.3216064453125 Mean cost: 56.99 Best cost setting worst state: 0.32158203125 (mean cost: 58.92 ) Worst state policy: State [0, 0, 0, 0, 0]: 1 State [0, 0, 0, 0, 1]: 1 State [0, 0, 0, 0, 2]: 1 State [0, 0, 0, 1, 0]: 1 State [0, 0, 0, 1, 1]: 1 State [0, 0, 0, 1, 2]: 1 State [0, 0, 0, 2, 0]: 1 State [0, 0, 0, 2, 1]: 1 State [0, 0, 0, 2, 2]: 1 State [0, 0, 1, 0, 0]: 0 State [0, 0, 1, 0, 1]: 0 State [0, 0, 1, 0, 2]: 0 State [0, 0, 1, 1, 0]: 0 State [0, 0, 1, 1, 1]: 0 State [0, 0, 1, 1, 2]: 0 State [0, 0, 1, 2, 0]: 0 State [0, 0, 1, 2, 1]: 0 State [0, 0, 1, 2, 2]: 0 State [0, 1, 0, 0, 0]: 1 State [0, 1, 0, 0, 1]: 1 State [0, 1, 0, 0, 2]: 1 State [0, 1, 0, 1, 0]: 1 State [0, 1, 0, 1, 1]: 1 State [0, 1, 0, 1, 2]: 1 State [0, 1, 0, 2, 0]: 1 State [0, 1, 0, 2, 1]: 1 State [0, 1, 0, 2, 2]: 1 State [0, 1, 1, 0, 0]: 0 State [0, 1, 1, 0, 1]: 0 State [0, 1, 1, 0, 2]: 0 State [0, 1, 1, 1, 0]: 0 State [0, 1, 1, 1, 1]: 0 State [0, 1, 1, 1, 2]: 0 State [0, 1, 1, 2, 0]: 0 State [0, 1, 1, 2, 1]: 0 State [0, 1, 1, 2, 2]: 0 State [1, 0, 0, 0, 0]: 1 State [1, 0, 0, 0, 1]: 1 State [1, 0, 0, 0, 2]: 1 State [1, 0, 0, 1, 0]: 1 State [1, 0, 0, 1, 1]: 1 State [1, 0, 0, 1, 2]: 1 State [1, 0, 0, 2, 0]: 1 State [1, 0, 0, 2, 1]: 1 State [1, 0, 0, 2, 2]: 1 State [1, 0, 1, 0, 0]: 0 State [1, 0, 1, 0, 1]: 0 State [1, 0, 1, 0, 2]: 0 State [1, 0, 1, 1, 0]: 0 State [1, 0, 1, 1, 1]: 0 State [1, 0, 1, 1, 2]: 0 State [1, 0, 1, 2, 0]: 0 State [1, 0, 1, 2, 1]: 0 State [1, 0, 1, 2, 2]: 0 State [1, 1, 0, 0, 0]: 0 State [1, 1, 0, 0, 1]: 0 State [1, 1, 0, 0, 2]: 0 State [1, 1, 0, 1, 0]: 0 State [1, 1, 0, 1, 1]: 0 State [1, 1, 0, 1, 2]: 0 State [1, 1, 0, 2, 0]: 0 State [1, 1, 0, 2, 1]: 0 State [1, 1, 0, 2, 2]: 0 State [1, 1, 1, 0, 0]: 0 State [1, 1, 1, 0, 1]: 0 State [1, 1, 1, 0, 2]: 0 State [1, 1, 1, 1, 0]: 0 State [1, 1, 1, 1, 1]: 0 State [1, 1, 1, 1, 2]: 0 State [1, 1, 1, 2, 0]: 0 State [1, 1, 1, 2, 1]: 0 State [1, 1, 1, 2, 2]: 0 State [2, 0, 0, 0, 0]: 0 State [2, 0, 0, 0, 1]: 1 State [2, 0, 0, 0, 2]: 1 State [2, 0, 0, 1, 0]: 0 State [2, 0, 0, 1, 1]: 1 State [2, 0, 0, 1, 2]: 1 State [2, 0, 0, 2, 0]: 0 State [2, 0, 0, 2, 1]: 1 State [2, 0, 0, 2, 2]: 1 State [2, 0, 1, 0, 0]: 0 State [2, 0, 1, 0, 1]: 0 State [2, 0, 1, 0, 2]: 0 State [2, 0, 1, 1, 0]: 0 State [2, 0, 1, 1, 1]: 0 State [2, 0, 1, 1, 2]: 0 State [2, 0, 1, 2, 0]: 0 State [2, 0, 1, 2, 1]: 0 State [2, 0, 1, 2, 2]: 0 State [2, 1, 0, 0, 0]: 0 State [2, 1, 0, 0, 1]: 0 State [2, 1, 0, 0, 2]: 0 State [2, 1, 0, 1, 0]: 0 State [2, 1, 0, 1, 1]: 0 State [2, 1, 0, 1, 2]: 0 State [2, 1, 0, 2, 0]: 0 State [2, 1, 0, 2, 1]: 0 State [2, 1, 0, 2, 2]: 0 State [2, 1, 1, 0, 0]: 0 State [2, 1, 1, 0, 1]: 0 State [2, 1, 1, 0, 2]: 0 State [2, 1, 1, 1, 0]: 0 State [2, 1, 1, 1, 1]: 0 State [2, 1, 1, 1, 2]: 0 State [2, 1, 1, 2, 0]: 0 State [2, 1, 1, 2, 1]: 0 State [2, 1, 1, 2, 2]: 0
cost_aux = 0.2
it = 0
increment = 0.05
above = [] # keep track if mean cost is below or above the one for the base policy
all_cost_settings = []
all_mean_cost_results_autonomy = []
optimal_policies_aux_autonomy = []
reward_funcs_autonomy = []
while it < max_iterations - 1:
reward_func_autonomy = pol_comp.compute_reward_func_request(reward_func,
base_states,
len(FEAT_SEL),
states_aux,
NUM_ACTIONS,
aux_relative_weights["autonomy"],
FEAT_INDEX_WANTING_HUMAN_SUPPORT,
NUM_VALS_WANTING_HUMAN_SUPPORT,
cost_aux)
reward_funcs_autonomy.append(reward_func_autonomy)
optimal_policy_aux, _, _ = pol_comp.get_optimal_policy_full(NUM_ACTIONS,
reward_func_autonomy,
trans_func_aux,
DISCOUNT_FACTOR,
verbose=False)
optimal_policies_aux_autonomy.append(optimal_policy_aux)
action_sum, _, _ = simulate(max_sessions_per_person,
dropout,
capacity,
max_add_at_once,
max_timesteps,
num_starting_people,
optimal_policy_aux,
NUM_SESSIONS_PER_WAITING_TIME,
all_base_states_frac,
trans_func_aux,
reward_func_base_for_aux,
base_states,
states_aux,
NUM_VALS_FEAT_PRIORITY,
NUM_VALS_FEAT_WAITING_TIME)
aux_mean_cost = np.mean(action_sum[cutoff_for_avg:-cutoff_for_avg])
print("Cost setting:", cost_aux, "Mean cost:", np.round(aux_mean_cost, 2))
all_cost_settings.append(cost_aux)
all_mean_cost_results_autonomy.append(aux_mean_cost)
if aux_mean_cost > base_mean_cost:
cost_aux += increment
above.append(1)
else:
cost_aux -= increment
above.append(0)
# Reduce the increment if we have switched to the other side of the base mean cost
if it > 0:
if above[-1] != above[-2]:
increment = increment/2
it += 1
mean_differences = [abs(all_mean_cost_results_autonomy[i] - base_mean_cost) for i in range(len(all_mean_cost_results_autonomy))]
best_cost_setting_idx_autonomy = np.argmin(mean_differences)
print("Best cost setting worst state:",
all_cost_settings[best_cost_setting_idx_autonomy],
"(mean cost:",
np.round(all_mean_cost_results_autonomy[best_cost_setting_idx_autonomy], 2), ")")
Cost setting: 0.2 Mean cost: 79.96 Cost setting: 0.25 Mean cost: 14.18 Cost setting: 0.2 Mean cost: 79.96 Cost setting: 0.225 Mean cost: 21.98 Cost setting: 0.2125 Mean cost: 21.98 Cost setting: 0.20625 Mean cost: 40.85 Cost setting: 0.19999999999999998 Mean cost: 79.96 Cost setting: 0.20625 Mean cost: 40.85 Cost setting: 0.203125 Mean cost: 49.48 Cost setting: 0.2015625 Mean cost: 79.96 Cost setting: 0.203125 Mean cost: 49.48 Cost setting: 0.20234375 Mean cost: 49.48 Cost setting: 0.20195312499999998 Mean cost: 79.96 Cost setting: 0.20234375 Mean cost: 49.48 Cost setting: 0.2021484375 Mean cost: 56.82 Cost setting: 0.20205078125 Mean cost: 79.96 Cost setting: 0.2021484375 Mean cost: 56.82 Cost setting: 0.202099609375 Mean cost: 79.96 Cost setting: 0.2021240234375 Mean cost: 79.96 Cost setting: 0.20213623046874998 Mean cost: 56.82 Cost setting: 0.2021240234375 Mean cost: 79.96 Cost setting: 0.202130126953125 Mean cost: 56.82 Cost setting: 0.20212707519531248 Mean cost: 79.96 Cost setting: 0.20212860107421873 Mean cost: 79.96 Cost setting: 0.20212936401367185 Mean cost: 79.96 Cost setting: 0.20213012695312496 Mean cost: 56.82 Cost setting: 0.20212936401367185 Mean cost: 79.96 Cost setting: 0.2021297454833984 Mean cost: 56.82 Cost setting: 0.2021295547485351 Mean cost: 79.96 Best cost setting worst state: 0.2021484375 (mean cost: 56.82 )
Now we want to compare the different policies.
One criterion we use to compare the policies is the degree to which they give human feedback to people in states with the lowest reward without human feedback.
NUM_VALS_WORST_STATE = 3
reward_func_base_for_aux_0 = [reward_func_base_for_aux[state_idx][0] for state_idx in range(num_states_aux)]
reward_func_cutoffs = np.nanpercentile(reward_func_base_for_aux_0, [j/NUM_VALS_WORST_STATE * 100 for j in range(1, NUM_VALS_WORST_STATE + 1)])
print("Reward function percentiles without human feedback:", np.round(reward_func_cutoffs, 2))
states_aux_reward_func_bins = []
for state_idx in range(num_states_aux):
for cutoff_idx, cutoff in enumerate(reward_func_cutoffs):
if reward_func_base_for_aux_0[state_idx] <= cutoff:
# We revert the index here so that people in a worse state have a higher value to be consistent with the other criteria
states_aux_reward_func_bins.append(NUM_VALS_WORST_STATE - 1 - cutoff_idx)
break
for state_idx, state in enumerate(states_aux):
print(state, ":", round(reward_func_base_for_aux_0[state_idx], 2),
"->", states_aux_reward_func_bins[state_idx])
Reward function percentiles without human feedback: [0.5 0.58 0.74] [0, 0, 0, 0, 0] : 0.42 -> 2 [0, 0, 0, 0, 1] : 0.42 -> 2 [0, 0, 0, 0, 2] : 0.42 -> 2 [0, 0, 0, 1, 0] : 0.42 -> 2 [0, 0, 0, 1, 1] : 0.42 -> 2 [0, 0, 0, 1, 2] : 0.42 -> 2 [0, 0, 0, 2, 0] : 0.42 -> 2 [0, 0, 0, 2, 1] : 0.42 -> 2 [0, 0, 0, 2, 2] : 0.42 -> 2 [0, 0, 1, 0, 0] : 0.51 -> 1 [0, 0, 1, 0, 1] : 0.51 -> 1 [0, 0, 1, 0, 2] : 0.51 -> 1 [0, 0, 1, 1, 0] : 0.51 -> 1 [0, 0, 1, 1, 1] : 0.51 -> 1 [0, 0, 1, 1, 2] : 0.51 -> 1 [0, 0, 1, 2, 0] : 0.51 -> 1 [0, 0, 1, 2, 1] : 0.51 -> 1 [0, 0, 1, 2, 2] : 0.51 -> 1 [0, 1, 0, 0, 0] : 0.46 -> 2 [0, 1, 0, 0, 1] : 0.46 -> 2 [0, 1, 0, 0, 2] : 0.46 -> 2 [0, 1, 0, 1, 0] : 0.46 -> 2 [0, 1, 0, 1, 1] : 0.46 -> 2 [0, 1, 0, 1, 2] : 0.46 -> 2 [0, 1, 0, 2, 0] : 0.46 -> 2 [0, 1, 0, 2, 1] : 0.46 -> 2 [0, 1, 0, 2, 2] : 0.46 -> 2 [0, 1, 1, 0, 0] : 0.52 -> 1 [0, 1, 1, 0, 1] : 0.52 -> 1 [0, 1, 1, 0, 2] : 0.52 -> 1 [0, 1, 1, 1, 0] : 0.52 -> 1 [0, 1, 1, 1, 1] : 0.52 -> 1 [0, 1, 1, 1, 2] : 0.52 -> 1 [0, 1, 1, 2, 0] : 0.52 -> 1 [0, 1, 1, 2, 1] : 0.52 -> 1 [0, 1, 1, 2, 2] : 0.52 -> 1 [1, 0, 0, 0, 0] : 0.49 -> 2 [1, 0, 0, 0, 1] : 0.49 -> 2 [1, 0, 0, 0, 2] : 0.49 -> 2 [1, 0, 0, 1, 0] : 0.49 -> 2 [1, 0, 0, 1, 1] : 0.49 -> 2 [1, 0, 0, 1, 2] : 0.49 -> 2 [1, 0, 0, 2, 0] : 0.49 -> 2 [1, 0, 0, 2, 1] : 0.49 -> 2 [1, 0, 0, 2, 2] : 0.49 -> 2 [1, 0, 1, 0, 0] : 0.57 -> 1 [1, 0, 1, 0, 1] : 0.57 -> 1 [1, 0, 1, 0, 2] : 0.57 -> 1 [1, 0, 1, 1, 0] : 0.57 -> 1 [1, 0, 1, 1, 1] : 0.57 -> 1 [1, 0, 1, 1, 2] : 0.57 -> 1 [1, 0, 1, 2, 0] : 0.57 -> 1 [1, 0, 1, 2, 1] : 0.57 -> 1 [1, 0, 1, 2, 2] : 0.57 -> 1 [1, 1, 0, 0, 0] : 0.61 -> 0 [1, 1, 0, 0, 1] : 0.61 -> 0 [1, 1, 0, 0, 2] : 0.61 -> 0 [1, 1, 0, 1, 0] : 0.61 -> 0 [1, 1, 0, 1, 1] : 0.61 -> 0 [1, 1, 0, 1, 2] : 0.61 -> 0 [1, 1, 0, 2, 0] : 0.61 -> 0 [1, 1, 0, 2, 1] : 0.61 -> 0 [1, 1, 0, 2, 2] : 0.61 -> 0 [1, 1, 1, 0, 0] : 0.7 -> 0 [1, 1, 1, 0, 1] : 0.7 -> 0 [1, 1, 1, 0, 2] : 0.7 -> 0 [1, 1, 1, 1, 0] : 0.7 -> 0 [1, 1, 1, 1, 1] : 0.7 -> 0 [1, 1, 1, 1, 2] : 0.7 -> 0 [1, 1, 1, 2, 0] : 0.7 -> 0 [1, 1, 1, 2, 1] : 0.7 -> 0 [1, 1, 1, 2, 2] : 0.7 -> 0 [2, 0, 0, 0, 0] : 0.49 -> 2 [2, 0, 0, 0, 1] : 0.49 -> 2 [2, 0, 0, 0, 2] : 0.49 -> 2 [2, 0, 0, 1, 0] : 0.49 -> 2 [2, 0, 0, 1, 1] : 0.49 -> 2 [2, 0, 0, 1, 2] : 0.49 -> 2 [2, 0, 0, 2, 0] : 0.49 -> 2 [2, 0, 0, 2, 1] : 0.49 -> 2 [2, 0, 0, 2, 2] : 0.49 -> 2 [2, 0, 1, 0, 0] : 0.6 -> 0 [2, 0, 1, 0, 1] : 0.6 -> 0 [2, 0, 1, 0, 2] : 0.6 -> 0 [2, 0, 1, 1, 0] : 0.6 -> 0 [2, 0, 1, 1, 1] : 0.6 -> 0 [2, 0, 1, 1, 2] : 0.6 -> 0 [2, 0, 1, 2, 0] : 0.6 -> 0 [2, 0, 1, 2, 1] : 0.6 -> 0 [2, 0, 1, 2, 2] : 0.6 -> 0 [2, 1, 0, 0, 0] : 0.54 -> 1 [2, 1, 0, 0, 1] : 0.54 -> 1 [2, 1, 0, 0, 2] : 0.54 -> 1 [2, 1, 0, 1, 0] : 0.54 -> 1 [2, 1, 0, 1, 1] : 0.54 -> 1 [2, 1, 0, 1, 2] : 0.54 -> 1 [2, 1, 0, 2, 0] : 0.54 -> 1 [2, 1, 0, 2, 1] : 0.54 -> 1 [2, 1, 0, 2, 2] : 0.54 -> 1 [2, 1, 1, 0, 0] : 0.74 -> 0 [2, 1, 1, 0, 1] : 0.74 -> 0 [2, 1, 1, 0, 2] : 0.74 -> 0 [2, 1, 1, 1, 0] : 0.74 -> 0 [2, 1, 1, 1, 1] : 0.74 -> 0 [2, 1, 1, 1, 2] : 0.74 -> 0 [2, 1, 1, 2, 0] : 0.74 -> 0 [2, 1, 1, 2, 1] : 0.74 -> 0 [2, 1, 1, 2, 2] : 0.74 -> 0
We also consider in which states giving human feedback leads to the largest increase in Q-values.
NUM_VALS_PROGNOSIS_CRITERION = 3
q_vals_diff = [q_vals_base[state_idx][1] - q_vals_base[state_idx][0] for state_idx in range(num_states_aux)]
q_vals_cutoffs = np.nanpercentile(q_vals_diff, [j/NUM_VALS_PROGNOSIS_CRITERION * 100 for j in range(1, NUM_VALS_PROGNOSIS_CRITERION + 1)])
print("Q-values difference percentiles:", np.round(q_vals_cutoffs, 2))
states_aux_q_vals_diff_bins = []
for state_idx in range(num_states_aux):
for cutoff_idx, cutoff in enumerate(q_vals_cutoffs):
if q_vals_diff[state_idx] <= cutoff:
states_aux_q_vals_diff_bins.append(cutoff_idx)
break
for state_idx, state in enumerate(states_aux):
print(state, ":", round(q_vals_diff[state_idx], 2),
"->", states_aux_q_vals_diff_bins[state_idx])
Q-values difference percentiles: [-0.04 0.02 0.11] [0, 0, 0, 0, 0] : -0.05 -> 0 [0, 0, 0, 0, 1] : -0.05 -> 0 [0, 0, 0, 0, 2] : -0.05 -> 0 [0, 0, 0, 1, 0] : -0.05 -> 0 [0, 0, 0, 1, 1] : -0.05 -> 0 [0, 0, 0, 1, 2] : -0.05 -> 0 [0, 0, 0, 2, 0] : -0.05 -> 0 [0, 0, 0, 2, 1] : -0.05 -> 0 [0, 0, 0, 2, 2] : -0.05 -> 0 [0, 0, 1, 0, 0] : -0.05 -> 0 [0, 0, 1, 0, 1] : -0.05 -> 0 [0, 0, 1, 0, 2] : -0.05 -> 0 [0, 0, 1, 1, 0] : -0.05 -> 0 [0, 0, 1, 1, 1] : -0.05 -> 0 [0, 0, 1, 1, 2] : -0.05 -> 0 [0, 0, 1, 2, 0] : -0.05 -> 0 [0, 0, 1, 2, 1] : -0.05 -> 0 [0, 0, 1, 2, 2] : -0.05 -> 0 [0, 1, 0, 0, 0] : 0.04 -> 2 [0, 1, 0, 0, 1] : 0.04 -> 2 [0, 1, 0, 0, 2] : 0.04 -> 2 [0, 1, 0, 1, 0] : 0.04 -> 2 [0, 1, 0, 1, 1] : 0.04 -> 2 [0, 1, 0, 1, 2] : 0.04 -> 2 [0, 1, 0, 2, 0] : 0.04 -> 2 [0, 1, 0, 2, 1] : 0.04 -> 2 [0, 1, 0, 2, 2] : 0.04 -> 2 [0, 1, 1, 0, 0] : 0.03 -> 2 [0, 1, 1, 0, 1] : 0.03 -> 2 [0, 1, 1, 0, 2] : 0.03 -> 2 [0, 1, 1, 1, 0] : 0.03 -> 2 [0, 1, 1, 1, 1] : 0.03 -> 2 [0, 1, 1, 1, 2] : 0.03 -> 2 [0, 1, 1, 2, 0] : 0.03 -> 2 [0, 1, 1, 2, 1] : 0.03 -> 2 [0, 1, 1, 2, 2] : 0.03 -> 2 [1, 0, 0, 0, 0] : -0.01 -> 1 [1, 0, 0, 0, 1] : -0.01 -> 1 [1, 0, 0, 0, 2] : -0.01 -> 1 [1, 0, 0, 1, 0] : -0.01 -> 1 [1, 0, 0, 1, 1] : -0.01 -> 1 [1, 0, 0, 1, 2] : -0.01 -> 1 [1, 0, 0, 2, 0] : -0.01 -> 1 [1, 0, 0, 2, 1] : -0.01 -> 1 [1, 0, 0, 2, 2] : -0.01 -> 1 [1, 0, 1, 0, 0] : -0.02 -> 1 [1, 0, 1, 0, 1] : -0.02 -> 1 [1, 0, 1, 0, 2] : -0.02 -> 1 [1, 0, 1, 1, 0] : -0.02 -> 1 [1, 0, 1, 1, 1] : -0.02 -> 1 [1, 0, 1, 1, 2] : -0.02 -> 1 [1, 0, 1, 2, 0] : -0.02 -> 1 [1, 0, 1, 2, 1] : -0.02 -> 1 [1, 0, 1, 2, 2] : -0.02 -> 1 [1, 1, 0, 0, 0] : 0.04 -> 2 [1, 1, 0, 0, 1] : 0.04 -> 2 [1, 1, 0, 0, 2] : 0.04 -> 2 [1, 1, 0, 1, 0] : 0.04 -> 2 [1, 1, 0, 1, 1] : 0.04 -> 2 [1, 1, 0, 1, 2] : 0.04 -> 2 [1, 1, 0, 2, 0] : 0.04 -> 2 [1, 1, 0, 2, 1] : 0.04 -> 2 [1, 1, 0, 2, 2] : 0.04 -> 2 [1, 1, 1, 0, 0] : 0.11 -> 2 [1, 1, 1, 0, 1] : 0.11 -> 2 [1, 1, 1, 0, 2] : 0.11 -> 2 [1, 1, 1, 1, 0] : 0.11 -> 2 [1, 1, 1, 1, 1] : 0.11 -> 2 [1, 1, 1, 1, 2] : 0.11 -> 2 [1, 1, 1, 2, 0] : 0.11 -> 2 [1, 1, 1, 2, 1] : 0.11 -> 2 [1, 1, 1, 2, 2] : 0.11 -> 2 [2, 0, 0, 0, 0] : -0.04 -> 1 [2, 0, 0, 0, 1] : -0.04 -> 1 [2, 0, 0, 0, 2] : -0.04 -> 1 [2, 0, 0, 1, 0] : -0.04 -> 1 [2, 0, 0, 1, 1] : -0.04 -> 1 [2, 0, 0, 1, 2] : -0.04 -> 1 [2, 0, 0, 2, 0] : -0.04 -> 1 [2, 0, 0, 2, 1] : -0.04 -> 1 [2, 0, 0, 2, 2] : -0.04 -> 1 [2, 0, 1, 0, 0] : -0.09 -> 0 [2, 0, 1, 0, 1] : -0.09 -> 0 [2, 0, 1, 0, 2] : -0.09 -> 0 [2, 0, 1, 1, 0] : -0.09 -> 0 [2, 0, 1, 1, 1] : -0.09 -> 0 [2, 0, 1, 1, 2] : -0.09 -> 0 [2, 0, 1, 2, 0] : -0.09 -> 0 [2, 0, 1, 2, 1] : -0.09 -> 0 [2, 0, 1, 2, 2] : -0.09 -> 0 [2, 1, 0, 0, 0] : -0.05 -> 0 [2, 1, 0, 0, 1] : -0.05 -> 0 [2, 1, 0, 0, 2] : -0.05 -> 0 [2, 1, 0, 1, 0] : -0.05 -> 0 [2, 1, 0, 1, 1] : -0.05 -> 0 [2, 1, 0, 1, 2] : -0.05 -> 0 [2, 1, 0, 2, 0] : -0.05 -> 0 [2, 1, 0, 2, 1] : -0.05 -> 0 [2, 1, 0, 2, 2] : -0.05 -> 0 [2, 1, 1, 0, 0] : 0.01 -> 1 [2, 1, 1, 0, 1] : 0.01 -> 1 [2, 1, 1, 0, 2] : 0.01 -> 1 [2, 1, 1, 1, 0] : 0.01 -> 1 [2, 1, 1, 1, 1] : 0.01 -> 1 [2, 1, 1, 1, 2] : 0.01 -> 1 [2, 1, 1, 2, 0] : 0.01 -> 1 [2, 1, 1, 2, 1] : 0.01 -> 1 [2, 1, 1, 2, 2] : 0.01 -> 1
Now we compute who is given feedback according to the different policies.
policies = [optimal_policy_base,
optimal_policies_aux_firstcome[best_cost_setting_idx_firstcome],
optimal_policies_aux_sickestfirst[best_cost_setting_idx_sickestfirst],
optimal_policies_aux_autonomy[best_cost_setting_idx_autonomy],
optimal_policies_aux_priority[best_cost_setting_idx_priority],
optimal_policies_aux_userweighted[best_cost_setting_idx_userweighted]]
policy_names = ["Prognosis",
"+ first-come, first-served",
"+ sickest-first",
"+ autonomy",
"+ priority",
"Smoker-preferred weighting"]
perc_states_priority_all_own_pop = []
perc_states_autonomy_all_own_pop = []
perc_states_firstcome_all_own_pop = []
perc_states_sickestfirst_all_own_pop = []
perc_states_qvalsdiff_all_own_pop = []
perc_states_userweighted_all_own_pop = []
for policy_idx, policy in enumerate(policies):
print("\n", policy_names[policy_idx])
perc_states_priority = []
perc_states_autonomy = []
perc_states_firstcome = []
perc_states_sickestfirst = []
perc_states_qval_diff = []
for priority in range(NUM_VALS_FEAT_PRIORITY):
states_with_support = [i for i in range(num_states_aux) if policy[i] == 1]
state_feat_val_with_support = [i for i in range(num_states_aux) if policy[i] == 1 and states_aux[i][FEAT_INDEX_PRIORITY] == priority]
perc_states_priority.append(len(state_feat_val_with_support)/len(states_with_support))
print("Priority:", np.round(perc_states_priority, 2))
for firstcome in range(NUM_VALS_FEAT_WAITING_TIME):
states_with_support = [i for i in range(num_states_aux) if policy[i] == 1]
state_feat_val_with_support = [i for i in range(num_states_aux) if policy[i] == 1 and states_aux[i][FEAT_INDEX_WAITING_TIME] == firstcome]
perc_states_firstcome.append(len(state_feat_val_with_support)/len(states_with_support))
print("First-come, first-served:", np.round(perc_states_firstcome, 2))
for autonomy in range(NUM_VALS_WANTING_HUMAN_SUPPORT):
states_with_support = [i for i in range(num_states_aux) if policy[i] == 1]
state_feat_val_with_support = [i for i in range(num_states_aux) if policy[i] == 1 and states_aux[i][FEAT_INDEX_WANTING_HUMAN_SUPPORT] == autonomy]
perc_states_autonomy.append(len(state_feat_val_with_support)/len(states_with_support))
print("Autonomy:", np.round(perc_states_autonomy, 2))
for sickestfirst in range(NUM_VALS_WORST_STATE):
states_with_support = [i for i in range(num_states_aux) if policy[i] == 1]
state_feat_val_with_support = [i for i in range(num_states_aux) if policy[i] == 1 and states_aux_reward_func_bins[i] == sickestfirst]
perc_states_sickestfirst.append(len(state_feat_val_with_support)/len(states_with_support))
print("Sickest first:", np.round(perc_states_sickestfirst, 2))
for prognosis in range(NUM_VALS_PROGNOSIS_CRITERION):
states_with_support = [i for i in range(num_states_aux) if policy[i] == 1]
state_feat_val_with_support = [i for i in range(num_states_aux) if policy[i] == 1 and states_aux_q_vals_diff_bins[i] == prognosis]
perc_states_qval_diff.append(len(state_feat_val_with_support)/len(states_with_support))
print("Prognosis:", np.round(perc_states_qval_diff, 2))
perc_states_priority_all_own_pop.append(perc_states_priority)
perc_states_autonomy_all_own_pop.append(perc_states_autonomy)
perc_states_firstcome_all_own_pop.append(perc_states_firstcome)
perc_states_sickestfirst_all_own_pop.append(perc_states_sickestfirst)
perc_states_qvalsdiff_all_own_pop.append(perc_states_qval_diff)
Prognosis Priority: [0.33 0.33 0.33] First-come, first-served: [0.33 0.33 0.33] Autonomy: [0.4 0.6] Sickest first: [0.6 0.2 0.2] Prognosis: [0. 0.2 0.8] + first-come, first-served Priority: [0.33 0.33 0.33] First-come, first-served: [0.14 0.33 0.52] Autonomy: [0.62 0.38] Sickest first: [0.29 0.24 0.48] Prognosis: [0.19 0.33 0.48] + sickest-first Priority: [0.33 0.33 0.33] First-come, first-served: [0.27 0.36 0.36] Autonomy: [1. 0.] Sickest first: [0. 0. 1.] Prognosis: [0.27 0.45 0.27] + autonomy Priority: [0.33 0.33 0.33] First-come, first-served: [0.29 0.35 0.35] Autonomy: [0.18 0.82] Sickest first: [0.29 0.53 0.18] Prognosis: [0.18 0.29 0.53] + priority Priority: [0. 0.4 0.6] First-come, first-served: [0.33 0.33 0.33] Autonomy: [0.6 0.4] Sickest first: [0.27 0.27 0.47] Prognosis: [0.2 0.27 0.53] Smoker-preferred weighting Priority: [0.24 0.32 0.44] First-come, first-served: [0.18 0.32 0.5 ] Autonomy: [0.79 0.21] Sickest first: [0. 0.21 0.79] Prognosis: [0.35 0.29 0.35]
sns.set()
sns.set_style("white")
med_fontsize = 22
small_fontsize = 18
extrasmall_fontsize = 15
sns.set_context("paper", rc={"font.size":small_fontsize,"axes.titlesize":med_fontsize,"axes.labelsize":med_fontsize,
'xtick.labelsize':small_fontsize, 'ytick.labelsize':small_fontsize,
'legend.fontsize':extrasmall_fontsize,'legend.title_fontsize': extrasmall_fontsize})
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
colors = ["#bbccee", "#cceeff", "#ccddaa", "#eeeebb", "#ffcccc", "#dddddd"]
patterns = ["", "/" , "-" , "x", ".", "\\"]
width = 0.1 # the width of the bars
/tmp/ipykernel_2651/48342549.py:12: DeprecationWarning: `set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()` set_matplotlib_formats('retina')
And now we create the plot.
perc_per_criterion_own_pop = [perc_states_qvalsdiff_all_own_pop,
perc_states_firstcome_all_own_pop,
perc_states_sickestfirst_all_own_pop,
perc_states_autonomy_all_own_pop,
perc_states_priority_all_own_pop]
# Number of possible values per criterion
criterion_num_vals = [NUM_VALS_PROGNOSIS_CRITERION,
NUM_VALS_FEAT_WAITING_TIME,
NUM_VALS_WORST_STATE,
NUM_VALS_WANTING_HUMAN_SUPPORT,
NUM_VALS_FEAT_PRIORITY]
criterion_vals_names = {"Priority": ["low", "medium", "high"],
"Autonomy": ["low", "high"],
"First-come,\nfirst-served": ["short waiting time", "medium waiting time",
"long waiting time"],
"Sickest-first": ["low",
"medium",
"high"],
"Prognosis": ["low", "medium", 'high']}
criterion_names_plots = ["Prognosis",
"First-come,\nfirst-served",
"Sickest-first",
"Autonomy",
"Priority"]
fig, ax = plt.subplots(layout='constrained', nrows=1, ncols=5, figsize =(15, 10))
for criterion_idx, criterion_name in enumerate(criterion_names_plots):
x = np.arange(len(policies)) # the label locations
multiplier = 0
bottom_vals = np.zeros(len(policies))
for criterion_val in range(criterion_num_vals[criterion_idx]):
y_vals = np.array([perc_per_criterion_own_pop[criterion_idx][policy_idx][criterion_val]*100 for policy_idx in range(len(policies))])
color = colors[criterion_val]
hatch = patterns[criterion_val]
# Just have low and high for autonomy
if criterion_name == "Autonomy" and criterion_val == 1:
color = colors[criterion_val + 1]
hatch = patterns[criterion_val + 1]
ax[criterion_idx].bar(x, y_vals,
color=color,
hatch = hatch,
bottom = bottom_vals,
label = criterion_vals_names[criterion_name][criterion_val])
bottom_vals += np.array(y_vals)
# Highlight bar of policy corresponding to current criterion
ax[criterion_idx].plot([x[criterion_idx] - 3.5 * width, x[criterion_idx] - 3.5 * width],
[0, 100],
color = 'dimgray',
linewidth = 3)
ax[criterion_idx].plot([x[criterion_idx] + 3.5 * width, x[criterion_idx] + 3.5 * width],
[0, 100],
color = 'dimgray',
linewidth = 3)
ax[criterion_idx].set_xticks(x, policy_names, rotation=90)
ax[criterion_idx].set_xlabel(criterion_names_plots[criterion_idx])
ax[criterion_idx].set_ylim((0, 100))
ax[0].set_ylabel("Percentage of human feedback\nallocated to subgroups")
plt.legend(bbox_to_anchor = (1, 0.6), title = "Criterion value")
plt.savefig("Figures/full_stacked_cost.pdf", dpi=1500, bbox_inches='tight', pad_inches=0)