raise('skip_exec not working!')
Numba / Dev Reinforcement learning framework
This notebook gathers the functions creating different kinds of environments for foraging and target search in various scenarios.
Helpers
isBetween
isBetween_c_Vec_numba
isBetween_c_Vec_numba (a, b, c, r)
Checks whether point c is crossing the line formed with point a and b.
Type | Details | |
---|---|---|
a | tensor, shape = (1,2) | Previous position. |
b | tensor, shape = (1,2) | Current position. |
c | tensor, shape = (Nt,2) | Positions of all targets. |
r | int/float | Target radius. |
Returns | array of boolean values | True at the indices of found targets. |
= isBetween_c_Vec_numba(np.array([0.1,1]), np.array([1,3]), np.random.rand(100,2), 0.00001) compiling
4.48 µs ± 16.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
from rl_opts.utils import isBetween_c_Vec as oldbetween
46.9 µs ± 442 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
Pareto sampling
pareto_sample
pareto_sample (alpha, xm, size=1)
Random sampling from array with probs
rand_choice_nb
rand_choice_nb (arr, prob)
:param arr: A 1D numpy array of values to sample from. :param prob: A 1D numpy array of probabilities for the given samples. :return: A random sample from the given array with a given probability.
Environments
TargetEnv
TargetEnv
TargetEnv (*args, **kwargs)
Class defining the foraging environment. It includes the methods needed to place several agents to the world.
Runtime testing
= TargetEnv(Nt = 1000,
env = 123,
L = 50,
r = np.array([[0.1],[1]]),
lc = 'pareto')
lc_distribution = env.check_encounter() compiling
15.5 µs ± 892 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
= TargetEnv(Nt = 1000,
env = 123,
L = 50,
r = np.array([[0.1, 0.3, 0.5, 0.5],[0.8, 0.1, 0.05, 0.05]]))
lc = env.check_encounter() compiling
12 µs ± 7.38 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
from rl_opts.rl_framework.legacy import TargetEnv as oldEnv
= oldEnv(Nt = 100,
oenv = 123,
L = 0.2,
r = 1) lc
/home/gorka/github/fork_rl_opts/rl_opts/utils.py:36: RuntimeWarning: invalid value encountered in divide
mask[np.argwhere(np.abs(np.cross(b-a, c-a))/np.linalg.norm(b-a) > r)] = False
232 µs ± 388 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
@jitclass
class move():
env : TargetEnv
def __init__(self,
= 1000,
Nt = 123.0,
L = 0.2,
r = 1.0,
lc = 10):
TIME_EP
self.env = TargetEnv(Nt, L, r, np.array([[0.1, 0.3, 0.5, 0.5],[0.8, 0.1, 0.05, 0.05]]), 1, 1, False, 'power_law')
def run(self, t):
for time in range(t):
self.env.update_pos(False)
self.env.check_encounter()
self.env.check_bc()
= move()
k = k.run(5) onerun
123 ms ± 61.2 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
from rl_opts.rl_framework.legacy import TargetEnv as oldEnv
= oldEnv(Nt = 1000,
oenv = 123,
L = 0.2,
r = 1) lc
def old_run(t):
for t in range(t):
False)
oenv.update_pos(
oenv.check_encounter() oenv.check_bc()
2.85 s ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Walk from policy
single_agent_walk
single_agent_walk (N_runs, time_ep, policy, env)
multi_agents_walk
multi_agents_walk (N_runs, time_ep, N_agents, Nt=100, L=100, r=0.5, lc=array([[1.], [1.]]), num_agents=1, agent_step=1, destructive_targets=False, lc_distribution='constant', policy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
ResetEnv
1D
ResetEnv_1D
ResetEnv_1D (*args, **kwargs)
Initialize self. See help(type(self)) for accurate signature.
multi_loop_exp
multi_loop_exp (T, rates, L, D)
multi_loop_constant
multi_loop_constant (T, resets, L, D)
reset_search_loop
reset_search_loop (T, reset_policy, env)
= 10.0; D = 1/2; T = int(1e3)
L
= np.linspace(70, 150, 40).astype(np.int64) resets
121 µs ± 46.6 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
112 µs ± 27.3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
2D
ResetEnv_2D
ResetEnv_2D (*args, **kwargs)
Initialize self. See help(type(self)) for accurate signature.
multi_loop_policies_2D
multi_loop_policies_2D (T, reset_policies, dist_target, radius_target, D)
multi_loop_exp_2D
multi_loop_exp_2D (T, rates, dist_target, radius_target, D)
multi_loop_constant_2D
multi_loop_constant_2D (T, resets, dist_target, radius_target, D)
# from rl_opts.rl_framework_numba import multi_loop_constant_2D
from rl_opts.rl_framework_numba import isBetween_c_Vec_numba, reset_search_loop
= (3*np.arange(10)+1).astype(int)
resets = int(1e6); dist_target = 5; radius_target = 1; D = 1
T plt.plot(resets, multi_loop_constant_2D(T, resets, dist_target, radius_target, D))
Loop saving position
reset_loop_saving_positions_2D
reset_loop_saving_positions_2D (n_agents, T, reset_policy, dist_target, radius_target, D)
from rl_opts.rl_framework_numba import ResetEnv_2D
= ResetEnv_2D(dist_target = 5, radius_target = 1, D = 1)
env = 1000
reset = np.zeros(reset)
reset_policy -1] = 1
reset_policy[reset# reset_policy[int(reset/2)] = 0.5
= 2; radius_target = 0.5; D = 0.1
dist_target = np.array([dist_target*np.cos(np.pi/4), dist_target*np.sin(np.pi/4)]) target_position
= 50
T = reset_loop_saving_positions_2D(n_agents = int(1e6), T = T, reset_policy = reset_policy,
positions = dist_target, radius_target = radius_target, D = D) dist_target
= np.linspace(-dist_target, dist_target, 200)
bins
for t in range(T)[::10]:
= np.histogram(pos[pos[:,t,0] != 0,t,0], bins = bins);
h, _
1:], h/h.max()) plt.plot(bins[
# import matplotlib.patches as patches
= np.linspace(-7, 7, 100)
bins = 3
t # plt.scatter(target_position[0], target_position[1], s = 10, c = 'r', zorder = 10)
0] != 0,t,0], pos[pos[:,t,0] != 0,t,1], bins = bins, cmap = 'Oranges');
plt.hist2d(pos[pos[:,t,
= patches.Circle(target_position, radius_target, edgecolor='r', facecolor='none')
circle
plt.gca().add_patch(circle)
# Adjust the aspect ratio
'equal', adjustable='box') plt.gca().set_aspect(
Animation
animate_positions_with_target
animate_positions_with_target (bins, positions, radius_target, target_position, cmap='Oranges')
= np.linspace(-10, 10, 1000)
bins
= animation(bins, positions, radius_target, target_position, cmap = 'Greys')
ani # Display the animation
HTML(ani.to_jshtml())
Move + Reset Env
1D
MoveResetEnv_1D
MoveResetEnv_1D (*args, **kwargs)
Initialize self. See help(type(self)) for accurate signature.
= MoveResetEnv_1D(L = 15, step_length = 1)
env
= []
pos for time in range(10000):
pos.append(env.position)False if np.random.rand() > 0.5 else True,
env.update_pos(True if time % 2500 == 0 else False)
plt.plot(pos) plt.axhline(env.L)
<matplotlib.lines.Line2D>
No-train search loops
multi_loop_MoveReset_allfixed
multi_loop_MoveReset_allfixed (T, resets, turns, L, step_length)
MoveReset_allfixed
MoveReset_allfixed (T, reset, turn, env)
= MoveResetEnv_1D(L = 5, step_length = 1)
env
= int(1e7)
T = 500
reset = np.arange(10, 20)
turns = 5.0
L = 1.0
step_length
= MoveResetEnv_1D(L = L, step_length = step_length)
env
#rewards = MoveReset_allfixed(T = T, reset = reset, turn = turns[0], env = env)
= multi_loop_MoveReset_allfixed(T = int(1e8), resets = np.array([500]), turns = np.arange(5, 10), L = 16.0, step_length = 1.0) rews
0]/T) plt.plot(rews[
2D
MoveResetEnv_2D
MoveResetEnv_2D (*args, **kwargs)
Open question: here we consider no boundaries, to properly replicate 1D. But in the MoveReset environment there will be boundaries?
= 100
L = MoveResetEnv_2D(dist_target = 2)
env = 200
T = np.zeros((2, T))
pos = []
time_enc for time in (range(T)):
= env.position
pos[:, time] = env.update_pos(False if np.random.rand() > 0.5 else True, True if time % 500 == 0 else False)
rew if rew == 1:
time_enc.append(time)
print(len(time_enc))
1
= plt.subplots()
fig, ax 0], pos[1])
ax.plot(pos[
= plt.Circle(env.target_position[0], env.r, color='C1')
target
ax.add_patch(target)
for t in time_enc:
0, t:t+2], pos[1, t:t+2], c = 'k', lw = 2)
plt.plot(pos[
'''
See that we are seeing here is the reseting step, because the real step in which we crossed the target is not recorded
because we do a env.init_env() when crossing the target and the one that gets out of the function is init_position
'''
'\nSee that we are seeing here is the reseting step, because the real step in which we crossed the target is not recorded\nbecause we do a env.init_env() when crossing the target and the one that gets out of the function is init_position\n'
Multi-target
MoveResetEnv_multiTarget_2D
MoveResetEnv_multiTarget_2D (*args, **kwargs)
Initialize self. See help(type(self)) for accurate signature.
= 100
L = MoveResetEnv_2D(L = L, Nt = 100, init_position=np.array([L/2, L/2]))
env = 200
T = np.zeros((2, T))
pos = []
target for time in (range(T)):
= env.position[0]
pos[:, time] False if np.random.rand() > 0.5 else True, True if time % 500 == 0 else False)
env.update_pos(if env.check_encounter() == 1: target.append(time)
#env.check_encounter()
env.check_bc()
If you want to consider cases with l_c, use MoveResetEnv_withlc_2D
0])
plt.plot(pos[for t in target: plt.axvline(t+1, c = 'k', alpha = 0.3)
50, c = 'k', alpha = 0.3) plt.axhline(
<matplotlib.lines.Line2D>
With l_c
MoveResetEnv_withlc_2D
MoveResetEnv_withlc_2D (*args, **kwargs)
Class defining the foraging environment. It includes the methods needed to place several agents to the world.
= 100
L = MoveResetEnv_2D(L = L, Nt = 100, init_positions=np.array([[L/2, L/2]]), lc_distribution = 'none')
env = 200
T = np.zeros((2, T))
pos = []
target for time in tqdm(range(T)):
= env.positions[0]
pos[:, time] False if np.random.rand() > 0.5 else True, True if time % 500 == 0 else False)
env.update_pos(if env.check_encounter() == 1: target.append(time)
#env.check_encounter()
env.check_bc()
0])
plt.plot(pos[for t in target: plt.axvline(t+1, c = 'k', alpha = 0.3)
50, c = 'k', alpha = 0.3) plt.axhline(
<matplotlib.lines.Line2D>
No-train search loops
Base MoveRest_2D
multi_loop_MoveReset2D_allfixed
multi_loop_MoveReset2D_allfixed (T, resets, turns, dist_target, radius_target, agent_step)
MoveReset2D_allfixed
MoveReset2D_allfixed (T, reset, turn, env)
from rl_opts.rl_framework_numba import MoveResetEnv_2D
= 1000; reset = 5; turn = 2;
T = MoveResetEnv_2D()
env = MoveReset2D_allfixed(T, reset, turn, env) rews
= multi_loop_MoveReset2D_allfixed(T = int(1e4), resets = np.array([500]), turns = np.arange(2, 10),
rews = 5, radius_target = 1.5, agent_step = 1.3) dist_target
0]/T) plt.plot(rews[
Multi target
multi_loop_MoveReset2D_multitarget_allfixed
multi_loop_MoveReset2D_multitarget_allfixed (T, resets, turns, L, Nt, r, step_length, init_position)
MoveReset2D_multitarget_allfixed
MoveReset2D_multitarget_allfixed (T, reset, turn, env)
= 1000; reset = 5; turn = 2;
T = MoveResetEnv_2D()
env = MoveReset2D_allfixed(T, reset, turn, env) rews
= multi_loop_MoveReset2D_multitarget_allfixed(T = int(1e7), resets = np.array([500]), turns = np.arange(2, 10), L = L,
rews = Nt, r = 1, init_position = np.array([L/2, L/2]),step_length = step_length) Nt
0]/T) plt.plot(rews[
Projective Simulation agent
BASE agent
PSAgent
PSAgent (*args, **kwargs)
Base class of a Reinforcement Learning agent based on Projective Simulation, with two-layered network. This class has been adapted from https://github.com/qic-ibk/projectivesimulation
= PSAgent(num_actions = 10, num_percepts_list = np.array([15]))
ps 0]*ps.num_percepts_list)
ps.percept_preprocess([0)
ps.probability_distr(= [0]*ps.num_percepts_list[0]
observation
ps.deliberate(np.array(observation))1)
ps.learn(
ps.reset_g() ps.deliberate_fixed_policy(np.array(observation))
No fixed policy was given to the agent. The action will be selected randomly.
7
Forager
Forager
Forager (*args, **kwargs)
*Same as PSAGENT but: num_percepts_list -> state_space
state_space : list List where each entry is the state space of each perceptual feature. E.g. [state space of step counter, state space of density of successful neighbours].*
= Forager(num_actions = 2, state_space = np.array([np.arange(100)]))
agent 0]*agent.num_percepts_list)
agent.percept_preprocess([0)
agent.probability_distr(= [0]*agent.num_percepts_list[0]
observation
agent.deliberate(np.array(observation))1)
agent.learn(
agent.reset_g()
agent.deliberate_fixed_policy(np.array(observation))0)
agent.act( agent.get_state()
No fixed policy was given to the agent. The action will be selected randomly.
array([1])
Forager with efficient H update
We use the formula \(H_{t+i} = (1-\gamma)^i H_t + \gamma H_0 \sum_{j=1}^i(1-\gamma)^{j-1}\)
Forager_efficient_H
Forager_efficient_H (*args, **kwargs)
*Same as PSAGENT but: num_percepts_list -> state_space
state_space : list List where each entry is the state space of each perceptual feature. E.g. [state space of step counter, state space of density of successful neighbours].*
Testing
def test_train_loop_Heff(efficient, agent, episodes):
for i in range(episodes):
if efficient:
+= 1
agent.counter_upd
= np.array([i])
state
if i % 2 == 0:
= 0
action else: 1
# here is where glow matrix updates:
= (1 - agent.eta_glow_damping) * agent.g_matrix
agent.g_matrix += 1 #record latest decision in g_matrix
agent.g_matrix[action, i]
if i == 2 or i == 6:
= 1
reward else: reward = 0
if efficient:
if reward == 1:
agent._learn_post_reward(reward)else:
agent.learn(reward)
return agent
Value testing
= 100
eps = Forager(num_actions = 2,
agent_noopt = np.array([np.arange(eps)]))
state_space = test_train_loop_Heff(efficient = False, agent = agent_noopt, episodes = eps)
trained_noopt
trained_noopt.h_matrix
= Forager_efficient_H(num_actions = 2,
agent_opt = np.array([np.arange(eps)]))
state_space = test_train_loop_Heff(efficient=True, agent = agent_opt, episodes = eps)
trained
f'comparison old and efficient: {(trained.h_matrix-trained_noopt.h_matrix).sum()} ||||| IF value != 0, something is wrong!!!'
'comparison old and efficient: 0.0 ||||| IF value != 0, something is wrong!!!'
Forager with efficient G and H update
This is full efficient, both w.r.t. G and H
Forager_efficient
Forager_efficient (*args, **kwargs)
*Updated version of the FORAGER class, with an efficient update both for the H-matrix and the G-matrix.
size_state_space : np.array num of percepts for each feature*
Value testing
We replicate a training with the original agent and the efficient one to check that the resulting h-matrix is equal. Note that because the deliberate is random, the value \(!=1\) in h_matrix may be in different rows (actions) for the two agents (but always on the same column, i.e. state).
from rl_opts.rl_framework_numba import Forager_efficient, Forager
= 0.5, 0.5
gamma, eta = 5
steps = 10
size_state_space
= Forager(num_actions = 2, state_space = np.array([np.arange(size_state_space)]),
ag_og =gamma,
gamma_damping=eta
eta_glow_damping
)
= Forager_efficient(num_actions = 2, size_state_space = np.array([size_state_space]),
ag_ef = eta, gamma_damping = gamma)
eta_glow_damping
for i in range(steps-1):
0)
ag_og.learn(
ag_og.deliberate(np.array([i]))1)
ag_og.learn(for i in range(steps-1):
0)
ag_og.learn(
= steps-1
ag_ef.N_upd_H = steps-1
ag_ef.N_upd_G 0)
ag_ef._learn_post_reward(
+= 1
ag_ef.N_upd_H += 1
ag_ef.N_upd_G
ag_ef.deliberate(np.array([i]))1)
ag_ef._learn_post_reward(
= steps-1
ag_ef.N_upd_H = steps-1
ag_ef.N_upd_G 0)
ag_ef._learn_post_reward(
ag_og.h_matrix, ag_ef.h_matrix
(array([[1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,
1. , 1. ],
[1. , 1. , 1. , 1.0625, 1. , 1. , 1. , 1. ,
1. , 1. ]]),
array([[1. , 1. , 1. , 1. , 1. , 1. , 1. , 1. ,
1. , 1. ],
[1. , 1. , 1. , 1.0625, 1. , 1. , 1. , 1. ,
1. , 1. ]]))
Runtime testing
= int(1e4); eta = 0.1
eps = Forager(num_actions = 2,
agent_noopt = np.array([np.arange(eps)]), eta_glow_damping = eta)
state_space = Forager_efficient(num_actions = 2,
agent_opt = np.array([np.arange(eps)]), eta_glow_damping = eta) state_space
22.7 ms ± 78.8 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
367 ms ± 7.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Forager with action dependent glow and damping
Forager_multi_param
Forager_multi_param (*args, **kwargs)
*Same as Forager_efficient but with different glows and damping factors for each action
size_state_space : np.array num of percepts for each feature*
Test prefactor calculation
= int(1e4);
eps = np.array([0.99,0.001])
etas = np.array([0.001, 0.001])
gammas
= Forager_multi_param(num_actions = 2,
agent = np.array([10]),
size_state_space = etas, gamma_damping = gammas)
eta_glow_damping
= Forager_efficient(num_actions = 2, size_state_space = np.array([10]),
agent_og = etas[0], gamma_damping = gammas[0])
eta_glow_damping
assert (agent.prefactor_1[0,:] == agent_og.prefactor_1).all()
assert (agent.prefactor_2[0,:] == agent_og.prefactor_2).all()
Test update of h_matrix
from rl_opts.rl_framework_numba import rand_choice_nb
= np.array([0.001]*2)
etas = np.array([0.001]*2)
gammas
= Forager_multi_param(num_actions = 2,
agent = np.array([10]),
size_state_space = etas, gamma_damping = gammas)
eta_glow_damping
= Forager_efficient(num_actions = 2, size_state_space = np.array([10]),
agent_og = etas[0], gamma_damping = gammas[0])
eta_glow_damping
= np.zeros((2, agent.size_state_space[0]))
h_mat
0,:] = 1
h_mat[
# h_mat[0,:] = np.random.randint(2, size = agent.size_state_space[0])
# h_mat[1,:] = np.abs(h_mat[0,:]-1)
= h_mat.copy()
agent.h_matrix = h_mat.copy()
agent_og.h_matrix
= []
acs for a in [agent, agent_og]:
= 0
a.N_upd_H = 0
a.N_upd_G = []
ac for i in range(5):
+= 1
a.N_upd_H += 1
a.N_upd_G
= a.deliberate(np.array([i]))
action
ac.append(action)
if i == 2 or i == 6:
= 1
reward else:
= 0
reward
if reward == 1:
a._learn_post_reward(reward)
a._G_upd_full()
acs.append(ac)
if acs[0] == acs[1]:
assert np.sum(agent_og.h_matrix - agent.h_matrix) < 1e-10
else:
print('actions didnt match :( this may be because of luck')
Test different gammas
= np.array([0.1]*2)
etas = np.array([0.1, 0.001])
gammas
= Forager_multi_param(num_actions = 2,
agent = np.array([10]),
size_state_space = etas, gamma_damping = gammas)
eta_glow_damping
*= 5
agent.h_matrix
= 10
agent.N_upd_H = 10
agent.N_upd_G 0)
agent._learn_post_reward(
agent.h_matrix
array([[2.39471376, 2.39471376, 2.39471376, 2.39471376, 2.39471376,
2.39471376, 2.39471376, 2.39471376, 2.39471376, 2.39471376],
[4.96017952, 4.96017952, 4.96017952, 4.96017952, 4.96017952,
4.96017952, 4.96017952, 4.96017952, 4.96017952, 4.96017952]])
Launch multi agent learning
For TargetEnv
train_loop
train_loop (episodes, time_ep, agent, env)
train_loop_h_efficient
train_loop_h_efficient (episodes, time_ep, agent, env, h_mat_allT=False)
train_loop_full_efficient
train_loop_full_efficient (episodes, time_ep, agent, env, h_mat_allT=False)
run_agents
run_agents (episodes, time_ep, N_agents, Nt=100, L=100, r=0.5, lc=array([[1.], [1.]]), num_agents=1, agent_step=1, destructive_targets=False, lc_distribution='constant', num_actions=2, state_space=array([[ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.]]), gamma_damping=1e-05, eta_glow_damping=0.1, initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, efficient_agent=False, h_mat_allT=False)
Testing
# Other similar exps: 'previous', 'previous_long'
# Current exp:
= 'previous_pol_t'
EXP
# Training spec
= 20
TIME_EP = 12
EPISODES = 5
multiplier_agents
# Environment
= 100; L = 100; r = 0.5;
Nt = 'constant'
lc_distribution = [0.6]+np.linspace(1,10,10).tolist()
lcs
# Agent
= int(2e3)
max_counter = np.array([np.arange(max_counter)])
state_space = 0.00001
gamma_damping = 0.1
eta_glow_damping = (np.array([0.99, 0.01])*np.ones((2, max_counter)).transpose()).transpose() initial_prob_distr
for lc_value in (lcs):
= np.array([[lc_value],[1.0]])
lc
print(f'starting lc = {lc_value}')
= run_agents(episodes = EPISODES, time_ep = TIME_EP, N_agents = 1,#multiplier_agents*numba.get_num_threads(),
rews, mats = Nt, L = L, r = r,
Nt = lc, lc_distribution = lc_distribution,
lc = state_space,
state_space = gamma_damping,
gamma_damping = eta_glow_damping,
eta_glow_damping = initial_prob_distr,
initial_prob_distr = True,
efficient_agent =True
h_mat_allT
)print(f'saving lc = {lc_value}')
# np.save(f'../../results/constant_lc/EXP_{EXP}/h_mats_lc_{lc_value}.npy', mats)
# np.save(f'../../results/constant_lc/EXP_{EXP}/rewards_lc_{lc_value}.npy', rews)
starting lc = 0.6
saving lc = 0.6
starting lc = 1.0
saving lc = 1.0
starting lc = 2.0
saving lc = 2.0
starting lc = 3.0
saving lc = 3.0
starting lc = 4.0
saving lc = 4.0
starting lc = 5.0
saving lc = 5.0
starting lc = 6.0
saving lc = 6.0
starting lc = 7.0
saving lc = 7.0
starting lc = 8.0
saving lc = 8.0
starting lc = 9.0
saving lc = 9.0
starting lc = 10.0
saving lc = 10.0
Runtime testing
from rl_opts.rl_framework.legacyimport run_agents
import numba
import numpy as np
= 10 time_ep
# For compiling and checking
= 10, time_ep = time_ep, N_agents = numba.get_num_threads(), state_space = np.array([np.linspace(0, time_ep-1, time_ep)]), efficient_agent=False); run_agents(episodes
= 12000 time_ep
9.86 s ± 44.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
= 12000 time_ep
2.33 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
= 12000 time_ep
1.06 s ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
For ResetEnv
1D
train_loop_reset
train_loop_reset (episodes, time_ep, agent, env, h_mat_allT=False, when_save_h_mat=1, reset_after_reward=True)
# from rl_opts.rl_framework_numba import Forager_efficient, ResetEnv_1D, ResetEnv_2D
# import numpy as np
= ResetEnv_1D(L = 5, D = 1/2)
env = Forager_efficient(num_actions = 2,
agent = np.array([100]))
size_state_space = train_loop_reset(100, 100, agent, env) res
= train_loop_reset(100, 100, agent, env) res
Test
from rl_opts.rl_framework_numba import Forager_efficient, ResetEnv_1D, ResetEnv_2D
import numpy as np
= 10
time_ep
= ResetEnv_1D(L = 5, D = 1/2)
env = Forager_efficient(num_actions = 2,
agent = np.array([time_ep+1]))
size_state_space
#initialize environment and agent's counter and g matrix
env.init_env()= 0
agent.agent_state
agent.reset_g()
for t in range(time_ep):
+= 1
agent.N_upd_H += 1
agent.N_upd_G
#get perception
= agent.get_state()
state
= 0 if t != 5 else 1
action
= agent.percept_preprocess(state)
percept
agent._G_upd_single_percept(percept, action)
#act (update counter)
agent.act(action)
= 0 if t < time_ep -1 else 1
reward
agent._learn_post_reward(reward)
agent.g_matrix
array([[2., 2., 2., 2., 1., 0., 0., 0., 0., 0., 0.],
[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])
def train_loop_reset(episodes, time_ep, agent, env, h_mat_allT = False, when_save_h_mat = 1, reset_after_reward = True):
if h_mat_allT:
= np.zeros((int(np.ceil(episodes/when_save_h_mat)),
policy_t -1]))
agent.h_matrix.shape[= 0
idx_policy_save
= np.zeros(episodes)
save_rewards = []
gmats = []
counters = []
tuples for ep in range(episodes):
#initialize environment and agent's counter and g matrix
= 0
agent.agent_state
agent.reset_g()
= 0
position for t in range(time_ep):
+= 1
agent.N_upd_H += 1
agent.N_upd_G
#get perception
= agent.get_state()
state
# if we reached the maximum state space, we perform turn action
if state == agent.h_matrix.shape[-1]:
= 1
action # else we do as normal
else:
= agent.deliberate(state)
action
#act (update counter)
agent.act(action)
#update positions
if action == 0:
+= 2*np.random.randint(2)-1
position else:
= 0
position
# Checking if reward
if position >= env.L:
= 0
position = 1
reward else:
= 0
reward
0].copy(), action, reward])
tuples.append([state[
if reward == 1 or agent.N_upd_H == agent.max_no_H_update-1:
agent._learn_post_reward(reward)
gmats.append(agent.g_matrix.copy())
counters.append(state)
if reset_after_reward == True and reward != 0:
= 0
agent.agent_state
# Saving
+= reward
save_rewards[ep] if h_mat_allT and ep % when_save_h_mat == 0:
= agent.h_matrix[0,:] / agent.h_matrix.sum(0)
policy_t[idx_policy_save] += 1
idx_policy_save
# return (save_rewards/time_ep, policy_t) if h_mat_allT else (save_rewards/time_ep, agent.h_matrix)
return gmats, counters, tuples
agent._G_upd_full??
Signature: agent._G_upd_full() Source: def _G_upd_full(self): '''Given the current number of steps without an update, updates the whole G-matrix. Then, resets all counters.''' self.g_matrix = (1 - self.eta_glow_damping)**(self.N_upd_G - self.last_upd_G) * self.g_matrix self.N_upd_G = 0 self.last_upd_G = np.zeros((self.num_actions, self.num_percepts)) File: /tmp/ipykernel_2749/2916760148.py Type: method
= 0
idxl
= 9, 5
idxg, idxe = 1/2;
D
= np.arange(5, 11)
Ls
= np.logspace(-9, -5.5, 10)
gammas # first round (not all L finished)
= np.linspace(0.05, 0.18, 10)
etas1 # second round
= np.linspace(0.18, 0.3, 10)
etas2 = np.append(etas1, etas2) etas
etas
array([0.05 , 0.06444444, 0.07888889, 0.09333333, 0.10777778,
0.12222222, 0.13666667, 0.15111111, 0.16555556, 0.18 ,
0.18 , 0.19333333, 0.20666667, 0.22 , 0.23333333,
0.24666667, 0.26 , 0.27333333, 0.28666667, 0.3 ])
= ResetEnv_1D(L = Ls[idxl], D = D)
env = Forager_efficient(num_actions = 2,
agent = np.array([50]),
size_state_space =gammas[idxg],
gamma_damping=etas[idxe],
eta_glow_damping= 'r') g_update
from rl_opts.rl_framework_numba import rand_choice_nb
= train_loop_reset(10, int(1e3), agent, env)
gmats, counters, tuples = np.array(gmats)
gmats = np.array(tuples) tuples
= np.argwhere(tuples[:,-1] == 1).flatten()
idx_reward round = tuples[:idx_reward[-1]+2,:]
round[-10:]
array([[0, 0, 0],
[1, 1, 0],
[0, 0, 0],
[1, 0, 0],
[2, 0, 0],
[3, 0, 0],
[4, 0, 0],
[5, 0, 0],
[6, 0, 1],
[0, 0, 0]])
0, 1, :10], label = 'reset')
plt.plot(gmats[0, 0, :10], label = 'continue')
plt.plot(gmats[ plt.legend()
<matplotlib.legend.Legend>
plt.plot(np.bincount(np.array(counters).flatten()))
-100:,1].mean(0)) plt.plot(gmats[
1]/agent.h_matrix.sum(0)) plt.plot(agent.h_matrix[
End test
run_agents_reset
run_agents_reset (episodes, time_ep, N_agents, D=0.5, L=10.0, num_actions=2, size_state_space=array([100]), gamma_damping=1e-05, eta_glow_damping=0.1, g_update='s', initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, h_mat_allT=False, reset_after_reward=True, num_runs=None)
Type | Default | Details | |
---|---|---|---|
episodes | |||
time_ep | |||
N_agents | |||
D | float | 0.5 | |
L | float | 10.0 | Environment props |
num_actions | int | 2 | Agent props |
size_state_space | ndarray | [100] | |
gamma_damping | float | 1e-05 | |
eta_glow_damping | float | 0.1 | |
g_update | str | s | |
initial_prob_distr | [] | ||
policy_type | str | standard | |
beta_softmax | int | 3 | |
fixed_policy | [] | ||
max_no_H_update | int | 1000 | |
h_mat_allT | bool | False | |
reset_after_reward | bool | True | |
num_runs | NoneType | None | When we want N_agent != number of max cores, we use this to make few runs over the selected number of cores, given by N_agents. |
Run test
from rl_opts.rl_framework_numba import Forager_efficient, ResetEnv_1D, train_loop_reset
= run_agents_reset(5, 100, 5, L = 2, num_runs=2, eta_glow_damping=0); rews, mats
rews
array([[0.01, 0.06, 0.04, 0.04, 0.03],
[0. , 0.01, 0.02, 0.03, 0.04],
[0.03, 0.06, 0.05, 0.07, 0.02],
[0.05, 0. , 0.02, 0.02, 0.04],
[0.03, 0.03, 0. , 0.01, 0.01],
[0.06, 0.06, 0.02, 0.02, 0.03],
[0.02, 0.01, 0.03, 0.01, 0.03],
[0.01, 0.03, 0.03, 0.06, 0.03],
[0.01, 0.02, 0.01, 0.02, 0.04],
[0.05, 0.04, 0.03, 0. , 0.02]])
2D
We need to have the 2D as a separate function because of an unification problem of numba
. You can’t have the following, because it fails to properly compile, env_1d
and env_2d
have different variables and are then different objects.
if dim == 1:
env = env_1d
else:
env = env_2d
run_agents_reset_2D
run_agents_reset_2D (episodes, time_ep, N_agents, dist_target=10.0, radius_target=1.0, D=0.5, num_actions=2, size_state_space=array([100]), gamma_damping=1e-05, eta_glow_damping=0.1, initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, h_mat_allT=False, when_save_h_mat=1, reset_after_reward=True, g_update='s')
Type | Default | Details | |
---|---|---|---|
episodes | |||
time_ep | |||
N_agents | |||
dist_target | float | 10.0 | |
radius_target | float | 1.0 | |
D | float | 0.5 | Environment props |
num_actions | int | 2 | Agent props |
size_state_space | ndarray | [100] | |
gamma_damping | float | 1e-05 | |
eta_glow_damping | float | 0.1 | |
initial_prob_distr | [] | ||
policy_type | str | standard | |
beta_softmax | int | 3 | |
fixed_policy | [] | ||
max_no_H_update | int | 1000 | |
h_mat_allT | bool | False | |
when_save_h_mat | int | 1 | |
reset_after_reward | bool | True | |
g_update | str | s |
Testing
from rl_opts.rl_framework_numba import Forager_efficient, ResetEnv_2D
int(1e2),int(1e2), 15, dist_target = 10, radius_target = 1, D = 1,
run_agents_reset_2D(=np.array([3]),
size_state_space=True, when_save_h_mat=5); h_mat_allT
h.shape
(15, 20, 3)
=True) np.unique(r.flatten(), return_counts
(array([0. , 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008,
0.009]),
array([47569, 53276, 31957, 12400, 3745, 844, 173, 32, 3,
1]))
= []
pol for ha in h:
0]/ha.sum(0))
pol.append(ha[= np.array(pol)
pol
0)) plt.plot(r.mean(
0)) plt.plot(r.mean(
For MoveResetEnv
2D
For base env
train_loop_MoveReset2D
train_loop_MoveReset2D (episodes, time_ep, agent, env, h_mat_allT=False, turn_0_when_reset=False)
from rl_opts.rl_framework_numba import Forager_efficient, MoveResetEnv_2D
= Forager_efficient(num_actions = 3, size_state_space = np.array([100, 100]),
agent = np.array([1e-5]*3),
gamma_damping = np.array([0.1,0.1,0.1]),
eta_glow_damping = 10)
max_no_H_update = MoveResetEnv_2D(dist_target = 5.1, # Distance from init position and target
env = 1.0, # Radius of the target
radius_target = 1) agent_step
= train_loop_MoveReset2D(episodes = 100, time_ep = int(1e4), agent = agent, env = env, h_mat_allT = False, turn_0_when_reset=True)
rews, hmat # hmat
Normal h_matrix update
run_agents_MoveReset2D
run_agents_MoveReset2D (episodes, time_ep, N_agents, dist_target=5.1, radius_target=1.0, agent_step=1.0, num_actions=3, size_state_space=array([100, 100]), gamma_damping=1e-05, eta_glow_damping=0.1, initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, efficient_agent=False, h_mat_allT=False)
= run_agents_MoveReset2D(episodes = 1000, time_ep = 1000, N_agents = 10) r, h
h.shape
(10, 3, 10000)
= h.mean(0)
mat = (100, 100)
size_state_space
= plt.subplots(1, 3, figsize = (9, 3), tight_layout = True)
fig, ax = plt.subplots(1, 4, figsize = (9, 3), tight_layout = True)
_, ax2 for i, action in enumerate(['continue', 'reset', 'turn']):
/mat.sum(0)).reshape(size_state_space).transpose())
ax[i].matshow((mat[i]'Rotate counter')
ax[i].set_ylabel(
ax[i].set_title(action)
/mat.sum(0))[:30])
ax2[i].plot((mat[i]1/3, c = 'k', ls = '--', alpha = 0.2, zorder = -1)
ax2[i].axhline(-1].plot((mat[i]/mat.sum(0))[:30])
ax2[
-1].set_xlabel('Reset counter') ax[
Text(0.5, 0, 'Reset counter')
0)) plt.plot(r.mean(
Multi param h_matrix update
= 0.00001,
gamma_damping = 0.1, eta_glow_damping
= np.array([1e-5,2e-5, 3e-5])
gamma_damping = np.array([0.1, 0.2, 0.3])
eta_glow_damping
assert gamma_damping.shape[0] == 3 and eta_glow_damping.shape[0] == 3, "
run_agents_MoveReset2D_multiparam
run_agents_MoveReset2D_multiparam (episodes, time_ep, N_agents, dist_target=5.1, radius_target=1.0, agent_step=1.0, num_actions=3, size_state_space=array([100, 100]), gamma_damping=array([1.e-05, 2.e-05, 3.e-05]), eta_glow_damping=array([0.1, 0.2, 0.3]), initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, efficient_agent=False, h_mat_allT=False, turn_0_when_reset=False)
from rl_opts.rl_framework_numba import Forager_multi_param, train_loop_MoveReset2D
= run_agents_MoveReset2D_multiparam(episodes = 1000, time_ep = 1000, N_agents = 10) r, h
h.shape
(10, 3, 10000)
= h.mean(0)
mat = (100, 100)
size_state_space
= plt.subplots(1, 3, figsize = (9, 3), tight_layout = True)
fig, ax = plt.subplots(1, 4, figsize = (9, 3), tight_layout = True)
_, ax2 for i, action in enumerate(['continue', 'reset', 'turn']):
/mat.sum(0)).reshape(size_state_space).transpose())
ax[i].matshow((mat[i]'Rotate counter')
ax[i].set_ylabel(
ax[i].set_title(action)
/mat.sum(0))[:30])
ax2[i].plot((mat[i]1/3, c = 'k', ls = '--', alpha = 0.2, zorder = -1)
ax2[i].axhline(-1].plot((mat[i]/mat.sum(0))[:30])
ax2[
-1].set_xlabel('Reset counter') ax[
Text(0.5, 0, 'Reset counter')
0)) plt.plot(r.mean(
For others
train_loop_MoveReset
train_loop_MoveReset (episodes, time_ep, agent, env, h_mat_allT=False)
run_agents_MoveReset
run_agents_MoveReset (episodes, time_ep, N_agents, Nt=100, L=100, r=0.5, lc=array([[1.], [1.]]), num_agents=1, agent_step=1, destructive_targets=False, lc_distribution='constant', init_positions=array([[0., 0.]]), num_actions=2, size_state_space=array([100, 100]), gamma_damping=1e-05, eta_glow_damping=0.1, initial_prob_distr=array([], shape=(2, 0), dtype=float64), policy_type='standard', beta_softmax=3, fixed_policy=array([], shape=(2, 0), dtype=float64), max_no_H_update=1000, efficient_agent=False, h_mat_allT=False)
#### For compiling and checking
= run_agents_MoveReset(episodes = int(1e2), time_ep = int(1e3), num_actions = 3,
rews , mats = 1, size_state_space = np.array([10, 10]),
N_agents =False, init_positions = np.array([[10, 10.1]])); efficient_agent
rews
array([[ 4., 0., 1., 16., 1., 0., 0., 1., 0., 4., 0., 4., 0.,
13., 0., 1., 2., 0., 0., 0., 11., 30., 6., 0., 12., 7.,
11., 4., 6., 2., 2., 5., 4., 4., 5., 0., 0., 29., 2.,
6., 2., 2., 10., 0., 4., 2., 0., 0., 15., 46., 2., 3.,
0., 0., 15., 0., 1., 6., 3., 4., 3., 13., 2., 2., 4.,
4., 0., 8., 0., 2., 2., 3., 4., 5., 4., 2., 3., 14.,
2., 32., 0., 17., 7., 26., 3., 2., 2., 0., 2., 6., 0.,
4., 8., 0., 3., 1., 26., 3., 3., 5.]])
How to read the h-matrix:
= Forager_efficient(num_actions = 3, size_state_space = np.array([100, 100])) agent
Convoluted way of doing it (see below for better option):
= np.array([100, 100])
size_state_space = np.zeros(size_state_space)
mat_2d for c_rotate in range(size_state_space[0]):
for c_reset in range(size_state_space[1]):
= (mat[0]/mat.sum(0))[agent.percept_preprocess(np.array([c_rotate, c_reset]))] mat_2d[c_rotate, c_reset]
10, :10])
plt.matshow(mat_2d[:'Rotate counter')
plt.ylabel('Reset counter') plt.xlabel(
Text(0.5, 0, 'Reset counter')
Better way: reshape + tranpose!!
= mats[0] mat
= plt.subplots(1, 3, figsize = (9, 3))
fig, ax for i, action in enumerate(['turn', 'reset', 'continue']):
/mat.sum(0)).reshape(10,10).transpose()[:10,:10])
ax[i].matshow((mat[i]'Rotate counter')
ax[i].set_ylabel(
ax[i].set_title(action)
-1].set_xlabel('Reset counter') ax[
Text(0.5, 0, 'Reset counter')