import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from rl_opts.imitation import PS_imitation
from rl_opts.analytics import pdf_powerlaw, pdf_multimode, pdf_discrete_sample, get_policy_from_dist
Imitation learning
In this brief tutorial we show how to reproduce the Figures 6 and 7c of the Appendix of our paper. The goal is to show that projective simulation (PS) can imitate the policy of an agent moving following a certain step length distribution. We focus on two distributions: Lévy and Bi-exponential.
First, let’s load the needed libraries and functions. See that rl_opts
needs to be already installed (see instructions in the README file).
Lévy distributions (Fig. 6a)
We consider distribution of the type \(P(L)=L^{-1-\beta}\), with various \(\beta\). To understand how the following code works, you can check the example shown in the documentation of PS_imitation
. Here we just do the same but looping over various \(\beta\).
= 100 # size of the state space
NUM_STATES = 1000 # number of epochs
EPOCHS = 10000 # number of learning steps per episode
NUM_STEPS
= [0.5, 1, 1.5, 2]
betas = np.zeros((len(betas), NUM_STATES))
hmatrix_pw
for idxb, beta in enumerate(tqdm(betas)):
# For every beta, we sample steps from the corresponding powerlaw (Levy dist.)
= pdf_discrete_sample(beta = beta,
steps = pdf_powerlaw,
pdf_func = np.arange(1, NUM_STATES),
L = (EPOCHS, NUM_STEPS))
num_samples
# We define the imitator and train it
= PS_imitation(num_states = NUM_STATES,
imitator = int(1e-7),
eta = 0)
gamma for e in (range(EPOCHS)):
imitator.reset()for s in steps[e]:
= s)
imitator.update(length
# We only save the turn probability
= imitator.h_matrix[1]/imitator.h_matrix.sum(0) hmatrix_pw[idxb]
0%| | 0/4 [01:50<?, ?it/s]
KeyboardInterrupt:
Now hmatrix_pw
contains the turn probability of an imitator agent for every \(\beta\). We can now plot this and compare it with the theoretical prediction, calculated by means of the get_policy_from_dist
function.
= plt.subplots(figsize = (3,3))
fig, ax_pw = plt.cm.plasma(np.linspace(0,1,len(betas)+1))
color
for idx, (h, beta) in enumerate(zip(hmatrix_pw, betas)):
#---- Analytical solution ----#
= get_policy_from_dist(n_max = NUM_STATES,
theory = pdf_powerlaw,
func = beta)
beta 2, NUM_STATES+1), 1-np.array(theory[1:]), c = color[idx])
ax_pw.plot(np.arange(
#---- Numerical solution ----#
2, NUM_STATES+2), h, 'o',
ax_pw.plot(np.arange(= color[idx], label = fr'$\beta$ = {beta}', alpha = 0.8, markeredgecolor='None', lw = 0.05)
c
#---- Plot features ----#
= (1.8, 30), ylim = (0.0, 1.01),
plt.setp(ax_pw, xlim =r'$n$', ylabel = r'$\pi_s(\Rsh|n)$',
xlabel = np.round(np.arange(0.2, 1.01, 0.2),1),
yticks = np.round(np.arange(0.2, 1.01, 0.2),1).astype(str),
yticklabels = 'log')
xscale 10, 10, label = 'Theory', c = 'k')
ax_pw.plot( ax_pw.legend()
<matplotlib.legend.Legend>
= plt.subplots(figsize = (3,3))
fig, ax_pw = plt.cm.plasma(np.linspace(0,1,len(betas)+1))
color
for idx, (h, beta) in enumerate(zip(hmatrix_pw, betas)):
#---- Analytical solution ----#
= get_policy_from_dist(n_max = NUM_STATES,
theory = pdf_powerlaw,
func = beta)
beta 2, NUM_STATES+1), 1-theory[1:], c = color[idx])
ax_pw.plot(np.arange(
#---- Numerical solution ----#
2, NUM_STATES+2), h, 'o',
ax_pw.plot(np.arange(= color[idx], label = fr'$\beta$ = {beta}', alpha = 0.8, markeredgecolor='None', lw = 0.05)
c
#---- Plot features ----#
= (1.8, 30), ylim = (0.0, 1.01),
plt.setp(ax_pw, xlim =r'$n$', ylabel = r'$\pi_s(\Rsh|n)$',
xlabel = np.round(np.arange(0.2, 1.01, 0.2),1),
yticks = np.round(np.arange(0.2, 1.01, 0.2),1).astype(str),
yticklabels = 'log')
xscale 10, 10, label = 'Theory', c = 'k')
ax_pw.plot( ax_pw.legend()
<matplotlib.legend.Legend>
Bi-exponentials (Fig. 6b)
We consider here distributions of the form \[ \Pr(L) = \sum_{i=1,2} \omega_i (1-e^{-1/\lambda_i}) e^{-(L-1)/\lambda_i} \, , \] with \(\omega = [0.94, 0.06]\), \(\lambda_2 = 5000\) and varying \(\lambda_1\).
= 100 # size of the state space
NUM_STATES = 100 # number of epochs
EPOCHS = 1000 # number of learning steps per episode
NUM_STEPS
# Bi-exponential parameters (lambda_1 will vary)
= np.array([0.94, 0.06])
omegas = np.array([0, 5000]).astype(float)
lambdas = [0.6, 0.6*2, 0.6*8, 0.6*16]
lambdas_1
# Array saving the results
= np.zeros((len(lambdas_1), NUM_STATES))
hmatrix_bi
for idx_l, lambda_1 in enumerate(tqdm(lambdas_1)):
0] = lambda_1
lambdas[= pdf_discrete_sample(pdf_func = pdf_multimode,
steps = lambdas,
lambdas = omegas,
probs = np.arange(1, NUM_STATES),
L = (EPOCHS, NUM_STEPS))
num_samples
= PS_imitation(num_states = NUM_STATES,
imitator = int(1e-7),
eta = 0)
gamma
for e in (range(EPOCHS)):
imitator.reset()for s in steps[e]:
= s)
imitator.update(length # We only save the turn probability
= imitator.h_matrix[1]/imitator.h_matrix.sum(0) hmatrix_bi[idx_l]
0%| | 0/4 [00:00<?, ?it/s]
25%|██▌ | 1/4 [00:01<00:03, 1.14s/it]
50%|█████ | 2/4 [00:02<00:02, 1.16s/it]
75%|███████▌ | 3/4 [00:03<00:01, 1.17s/it]
100%|██████████| 4/4 [00:04<00:00, 1.17s/it]
= plt.subplots(figsize = (4,4))
fig, ax_bi = plt.cm.inferno(np.linspace(0,1,len(lambdas_1)+1))
color
############# Powerlaw #################
for idx, (lambda_1, h) in enumerate(zip(lambdas_1, hmatrix_bi)):
#---- Analytical solution ----#
0] = lambda_1
lambdas[= get_policy_from_dist(n_max = NUM_STATES,
theory = pdf_multimode,
func = lambdas,
lambdas = omegas,)
probs 1, NUM_STATES), 1-np.array(theory[1:]), c = color[idx])
ax_bi.plot(np.arange(
#---- Numerical solution ----#
2, NUM_STATES+2), h, 'o',
ax_bi.plot(np.arange(= color[idx],
c = fr'$d_1$ = {np.round(lambda_1,1)}',
label = 0.8, markeredgecolor='None')
alpha
#---- Plot features ----#
= (1.8, 30), ylim = (0.0, 1.01),
plt.setp(ax_bi, xlim =r'$n$', ylabel = r'$\pi_s(\Rsh|n)$',
xlabel = np.round(np.arange(0.2, 1.01, 0.2),1),
yticks = np.round(np.arange(0.2, 1.01, 0.2),1).astype(str),
yticklabels = 'log')
xscale 10, 10, label = 'Theory', c = 'k')
ax_bi.plot( ax_bi.legend()
<matplotlib.legend.Legend>
= plt.subplots(figsize = (4,4))
fig, ax_bi = plt.cm.inferno(np.linspace(0,1,len(lambdas_1)+1))
color
############# Powerlaw #################
for idx, (lambda_1, h) in enumerate(zip(lambdas_1, hmatrix_bi)):
#---- Analytical solution ----#
0] = lambda_1
lambdas[= get_policy_from_dist(n_max = NUM_STATES,
theory = pdf_multimode,
func = lambdas,
lambdas = omegas,)
probs 2, NUM_STATES+1), 1-theory[1:], c = color[idx])
ax_bi.plot(np.arange(
#---- Numerical solution ----#
2, NUM_STATES+2), h, 'o',
ax_bi.plot(np.arange(= color[idx],
c = fr'$d_1$ = {np.round(lambda_1,1)}',
label = 0.8, markeredgecolor='None')
alpha
#---- Plot features ----#
= (1.8, 30), ylim = (0.0, 1.01),
plt.setp(ax_bi, xlim =r'$n$', ylabel = r'$\pi_s(\Rsh|n)$',
xlabel = np.round(np.arange(0.2, 1.01, 0.2),1),
yticks = np.round(np.arange(0.2, 1.01, 0.2),1).astype(str),
yticklabels = 'log')
xscale 10, 10, label = 'Theory', c = 'k')
ax_bi.plot( ax_bi.legend()
<matplotlib.legend.Legend>
Effect of a cutoff \(L_{max}\) (Fig. 7c)
In our paper we explain that introducing a cutoff in the distribution used by the expert in the imitation scheme affects the resulting policy of the agent. Here we show this effect, using one of the previous bi-exponential policies as example:
# Simulation parameters
= 1000 # size of the state space
NUM_STATES = 100 # number of epochs
EPOCHS = 1000 # number of learning steps per episode
NUM_STEPS
# Distribution paremeters
= np.array([0.94, 0.06])
omegas = np.array([0.6, 5000])
lambdas # Get theoretical policy (without cutoff)
= get_policy_from_dist(n_max = NUM_STATES,
theory_nocutoff = pdf_multimode,
func = lambdas,
lambdas = omegas,)
probs
# Setting a max step length
= [30, 150, 500, 2000, 10000]
L_cutoffs # To make the loop more efficient, we sample now all steps,
# which we will then cut if the are bigger than L_cutoff
= pdf_discrete_sample(pdf_func = pdf_multimode,
steps_og = lambdas,
lambdas = omegas,
probs = NUM_STATES,
L_max = EPOCHS*NUM_STEPS)
num_samples
= np.zeros((len(L_cutoffs), NUM_STATES))
hmatrix_co for idx_c, L_cutoff in enumerate(tqdm(L_cutoffs)):
# Copy steps we sampled above and apply cutoff.
# We re-generate the cutted steps
= steps_og.copy()
steps while np.max(steps) > L_cutoff:
> L_cutoff] = pdf_discrete_sample(pdf_func = pdf_multimode,
steps[steps = lambdas,
lambdas = omegas,
probs = NUM_STATES,
L_max = len(steps[steps > L_cutoff]))
num_samples = steps.reshape(EPOCHS, NUM_STEPS)
steps
# Define PS imitator
= PS_imitation(num_states = NUM_STATES,
imitator = int(1e-7),
eta = 0)
gamma # Training
for e in (range(EPOCHS)):
imitator.reset()for s in steps[e]:
= s)
imitator.update(length # Saving
= imitator.h_matrix[1]/imitator.h_matrix.sum(0) hmatrix_co[idx_c]
= plt.subplots(figsize = (6, 3))
fig, ax_co = plt.cm.cividis(np.linspace(0,1,len(L_cutoffs)+1))
color
for idx, (h, L_cutoff) in enumerate(zip(hmatrix_co, L_cutoffs)):
#---- Numerical solutions ----#
2, NUM_STATES+2), h, 'o',
ax_co.plot(np.arange(= color[idx], label = r'$L_{max}$ = '+f'{L_cutoff}',
c = 0.8, markeredgecolor='None', rasterized=True)
alpha 2, NUM_STATES+2), h,
ax_co.plot(np.arange(= color[idx], alpha = 0.2)
c
#---- Analytical solutions ----#
2, NUM_STATES+1), 1-theory_nocutoff[1:], '-',
ax_co.plot(np.arange(= 'k', alpha = 0.8, label = r'$L_{max}\rightarrow \infty$')
c
#---- Plot features ----#
0.5, c = 'k', ls = '--', alpha = 0.5, zorder = -1)
ax_co.axhline(1.5, 0.52, r'$\pi_0$', alpha = 0.5)
ax_co.text(= 'upper right', fontsize = 8)
plt.legend(loc =r'$n$', ylabel = r'$\pi(\Rsh|n)$', xscale = 'log') plt.setp(ax_co, xlabel
[Text(0.5, 0, '$n$'), Text(0, 0.5, '$\\pi(\\Rsh|n)$'), None]