import torch as th
from torch import nn
from torch.optim import Adam, RMSprop
import numpy as np
from MARL.common.Memory import ReplayMemory
from MARL.common.Model import QNetwork
from MARL.common.utils import identity, to_tensor_var
class DQN:
“””
DQN agent
using pytorch model approximation based method
– take exploration action, expect epsilon value or use decay_epsilon()
– save experiences to replay memory
– train model and update values on batch sample
– save model
“””
def __init__(self, state_dim, action_dim,
memory_capacity=10000, batch_size=100,
reward_gamma=0.99, reward_scale=1.,
target_update_freq=30,
actor_hidden_size=128, actor_output_act=identity,
critic_loss=”mse”, actor_lr=0.001,
optimizer_type=”rmsprop”, max_grad_norm=0.5,
epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.998,
use_cuda=True):
self.state_dim = state_dim
self.action_dim = action_dim
self.reward_gamma = reward_gamma
self.reward_scale = reward_scale
self.memory = ReplayMemory(memory_capacity)
self.actor_hidden_size = actor_hidden_size
self.actor_output_act = actor_output_act
self.critic_loss = critic_loss
self.actor_lr = actor_lr
self.optimizer_type = optimizer_type
self.max_grad_norm = max_grad_norm
self.batch_size = batch_size
self.target_update_freq = target_update_freq
# params for epsilon greedy
self.epsilon = epsilon_start
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.train_count = 0
self.use_cuda = use_cuda and th.cuda.is_available()
self.actor = QNetwork(self.state_dim, self.actor_hidden_size,
self.action_dim)
self.target = QNetwork(self.state_dim, self.actor_hidden_size,
self.action_dim)
self.update_target()
if self.optimizer_type == “adam”:
self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
elif self.optimizer_type == “rmsprop”:
self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr)
if self.use_cuda:
self.actor.cuda()
self.target.cuda()
# train on a sample batch
def learn(self):
batch = self.memory.sample(self.batch_size)
states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
actions_var = to_tensor_var(batch.actions, self.use_cuda, “long”).view(-1, 1)
rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)
dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)
# compute Q(s_t, a) – the model computes Q(s_t), then we select the
# columns of actions taken
current_q = self.actor(states_var).gather(1, actions_var)
# compute V(s_{t+1}) for all next states and all actions,
# and we then take max_a { V(s_{t+1}) }
next_state_action_values = self.target(next_states_var).detach()
next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)
# compute target q by: r + gamma * max_a { V(s_{t+1}) }
target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. – dones_var)
# update value network
self.actor_optimizer.zero_grad()
if self.critic_loss == “huber”:
loss = th.nn.functional.smooth_l1_loss(current_q, target_q)
else:
loss = th.nn.MSELoss()(current_q, target_q)
loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
self.train_count += 1
if self.train_count % self.target_update_freq == 0:
self.update_target()
def shared_learning(self, n_agents, agent_index, shared_batch_sample):
states_var = to_tensor_var(shared_batch_sample.states, self.use_cuda).view(-1, self.state_dim * n_agents)
actions_var = to_tensor_var(shared_batch_sample.actions, self.use_cuda, “long”).view(-1, n_agents, 1)
rewards_var = to_tensor_var(shared_batch_sample.rewards, self.use_cuda).view(-1, n_agents, 1)
next_states_var = to_tensor_var(shared_batch_sample.next_states, self.use_cuda).view(-1, self.state_dim * n_agents)
dones_var = to_tensor_var(shared_batch_sample.dones, self.use_cuda).view(-1, n_agents)
# compute Q(s_t, a) – the model computes Q(s_t), then we select the
# columns of actions taken
current_q = self.actor(states_var).gather(1, actions_var[:, agent_index, :])
# compute V(s_{t+1}) for all next states and all actions,
# and we then take max_a { V(s_{t+1}) }
next_state_action_values = self.target(next_states_var).detach()
next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)
# compute target q by: r + gamma * max_a { V(s_{t+1}) }
target_q = self.reward_scale * rewards_var[:, n_agents, :] + self.reward_gamma * next_q * (1. – dones_var[:, agent_index, :])
print(“is it the same dim!”)
print(len(next_q) == len(rewards_var[:, n_agents, :]))
# update value network
self.actor_optimizer.zero_grad()
if self.critic_loss == “huber”:
loss = th.nn.functional.smooth_l1_loss(current_q, target_q)
else:
loss = th.nn.MSELoss()(current_q, target_q)
loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
self.train_count += 1
if self.train_count % self.target_update_freq == 0:
self.update_target()
def update_target(self):
self.target.load_state_dict(self.actor.state_dict())
# choose an action based on state with random noise added for exploration in training
def exploration_action(self, state, epsilon=None):
if epsilon:
self.epsilon = epsilon
else:
self.decay_epsilon()
if np.random.rand() < self.epsilon:
action = np.random.choice(self.action_dim)
else:
action = self.action(state)
return action
# choose an action based on state for execution
def action(self, state):
state_var = to_tensor_var([state], self.use_cuda)
state_action_value_var = self.actor(state_var)
if self.use_cuda:
state_action_value = state_action_value_var.data.cpu().numpy()[0]
else:
state_action_value = state_action_value_var.data.numpy()[0]
action = np.argmax(state_action_value)
return action
def remember(self, state, action, reward, new_state, done):
self.memory.push(state, action, reward, new_state, done)
def decay_epsilon(self):
# decrease the exploration rate epsilon over time
if self.epsilon > self.epsilon_end:
# self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_end, self.epsilon)
def save(self, global_step, out_dir=’/model’):
actor_file_path = out_dir + f”/actor_.pt”
th.save({‘global_step’: global_step,
‘model_state_dict’: self.actor.state_dict(),
‘optimizer_state_dict’: self.actor_optimizer.state_dict()},
actor_file_path)
import random
import numpy as np
import torch as th
from torch import nn
from torch.optim import Adam, RMSprop
from MARL.common.Memory import ReplayMemory
from MARL.common.Model import CriticNetwork, ActorNet
from MARL.common.utils import entropy, index_to_one_hot, to_tensor_var
class A2C:
“””
A2C agent
using pytorch model approximation learning based method
– take exploration action, expect epsilon value or use decay_epsilon()
– save experiences to replay memory
– train actor critic model
– Actor takes state as input
– Critic takes both state and action as input
– save model
“””
def __init__(self, state_dim, action_dim,
memory_capacity=10000,
reward_gamma=0.99, reward_scale=1.,
actor_hidden_size=32, critic_hidden_size=32,
actor_output_act=nn.functional.softmax, critic_loss=”mse”,
actor_lr=0.001, critic_lr=0.001,
optimizer_type=”rmsprop”, entropy_reg=0.01,
max_grad_norm=0.5, batch_size=100,
epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.003,
marl_training_strategy=”concurrent”, use_cuda=True):
self.state_dim = state_dim
self.action_dim = action_dim
self.reward_gamma = reward_gamma
self.reward_scale = reward_scale
self.memory = ReplayMemory(memory_capacity)
self.actor_hidden_size = actor_hidden_size
self.critic_hidden_size = critic_hidden_size
self.actor_output_act = actor_output_act
self.critic_loss = critic_loss
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.optimizer_type = optimizer_type
self.entropy_reg = entropy_reg
self.max_grad_norm = max_grad_norm
self.batch_size = batch_size
self.target_tau = 0.01
# params for epsilon greedy
self.epsilon = epsilon_start
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.use_cuda = use_cuda and th.cuda.is_available()
self.marl_training_strategy = marl_training_strategy
self.actor = ActorNet(self.state_dim, actor_hidden_size, self.action_dim)
self.critic = CriticNetwork(self.state_dim, self.action_dim, self.critic_hidden_size, 1)
if self.optimizer_type == “adam”:
self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)
self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr)
elif self.optimizer_type == “rmsprop”:
self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr)
self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr)
if self.use_cuda:
self.actor.cuda()
# train on a roll_out batch
def learn(self):
“””
Note:
– use learn() after pushing some experiences to the replay memory
– if the environment is multi-agent with centralized training consider use shared_learning()
“””
assert self.marl_training_strategy == “concurrent”
batch = self.memory.sample(self.batch_size)
states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)
actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim)
rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
# update actor network
self.actor_optimizer.zero_grad()
action_log_probs = self.actor(states_var)
entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
action_log_probs = th.sum(action_log_probs * actions_var, 1)
values = self.critic(states_var, actions_var)
advantages = rewards_var – values.detach()
pg_loss = -th.mean(action_log_probs * advantages)
actor_loss = pg_loss – entropy_loss * self.entropy_reg
actor_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
# update critic network
self.critic_optimizer.zero_grad()
target_values = rewards_var
if self.critic_loss == “huber”:
critic_loss = nn.functional.smooth_l1_loss(values, target_values)
else:
critic_loss = nn.MSELoss()(values, target_values)
critic_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)
self.critic_optimizer.step()
def shared_learning(self,
n_agents,
agent_index,
shared_critic,
shared_critic_optim,
shared_batch_sample):
“””
centralized learning for N agents
update and synchronize the shared critic network parameters during the learning process.
@mahmoudtaouti
“””
assert self.marl_training_strategy == “centralized”
states_var = to_tensor_var(shared_batch_sample.states, self.use_cuda).view(-1, n_agents, self.state_dim)
one_hot_actions = [index_to_one_hot(a, self.action_dim) for a in shared_batch_sample.actions]
actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, n_agents, self.action_dim)
rewards_var = to_tensor_var(shared_batch_sample.rewards, self.use_cuda).view(-1, n_agents, 1)
whole_states_var = states_var.view(-1, n_agents * self.state_dim)
whole_actions_var = actions_var.view(-1, n_agents * self.action_dim)
# update actor network
self.actor_optimizer.zero_grad()
action_log_probs = self.actor(states_var[:, agent_index, :])
entropy_loss = th.mean(entropy(th.exp(action_log_probs)))
action_log_probs = th.sum(action_log_probs * actions_var[:, agent_index, :], 1)
values = shared_critic(whole_states_var, whole_actions_var)
# calculate advantages
advantages = rewards_var[:, agent_index, :] – values.detach()
pg_loss = -th.mean(action_log_probs * advantages)
actor_loss = pg_loss – entropy_loss * self.entropy_reg
actor_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
# update critic network
shared_critic_optim.zero_grad()
target_values = rewards_var[:, agent_index, :]
critic_loss = nn.MSELoss()(values, target_values)
critic_loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm_(shared_critic.parameters(), self.max_grad_norm)
shared_critic_optim.step()
# predict softmax action based on state
def _softmax_action(self, state):
state_var = to_tensor_var([state], self.use_cuda)
softmax_action_var = th.exp(self.actor(state_var))
# dist = th.distributions.Categorical(probs=state_var)
# dist = dist.sample()
# action = dist.detach().data.numpy()[0]
if self.use_cuda:
softmax_action = softmax_action_var.data.cpu().numpy()[0]
else:
softmax_action = softmax_action_var.data.numpy()[0]
return softmax_action
# choose an action based on state with random noise added for exploration in training
def exploration_action(self, state, epsilon=None):
if epsilon:
self.epsilon = epsilon
else:
self.decay_epsilon()
softmax_action = self._softmax_action(state)
# the epsilon greedy is calculated with MARL
if np.random.rand() < self.epsilon:
action = np.random.choice(self.action_dim)
else:
action = np.argmax(softmax_action)
return action
# choose an action based on state for execution
def action(self, state):
softmax_action = self._softmax_action(state)
action = np.argmax(softmax_action)
return action
def remember(self, state, action, reward, new_state, done):
self.memory.push(state, action, reward, new_state, done)
def decay_epsilon(self):
# decrease the exploration rate epsilon over time
if self.epsilon > self.epsilon_end:
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_end, self.epsilon)
# evaluate value for a state-action pair
def value(self, state, action):
state_var = to_tensor_var([state], self.use_cuda)
action = index_to_one_hot(action, self.action_dim)
action_var = to_tensor_var([action], self.use_cuda)
value_var = self.critic(state_var, action_var)
if self.use_cuda:
value = value_var.data.cpu().numpy()[0]
else:
value = value_var.data.numpy()[0]
return value
def save(self, global_step, out_dir=’/model’):
actor_file_path = out_dir + f”/actor_.pt”
critic_file_path = out_dir + f”/critic_.pt”
th.save({‘global_step’: global_step,
‘model_state_dict’: self.actor.state_dict(),
‘optimizer_state_dict’: self.actor_optimizer.state_dict()},
actor_file_path)
th.save({‘global_step’: global_step,
‘model_state_dict’: self.critic.state_dict(),
‘optimizer_state_dict’: self.critic_optimizer.state_dict()},
critic_file_path)
import keras.models as models
from keras.layers import Dense
from keras.optimizers import Adam
import numpy as np
import random
from collections import deque
from util.ModifiedTensorBoard import ModifiedTensorBoard
seed = 10
np.random.seed(seed)
random.seed(seed)
class DQN_TF:
“””
DQN agent
using tensorflow model approximation based method
– take exploration action, expect epsilon value or use decay_epsilon()
– save experiences to replay memory
– train model and update values on batch sample
– save model
@mahmoudtaouti
“””
def __init__(self, state_dim, action_dim,
memory_capacity=10000, batch_size=100,
reward_gamma=0.99, reward_scale=1.,
target_update_freq=30,
actor_hidden_size=128, actor_lr=0.001,
optimizer_type=”rmsprop”,
epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.001,
load_model=None):
self.state_dim = state_dim
self.action_dim = action_dim
self.reward_gamma = reward_gamma
self.reward_scale = reward_scale
self.actor_hidden_size = actor_hidden_size
self.actor_lr = actor_lr
self.optimizer_type = optimizer_type
self.batch_size = batch_size
self.target_update_freq = target_update_freq
# params for epsilon greedy
self.epsilon = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.memory = deque(maxlen=memory_capacity)
self.loaded_model = load_model
self.actor = self._load_model() if load_model else self._build_model()
self.target = self._build_model()
self.target.set_weights(self.actor.get_weights())
# Used to count when to update target network with main network’s weights
def _build_model(self):
# neural network for deep-q learning model
model = models.Sequential()
model.add(Dense(self.actor_hidden_size, input_dim=self.state_dim, activation=’relu’))
model.add(Dense(self.action_dim, activation=’linear’))
model.compile(loss=’mse’, optimizer=Adam(learning_rate=self.actor_lr), metrics=[‘accuracy’])
return model
def _load_model(self):
return models.load_model(self.loaded_model)
def learn(self):
# Start training only if certain number of samples is already saved
if len(self.memory) < self.batch_size:
return
batch = random.sample(self.memory, self.batch_size)
# Get current states from minibatch, then query NN model for Q values
states = np.array([transition[0] for transition in batch])
qs_list = self.actor.predict(states)
# Get future states from batch, then query NN model for Q values
new_states = np.array([transition[3] for transition in batch])
future_qs_list = self.target.predict(new_states)
# train the model on the batch of experiences
for index, (state, action, reward, next_state, done) in enumerate(batch):
# compute target q by: r + gamma * max_a { V(s_{t+1}) }
new_q = self.reward_scale * reward + self.reward_gamma * max(future_qs_list[index]) * (1 – int(done))
# Update Q value for given state
current_qs = qs_list[index]
current_qs[action] = new_q
qs_list[index] = current_qs
X = np.array(states)
Y = np.array(qs_list)
self.actor.fit(X, Y, batch_size=self.batch_size, epochs=3, verbose=0, shuffle=False)
def shared_learning(self, n_agents, agent_index, shared_batch_sample):
self.learn()
def update_target(self):
self.target.set_weights(self.actor.get_weights())
# predict or get random action
def exploration_action(self, state, epsilon=None):
self.epsilon = epsilon if epsilon else self.epsilon
if np.random.rand() > self.epsilon:
return self.action(state)
else:
return random.randrange(0, self.action_dim)
def action(self, state):
X = np.array([state])
qs = self.actor.predict(X)
return np.argmax(qs)
# Adds step’s experience to a memory replay
# (observation space, action, reward, new observation space, done)
def remember(self, state, action, reward, new_state, done):
self.memory.append((state, action, reward, new_state, done))
def decay_epsilon(self):
# decrease the exploration rate epsilon over time
if self.epsilon > self.epsilon_end:
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_end, self.epsilon)
def save(self, global_step=None, out_dir=’/model’):
actor_file_path = out_dir + f”/actor_eps{global_step}.model”
self.actor.save(filepath=actor_file_path)
import torch as th
import os
from MARL.common.Model import QNetwork, QMIXNet
from MARL.common.utils import to_tensor_var
class QMIX_Agent:
def __init__(self, args):
self.n_actions = args.n_actions
self.n_agents = args.n_agents
self.state_shape = args.state_shape
self.obs_shape = args.obs_shape
input_shape = self.obs_shape
if args.last_action:
input_shape += self.n_actions
if args.reuse_network:
input_shape += self.n_agents
self.eval_q = QNetwork(input_shape, )
self.target_q = QNetwork(input_shape, )
self.eval_qmix_net = QMIXNet( N, state_dim, hidden_size, hyper_hidden_dim, hyper_layers_num)
self.target_q.load_state_dict(self.eval_q.state_dict())
self.eval_parameters = list(self.eval_qmix_net.parameters()) + list(self.eval_q.parameters())
if args.optimizer == “RMS”:
self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr)
def shared_learning(self, batch, n_agents, qmix_net):
batch = self.memory.sample(self.batch_size)
states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)
actions_var = to_tensor_var(batch.actions, self.use_cuda, “long”).view(-1, 1)
rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)
next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)
dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)
# Compute Q(s_t, a) using the actor network
current_q = self.actor(states_var).gather(1, actions_var)
# Compute Q_tot using the mixing network
q_tot = self.mixing_network.compute_Q_tot(batch.states, batch.actions, self.actor, self.target)
# Compute the target Q values
next_state_action_values = self.target(next_states_var).detach()
next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)
target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. – dones_var)
# Update actor network
self.actor_optimizer.zero_grad()
loss = th.nn.functional.smooth_l1_loss(current_q, q_tot) # Using QMIX loss
loss.backward()
if self.max_grad_norm is not None:
nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)
self.actor_optimizer.step()
self.train_count += 1
if self.train_count % self.target_update_freq == 0:
self.update_target()
def save_model(self, train_step):
num = str(train_step // self.args.save_cycle)
if not os.path.exists(self.model_dir):
os.makedirs(self.model_dir)
torch.save(self.eval_qmix_net.state_dict(), self.model_dir + ‘/’ + num + ‘_qmix_net_params.pkl’)
torch.save(self.eval_rnn.state_dict(), self.model_dir + ‘/’ + num + ‘_rnn_net_params.pkl’)
import math
import torch as th
from torch.autograd import Variable
import numpy as np
def identity(x):
return x
def entropy(p):
return -th.sum(p * th.log(p), 1)
def kl_log_probs(log_p1, log_p2):
return -th.sum(th.exp(log_p1) * (log_p2 – log_p1), 1)
def index_to_one_hot(index, dim):
if isinstance(index, np.int) or isinstance(index, np.int64):
one_hot = np.zeros(dim)
one_hot[index] = 1.
else:
one_hot = np.zeros((len(index), dim))
one_hot[np.arange(len(index)), index] = 1.
return one_hot
def to_tensor_var(x, use_cuda=False, dtype=”float”):
FloatTensor = th.cuda.FloatTensor if use_cuda else th.FloatTensor
LongTensor = th.cuda.LongTensor if use_cuda else th.LongTensor
ByteTensor = th.cuda.ByteTensor if use_cuda else th.ByteTensor
if dtype == “float”:
x = np.array(x, dtype=np.float64).tolist()
return Variable(FloatTensor(x))
elif dtype == “long”:
x = np.array(x, dtype=np.long).tolist()
return Variable(LongTensor(x))
elif dtype == “byte”:
x = np.array(x, dtype=np.byte).tolist()
return Variable(ByteTensor(x))
else:
x = np.array(x, dtype=np.float64).tolist()
return Variable(FloatTensor(x))
def agg_stat_list(agg_list):
“””
Aggregate then calculate statistics
l: [ […], […], […] ]
l_i: result of each step in the i-th episode
Returns:
mean, std, max, min
“””
s = [np.sum(np.array(l_i)) for l_i in agg_list]
s_mu = np.mean(np.array(s))
s_std = np.std(np.array(s))
s_max = np.max(np.array(s))
s_min = np.min(np.array(s))
return s_mu, s_std, s_max, s_min
def exponential_epsilon_decay(epsilon_start, epsilon_end, decay_rate, episode):
“””
Exponential epsilon decay function.
Args:
epsilon_start (float): Starting value of epsilon.
epsilon_end (float): Minimum value of epsilon.
decay_rate (float): Decay rate of epsilon e.g 0.01, 0.001, 0.0001.
episode (int): Current episode number.
Returns:
float: Decayed epsilon value for the given episode.
“””
epsilon = epsilon_end + (epsilon_start – epsilon_end) * math.exp(-decay_rate * episode)
return epsilon
def greedy_epsilon_decay(epsilon, epsilon_end, decay_rate):
“””
Decrease the epsilon over time
Args:
epsilon (float): Current value of epsilon.
epsilon_end (float): Minimum value of epsilon.
decay_rate (float): Decay rate of epsilon e.g 0.99 0.995 0.98.
Returns:
float: Decayed epsilon value.
“””
if epsilon > epsilon_end:
epsilon *= decay_rate
return max(epsilon_end, epsilon)
return epsilon
import random
from collections import namedtuple
Experience = namedtuple(“Experience”,
(“states”, “actions”, “rewards”, “next_states”, “dones”))
class ReplayMemory(object):
“””
Replay memory buffer
“””
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
def _push_one(self, state, action, reward, next_state=None, done=None):
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Experience(state, action, reward, next_state, done)
self.position = (self.position + 1) % self.capacity
def push(self, states, actions, rewards, next_states=None, dones=None):
if isinstance(states, list):
if next_states is not None and len(next_states) > 0:
for s, a, r, n_s, d in zip(states, actions, rewards, next_states, dones):
self._push_one(s, a, r, n_s, d)
else:
for s, a, r in zip(states, actions, rewards):
self._push_one(s, a, r)
else:
self._push_one(states, actions, rewards, next_states, dones)
def sample(self, batch_size):
if batch_size > len(self.memory):
batch_size = len(self.memory)
transitions = random.sample(self.memory, batch_size)
batch = Experience(*zip(*transitions))
return batch
def get_experiences(self):
batch = Experience(*zip(*self.memory))
return batch
def clear_memory(self):
self.memory = []
self.position = 0
def __len__(self):
return len(self.memory)
import torch as th
from torch import nn
import torch.functional as F
class ActorNet(nn.Module):
def __init__(self, state_dim, hidden_size, output_size):
super().__init__()
self.model = nn.Sequential(
nn.Linear(state_dim, hidden_size),
nn.Tanh(),
nn.Linear(hidden_size, hidden_size),
nn.Tanh(),
nn.Linear(hidden_size, output_size),
nn.Softmax(dim=1)
)
def __call__(self, state):
return self.model(state)
class ActorNetwork(nn.Module):
“””
A network for actor
“””
def __init__(self, state_dim, hidden_size, output_size, output_activation):
super(ActorNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
# activation function for the output
self.output_activation = output_activation
def __call__(self, state):
out = nn.functional.relu(self.fc1(state))
out = nn.functional.relu(self.fc2(out))
out = self.output_activation(self.fc3(out))
return out
class CriticNetwork(nn.Module):
“””
A network for critic
“””
def __init__(self, state_dim, action_dim, hidden_size, output_size=1):
super(CriticNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size + action_dim, hidden_size)
self.fc3 = nn.Linear(hidden_size, output_size)
def __call__(self, state, action):
out = nn.functional.relu(self.fc1(state))
out = th.cat([out, action], 1)
out = nn.functional.relu(self.fc2(out))
out = self.fc3(out)
return out
class ActorCriticNetwork(nn.Module):
“””
An actor-critic network that shared lower-layer representations but
have distinct output layers
“””
def __init__(self, state_dim, action_dim, hidden_size,
actor_output_act, critic_output_size=1):
super(ActorCriticNetwork, self).__init__()
self.fc1 = nn.Linear(state_dim, hidden_size)
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.actor_linear = nn.Linear(hidden_size, action_dim)
self.critic_linear = nn.Linear(hidden_size, critic_output_size)
self.actor_output_act = actor_output_act
def __call__(self, state):
out = nn.functional.relu(self.fc1(state))
out = nn.functional.relu(self.fc2(out))
act = self.actor_output_act(self.actor_linear(out))
val = self.critic_linear(out)
return act, val
class QNetwork(nn.Module):
“””
A network for the Q-function in DQN
“””
def __init__(self, state_dim, hidden_size, action_dim):
super().__init__()
self.model = nn.Sequential(
nn.Linear(state_dim, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, action_dim),
)
def __call__(self, state):
return self.model(state)
class QMIXNet(nn.Module):
def __init__(self, N, state_dim, hidden_size, hyper_hidden_dim, hyper_layers_num):
super(QMIXNet, self).__init__()
self.N = N
self.state_dim = state_dim
self.qmix_hidden_dim = hidden_size
self.hyper_hidden_dim = hyper_hidden_dim
self.hyper_layers_num = hyper_layers_num
“””
w1:(N, qmix_hidden_dim)
b1:(1, qmix_hidden_dim)
w2:(qmix_hidden_dim, 1)
b2:(1, 1)
Because the generated hyper_w1 needs to be a matrix, and the pytorch neural network can only output a vector,
So first output the vector whose length is the required matrix row*matrix column, and then convert it into a matrix
“””
if self.hyper_layers_num == 2:
self.hyper_w1 = nn.Sequential(nn.Linear(self.state_dim, self.hyper_hidden_dim),
nn.ReLU(),
nn.Linear(self.hyper_hidden_dim, self.N * self.qmix_hidden_dim))
self.hyper_w2 = nn.Sequential(nn.Linear(self.state_dim, self.hyper_hidden_dim),
nn.ReLU(),
nn.Linear(self.hyper_hidden_dim, self.qmix_hidden_dim * 1))
elif self.hyper_layers_num == 1:
self.hyper_w1 = nn.Linear(self.state_dim, self.N * self.qmix_hidden_dim)
self.hyper_w2 = nn.Linear(self.state_dim, self.qmix_hidden_dim * 1)
self.hyper_b1 = nn.Linear(self.state_dim, self.qmix_hidden_dim)
self.hyper_b2 = nn.Sequential(nn.Linear(self.state_dim, self.qmix_hidden_dim),
nn.ReLU(),
nn.Linear(self.qmix_hidden_dim, 1))
def __call__(self, q, s):
# q.shape(batch_size, max_episode_len, N)
# s.shape(batch_size, max_episode_len,state_dim)
q = q.view(-1, 1, self.N) # (batch_size * max_episode_len, 1, N)
s = s.reshape(-1, self.state_dim) # (batch_size * max_episode_len, state_dim)
w1 = th.abs(self.hyper_w1(s)) # (batch_size * max_episode_len, N * qmix_hidden_dim)
b1 = self.hyper_b1(s) # (batch_size * max_episode_len, qmix_hidden_dim)
w1 = w1.view(-1, self.N, self.qmix_hidden_dim) # (batch_size * max_episode_len, N, qmix_hidden_dim)
b1 = b1.view(-1, 1, self.qmix_hidden_dim) # (batch_size * max_episode_len, 1, qmix_hidden_dim)
# torch.bmm: 3 dimensional tensor multiplication
q_hidden = F.elu(th.bmm(q, w1) + b1) # (batch_size * max_episode_len, 1, qmix_hidden_dim)
w2 = th.abs(self.hyper_w2(s)) # (batch_size * max_episode_len, qmix_hidden_dim * 1)
b2 = self.hyper_b2(s) # (batch_size * max_episode_len,1)
w2 = w2.view(-1, self.qmix_hidden_dim, 1) # (batch_size * max_episode_len, qmix_hidden_dim, 1)
b2 = b2.view(-1, 1, 1) # (batch_size * max_episode_len, 1, 1)
q_total = th.bmm(q_hidden, w2) + b2 # (batch_size * max_episode_len, 1, 1)
q_total = q_total.view(self.batch_size, -1, 1) # (batch_size, max_episode_len, 1)
return q_total
import os
import numpy as np
import torch.nn as nn
import torch as th
from MARL.agent.DQN import DQN
from MARL.common.Memory import ReplayMemory
from MARL.common.utils import exponential_epsilon_decay
from util.ModifiedTensorBoard import ModifiedTensorBoard
class QMIX:
“””
multi-agent Deep Q-Network
– training with concurrent or centralized learnings
– two model option torch or tensor
– using target model for stable learning
– exploration action with exponential epsilon greedy
@mahmoudtaouti
“””
def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,
reward_gamma=0.99, reward_scale=1.,
actor_hidden_size=128, target_update_freq=50,
actor_output_act=nn.functional.tanh, critic_loss=”mse”,
actor_lr=0.001, optimizer_type=”adam”,
max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,
epsilon_end=0.01, epsilon_decay=0.003,
training_strategy=”concurrent”, use_cuda=False, model_type=”torch”, outputs_dir=”logs/”):
assert training_strategy in [“concurrent”, “centralized”]
assert model_type in [“torch”, “tensor”]
self.n_agents = n_agents
self.state_dim = state_dim
self.action_dim = action_dim
self.batch_size = batch_size
self.training_strategy = training_strategy
self.use_cuda = use_cuda and th.cuda.is_available()
self.epsilon = epsilon_start
self.epsilon = epsilon_start
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.model_type = model_type
self.tensorboard = ModifiedTensorBoard(outputs_dir)
self.actor_output_act = actor_output_act
self.agents = []
if self.training_strategy == ‘concurrent’:
for _ in range(self.n_agents):
agent = DQN(state_dim=state_dim,
action_dim=action_dim,
memory_capacity=memory_capacity,
reward_gamma=reward_gamma,
reward_scale=reward_scale,
actor_hidden_size=actor_hidden_size,
critic_loss=critic_loss,
actor_lr=actor_lr,
target_update_freq=target_update_freq,
optimizer_type=optimizer_type,
batch_size=batch_size,
epsilon_start=epsilon_start,
epsilon_end=epsilon_end,
epsilon_decay=epsilon_decay,
max_grad_norm=max_grad_norm,
use_cuda=use_cuda)
self.agents.append(agent)
elif self.training_strategy == “centralized”:
# parameters sharing with shared replay memory
self.shared_memory = ReplayMemory(capacity=memory_capacity)
self.agents = [DQN(state_dim=state_dim * n_agents,
action_dim=action_dim,
memory_capacity=memory_capacity,
reward_gamma=reward_gamma,
reward_scale=reward_scale,
actor_hidden_size=actor_hidden_size,
critic_loss=critic_loss,
actor_lr=actor_lr,
target_update_freq=target_update_freq,
optimizer_type=optimizer_type,
batch_size=batch_size,
epsilon_start=epsilon_start,
epsilon_end=epsilon_end,
epsilon_decay=epsilon_decay,
max_grad_norm=max_grad_norm,
use_cuda=use_cuda)] * n_agents
def learn(self):
“””
train for each agent the gathered experiences
“””
if self.training_strategy == “concurrent”:
for agent in self.agents:
agent.learn()
elif self.training_strategy == “centralized”:
batch = self.shared_memory.sample(self.batch_size)
for agent in self.agents:
agent.shared_learning(n_agents=self.n_agents, agent_index=agent, shared_batch_sample=batch)
def remember(self, states, actions, rewards, new_states, dones):
“””
push experiences to replay memory
“””
dones = dones if isinstance(dones, list) else [dones] * self.n_agents
if self.training_strategy == ‘concurrent’:
for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):
agent.remember(s, a, r, n_s, d)
elif self.training_strategy == ‘centralized’:
self.shared_memory.push(states, actions, rewards, new_states, dones)
def exploration_act(self, states, n_episodes):
“””
for each agent make exploration action with exponential epsilon greedy,
and return a tuple of all agents actions
Returns:
tuple(actions)
“””
self.epsilon = exponential_epsilon_decay(
epsilon_start=self.epsilon_start,
epsilon_end=self.epsilon_end,
decay_rate=self.epsilon_decay,
episode=n_episodes)
actions = []
if self.training_strategy == ‘concurrent’:
for agent, state in zip(self.agents, states):
action = agent.exploration_action(state, epsilon=self.epsilon)
actions.append(action)
elif self.training_strategy == ‘centralized’:
for agent, _ in zip(self.agents, states):
action = agent.exploration_action(np.array(states).reshape(-1, self.n_agents * self.state_dim),
epsilon=self.epsilon)
actions.append(action)
return tuple(actions)
def update_targets(self):
“””
update target model weights for each agent
“””
for agent in self.agents:
agent.update_target()
def act(self, states):
“””
for each agent predict action,
Returns:
tuple(actions)
“””
actions = []
if self.training_strategy == ‘concurrent’:
for agent, state in zip(self.agents, states):
action = agent.action(state)
actions.append(action)
elif self.training_strategy == ‘centralized’:
for agent, _ in zip(self.agents, states):
action = agent.action(np.array(states).reshape(-1, self.n_agents * self.state_dim))
actions.append(action)
return tuple(actions)
def save(self, out_dir, checkpoint_num, global_step):
“””
create checkpoint directory,
and save models of all RL agents
“””
os.makedirs(out_dir + “/models”, exist_ok=True)
checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”
os.makedirs(checkpoint_dir, exist_ok=True)
for index, agent in enumerate(self.agents):
if self.model_type == “torch”:
actor_file_path = checkpoint_dir + f”/actor_{index}.pt”
th.save({‘global_step’: global_step,
‘model_state_dict’: agent.actor.state_dict(),
‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},
actor_file_path)
else:
actor_file_path = checkpoint_dir + f”/eps{global_step}_actor_{index}.model”
agent.actor.save(actor_file_path)
def load(self, directory, check_point=None):
“””
load saved models
“””
if not os.path.exists(directory):
raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)
checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory
for index, agent in enumerate(self.agents):
actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)
checkpoint = th.load(actor_file_path)
agent.actor.load_state_dict(checkpoint[‘model_state_dict’])
agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])
import os
import numpy as np
import torch.nn as nn
import torch as th
from MARL.agent.DQN import DQN
from MARL.common.Memory import ReplayMemory
from MARL.common.Model import QMIXNet
from MARL.common.utils import exponential_epsilon_decay
from util.ModifiedTensorBoard import ModifiedTensorBoard
class MADQN:
“””
multi-agent Deep Q-Network
– training with concurrent or centralized learnings
– two model option torch or tensor
– using target model for stable learning
– exploration action with exponential epsilon greedy
@mahmoudtaouti
“””
def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,
reward_gamma=0.99, reward_scale=1.,
actor_hidden_size=128, target_update_freq=50,
actor_output_act=nn.functional.tanh, critic_loss=”mse”,
actor_lr=0.001, optimizer_type=”adam”,
max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,
epsilon_end=0.01, epsilon_decay=0.003,
qmix_hidden_size=128, training_strategy=”concurrent”,
use_cuda=False, model_type=”torch”, outputs_dir=”logs/”):
assert training_strategy in [“concurrent”, “centralized”]
assert model_type in [“torch”, “tensor”]
self.n_agents = n_agents
self.state_dim = state_dim
self.action_dim = action_dim
self.batch_size = batch_size
self.training_strategy = training_strategy
self.use_cuda = use_cuda and th.cuda.is_available()
self.epsilon = epsilon_start
self.epsilon = epsilon_start
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.model_type = model_type
self.tensorboard = ModifiedTensorBoard(outputs_dir)
self.actor_output_act = actor_output_act
self.agents = []
self.agents = [DQN(state_dim=state_dim * n_agents,
action_dim=action_dim,
memory_capacity=memory_capacity,
reward_gamma=reward_gamma,
reward_scale=reward_scale,
actor_hidden_size=actor_hidden_size,
critic_loss=critic_loss,
actor_lr=actor_lr,
target_update_freq=target_update_freq,
optimizer_type=optimizer_type,
batch_size=batch_size,
epsilon_start=epsilon_start,
epsilon_end=epsilon_end,
epsilon_decay=epsilon_decay,
max_grad_norm=max_grad_norm,
use_cuda=use_cuda)] * n_agents
if self.training_strategy == “centralized”:
# parameters sharing with shared replay memory
self.shared_memory = ReplayMemory(capacity=memory_capacity)
self.qmix_net = QMIXNet(n_agents, state_dim, qmix_hidden_size, hyper_hidden_dim, hyper_layers_num)
def learn(self):
“””
train for each agent the gathered experiences
“””
if self.training_strategy == “concurrent”:
for agent in self.agents:
agent.learn()
elif self.training_strategy == “centralized”:
batch = self.shared_memory.sample(self.batch_size)
for agent in self.agents:
agent.shared_learning(n_agents=self.n_agents, agent_index=agent, shared_batch_sample=batch, qmix_net= self.qmix_net)
def remember(self, states, actions, rewards, new_states, dones):
“””
push experiences to replay memory
“””
dones = dones if isinstance(dones, list) else [dones] * self.n_agents
if self.training_strategy == ‘concurrent’:
for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):
agent.remember(s, a, r, n_s, d)
elif self.training_strategy == ‘centralized’:
self.shared_memory.push(states, actions, rewards, new_states, dones)
def exploration_act(self, states, n_episodes):
“””
for each agent make exploration action with exponential epsilon greedy,
and return a tuple of all agents actions
Returns:
tuple(actions)
“””
self.epsilon = exponential_epsilon_decay(
epsilon_start=self.epsilon_start,
epsilon_end=self.epsilon_end,
decay_rate=self.epsilon_decay,
episode=n_episodes)
actions = []
if self.training_strategy == ‘concurrent’:
for agent, state in zip(self.agents, states):
action = agent.exploration_action(state, epsilon=self.epsilon)
actions.append(action)
elif self.training_strategy == ‘centralized’:
for agent, _ in zip(self.agents, states):
action = agent.exploration_action(np.array(states).reshape(-1, self.n_agents * self.state_dim),
epsilon=self.epsilon)
actions.append(action)
return tuple(actions)
def update_targets(self):
“””
update target model weights for each agent
“””
for agent in self.agents:
agent.update_target()
def act(self, states):
“””
for each agent predict action,
Returns:
tuple(actions)
“””
actions = []
if self.training_strategy == ‘concurrent’:
for agent, state in zip(self.agents, states):
action = agent.action(state)
actions.append(action)
elif self.training_strategy == ‘centralized’:
for agent, _ in zip(self.agents, states):
action = agent.action(np.array(states).reshape(-1, self.n_agents * self.state_dim))
actions.append(action)
return tuple(actions)
def save(self, out_dir, checkpoint_num, global_step):
“””
create checkpoint directory,
and save models of all RL agents
“””
os.makedirs(out_dir + “/models”, exist_ok=True)
checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”
os.makedirs(checkpoint_dir, exist_ok=True)
for index, agent in enumerate(self.agents):
if self.model_type == “torch”:
actor_file_path = checkpoint_dir + f”/actor_{index}.pt”
th.save({‘global_step’: global_step,
‘model_state_dict’: agent.actor.state_dict(),
‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},
actor_file_path)
else:
actor_file_path = checkpoint_dir + f”/eps{global_step}_actor_{index}.model”
agent.actor.save(actor_file_path)
def load(self, directory, check_point=None):
“””
load saved models
“””
if not os.path.exists(directory):
raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)
checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory
for index, agent in enumerate(self.agents):
actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)
checkpoint = th.load(actor_file_path)
agent.actor.load_state_dict(checkpoint[‘model_state_dict’])
agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])
import os
import torch as th
from torch.optim import Adam
from MAA2C_config import SHARED_CRITIC_HIDDEN_SIZE
from MARL.agent.A2C import A2C
from MARL.common.Memory import ReplayMemory
from MARL.common.Model import CriticNetwork
from MARL.common.utils import exponential_epsilon_decay
from util.ModifiedTensorBoard import ModifiedTensorBoard
class MAA2C:
“””
multi agent advantage actor critic
@mahmoudtaouti
“””
def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,
reward_gamma=0.99, reward_scale=1.,
actor_hidden_size=128, critic_hidden_size=128,
critic_loss=”mse”, actor_lr=0.001, critic_lr=0.001,
optimizer_type=”rmsprop”, entropy_reg=0.01,
max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,
epsilon_end=0.01, epsilon_decay=0.003,
training_strategy=”concurrent”, use_cuda=False, outputs_dir=”logs/”, is_evaluation=False):
assert training_strategy in [“concurrent”, “centralized”]
self.n_agents = n_agents
self.state_dim = state_dim
self.action_dim = action_dim
self.batch_size = batch_size
self.training_strategy = training_strategy
self.use_cuda = use_cuda and th.cuda.is_available()
self.epsilon = epsilon_start
self.epsilon = epsilon_start
self.epsilon_start = epsilon_start
self.epsilon_end = epsilon_end
self.epsilon_decay = epsilon_decay
self.is_evaluation = is_evaluation
self.tensorboard = ModifiedTensorBoard(outputs_dir)
self.shared_critic = None
self.shared_memory = None
self.shared_critic_optimizer = None
if training_strategy == “centralized” and not is_evaluation:
self.shared_memory = ReplayMemory(capacity=memory_capacity)
self.shared_critic = CriticNetwork(state_dim * n_agents, action_dim * n_agents,
SHARED_CRITIC_HIDDEN_SIZE, 1)
self.shared_critic_optimizer = Adam(self.shared_critic.parameters(), lr=critic_lr)
# Create N agents that share the same model
self.agents = [A2C(state_dim=state_dim,
action_dim=action_dim,
memory_capacity=memory_capacity,
reward_gamma=reward_gamma,
reward_scale=reward_scale,
actor_hidden_size=actor_hidden_size,
critic_hidden_size=critic_hidden_size,
critic_loss=critic_loss,
actor_lr=actor_lr,
critic_lr=critic_lr,
optimizer_type=optimizer_type,
batch_size=batch_size,
epsilon_start=epsilon_start,
epsilon_end=epsilon_end,
epsilon_decay=epsilon_decay,
entropy_reg=entropy_reg,
max_grad_norm=max_grad_norm,
marl_training_strategy=training_strategy,
use_cuda=use_cuda)] * n_agents
def learn(self):
“””
train for each agent the gathered experiences
agent.shared_learning: centralized learning strategy that share the same critic network
agent.learn: concurrent (independent) learning
“””
for index, agent in enumerate(self.agents):
if self.training_strategy == “centralized”:
shared_batch = self.shared_memory.sample(self.batch_size)
agent.shared_learning(n_agents=self.n_agents,
agent_index=index,
shared_batch_sample=shared_batch,
shared_critic=self.shared_critic,
shared_critic_optim=self.shared_critic_optimizer)
else:
agent.learn()
def remember(self, states, actions, rewards, new_states, dones):
“””
push experiences to replay memory
“””
dones = dones if isinstance(dones, list) else [dones] * self.n_agents
if self.training_strategy == “centralized”:
self.shared_memory.push(states, actions, rewards, new_states, dones)
else:
for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):
agent.remember(s, a, r, n_s, d)
def exploration_act(self, states, n_episodes):
“””
for each agent make exploration action,
and return a tuple of all agents actions
using exponential epsilon decay
Returns: tuple(actions)
“””
self.epsilon = exponential_epsilon_decay(
epsilon_start=self.epsilon_start,
epsilon_end=self.epsilon_end,
decay_rate=self.epsilon_decay,
episode=n_episodes)
actions = []
for agent, state in zip(self.agents, states):
action = agent.exploration_action(state, self.epsilon)
actions.append(action)
return tuple(actions)
def act(self, states):
“””
for each agent predict action
and return a tuple of all agents actions
“””
actions = []
for agent, state in zip(self.agents, states):
action = agent.action(state)
actions.append(action)
return tuple(actions)
def save(self, out_dir, checkpoint_num, global_step):
“””
save models of all MAA2C agents
Args:
out_dir (str): Directory path where to save.
checkpoint_num(int): check-point during the training
global_step (int): Global step or checkpoint number to load (optional).
Raises:
FileNotFoundError: If the specified output directory does not exist.
“””
if not os.path.exists(out_dir):
raise FileNotFoundError(f”The directory ‘{out_dir}’ does not exist.”)
os.makedirs(out_dir + “/models”, exist_ok=True)
checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”
os.makedirs(checkpoint_dir, exist_ok=True)
for index, agent in enumerate(self.agents):
actor_file_path = checkpoint_dir + f”/actor_{index}.pt”
critic_file_path = checkpoint_dir + f”/critic_{index}.pt”
shared_critic_oath = checkpoint_dir + f”/shared_critic.pt”
th.save({‘global_step’: global_step,
‘model_state_dict’: agent.actor.state_dict(),
‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},
actor_file_path)
if self.training_strategy == “centralized”:
th.save({‘global_step’: global_step,
‘model_state_dict’: self.shared_critic.state_dict(),
‘optimizer_state_dict’: self.shared_critic_optimizer.state_dict()},
shared_critic_oath)
else:
th.save({‘global_step’: global_step,
‘model_state_dict’: agent.critic.state_dict(),
‘optimizer_state_dict’: agent.critic_optimizer.state_dict()},
critic_file_path)
def load(self, directory, check_point=None):
“””
Load saved models
Args:
directory (str): Directory path where the saved models are located.
check_point (int): Global step or checkpoint number to load (optional).
Raises:
FileNotFoundError: If the specified directory or checkpoint does not exist.
“””
if not os.path.exists(directory):
raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)
checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory
for index, agent in enumerate(self.agents):
actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)
critic_file_path = os.path.join(checkpoint_dir, f”critic_{index}.pt”)
shared_critic_file_path = os.path.join(checkpoint_dir, f”shared_critic.pt”)
checkpoint = th.load(actor_file_path)
agent.actor.load_state_dict(checkpoint[‘model_state_dict’])
agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])
# TODO : correct load for centralized case
if not self.is_evaluation:
if self.training_strategy == “centralized”:
critic_checkpoint = th.load(shared_critic_file_path)
self.shared_critic.load_state_dict(critic_checkpoint[‘model_state_dict’])
self.shared_critic_optimizer.load_state_dict(critic_checkpoint[‘optimizer_state_dict’])
else:
critic_checkpoint = th.load(critic_file_path)
agent.critic.load_state_dict(critic_checkpoint[‘model_state_dict’])
agent.critic_optimizer.load_state_dict(critic_checkpoint[‘optimizer_state_dict’])
print(“Models loaded successfully.”)
Recent Comments