import torch as th

from torch import nn

from torch.optim import Adam, RMSprop

 

import numpy as np

from MARL.common.Memory import ReplayMemory

from MARL.common.Model import QNetwork

from MARL.common.utils import identity, to_tensor_var

 

 

class DQN:

“””

DQN agent

using pytorch model approximation based method

– take exploration action, expect epsilon value or use decay_epsilon()

– save experiences to replay memory

– train model and update values on batch sample

– save model

“””

 

def __init__(self, state_dim, action_dim,

memory_capacity=10000, batch_size=100,

reward_gamma=0.99, reward_scale=1.,

target_update_freq=30,

actor_hidden_size=128, actor_output_act=identity,

critic_loss=”mse”, actor_lr=0.001,

optimizer_type=”rmsprop”, max_grad_norm=0.5,

epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.998,

use_cuda=True):

 

self.state_dim = state_dim

self.action_dim = action_dim

 

self.reward_gamma = reward_gamma

self.reward_scale = reward_scale

 

self.memory = ReplayMemory(memory_capacity)

self.actor_hidden_size = actor_hidden_size

self.actor_output_act = actor_output_act

self.critic_loss = critic_loss

self.actor_lr = actor_lr

self.optimizer_type = optimizer_type

self.max_grad_norm = max_grad_norm

self.batch_size = batch_size

self.target_update_freq = target_update_freq

 

# params for epsilon greedy

self.epsilon = epsilon_start

self.epsilon_start = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

self.train_count = 0

 

self.use_cuda = use_cuda and th.cuda.is_available()

 

self.actor = QNetwork(self.state_dim, self.actor_hidden_size,

self.action_dim)

 

self.target = QNetwork(self.state_dim, self.actor_hidden_size,

self.action_dim)

self.update_target()

 

if self.optimizer_type == “adam”:

self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)

elif self.optimizer_type == “rmsprop”:

self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr)

if self.use_cuda:

self.actor.cuda()

self.target.cuda()

 

# train on a sample batch

def learn(self):

 

batch = self.memory.sample(self.batch_size)

states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)

actions_var = to_tensor_var(batch.actions, self.use_cuda, “long”).view(-1, 1)

rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)

dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

 

# compute Q(s_t, a) – the model computes Q(s_t), then we select the

# columns of actions taken

current_q = self.actor(states_var).gather(1, actions_var)

 

# compute V(s_{t+1}) for all next states and all actions,

# and we then take max_a { V(s_{t+1}) }

next_state_action_values = self.target(next_states_var).detach()

next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)

# compute target q by: r + gamma * max_a { V(s_{t+1}) }

target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. – dones_var)

 

# update value network

self.actor_optimizer.zero_grad()

 

if self.critic_loss == “huber”:

loss = th.nn.functional.smooth_l1_loss(current_q, target_q)

else:

loss = th.nn.MSELoss()(current_q, target_q)

loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)

 

self.actor_optimizer.step()

 

self.train_count += 1

if self.train_count % self.target_update_freq == 0:

self.update_target()

 

def shared_learning(self, n_agents, agent_index, shared_batch_sample):

 

states_var = to_tensor_var(shared_batch_sample.states, self.use_cuda).view(-1, self.state_dim * n_agents)

actions_var = to_tensor_var(shared_batch_sample.actions, self.use_cuda, “long”).view(-1, n_agents, 1)

rewards_var = to_tensor_var(shared_batch_sample.rewards, self.use_cuda).view(-1, n_agents, 1)

next_states_var = to_tensor_var(shared_batch_sample.next_states, self.use_cuda).view(-1, self.state_dim * n_agents)

dones_var = to_tensor_var(shared_batch_sample.dones, self.use_cuda).view(-1, n_agents)

 

# compute Q(s_t, a) – the model computes Q(s_t), then we select the

# columns of actions taken

current_q = self.actor(states_var).gather(1, actions_var[:, agent_index, :])

 

# compute V(s_{t+1}) for all next states and all actions,

# and we then take max_a { V(s_{t+1}) }

next_state_action_values = self.target(next_states_var).detach()

next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)

# compute target q by: r + gamma * max_a { V(s_{t+1}) }

target_q = self.reward_scale * rewards_var[:, n_agents, :] + self.reward_gamma * next_q * (1. – dones_var[:, agent_index, :])

 

print(“is it the same dim!”)

print(len(next_q) == len(rewards_var[:, n_agents, :]))

# update value network

self.actor_optimizer.zero_grad()

 

if self.critic_loss == “huber”:

loss = th.nn.functional.smooth_l1_loss(current_q, target_q)

else:

loss = th.nn.MSELoss()(current_q, target_q)

loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)

 

self.actor_optimizer.step()

 

self.train_count += 1

if self.train_count % self.target_update_freq == 0:

self.update_target()

 

def update_target(self):

self.target.load_state_dict(self.actor.state_dict())

 

# choose an action based on state with random noise added for exploration in training

def exploration_action(self, state, epsilon=None):

if epsilon:

self.epsilon = epsilon

else:

self.decay_epsilon()

 

if np.random.rand() < self.epsilon:

action = np.random.choice(self.action_dim)

else:

action = self.action(state)

return action

 

# choose an action based on state for execution

def action(self, state):

state_var = to_tensor_var([state], self.use_cuda)

state_action_value_var = self.actor(state_var)

if self.use_cuda:

state_action_value = state_action_value_var.data.cpu().numpy()[0]

else:

state_action_value = state_action_value_var.data.numpy()[0]

action = np.argmax(state_action_value)

return action

 

def remember(self, state, action, reward, new_state, done):

self.memory.push(state, action, reward, new_state, done)

 

def decay_epsilon(self):

# decrease the exploration rate epsilon over time

if self.epsilon > self.epsilon_end:

# self.epsilon *= self.epsilon_decay

self.epsilon = max(self.epsilon_end, self.epsilon)

 

def save(self, global_step, out_dir=’/model’):

actor_file_path = out_dir + f”/actor_.pt”

th.save({‘global_step’: global_step,

‘model_state_dict’: self.actor.state_dict(),

‘optimizer_state_dict’: self.actor_optimizer.state_dict()},

actor_file_path)

 

 

 

 

 

import random

 

import numpy as np

import torch as th

from torch import nn

from torch.optim import Adam, RMSprop

 

from MARL.common.Memory import ReplayMemory

from MARL.common.Model import CriticNetwork, ActorNet

from MARL.common.utils import entropy, index_to_one_hot, to_tensor_var

 

 

class A2C:

“””

A2C agent

using pytorch model approximation learning based method

– take exploration action, expect epsilon value or use decay_epsilon()

– save experiences to replay memory

– train actor critic model

– Actor takes state as input

– Critic takes both state and action as input

– save model

“””

 

def __init__(self, state_dim, action_dim,

memory_capacity=10000,

reward_gamma=0.99, reward_scale=1.,

actor_hidden_size=32, critic_hidden_size=32,

actor_output_act=nn.functional.softmax, critic_loss=”mse”,

actor_lr=0.001, critic_lr=0.001,

optimizer_type=”rmsprop”, entropy_reg=0.01,

max_grad_norm=0.5, batch_size=100,

epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.003,

marl_training_strategy=”concurrent”, use_cuda=True):

 

self.state_dim = state_dim

self.action_dim = action_dim

self.reward_gamma = reward_gamma

self.reward_scale = reward_scale

 

self.memory = ReplayMemory(memory_capacity)

self.actor_hidden_size = actor_hidden_size

self.critic_hidden_size = critic_hidden_size

self.actor_output_act = actor_output_act

self.critic_loss = critic_loss

self.actor_lr = actor_lr

self.critic_lr = critic_lr

self.optimizer_type = optimizer_type

self.entropy_reg = entropy_reg

self.max_grad_norm = max_grad_norm

self.batch_size = batch_size

self.target_tau = 0.01

 

# params for epsilon greedy

self.epsilon = epsilon_start

self.epsilon_start = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

 

self.use_cuda = use_cuda and th.cuda.is_available()

self.marl_training_strategy = marl_training_strategy

 

self.actor = ActorNet(self.state_dim, actor_hidden_size, self.action_dim)

self.critic = CriticNetwork(self.state_dim, self.action_dim, self.critic_hidden_size, 1)

 

if self.optimizer_type == “adam”:

self.actor_optimizer = Adam(self.actor.parameters(), lr=self.actor_lr)

self.critic_optimizer = Adam(self.critic.parameters(), lr=self.critic_lr)

elif self.optimizer_type == “rmsprop”:

self.actor_optimizer = RMSprop(self.actor.parameters(), lr=self.actor_lr)

self.critic_optimizer = RMSprop(self.critic.parameters(), lr=self.critic_lr)

if self.use_cuda:

self.actor.cuda()

 

# train on a roll_out batch

def learn(self):

“””

Note:

– use learn() after pushing some experiences to the replay memory

– if the environment is multi-agent with centralized training consider use shared_learning()

“””

 

assert self.marl_training_strategy == “concurrent”

 

batch = self.memory.sample(self.batch_size)

states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)

one_hot_actions = index_to_one_hot(batch.actions, self.action_dim)

actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim)

rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

 

# update actor network

self.actor_optimizer.zero_grad()

action_log_probs = self.actor(states_var)

entropy_loss = th.mean(entropy(th.exp(action_log_probs)))

action_log_probs = th.sum(action_log_probs * actions_var, 1)

values = self.critic(states_var, actions_var)

advantages = rewards_var – values.detach()

pg_loss = -th.mean(action_log_probs * advantages)

actor_loss = pg_loss – entropy_loss * self.entropy_reg

actor_loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)

self.actor_optimizer.step()

 

# update critic network

self.critic_optimizer.zero_grad()

target_values = rewards_var

 

if self.critic_loss == “huber”:

critic_loss = nn.functional.smooth_l1_loss(values, target_values)

else:

critic_loss = nn.MSELoss()(values, target_values)

 

critic_loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm)

self.critic_optimizer.step()

 

def shared_learning(self,

n_agents,

agent_index,

shared_critic,

shared_critic_optim,

shared_batch_sample):

“””

centralized learning for N agents

update and synchronize the shared critic network parameters during the learning process.

@mahmoudtaouti

“””

 

assert self.marl_training_strategy == “centralized”

states_var = to_tensor_var(shared_batch_sample.states, self.use_cuda).view(-1, n_agents, self.state_dim)

one_hot_actions = [index_to_one_hot(a, self.action_dim) for a in shared_batch_sample.actions]

actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, n_agents, self.action_dim)

rewards_var = to_tensor_var(shared_batch_sample.rewards, self.use_cuda).view(-1, n_agents, 1)

 

whole_states_var = states_var.view(-1, n_agents * self.state_dim)

whole_actions_var = actions_var.view(-1, n_agents * self.action_dim)

 

# update actor network

self.actor_optimizer.zero_grad()

action_log_probs = self.actor(states_var[:, agent_index, :])

entropy_loss = th.mean(entropy(th.exp(action_log_probs)))

action_log_probs = th.sum(action_log_probs * actions_var[:, agent_index, :], 1)

 

values = shared_critic(whole_states_var, whole_actions_var)

 

# calculate advantages

advantages = rewards_var[:, agent_index, :] – values.detach()

pg_loss = -th.mean(action_log_probs * advantages)

actor_loss = pg_loss – entropy_loss * self.entropy_reg

actor_loss.backward()

 

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm_(self.actor.parameters(), self.max_grad_norm)

self.actor_optimizer.step()

 

# update critic network

shared_critic_optim.zero_grad()

target_values = rewards_var[:, agent_index, :]

 

critic_loss = nn.MSELoss()(values, target_values)

 

critic_loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm_(shared_critic.parameters(), self.max_grad_norm)

shared_critic_optim.step()

 

# predict softmax action based on state

def _softmax_action(self, state):

state_var = to_tensor_var([state], self.use_cuda)

softmax_action_var = th.exp(self.actor(state_var))

# dist = th.distributions.Categorical(probs=state_var)

# dist = dist.sample()

# action = dist.detach().data.numpy()[0]

if self.use_cuda:

softmax_action = softmax_action_var.data.cpu().numpy()[0]

else:

softmax_action = softmax_action_var.data.numpy()[0]

 

return softmax_action

 

# choose an action based on state with random noise added for exploration in training

def exploration_action(self, state, epsilon=None):

 

if epsilon:

self.epsilon = epsilon

else:

self.decay_epsilon()

 

softmax_action = self._softmax_action(state)

# the epsilon greedy is calculated with MARL

if np.random.rand() < self.epsilon:

action = np.random.choice(self.action_dim)

else:

action = np.argmax(softmax_action)

return action

 

# choose an action based on state for execution

def action(self, state):

softmax_action = self._softmax_action(state)

action = np.argmax(softmax_action)

return action

 

def remember(self, state, action, reward, new_state, done):

self.memory.push(state, action, reward, new_state, done)

 

def decay_epsilon(self):

# decrease the exploration rate epsilon over time

if self.epsilon > self.epsilon_end:

self.epsilon *= self.epsilon_decay

self.epsilon = max(self.epsilon_end, self.epsilon)

 

# evaluate value for a state-action pair

def value(self, state, action):

state_var = to_tensor_var([state], self.use_cuda)

action = index_to_one_hot(action, self.action_dim)

action_var = to_tensor_var([action], self.use_cuda)

value_var = self.critic(state_var, action_var)

if self.use_cuda:

value = value_var.data.cpu().numpy()[0]

else:

value = value_var.data.numpy()[0]

return value

 

def save(self, global_step, out_dir=’/model’):

actor_file_path = out_dir + f”/actor_.pt”

critic_file_path = out_dir + f”/critic_.pt”

 

th.save({‘global_step’: global_step,

‘model_state_dict’: self.actor.state_dict(),

‘optimizer_state_dict’: self.actor_optimizer.state_dict()},

actor_file_path)

th.save({‘global_step’: global_step,

‘model_state_dict’: self.critic.state_dict(),

‘optimizer_state_dict’: self.critic_optimizer.state_dict()},

critic_file_path)

 

 

import keras.models as models

from keras.layers import Dense

from keras.optimizers import Adam

 

import numpy as np

import random

from collections import deque

from util.ModifiedTensorBoard import ModifiedTensorBoard

 

seed = 10

np.random.seed(seed)

random.seed(seed)

 

 

class DQN_TF:

“””

DQN agent

using tensorflow model approximation based method

– take exploration action, expect epsilon value or use decay_epsilon()

– save experiences to replay memory

– train model and update values on batch sample

– save model

@mahmoudtaouti

“””

def __init__(self, state_dim, action_dim,

memory_capacity=10000, batch_size=100,

reward_gamma=0.99, reward_scale=1.,

target_update_freq=30,

actor_hidden_size=128, actor_lr=0.001,

optimizer_type=”rmsprop”,

epsilon_start=0.9, epsilon_end=0.01, epsilon_decay=0.001,

load_model=None):

 

self.state_dim = state_dim

self.action_dim = action_dim

 

self.reward_gamma = reward_gamma

self.reward_scale = reward_scale

 

self.actor_hidden_size = actor_hidden_size

self.actor_lr = actor_lr

self.optimizer_type = optimizer_type

self.batch_size = batch_size

self.target_update_freq = target_update_freq

 

# params for epsilon greedy

self.epsilon = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

 

self.memory = deque(maxlen=memory_capacity)

 

self.loaded_model = load_model

 

self.actor = self._load_model() if load_model else self._build_model()

self.target = self._build_model()

 

self.target.set_weights(self.actor.get_weights())

 

# Used to count when to update target network with main network’s weights

def _build_model(self):

# neural network for deep-q learning model

model = models.Sequential()

model.add(Dense(self.actor_hidden_size, input_dim=self.state_dim, activation=’relu’))

model.add(Dense(self.action_dim, activation=’linear’))

model.compile(loss=’mse’, optimizer=Adam(learning_rate=self.actor_lr), metrics=[‘accuracy’])

return model

 

def _load_model(self):

return models.load_model(self.loaded_model)

 

def learn(self):

 

# Start training only if certain number of samples is already saved

if len(self.memory) < self.batch_size:

return

 

batch = random.sample(self.memory, self.batch_size)

# Get current states from minibatch, then query NN model for Q values

states = np.array([transition[0] for transition in batch])

qs_list = self.actor.predict(states)

 

# Get future states from batch, then query NN model for Q values

new_states = np.array([transition[3] for transition in batch])

future_qs_list = self.target.predict(new_states)

 

# train the model on the batch of experiences

for index, (state, action, reward, next_state, done) in enumerate(batch):

# compute target q by: r + gamma * max_a { V(s_{t+1}) }

new_q = self.reward_scale * reward + self.reward_gamma * max(future_qs_list[index]) * (1 – int(done))

 

# Update Q value for given state

current_qs = qs_list[index]

current_qs[action] = new_q

qs_list[index] = current_qs

 

X = np.array(states)

Y = np.array(qs_list)

self.actor.fit(X, Y, batch_size=self.batch_size, epochs=3, verbose=0, shuffle=False)

 

def shared_learning(self, n_agents, agent_index, shared_batch_sample):

self.learn()

 

def update_target(self):

self.target.set_weights(self.actor.get_weights())

 

# predict or get random action

def exploration_action(self, state, epsilon=None):

 

self.epsilon = epsilon if epsilon else self.epsilon

if np.random.rand() > self.epsilon:

return self.action(state)

else:

return random.randrange(0, self.action_dim)

 

def action(self, state):

X = np.array([state])

qs = self.actor.predict(X)

return np.argmax(qs)

 

# Adds step’s experience to a memory replay

# (observation space, action, reward, new observation space, done)

def remember(self, state, action, reward, new_state, done):

self.memory.append((state, action, reward, new_state, done))

 

def decay_epsilon(self):

# decrease the exploration rate epsilon over time

if self.epsilon > self.epsilon_end:

self.epsilon *= self.epsilon_decay

self.epsilon = max(self.epsilon_end, self.epsilon)

 

def save(self, global_step=None, out_dir=’/model’):

actor_file_path = out_dir + f”/actor_eps{global_step}.model”

self.actor.save(filepath=actor_file_path)

 

 

 

 

 

 

 

 

 

 

import torch as th

import os

from MARL.common.Model import QNetwork, QMIXNet

from MARL.common.utils import to_tensor_var

 

 

class QMIX_Agent:

def __init__(self, args):

self.n_actions = args.n_actions

self.n_agents = args.n_agents

self.state_shape = args.state_shape

self.obs_shape = args.obs_shape

input_shape = self.obs_shape

 

if args.last_action:

input_shape += self.n_actions

if args.reuse_network:

input_shape += self.n_agents

 

self.eval_q = QNetwork(input_shape, )

self.target_q = QNetwork(input_shape, )

 

self.eval_qmix_net = QMIXNet( N, state_dim, hidden_size, hyper_hidden_dim, hyper_layers_num)

 

self.target_q.load_state_dict(self.eval_q.state_dict())

 

self.eval_parameters = list(self.eval_qmix_net.parameters()) + list(self.eval_q.parameters())

if args.optimizer == “RMS”:

self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr)

 

 

def shared_learning(self, batch, n_agents, qmix_net):

 

batch = self.memory.sample(self.batch_size)

states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim)

actions_var = to_tensor_var(batch.actions, self.use_cuda, “long”).view(-1, 1)

rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1)

next_states_var = to_tensor_var(batch.next_states, self.use_cuda).view(-1, self.state_dim)

dones_var = to_tensor_var(batch.dones, self.use_cuda).view(-1, 1)

 

# Compute Q(s_t, a) using the actor network

current_q = self.actor(states_var).gather(1, actions_var)

 

# Compute Q_tot using the mixing network

q_tot = self.mixing_network.compute_Q_tot(batch.states, batch.actions, self.actor, self.target)

 

# Compute the target Q values

next_state_action_values = self.target(next_states_var).detach()

next_q = th.max(next_state_action_values, 1)[0].view(-1, 1)

target_q = self.reward_scale * rewards_var + self.reward_gamma * next_q * (1. – dones_var)

 

# Update actor network

self.actor_optimizer.zero_grad()

loss = th.nn.functional.smooth_l1_loss(current_q, q_tot)  # Using QMIX loss

loss.backward()

if self.max_grad_norm is not None:

nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm)

self.actor_optimizer.step()

 

self.train_count += 1

if self.train_count % self.target_update_freq == 0:

self.update_target()

 

 

def save_model(self, train_step):

num = str(train_step // self.args.save_cycle)

if not os.path.exists(self.model_dir):

os.makedirs(self.model_dir)

torch.save(self.eval_qmix_net.state_dict(), self.model_dir + ‘/’ + num + ‘_qmix_net_params.pkl’)

torch.save(self.eval_rnn.state_dict(),  self.model_dir + ‘/’ + num + ‘_rnn_net_params.pkl’)

 

 

 

 

 

 

 

 

 

 

 

 

 

import math

import torch as th

from torch.autograd import Variable

import numpy as np

 

 

def identity(x):

return x

 

 

def entropy(p):

return -th.sum(p * th.log(p), 1)

 

 

def kl_log_probs(log_p1, log_p2):

return -th.sum(th.exp(log_p1) * (log_p2 – log_p1), 1)

 

 

def index_to_one_hot(index, dim):

if isinstance(index, np.int) or isinstance(index, np.int64):

one_hot = np.zeros(dim)

one_hot[index] = 1.

else:

one_hot = np.zeros((len(index), dim))

one_hot[np.arange(len(index)), index] = 1.

return one_hot

 

 

def to_tensor_var(x, use_cuda=False, dtype=”float”):

FloatTensor = th.cuda.FloatTensor if use_cuda else th.FloatTensor

LongTensor = th.cuda.LongTensor if use_cuda else th.LongTensor

ByteTensor = th.cuda.ByteTensor if use_cuda else th.ByteTensor

if dtype == “float”:

x = np.array(x, dtype=np.float64).tolist()

return Variable(FloatTensor(x))

elif dtype == “long”:

x = np.array(x, dtype=np.long).tolist()

return Variable(LongTensor(x))

elif dtype == “byte”:

x = np.array(x, dtype=np.byte).tolist()

return Variable(ByteTensor(x))

else:

x = np.array(x, dtype=np.float64).tolist()

return Variable(FloatTensor(x))

 

 

def agg_stat_list(agg_list):

“””

Aggregate then calculate statistics

l: [ […], […], […] ]

l_i: result of each step in the i-th episode

Returns:

mean, std, max, min

“””

s = [np.sum(np.array(l_i)) for l_i in agg_list]

s_mu = np.mean(np.array(s))

s_std = np.std(np.array(s))

s_max = np.max(np.array(s))

s_min = np.min(np.array(s))

return s_mu, s_std, s_max, s_min

 

 

def exponential_epsilon_decay(epsilon_start, epsilon_end, decay_rate, episode):

“””

Exponential epsilon decay function.

 

Args:

epsilon_start (float): Starting value of epsilon.

epsilon_end (float): Minimum value of epsilon.

decay_rate (float): Decay rate of epsilon e.g 0.01, 0.001, 0.0001.

episode (int): Current episode number.

 

Returns:

float: Decayed epsilon value for the given episode.

“””

epsilon = epsilon_end + (epsilon_start – epsilon_end) * math.exp(-decay_rate * episode)

return epsilon

 

 

def greedy_epsilon_decay(epsilon, epsilon_end, decay_rate):

“””

Decrease the epsilon over time

Args:

epsilon (float): Current value of epsilon.

epsilon_end (float): Minimum value of epsilon.

decay_rate (float): Decay rate of epsilon e.g 0.99 0.995 0.98.

 

Returns:

float: Decayed epsilon value.

“””

if epsilon > epsilon_end:

epsilon *= decay_rate

return max(epsilon_end, epsilon)

return epsilon

 

 

 

 

 

 

 

import random

from collections import namedtuple

 

 

Experience = namedtuple(“Experience”,

(“states”, “actions”, “rewards”, “next_states”, “dones”))

 

 

class ReplayMemory(object):

“””

Replay memory buffer

“””

def __init__(self, capacity):

self.capacity = capacity

self.memory = []

self.position = 0

 

def _push_one(self, state, action, reward, next_state=None, done=None):

if len(self.memory) < self.capacity:

self.memory.append(None)

self.memory[self.position] = Experience(state, action, reward, next_state, done)

self.position = (self.position + 1) % self.capacity

 

def push(self, states, actions, rewards, next_states=None, dones=None):

if isinstance(states, list):

if next_states is not None and len(next_states) > 0:

for s, a, r, n_s, d in zip(states, actions, rewards, next_states, dones):

self._push_one(s, a, r, n_s, d)

else:

for s, a, r in zip(states, actions, rewards):

self._push_one(s, a, r)

else:

self._push_one(states, actions, rewards, next_states, dones)

 

def sample(self, batch_size):

if batch_size > len(self.memory):

batch_size = len(self.memory)

transitions = random.sample(self.memory, batch_size)

batch = Experience(*zip(*transitions))

return batch

 

def get_experiences(self):

batch = Experience(*zip(*self.memory))

return batch

 

def clear_memory(self):

self.memory = []

self.position = 0

 

def __len__(self):

return len(self.memory)

 

 

 

 

 

 

 

import torch as th

from torch import nn

import torch.functional as F

 

 

class ActorNet(nn.Module):

def __init__(self, state_dim, hidden_size, output_size):

super().__init__()

self.model = nn.Sequential(

nn.Linear(state_dim, hidden_size),

nn.Tanh(),

nn.Linear(hidden_size, hidden_size),

nn.Tanh(),

nn.Linear(hidden_size, output_size),

nn.Softmax(dim=1)

)

 

def __call__(self, state):

return self.model(state)

 

 

class ActorNetwork(nn.Module):

“””

A network for actor

“””

 

def __init__(self, state_dim, hidden_size, output_size, output_activation):

super(ActorNetwork, self).__init__()

self.fc1 = nn.Linear(state_dim, hidden_size)

self.fc2 = nn.Linear(hidden_size, hidden_size)

self.fc3 = nn.Linear(hidden_size, output_size)

# activation function for the output

self.output_activation = output_activation

 

def __call__(self, state):

out = nn.functional.relu(self.fc1(state))

out = nn.functional.relu(self.fc2(out))

out = self.output_activation(self.fc3(out))

return out

 

 

class CriticNetwork(nn.Module):

“””

A network for critic

“””

 

def __init__(self, state_dim, action_dim, hidden_size, output_size=1):

super(CriticNetwork, self).__init__()

self.fc1 = nn.Linear(state_dim, hidden_size)

self.fc2 = nn.Linear(hidden_size + action_dim, hidden_size)

self.fc3 = nn.Linear(hidden_size, output_size)

 

def __call__(self, state, action):

out = nn.functional.relu(self.fc1(state))

out = th.cat([out, action], 1)

out = nn.functional.relu(self.fc2(out))

out = self.fc3(out)

return out

 

 

class ActorCriticNetwork(nn.Module):

“””

An actor-critic network that shared lower-layer representations but

have distinct output layers

“””

 

def __init__(self, state_dim, action_dim, hidden_size,

actor_output_act, critic_output_size=1):

super(ActorCriticNetwork, self).__init__()

self.fc1 = nn.Linear(state_dim, hidden_size)

self.fc2 = nn.Linear(hidden_size, hidden_size)

self.actor_linear = nn.Linear(hidden_size, action_dim)

self.critic_linear = nn.Linear(hidden_size, critic_output_size)

self.actor_output_act = actor_output_act

 

def __call__(self, state):

out = nn.functional.relu(self.fc1(state))

out = nn.functional.relu(self.fc2(out))

act = self.actor_output_act(self.actor_linear(out))

val = self.critic_linear(out)

return act, val

 

 

class QNetwork(nn.Module):

“””

A network for the Q-function in DQN

“””

 

def __init__(self, state_dim, hidden_size, action_dim):

super().__init__()

self.model = nn.Sequential(

nn.Linear(state_dim, hidden_size),

nn.ReLU(),

nn.Linear(hidden_size, hidden_size),

nn.ReLU(),

nn.Linear(hidden_size, action_dim),

)

 

def __call__(self, state):

return self.model(state)

 

 

class QMIXNet(nn.Module):

def __init__(self, N, state_dim, hidden_size, hyper_hidden_dim, hyper_layers_num):

super(QMIXNet, self).__init__()

self.N = N

self.state_dim = state_dim

self.qmix_hidden_dim = hidden_size

self.hyper_hidden_dim = hyper_hidden_dim

self.hyper_layers_num = hyper_layers_num

“””

w1:(N, qmix_hidden_dim)

b1:(1, qmix_hidden_dim)

w2:(qmix_hidden_dim, 1)

b2:(1, 1)

Because the generated hyper_w1 needs to be a matrix, and the pytorch neural network can only output a vector,

So first output the vector whose length is the required matrix row*matrix column, and then convert it into a matrix

“””

if self.hyper_layers_num == 2:

self.hyper_w1 = nn.Sequential(nn.Linear(self.state_dim, self.hyper_hidden_dim),

nn.ReLU(),

nn.Linear(self.hyper_hidden_dim, self.N * self.qmix_hidden_dim))

self.hyper_w2 = nn.Sequential(nn.Linear(self.state_dim, self.hyper_hidden_dim),

nn.ReLU(),

nn.Linear(self.hyper_hidden_dim, self.qmix_hidden_dim * 1))

elif self.hyper_layers_num == 1:

self.hyper_w1 = nn.Linear(self.state_dim, self.N * self.qmix_hidden_dim)

self.hyper_w2 = nn.Linear(self.state_dim, self.qmix_hidden_dim * 1)

 

self.hyper_b1 = nn.Linear(self.state_dim, self.qmix_hidden_dim)

self.hyper_b2 = nn.Sequential(nn.Linear(self.state_dim, self.qmix_hidden_dim),

nn.ReLU(),

nn.Linear(self.qmix_hidden_dim, 1))

 

def __call__(self, q, s):

# q.shape(batch_size, max_episode_len, N)

# s.shape(batch_size, max_episode_len,state_dim)

q = q.view(-1, 1, self.N)  # (batch_size * max_episode_len, 1, N)

s = s.reshape(-1, self.state_dim)  # (batch_size * max_episode_len, state_dim)

 

w1 = th.abs(self.hyper_w1(s))  # (batch_size * max_episode_len, N * qmix_hidden_dim)

b1 = self.hyper_b1(s)  # (batch_size * max_episode_len, qmix_hidden_dim)

w1 = w1.view(-1, self.N, self.qmix_hidden_dim)  # (batch_size * max_episode_len, N,  qmix_hidden_dim)

b1 = b1.view(-1, 1, self.qmix_hidden_dim)  # (batch_size * max_episode_len, 1, qmix_hidden_dim)

 

# torch.bmm: 3 dimensional tensor multiplication

q_hidden = F.elu(th.bmm(q, w1) + b1)  # (batch_size * max_episode_len, 1, qmix_hidden_dim)

 

w2 = th.abs(self.hyper_w2(s))  # (batch_size * max_episode_len, qmix_hidden_dim * 1)

b2 = self.hyper_b2(s)  # (batch_size * max_episode_len,1)

w2 = w2.view(-1, self.qmix_hidden_dim, 1)  # (batch_size * max_episode_len, qmix_hidden_dim, 1)

b2 = b2.view(-1, 1, 1)  # (batch_size * max_episode_len, 1, 1)

 

q_total = th.bmm(q_hidden, w2) + b2  # (batch_size * max_episode_len, 1, 1)

q_total = q_total.view(self.batch_size, -1, 1)  # (batch_size, max_episode_len, 1)

return q_total

 

 

 

 

 

 

 

import os

import numpy as np

import torch.nn as nn

import torch as th

 

from MARL.agent.DQN import DQN

from MARL.common.Memory import ReplayMemory

from MARL.common.utils import exponential_epsilon_decay

from util.ModifiedTensorBoard import ModifiedTensorBoard

 

 

class QMIX:

“””

multi-agent Deep Q-Network

– training with concurrent or centralized learnings

– two model option torch or tensor

– using target model for stable learning

– exploration action with exponential epsilon greedy

@mahmoudtaouti

“””

 

def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,

reward_gamma=0.99, reward_scale=1.,

actor_hidden_size=128, target_update_freq=50,

actor_output_act=nn.functional.tanh, critic_loss=”mse”,

actor_lr=0.001, optimizer_type=”adam”,

max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,

epsilon_end=0.01, epsilon_decay=0.003,

training_strategy=”concurrent”, use_cuda=False, model_type=”torch”, outputs_dir=”logs/”):

 

assert training_strategy in [“concurrent”, “centralized”]

assert model_type in [“torch”, “tensor”]

 

self.n_agents = n_agents

self.state_dim = state_dim

self.action_dim = action_dim

 

self.batch_size = batch_size

 

self.training_strategy = training_strategy

self.use_cuda = use_cuda and th.cuda.is_available()

self.epsilon = epsilon_start

self.epsilon = epsilon_start

self.epsilon_start = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

 

self.model_type = model_type

self.tensorboard = ModifiedTensorBoard(outputs_dir)

self.actor_output_act = actor_output_act

 

self.agents = []

if self.training_strategy == ‘concurrent’:

for _ in range(self.n_agents):

agent = DQN(state_dim=state_dim,

action_dim=action_dim,

memory_capacity=memory_capacity,

reward_gamma=reward_gamma,

reward_scale=reward_scale,

actor_hidden_size=actor_hidden_size,

critic_loss=critic_loss,

actor_lr=actor_lr,

target_update_freq=target_update_freq,

optimizer_type=optimizer_type,

batch_size=batch_size,

epsilon_start=epsilon_start,

epsilon_end=epsilon_end,

epsilon_decay=epsilon_decay,

max_grad_norm=max_grad_norm,

use_cuda=use_cuda)

self.agents.append(agent)

elif self.training_strategy == “centralized”:

# parameters sharing with shared replay memory

self.shared_memory = ReplayMemory(capacity=memory_capacity)

self.agents = [DQN(state_dim=state_dim * n_agents,

action_dim=action_dim,

memory_capacity=memory_capacity,

reward_gamma=reward_gamma,

reward_scale=reward_scale,

actor_hidden_size=actor_hidden_size,

critic_loss=critic_loss,

actor_lr=actor_lr,

target_update_freq=target_update_freq,

optimizer_type=optimizer_type,

batch_size=batch_size,

epsilon_start=epsilon_start,

epsilon_end=epsilon_end,

epsilon_decay=epsilon_decay,

max_grad_norm=max_grad_norm,

use_cuda=use_cuda)] * n_agents

 

def learn(self):

“””

train for each agent the gathered experiences

“””

if self.training_strategy == “concurrent”:

for agent in self.agents:

agent.learn()

elif self.training_strategy == “centralized”:

batch = self.shared_memory.sample(self.batch_size)

for agent in self.agents:

agent.shared_learning(n_agents=self.n_agents, agent_index=agent, shared_batch_sample=batch)

 

def remember(self, states, actions, rewards, new_states, dones):

“””

push experiences to replay memory

“””

dones = dones if isinstance(dones, list) else [dones] * self.n_agents

if self.training_strategy == ‘concurrent’:

for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):

agent.remember(s, a, r, n_s, d)

elif self.training_strategy == ‘centralized’:

self.shared_memory.push(states, actions, rewards, new_states, dones)

 

def exploration_act(self, states, n_episodes):

“””

for each agent make exploration action with exponential epsilon greedy,

and return a tuple of all agents actions

Returns:

tuple(actions)

“””

self.epsilon = exponential_epsilon_decay(

epsilon_start=self.epsilon_start,

epsilon_end=self.epsilon_end,

decay_rate=self.epsilon_decay,

episode=n_episodes)

 

actions = []

if self.training_strategy == ‘concurrent’:

for agent, state in zip(self.agents, states):

action = agent.exploration_action(state, epsilon=self.epsilon)

actions.append(action)

elif self.training_strategy == ‘centralized’:

for agent, _ in zip(self.agents, states):

action = agent.exploration_action(np.array(states).reshape(-1, self.n_agents * self.state_dim),

epsilon=self.epsilon)

actions.append(action)

return tuple(actions)

 

def update_targets(self):

“””

update target model weights for each agent

“””

for agent in self.agents:

agent.update_target()

 

def act(self, states):

“””

for each agent predict action,

Returns:

tuple(actions)

“””

actions = []

if self.training_strategy == ‘concurrent’:

for agent, state in zip(self.agents, states):

action = agent.action(state)

actions.append(action)

elif self.training_strategy == ‘centralized’:

for agent, _ in zip(self.agents, states):

action = agent.action(np.array(states).reshape(-1, self.n_agents * self.state_dim))

actions.append(action)

return tuple(actions)

 

def save(self, out_dir, checkpoint_num, global_step):

“””

create checkpoint directory,

and save models of all RL agents

“””

os.makedirs(out_dir + “/models”, exist_ok=True)

checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”

os.makedirs(checkpoint_dir, exist_ok=True)

 

for index, agent in enumerate(self.agents):

if self.model_type == “torch”:

actor_file_path = checkpoint_dir + f”/actor_{index}.pt”

th.save({‘global_step’: global_step,

‘model_state_dict’: agent.actor.state_dict(),

‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},

actor_file_path)

else:

actor_file_path = checkpoint_dir + f”/eps{global_step}_actor_{index}.model”

agent.actor.save(actor_file_path)

 

def load(self, directory, check_point=None):

“””

load saved models

“””

if not os.path.exists(directory):

raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)

 

checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory

 

for index, agent in enumerate(self.agents):

actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)

 

checkpoint = th.load(actor_file_path)

agent.actor.load_state_dict(checkpoint[‘model_state_dict’])

agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])

 

 

 

 

 

 

 

import os

 

import numpy as np

import torch.nn as nn

import torch as th

 

from MARL.agent.DQN import DQN

from MARL.common.Memory import ReplayMemory

from MARL.common.Model import QMIXNet

from MARL.common.utils import exponential_epsilon_decay

from util.ModifiedTensorBoard import ModifiedTensorBoard

 

 

class MADQN:

“””

multi-agent Deep Q-Network

– training with concurrent or centralized learnings

– two model option torch or tensor

– using target model for stable learning

– exploration action with exponential epsilon greedy

@mahmoudtaouti

“””

 

def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,

reward_gamma=0.99, reward_scale=1.,

actor_hidden_size=128, target_update_freq=50,

actor_output_act=nn.functional.tanh, critic_loss=”mse”,

actor_lr=0.001, optimizer_type=”adam”,

max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,

epsilon_end=0.01, epsilon_decay=0.003,

qmix_hidden_size=128, training_strategy=”concurrent”,

use_cuda=False, model_type=”torch”, outputs_dir=”logs/”):

 

assert training_strategy in [“concurrent”, “centralized”]

assert model_type in [“torch”, “tensor”]

 

self.n_agents = n_agents

self.state_dim = state_dim

self.action_dim = action_dim

 

self.batch_size = batch_size

 

self.training_strategy = training_strategy

self.use_cuda = use_cuda and th.cuda.is_available()

self.epsilon = epsilon_start

self.epsilon = epsilon_start

self.epsilon_start = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

 

self.model_type = model_type

self.tensorboard = ModifiedTensorBoard(outputs_dir)

self.actor_output_act = actor_output_act

 

self.agents = []

 

self.agents = [DQN(state_dim=state_dim * n_agents,

action_dim=action_dim,

memory_capacity=memory_capacity,

reward_gamma=reward_gamma,

reward_scale=reward_scale,

actor_hidden_size=actor_hidden_size,

critic_loss=critic_loss,

actor_lr=actor_lr,

target_update_freq=target_update_freq,

optimizer_type=optimizer_type,

batch_size=batch_size,

epsilon_start=epsilon_start,

epsilon_end=epsilon_end,

epsilon_decay=epsilon_decay,

max_grad_norm=max_grad_norm,

use_cuda=use_cuda)] * n_agents

 

if self.training_strategy == “centralized”:

# parameters sharing with shared replay memory

self.shared_memory = ReplayMemory(capacity=memory_capacity)

self.qmix_net = QMIXNet(n_agents, state_dim, qmix_hidden_size, hyper_hidden_dim, hyper_layers_num)

 

def learn(self):

“””

train for each agent the gathered experiences

“””

if self.training_strategy == “concurrent”:

for agent in self.agents:

agent.learn()

elif self.training_strategy == “centralized”:

batch = self.shared_memory.sample(self.batch_size)

for agent in self.agents:

agent.shared_learning(n_agents=self.n_agents, agent_index=agent, shared_batch_sample=batch, qmix_net= self.qmix_net)

 

def remember(self, states, actions, rewards, new_states, dones):

“””

push experiences to replay memory

“””

dones = dones if isinstance(dones, list) else [dones] * self.n_agents

if self.training_strategy == ‘concurrent’:

for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):

agent.remember(s, a, r, n_s, d)

elif self.training_strategy == ‘centralized’:

self.shared_memory.push(states, actions, rewards, new_states, dones)

 

def exploration_act(self, states, n_episodes):

“””

for each agent make exploration action with exponential epsilon greedy,

and return a tuple of all agents actions

Returns:

tuple(actions)

“””

self.epsilon = exponential_epsilon_decay(

epsilon_start=self.epsilon_start,

epsilon_end=self.epsilon_end,

decay_rate=self.epsilon_decay,

episode=n_episodes)

 

actions = []

if self.training_strategy == ‘concurrent’:

for agent, state in zip(self.agents, states):

action = agent.exploration_action(state, epsilon=self.epsilon)

actions.append(action)

elif self.training_strategy == ‘centralized’:

for agent, _ in zip(self.agents, states):

action = agent.exploration_action(np.array(states).reshape(-1, self.n_agents * self.state_dim),

epsilon=self.epsilon)

actions.append(action)

return tuple(actions)

 

def update_targets(self):

“””

update target model weights for each agent

“””

for agent in self.agents:

agent.update_target()

 

def act(self, states):

“””

for each agent predict action,

Returns:

tuple(actions)

“””

actions = []

if self.training_strategy == ‘concurrent’:

for agent, state in zip(self.agents, states):

action = agent.action(state)

actions.append(action)

elif self.training_strategy == ‘centralized’:

for agent, _ in zip(self.agents, states):

action = agent.action(np.array(states).reshape(-1, self.n_agents * self.state_dim))

actions.append(action)

return tuple(actions)

 

def save(self, out_dir, checkpoint_num, global_step):

“””

create checkpoint directory,

and save models of all RL agents

“””

os.makedirs(out_dir + “/models”, exist_ok=True)

checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”

os.makedirs(checkpoint_dir, exist_ok=True)

 

for index, agent in enumerate(self.agents):

if self.model_type == “torch”:

actor_file_path = checkpoint_dir + f”/actor_{index}.pt”

th.save({‘global_step’: global_step,

‘model_state_dict’: agent.actor.state_dict(),

‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},

actor_file_path)

else:

actor_file_path = checkpoint_dir + f”/eps{global_step}_actor_{index}.model”

agent.actor.save(actor_file_path)

 

def load(self, directory, check_point=None):

“””

load saved models

“””

if not os.path.exists(directory):

raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)

 

checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory

 

for index, agent in enumerate(self.agents):

actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)

 

checkpoint = th.load(actor_file_path)

agent.actor.load_state_dict(checkpoint[‘model_state_dict’])

agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])

 

 

 

 

 

 

 

 

import os

 

import torch as th

from torch.optim import Adam

 

from MAA2C_config import SHARED_CRITIC_HIDDEN_SIZE

from MARL.agent.A2C import A2C

from MARL.common.Memory import ReplayMemory

from MARL.common.Model import CriticNetwork

from MARL.common.utils import exponential_epsilon_decay

from util.ModifiedTensorBoard import ModifiedTensorBoard

 

 

class MAA2C:

“””

multi agent advantage actor critic

@mahmoudtaouti

“””

 

def __init__(self, state_dim, action_dim, n_agents, memory_capacity=10000,

reward_gamma=0.99, reward_scale=1.,

actor_hidden_size=128, critic_hidden_size=128,

critic_loss=”mse”, actor_lr=0.001, critic_lr=0.001,

optimizer_type=”rmsprop”, entropy_reg=0.01,

max_grad_norm=0.5, batch_size=64, epsilon_start=0.9,

epsilon_end=0.01, epsilon_decay=0.003,

training_strategy=”concurrent”, use_cuda=False, outputs_dir=”logs/”, is_evaluation=False):

 

assert training_strategy in [“concurrent”, “centralized”]

 

self.n_agents = n_agents

self.state_dim = state_dim

self.action_dim = action_dim

 

self.batch_size = batch_size

 

self.training_strategy = training_strategy

self.use_cuda = use_cuda and th.cuda.is_available()

self.epsilon = epsilon_start

self.epsilon = epsilon_start

self.epsilon_start = epsilon_start

self.epsilon_end = epsilon_end

self.epsilon_decay = epsilon_decay

self.is_evaluation = is_evaluation

self.tensorboard = ModifiedTensorBoard(outputs_dir)

 

self.shared_critic = None

self.shared_memory = None

self.shared_critic_optimizer = None

 

if training_strategy == “centralized” and not is_evaluation:

self.shared_memory = ReplayMemory(capacity=memory_capacity)

self.shared_critic = CriticNetwork(state_dim * n_agents, action_dim * n_agents,

SHARED_CRITIC_HIDDEN_SIZE, 1)

self.shared_critic_optimizer = Adam(self.shared_critic.parameters(), lr=critic_lr)

 

# Create N agents that share the same model

self.agents = [A2C(state_dim=state_dim,

action_dim=action_dim,

memory_capacity=memory_capacity,

reward_gamma=reward_gamma,

reward_scale=reward_scale,

actor_hidden_size=actor_hidden_size,

critic_hidden_size=critic_hidden_size,

critic_loss=critic_loss,

actor_lr=actor_lr,

critic_lr=critic_lr,

optimizer_type=optimizer_type,

batch_size=batch_size,

epsilon_start=epsilon_start,

epsilon_end=epsilon_end,

epsilon_decay=epsilon_decay,

entropy_reg=entropy_reg,

max_grad_norm=max_grad_norm,

marl_training_strategy=training_strategy,

use_cuda=use_cuda)] * n_agents

 

def learn(self):

“””

train for each agent the gathered experiences

agent.shared_learning: centralized learning strategy that share the same critic network

agent.learn: concurrent (independent) learning

“””

for index, agent in enumerate(self.agents):

if self.training_strategy == “centralized”:

shared_batch = self.shared_memory.sample(self.batch_size)

agent.shared_learning(n_agents=self.n_agents,

agent_index=index,

shared_batch_sample=shared_batch,

shared_critic=self.shared_critic,

shared_critic_optim=self.shared_critic_optimizer)

else:

agent.learn()

 

def remember(self, states, actions, rewards, new_states, dones):

“””

push experiences to replay memory

“””

dones = dones if isinstance(dones, list) else [dones] * self.n_agents

if self.training_strategy == “centralized”:

self.shared_memory.push(states, actions, rewards, new_states, dones)

else:

for agent, s, a, r, n_s, d in zip(self.agents, states, actions, rewards, new_states, dones):

agent.remember(s, a, r, n_s, d)

 

def exploration_act(self, states, n_episodes):

“””

for each agent make exploration action,

and return a tuple of all agents actions

using exponential epsilon decay

Returns: tuple(actions)

“””

self.epsilon = exponential_epsilon_decay(

epsilon_start=self.epsilon_start,

epsilon_end=self.epsilon_end,

decay_rate=self.epsilon_decay,

episode=n_episodes)

 

actions = []

for agent, state in zip(self.agents, states):

action = agent.exploration_action(state, self.epsilon)

actions.append(action)

return tuple(actions)

 

def act(self, states):

“””

for each agent predict action

and return a tuple of all agents actions

“””

actions = []

for agent, state in zip(self.agents, states):

action = agent.action(state)

actions.append(action)

return tuple(actions)

 

def save(self, out_dir, checkpoint_num, global_step):

“””

save models of all MAA2C agents

Args:

out_dir (str): Directory path where to save.

checkpoint_num(int): check-point during the training

global_step (int): Global step or checkpoint number to load (optional).

 

Raises:

FileNotFoundError: If the specified output directory does not exist.

“””

if not os.path.exists(out_dir):

raise FileNotFoundError(f”The directory ‘{out_dir}’ does not exist.”)

 

os.makedirs(out_dir + “/models”, exist_ok=True)

checkpoint_dir = out_dir + f”/models/checkpoint-{checkpoint_num}”

os.makedirs(checkpoint_dir, exist_ok=True)

 

for index, agent in enumerate(self.agents):

actor_file_path = checkpoint_dir + f”/actor_{index}.pt”

critic_file_path = checkpoint_dir + f”/critic_{index}.pt”

shared_critic_oath = checkpoint_dir + f”/shared_critic.pt”

 

th.save({‘global_step’: global_step,

‘model_state_dict’: agent.actor.state_dict(),

‘optimizer_state_dict’: agent.actor_optimizer.state_dict()},

actor_file_path)

 

if self.training_strategy == “centralized”:

th.save({‘global_step’: global_step,

‘model_state_dict’: self.shared_critic.state_dict(),

‘optimizer_state_dict’: self.shared_critic_optimizer.state_dict()},

shared_critic_oath)

else:

th.save({‘global_step’: global_step,

‘model_state_dict’: agent.critic.state_dict(),

‘optimizer_state_dict’: agent.critic_optimizer.state_dict()},

critic_file_path)

 

def load(self, directory, check_point=None):

“””

Load saved models

Args:

directory (str): Directory path where the saved models are located.

check_point (int): Global step or checkpoint number to load (optional).

Raises:

FileNotFoundError: If the specified directory or checkpoint does not exist.

“””

if not os.path.exists(directory):

raise FileNotFoundError(f”The directory ‘{directory}’ does not exist.”)

 

checkpoint_dir = os.path.join(directory, f”checkpoint-{check_point}”) if check_point else directory

 

for index, agent in enumerate(self.agents):

actor_file_path = os.path.join(checkpoint_dir, f”actor_{index}.pt”)

critic_file_path = os.path.join(checkpoint_dir, f”critic_{index}.pt”)

shared_critic_file_path = os.path.join(checkpoint_dir, f”shared_critic.pt”)

 

checkpoint = th.load(actor_file_path)

agent.actor.load_state_dict(checkpoint[‘model_state_dict’])

agent.actor_optimizer.load_state_dict(checkpoint[‘optimizer_state_dict’])

 

# TODO : correct load for centralized case

if not self.is_evaluation:

if self.training_strategy == “centralized”:

critic_checkpoint = th.load(shared_critic_file_path)

self.shared_critic.load_state_dict(critic_checkpoint[‘model_state_dict’])

self.shared_critic_optimizer.load_state_dict(critic_checkpoint[‘optimizer_state_dict’])

else:

critic_checkpoint = th.load(critic_file_path)

agent.critic.load_state_dict(critic_checkpoint[‘model_state_dict’])

agent.critic_optimizer.load_state_dict(critic_checkpoint[‘optimizer_state_dict’])

 

print(“Models loaded successfully.”)