classdef MySACAgent < rl.agent.CustomAgent
properties(Access = private)
function obj = MySACAgent(numObs,numAct,obsInfo,actInfo,hid_dim,Ts,options)
obj = obj@rl.agent.CustomAgent();
obj.ObservationInfo = obsInfo;
obj.ActionInfo = actInfo;
obj.actor = CreateActor(obj,numObs,numAct,hid_dim,obsInfo,actInfo);
[obj.critic1,obj.critic2,obj.critic_target1,obj.critic_target2] = CreateCritic(obj,numObs,numAct,hid_dim,obsInfo,actInfo);
assert(options.WarmUpSteps>options.MiniBatchSize,...
'options.WarmUpSteps must not be less than options.MiniBatchSize');
obj.actorOptimizer = rlOptimizer(options.OptimizerOptions{1});
obj.criticOptimizer_1 = rlOptimizer(options.OptimizerOptions{2});
obj.criticOptimizer_2 = rlOptimizer(options.OptimizerOptions{3});
obj.entWgtOptimizer = rlOptimizer(options.OptimizerOptions{4});
obj.log_alpha = dlarray(log(obj.options.EntropyLossWeight));
function resetBuffer(obj)
obj.obsBuffer = dlarray(...
zeros(obj.numObs,obj.options.MaxBufferLen),'CB');
obj.actionBuffer = dlarray(...
zeros(obj.numAct,obj.options.MaxBufferLen),'CB');
obj.rewardBuffer = dlarray(zeros(1,obj.options.MaxBufferLen),'CB');
obj.nextObsBuffer = dlarray(...
zeros(obj.numObs,obj.options.MaxBufferLen),'CB');
obj.isDoneBuffer = dlarray(zeros(1,obj.options.MaxBufferLen),'CB');
function actor = CreateActor(obj,numObs,numAct,hid_dim,obsInfo,actInfo)
featureInputLayer(numObs,Name="obsInLyr")
fullyConnectedLayer(hid_dim)
fullyConnectedLayer(hid_dim)
reluLayer(Name="comPathOutLyr")
fullyConnectedLayer(numAct,Name="meanOutLyr")
fullyConnectedLayer(numAct,Name="stdInLyr")
softplusLayer(Name="stdOutLyr")
actorNetwork = layerGraph(commonPath);
actorNetwork = addLayers(actorNetwork,meanPath);
actorNetwork = addLayers(actorNetwork,stdPath);
actorNetwork = connectLayers(actorNetwork,"comPathOutLyr","meanOutLyr/in");
actorNetwork = connectLayers(actorNetwork,"comPathOutLyr","stdInLyr/in");
actordlnet = dlnetwork(actorNetwork);
actor = initialize(actordlnet);
function [critic1,critic2,critic_target1,critic_target2] = CreateCritic(obj,numObs,numAct,hid_dim,obsInfo,actInfo)
featureInputLayer(numObs+numAct,Name="obsInLyr")
fullyConnectedLayer(hid_dim)
fullyConnectedLayer(hid_dim)
fullyConnectedLayer(1,Name="QValueOutLyr")
criticNet = layerGraph(criticNet);
criticDLnet = dlnetwork(criticNet,'Initialize',false);
critic1 = initialize(criticDLnet);
critic2 = initialize(criticDLnet);
critic_target1 = initialize(criticDLnet);
critic_target1.Learnables = critic1.Learnables;
critic_target1.State = critic1.State;
critic_target2 = initialize(criticDLnet);
critic_target2.Learnables = critic2.Learnables;
critic_target2.State = critic2.State;
function logP = logProbBoundedAction(obj,boundedAction,mu,sigma)
logP = sum(log(1/sqrt(2*pi)./sigma.*exp(-0.5*(0.5*...
log((1+boundedAction+eps)./(1-boundedAction+eps))-mu).^2./sigma.^2).*1./(1-boundedAction.^2+eps)),1);
function [vLoss_1, vLoss_2, criticGrad_1, criticGrad_2] = criticLoss(obj,batchExperiences,c1,c2)
batchObs = batchExperiences{1};
batchAction = batchExperiences{2};
batchReward = batchExperiences{3};
batchNextObs = batchExperiences{4};
batchIsDone = batchExperiences{5};
batchSize = size(batchObs,2);
gamma = obj.options.DiscountFactor;
y = dlarray(zeros(1,batchSize));
actionNext = getActionWithExploration_dlarray(obj,batchNextObs);
actionNext = actionNext{1};
Qt1=predict(obj.critic_target1,cat(1,batchNextObs,actionNext));
Qt2=predict(obj.critic_target2,cat(1,batchNextObs,actionNext));
[mu,sigma] = predict(obj.actor,batchNextObs);
next_action = tanh(mu + sigma.*randn(size(sigma)));
logP = logProbBoundedAction(obj,next_action,mu,sigma);
y = y + (1 - batchIsDone).*(gamma*(min(cat(1,Qt1,Qt2),[],1) - exp(obj.log_alpha)*logP));
critic_input = cat(1,batchObs,batchAction);
Q1 = forward(c1,critic_input);
Q2 = forward(c2,critic_input);
vLoss_1 = 1/2*mean((y - Q1).^2,'all');
vLoss_2 = 1/2*mean((y - Q2).^2,'all');
criticGrad_1 = dlgradient(vLoss_1,c1.Learnables);
criticGrad_2 = dlgradient(vLoss_2,c2.Learnables);
function [aLoss,actorGrad] = actorLoss(obj,batchExperiences,actor)
batchObs = batchExperiences{1};
batchSize = size(batchObs,2);
[mu,sigma] = forward(actor,batchObs);
curr_action = tanh(mu + sigma.*randn(size(sigma)));
critic_input = cat(1,batchObs,curr_action);
Q1=forward(obj.critic1,critic_input);
Q2=forward(obj.critic2,critic_input);
logP = logProbBoundedAction(obj,curr_action,mu,sigma);
aLoss = mean(-min(cat(1,Q1,Q2),[],1) + exp(obj.log_alpha) * logP,'all');
actorGrad= dlgradient(aLoss,actor.Learnables);
function [eLoss,entGrad] = entropyLoss(obj,batchExperiences,logAlpha)
batchObs = batchExperiences{1};
[mu,sigma] = predict(obj.actor,batchObs);
curr_action = tanh(mu + sigma.*randn(size(sigma)));
ent = mean(-logProbBoundedAction(obj,curr_action,mu,sigma));
eLoss = exp(logAlpha) * (ent - obj.options.TargetEntropy);
entGrad = dlgradient(eLoss,logAlpha);
methods(Access=protected)
function ts = getSampleTime_(obj)
function action = getActionImpl(obj,obs)
obs = dlarray(obs{1},'CB');
[mu,~] = predict(obj.actor,obs);
function action = getActionWithExplorationImpl(obj,obs)
if ~isa(obs,'dlarray') || size(obs,1)~=obj.numObs
obs = dlarray(randn(obj.numObs,1),'CB');
[mu,sigma] = predict(obj.actor,obs);
sigma = extractdata(sigma);
action = {tanh(mu + sigma .* randn(size(sigma)))};
function action = getActionWithExploration_dlarray(obj,obs)
[mu,sigma] = predict(obj.actor,obs);
action = {tanh(mu + sigma .* randn(size(sigma)))};
function action = learnImpl(obj,Experience)
isDone = logical(Experience{5});
obj.obsBuffer(:,obj.bufferIdx+1,:) = obs{1};
obj.actionBuffer(:,obj.bufferIdx+1,:) = action{1};
obj.rewardBuffer(:,obj.bufferIdx+1) = reward;
obj.nextObsBuffer(:,obj.bufferIdx+1,:) = nextObs{1};
obj.isDoneBuffer(:,obj.bufferIdx+1) = isDone;
obj.bufferLen = max(obj.bufferLen,obj.bufferIdx+1);
obj.bufferIdx = mod(obj.bufferIdx+1,obj.options.MaxBufferLen);
if obj.bufferLen>=max(obj.options.WarmUpSteps,obj.options.MiniBatchSize)
obj.counter = obj.counter + 1;
if (obj.options.LearningFrequency==-1 && isDone) || ...
(obj.options.LearningFrequency>0 && mod(obj.counter,obj.options.LearningFrequency)==0)
for gstep = 1:obj.options.NumGradientStepsPerUpdate
batchSize = obj.options.MiniBatchSize;
batchInd = randperm(obj.bufferLen,batchSize);
obj.obsBuffer(:,batchInd,:),...
obj.actionBuffer(:,batchInd,:),...
obj.rewardBuffer(:,batchInd),...
obj.nextObsBuffer(:,batchInd,:),...
obj.isDoneBuffer(:,batchInd)
[cLoss1,cLoss2,criticGrad_1,criticGrad_2] = dlfeval(@(x,c1,c2)obj.criticLoss(x,c1,c2),batchExperience,obj.critic1,obj.critic2);
obj.cLoss = min(extractdata(cLoss1),extractdata(cLoss2));
[obj.critic1.Learnables.Value,obj.criticOptimizer_1] = update(obj.criticOptimizer_1,obj.critic1.Learnables.Value,criticGrad_1.Value);
[obj.critic2.Learnables.Value,obj.criticOptimizer_2] = update(obj.criticOptimizer_2,obj.critic2.Learnables.Value,criticGrad_2.Value);
if (mod(obj.counter,obj.options.PolicyUpdateFrequency)==0 && obj.options.LearningFrequency==-1) ||...
(mod(obj.counter,obj.options.LearningFrequency * obj.options.PolicyUpdateFrequency)==0 ...
&& obj.options.LearningFrequency>0)
[aloss,actorGrad] = dlfeval(...
@(x,actor)obj.actorLoss(x,actor),...
batchExperience,obj.actor);
obj.aLoss = extractdata(aloss);
[obj.actor.Learnables.Value,obj.actorOptimizer] = update(obj.actorOptimizer,obj.actor.Learnables.Value,actorGrad.Value);
[eloss,entGrad] = dlfeval(@(x,alpha)obj.entropyLoss(x,alpha),batchExperience,obj.log_alpha);
obj.eLoss = extractdata(eloss);
[obj.log_alpha,obj.entWgtOptimizer] = update(obj.entWgtOptimizer,{obj.log_alpha},{entGrad});
obj.log_alpha = obj.log_alpha{1};
critic1_params = obj.critic1.Learnables.Value;
critic_target1_params = obj.critic_target1.Learnables.Value;
for i=1:size(critic1_params,1)
obj.critic_target1.Learnables.Value{i} = obj.options.TargetSmoothFactor * critic1_params{i}...
+ (1 - obj.options.TargetSmoothFactor) * critic_target1_params{i};
critic2_params = obj.critic2.Learnables.Value;
critic_target2_params = obj.critic_target2.Learnables.Value;
for i=1:size(critic2_params,1)
obj.critic_target2.Learnables.Value{i} = obj.options.TargetSmoothFactor * critic2_params{i}...
+ (1 - obj.options.TargetSmoothFactor) * critic_target2_params{i};
action = getActionWithExplorationImpl(obj,nextObs{1});
run('init_car_params.m');
obsInfo = rlNumericSpec([numObs 1]);
actInfo = rlNumericSpec([numAct 1]);
env = rlSimulinkEnv(mdl,blk,obsInfo,actInfo);
params=struct('rw_radius',rw_radius,'a',a,'b',b,'init_vx',init_vx,'init_yaw_rate',init_yaw_rate);
env.ResetFcn = @(in) PriusResetFcn(in,params,mdl);
agent = createNetworks(rnd_seed,numObs,numAct,obsInfo,actInfo,Ts);
options=getDWMLAgentOptions();
agent = MySACAgent(numObs,numAct,obsInfo,actInfo,hid_dim,Ts,options);
saveAgentDir = ['savedAgents/',algorithm,'/',num2str(run_idx)];
trainOpts = rlTrainingOptions(...
MaxEpisodes=maxEpisodes, ...
MaxStepsPerEpisode=maxSteps, ...
ScoreAveragingWindowLength=100, ...
Plots="training-progress", ...
StopTrainingCriteria="AverageReward", ...
UseParallel=useParallel,...
SaveAgentCriteria='EpisodeReward',...
SaveAgentDirectory=saveAgentDir);
trainOpts = rlTrainingOptions(...
MaxEpisodes=maxEpisodes, ...
MaxStepsPerEpisode=maxSteps, ...
ScoreAveragingWindowLength=100, ...
Plots="training-progress", ...
StopTrainingCriteria="AverageReward", ...
UseParallel=useParallel,...
SaveAgentCriteria='EpisodeReward',...
SaveAgentDirectory=saveAgentDir);
set_param(mdl,"FastRestart","off");
set_param(mdl, SimMechanicsOpenEditorOnUpdate="off");
set_param(mdl, SimMechanicsOpenEditorOnUpdate="on");
monitor = trainingProgressMonitor();
logger = rlDataLogger(monitor);
logger.EpisodeFinishedFcn = @myEpisodeLoggingFcn;
trainResult = train(agent,env,trainOpts,Logger=logger);
function dataToLog = myEpisodeLoggingFcn(data)
dataToLog.criticLoss = data.Agent.cLoss;
dataToLog.actorLoss = data.Agent.aLoss;
dataToLog.entLoss = data.Agent.eLoss;