stoptrainingcriteria = "AverageReward";
stoptrainingvalue = 2000000;
obsInfo = rlNumericSpec([numObs 1]);
obsInfo.Name = 'observations';
ActionInfo = rlNumericSpec([1 1],...
ActionInfo.Name = 'alfa';
env = rlSimulinkEnv(mdl,blk,obsInfo,ActionInfo);
env.ResetFcn = @(in) resetfunction(in, mdl);
initOpts = rlAgentInitializationOptions('NumHiddenUnit',32);
agent = rlDDPGAgent(obsInfo, ActionInfo, initOpts);
agent.AgentOptions.NoiseOptions.MeanAttractionConstant = 1/30;
agent.AgentOptions.NoiseOptions.StandardDeviation = 41;
agent.AgentOptions.NoiseOptions.StandardDeviationDecayRate = 0.00001;
agent.AgentOptions.NumStepsToLookAhead = 32;
agent.AgentOptions.CriticOptimizerOptions.LearnRate = 1e-03;
agent.AgentOptions.CriticOptimizerOptions.GradientThreshold = 1;
agent.AgentOptions.ActorOptimizerOptions.LearnRate = 1e-04;
agent.AgentOptions.ActorOptimizerOptions.GradientThreshold = 1;
opt = rlTrainingOptions(...
'MaxEpisodes', epochs,...
'MaxStepsPerEpisode', 1000,...
'StopTrainingCriteria', stoptrainingcriteria,...
'StopTrainingValue', stoptrainingvalue,...
'Plots', "training-progress");
trainResults = train(agent,env,opt);
generatePolicyFunction(agent);
policy1 = getGreedyPolicy(agent);
policy2 = getExplorationPolicy(agent);
actions1 = zeros(length(x_values), 1);
actions2 = zeros(length(x_values), 1);
for i = 1:length(x_values)
actions1(i) = cell2mat(policy1.getAction(x_values(i)));
actions2(i) = cell2mat(policy2.getAction(x_values(i)));
plot(x_values, actions2);
plot(x_values, actions1, 'LineWidth', 2);