Custom deep reinforcement learning training process, the code does not report errors, but during the training process it can not be read, just like the data
    3 次查看(过去 30 天)
  
       显示 更早的评论
    
rng(0)
env = rlPredefinedEnv('CartPoleSimscapeModel-Continuous');
%Extract the observation and action specifications from the environment.
obsInfo = getObservationInfo(env);
actInfo = getActionInfo(env);
%Obtain the number of observations (numObs) and actions (numAct).
numObs = obsInfo.Dimension(1);
numAct = actInfo.Dimension(1);
%Set a sample time for the environment
Ts = 0.01;
Tf=20;
%%
%network
criticLayerSizes = [128 200];
actorLayerSizes = [128 200];
%createNetworkWeights;
statePath = [
    featureInputLayer(numObs,'Normalization','none','Name','observation')
    fullyConnectedLayer(128,'Name','CriticStateFC1')
    reluLayer('Name','CriticRelu1')
    fullyConnectedLayer(200,'Name','CriticStateFC2')];
actionPath = [
    featureInputLayer(1,'Normalization','none','Name','action')
    fullyConnectedLayer(200,'Name','CriticActionFC1','BiasLearnRateFactor',0)];
commonPath = [
    additionLayer(2,'Name','add')
    reluLayer('Name','CriticCommonRelu')
    fullyConnectedLayer(1,'Name','CriticOutput')];
criticNetwork = layerGraph(statePath);
criticNetwork = addLayers(criticNetwork,actionPath);
criticNetwork = addLayers(criticNetwork,commonPath);
criticNetwork = connectLayers(criticNetwork,'CriticStateFC2','add/in1');
criticNetwork = connectLayers(criticNetwork,'CriticActionFC1','add/in2');
criticNetwork = dlnetwork(criticNetwork);
% Create the critic function approximator.
criticOptions = rlOptimizerOptions('LearnRate',1e-03,'GradientThreshold',1);
critic = rlQValueFunction(criticNetwork,obsInfo,actInfo);
criticOptimizer = rlOptimizer(criticOptions);
%ActorNetwork
actorNetwork = [
    featureInputLayer(numObs,'Normalization','none','Name','observation')
    fullyConnectedLayer(128,'Name','ActorFC1')
    reluLayer('Name','ActorRelu1')
    fullyConnectedLayer(200,'Name','ActorFC2')
    reluLayer('Name','ActorRelu2')
    fullyConnectedLayer(1,'Name','ActorFC3')
    tanhLayer('Name','ActorTanh1')
    scalingLayer('Name','ActorScaling','Scale',max(actInfo.UpperLimit))];
actorNetwork = dlnetwork(actorNetwork);
% Create the actor function approximator.
actorOptions = rlOptimizerOptions('LearnRate',5e-04,'GradientThreshold',1);
actor = rlContinuousDeterministicActor(actorNetwork,obsInfo,actInfo);
actorOptimizer  = rlOptimizer(actorOptions);
policy = rlDeterministicActorPolicy(actor);
agentOptions = rlDDPGAgentOptions(...
    'SampleTime',Ts,...
    'ActorOptimizerOptions',actorOptions,...
    'CriticOptimizerOptions',criticOptions,...
    'ExperienceBufferLength',1e6,...
    'MiniBatchSize',128);
agentOptions.NoiseOptions.Variance = 0.4;
agentOptions.NoiseOptions.VarianceDecayRate = 1e-5;
agent = rlDDPGAgent(actor,critic,agentOptions);
%%
%creat buffer
myBuffer.bufferSize = 500;
myBuffer.bufferIndex = 0;
myBuffer.currentBufferLength = 0;
myBuffer.observation = zeros(numObs,1,myBuffer.bufferSize);
myBuffer. nextObservation =zeros(numObs,1,myBuffer.bufferSize);
myBuffer.action = zeros(numAct,1,myBuffer.bufferSize);
myBuffer.reward = zeros(1,myBuffer.bufferSize);
myBuffer.isDone = zeros(1,myBuffer.bufferSize);
%processExpData structure
processExpData.Critic = critic;
processExpData.TargetCritic = critic;
processExpData.Actor = actor;
processExpData.TargetActor = actor;
processExpData.MyBuffer = myBuffer;
processExpData.CriticOptimizer = criticOptimizer;
processExpData.ActorOptimizer = actorOptimizer;
processExpData.MiniBatchSize = 128;
processExpData.DiscountFactor = 0.99;
processExpData.TargetSmoothFactor = 1e-3;
maxEpisodes = 1000;
maxSteps = ceil(Tf/Ts);
trainingTerminationValue = 480;
[trainingPlot,lineReward,lineAveReward] = hBuildFigure;%图像显示
% Enable the training visualization plot.
set(trainingPlot,'Visible','on');
%%
%train
doTraining = true;
if doTraining
    % Training loop
    for i = 1:maxEpisodes
        % update actor, critic
        agent = setActor(agent,actor);
        agent = setCritic(agent,critic);
        out=sim(agent, env);
        myBuffer.observations=out.Observation.observations.Data(:,:,1:myBuffer.bufferSize-1);
        myBuffer. nextObservation=out.Observation.observations.Data(:,:,2:myBuffer.bufferSize);
        myBuffer.action =out.Action.force.Data;
        myBuffer.reward = out.Reward.Data'; %转置
        myBuffer.isDone = out.IsDone.Data';
        %miniBatch 
        BatchSize.observations=myBuffer.observations(:,:,1:processExpData.MiniBatchSize);
        BatchSize.nextObservation=myBuffer.nextObservation(:,:,1:processExpData.MiniBatchSize);
        BatchSize.action=myBuffer.action(:,:,1:processExpData.MiniBatchSize);
        BatchSize.reward = myBuffer.reward(:,1:processExpData.MiniBatchSize);
        BatchSize.isDone = myBuffer.isDone(:,1:processExpData.MiniBatchSize);
        BatchSize.nextObs{1}=BatchSize.nextObservation;
        BatchSize.obs{1}=BatchSize.observations;
        for epoch=1:maxSteps
            if ~isempty(BatchSize)
            % Update network parameters using the mini-batch.
            [processExpData,actorParams] = learnFcn(processExpData,BatchSize);
            % Update the policy parameters using the actor parameters.
            policy = setLearnableParameters(policy,actorParams);
            end
        end
        % Extract the critic and actor networks from processExpData.
        critic = processExpData.Critic;
        actor  = processExpData.Actor;
        % Extract the cumulative reward and calculate average reward 
        % per step for this episode.
        episodeCumulativeReward = sum(BatchSize.reward);
        episodeCumulativeRewardVector = cat(2,...
          episodeCumulativeRewardVector,episodeCumulativeReward);
        movingAveReward = movmean(episodeCumulativeRewardVector,...
         aveWindowSize,2);
        addpoints(lineReward,episodeCt,episodeCumulativeReward);
        addpoints(lineAveReward,episodeCt,movingAveReward(end));
        drawnow;
        if max(movingAveReward) > trainingTerminationValue
           break
        end
    end
end
% %plot env
% obs = reset(env);
% plot(env);
% for maxStepsPerEpisode = 1:maxStepsPerEpisode
%     
%     % Select action according to trained policy
%     action = getAction(Actor,{obs});
%         
%     % Step the environment
%     [nextObs,reward,isdone] = step(env,action{1});
%     
%     % Check for terminal condition
%     if isdone
%         break
%     end
%     
%     obs = nextObs;
%     
% end
%%
function [processExpData,actorParams] = learnFcn(processExpData,BatchSize)
% Find the terminal experiences.
doneidx = (BatchSize.isDone == 1);
% Compute target next actions against the next observations.
nextAction = evaluate(processExpData.TargetActor,BatchSize.nextObs);%数据类型要变为cell
% compute qtarget = reward + gamma*Q(nextObservation,nextAction)
%                 = reward + gamma*expectedFutureReturn
targetq = BatchSize.reward;
% Bootstrap the target at nonterminal experiences.
expectedFutureReturn = ...
    getValue(processExpData.TargetCritic,BatchSize.nextObs,nextAction);
targetq(~doneidx) = targetq(~doneidx) + ...
    processExpData.DiscountFactor.*expectedFutureReturn(~doneidx);
% Compute critic gradient using deepCriticLoss function.
criticGradient = gradient(processExpData.Critic,@deepCriticLoss,...
    [BatchSize.obs,BatchSize.action],targetq);
% Update the critic parameters.
[processExpData.Critic,processExpData.CriticOptimizer] = update(...
    processExpData.CriticOptimizer,processExpData.Critic,...
    criticGradient);
% Compute the actor gradient using the deepActorGradient function. To
% accelerate the deepActorGradient function, the critic network is
% extracted outside the function and is passed in as a field to the
% actorGradData input struct.
actorGradData.CriticNet = getModel(processExpData.Critic);
actorGradData.MiniBatchSize = processExpData.MiniBatchSize;
actorGradient = customGradient(processExpData.Actor,@deepActorGradient,...
    BatchSize.obs,actorGradData);
% Update the actor parameters.
[processExpData.Actor,processExpData.ActorOptimizer] = update(...
    processExpData.ActorOptimizer,processExpData.Actor,...
    actorGradient);
actorParams = getLearnableParameters(processExpData.Actor);
% Update targets using the given TargetSmoothFactor hyperparameter.
processExpData.TargetCritic = syncParameters(processExpData.TargetCritic,...
    processExpData.Critic,processExpData.TargetSmoothFactor);
processExpData.TargetActor  = syncParameters(processExpData.TargetActor ,...
    processExpData.Actor ,processExpData.TargetSmoothFactor);
end
function loss = deepCriticLoss(q,targetq)
q = q{1};
% Loss is the half mean-square error of q = Q(observation,action)
%against  qtarget
loss = mse(q,reshape(targetq,size(q)));
end
function dQdTheta = deepActorGradient(actorNet,observation,gradData)
% Evaluate actions from current observations.
action = forward(actorNet,observation{:});
% Compute: q = Q(s,a)
q = predict(gradData.CriticNet,observation{:},action);
% Compute: qsum = -sum(q)/N to maximize q
qsum = -sum(q,"all")/gradData.MiniBatchSize;
% Compute: d(-sum(q)/N)/dActorParams
dQdTheta = dlgradient(qsum,actorNet.Learnables);
end
function [trainingPlot, lineReward, lineAveReward] = hBuildFigure()
    plotRatio = 16/9;
    trainingPlot = figure(...
                'Visible','off',...
                'HandleVisibility','off', ...
                'NumberTitle','off',...
                'Name','Cart Pole Custom Training');
    trainingPlot.Position(3) = plotRatio * trainingPlot.Position(4);
    ax = gca(trainingPlot);
    lineReward = animatedline(ax);
    lineAveReward = animatedline(ax,'Color','r','LineWidth',3);
    xlabel(ax,'Episode');
    ylabel(ax,'Reward');
    legend(ax,'Cumulative Reward','Average Reward','Location','northwest')
    title(ax,'Training Progress');
end
Operation terminated by user during deep.internal.recording.convert.tapeToFunction
In deep.AcceleratedFunction>iGenerateBackwardFunctionNoCleanup (line 637)
[backwardFun, backwardFileName] = deep.internal.recording.convert.tapeToFunction(tape, backwardInputIDs, gradIDs);
In deep.AcceleratedFunction>iGenerateBackwardFunction (line 603)
[backwardFun, backwardFileName] = iGenerateBackwardFunctionNoCleanup(args,numIntermediateAdjointsToDrop);
In deep.AcceleratedFunction/augmentWithBackwardFunctions (line 467)
                [fullBackwardFun, fullBackwardFileName] = iGenerateBackwardFunction(args, 0);
In deep.AcceleratedFunction/generateForward (line 442)
                    fun = augmentWithBackwardFunctions(obj, args, numIntermediates, generatedCode);
In  ()  (line 262)
            [cacheData, varargout, illegalOutputs] = generateForward(obj, varargout, inputNodes, tm, priorTapeCount, isTracing);
In nnet.internal.cnn.layer.CodegenFusedLayer/evaluate (line 153)
                [Z{1:nout}] = trainingFun(X, this.Learnables, this.State);
In nnet.internal.cnn.layer.CodegenFusedLayer/predict (line 75)
            [varargout{1:nargout}] = evaluate(this, X, @predictPropagate, this.PredictTrainingFcn, this.PredictInferenceCache);
In nnet.internal.cnn.layer.GraphExecutor>iPredictWithoutState (line 407)
    out = predict(layer, in);
0 个评论
回答(0 个)
另请参阅
类别
				在 Help Center 和 File Exchange 中查找有关 Training and Simulation 的更多信息
			
	Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!
