我再使用强化学习工具​箱编写SAC智能体进​行训练时策略一直在上​下限波动,没有很好的​探索,而使用DDPG​智能体和PPO智能体​则是能够进行一些有效​的探索,请问这是什么​原因?

14 次查看(过去 30 天)
希
2024-8-31
回答: halleyhit 2024-9-2
%main
% 观测空间和动作空间定义
% numObs = 11; %观测空间维度
% numAct = 4;%动作空间维度
numObs1 = 7; %观测空间维度
numAct1 = 3;%动作空间维度
%BS, EB, ,CL
actLowerLimit = [-100 ;-200 ; -50];
actUpperLimit = [100 ; 200 ; 50];
obsInfo = rlNumericSpec([numObs1 1]);
obsInfo.Name = 'ObservationSac1';
% 连续动作空间
actInfo = rlNumericSpec([numAct1 1],...
'LowerLimit',actLowerLimit,...
'UpperLimit',actUpperLimit);
actInfo.Name = 'ActionSac';
% 创建强化学习环境
env = rlFunctionEnv(obsInfo, actInfo, 'sacStepFunction', 'sacResetFunction');
% 定义网络结构参数
criticLayerSizes = [64 32];
actorLayerSizes = [64 32];
% 第一个Critic网络,SAC网络是同时输入观测状态与动作,然后输出一个Q值,每个critic网络有两个输入层;
% 观测状态输入层
obsPath = [
featureInputLayer(numObs1, Name="obsPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="obsout")
];
% 动作状态输入层
actPath = [
featureInputLayer(numAct1, Name="actPathInLyr")
fullyConnectedLayer(criticLayerSizes(1))
reluLayer
fullyConnectedLayer(criticLayerSizes(1),Name="actout")
];
% 合并路径
comPath = [
concatenationLayer(1,2,Name="cct")
fullyConnectedLayer(criticLayerSizes(2))
reluLayer
fullyConnectedLayer(1, Name="output")
];
%创建critic网络
criticNetwork = dlnetwork();
criticNetwork = addLayers(criticNetwork,obsPath);
criticNetwork = addLayers(criticNetwork,actPath);
criticNetwork = addLayers(criticNetwork,comPath);
criticNetwork = connectLayers(criticNetwork,"obsout","cct/in1");
criticNetwork = connectLayers(criticNetwork,"actout","cct/in2");
critic11 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
critic12 = rlQValueFunction(criticNetwork,obsInfo,actInfo, ...
ActionInputNames="actPathInLyr", ...
ObservationInputNames="obsPathInLyr");
% 创建策略网络(Actor Network),SAC的策略网络输入为观测状态,输出为动作的均值与方差,网络结构为单一输入两输出
% 输入层
inPath = [
featureInputLayer( ...
numObs1, ...
Name="netOin")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(1), ...
Name="nethid")
reluLayer
fullyConnectedLayer( ...
actorLayerSizes(2), ...
Name="infc")
];
meanPath = [
% tanhLayer(Name="tanhMean");
fullyConnectedLayer(numAct1,Name="FCMean");
% scalingLayer(Name="scale", ...
% Scale=actUpperLimit),
];
sdevPath = [
reluLayer(Name="reluStdv");
fullyConnectedLayer(numAct1,Name="FCStdv");
softplusLayer(Name="splus")
];
actorNetwork = dlnetwork();
actorNetwork = addLayers(actorNetwork,inPath);
actorNetwork = addLayers(actorNetwork,meanPath);
actorNetwork = addLayers(actorNetwork,sdevPath);
% actorNetwork = connectLayers(actorNetwork,"infc","tanhMean/in");
% actorNetwork = connectLayers(actorNetwork,"infc","tanhStdv/in");
actorNetwork = connectLayers(actorNetwork,"infc","FCMean/in");
actorNetwork = connectLayers(actorNetwork,"infc","reluStdv/in");
% 创建随机策略表示(高斯采样)
actor1 = rlContinuousGaussianActor(actorNetwork, obsInfo, actInfo, ...
ActionMeanOutputNames="FCMean",...
ActionStandardDeviationOutputNames="splus",...
ObservationInputNames="netOin");
act = getAction(actor1,{rand(obsInfo.Dimension)});
dist = evaluate(actor1,{rand(obsInfo.Dimension)});
% 评估网络训练设置
criticOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=2e-4);
%动作网络训练设置
actorOptions = rlOptimizerOptions( ...
Optimizer="adam", ...
LearnRate=1e-3,...
GradientThreshold=1, ...
L2RegularizationFactor=1e-5);
% 定义SAC智能体选项
sacOptions = rlSACAgentOptions(...
'TargetSmoothFactor',1e-3,... % 目标网络平滑系数
'ExperienceBufferLength',5000,... % 经验缓冲区大小
'MiniBatchSize',256,... % 小批量大小
'DiscountFactor',0.99,... % 折扣因子
'SampleTime',1,... % 采样时间
'CriticOptimizerOptions', criticOptions,...
'ActorOptimizerOptions',actorOptions);
% 创建SAC智能体
agent1 = rlSACAgent(actor1,[critic11 critic12],sacOptions);
% 定义训练选项
trainOpts = rlTrainingOptions(...
'MaxEpisodes',500,... % 最大训练回合数
'MaxStepsPerEpisode',96,... % 每回合的最大步数
'Verbose',true,... % 不显示详细的训练信息
'Plots','training-progress',... % 显示训练进度图
'StopTrainingCriteria','AverageReward',... % 训练停止条件
'StopTrainingValue',0,... % 停止训练的平均奖励值
'ScoreAveragingWindowLength',10,... % 计算平均奖励的窗口长度
'SaveAgentCriteria',"EpisodeReward",... % 保存智能体的条件
'SaveAgentValue',0); % 保存智能体的奖励值
% %单智能体训练选项
% trainOpts = rlTrainingOptions(...
% Plots='training-progress',...
% MaxEpisodes=500,...
% MaxStepsPerEpisode=96,...
% ScoreAveragingWindowLength=10,...
% StopTrainingCriteria="AverageReward", ...
% StopTrainingValue=0);
% %"LearningStrategy","decentralized",...
% % 'Verbose',true, ...
% %训练智能体
result = train(agent1,env,trainOpts);
%% 测试
agent=agent_Trained;
%agent=agent1;
simSteps = 200;
simOptions = rlSimulationOptions('MaxSteps',simSteps);
experience = sim(env,agent,simOptions);
simActionSeries = experience.Action.ActionSac.Data;
%STEPFUNCTION
function [NextObs,Reward,IsDone,LoggedSignals] = sacStepFunction(Action,LoggedSignals)
Q_BS=500;
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_EE = 0.6*transpose(load_e);
LOAD_EE1 = LOAD_EE(1:96);
LOAD_EE2 = LOAD_EE(97:192);
LOAD_EE3 = LOAD_EE(193:288);
%分时购电电价,扩展至96点
Power_B1=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_Buy=zeros(1,96);
for n=1:24
Power_Buy(4*n-3:4*n)=Power_B1(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV1_96 = transpose(PV(:,1));
PV2_96 = transpose(PV(:,2));
PV3_96 = transpose(PV(:,3));
%%EB与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%指令时延(s),96*3;
delay11 = 2*ones(96,1);
delay12 = 2*ones(96,1);
delay13 = 2*ones(96,1);
%BS, EB, BUY ,CL,TR
%action_space = rlNumericSpec([5 1], 'LowerLimit', action_lowerlimits, 'UpperLimit', action_upperlimits);
% 当前状态值, T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3
State = LoggedSignals.State;
%调试信息
% disp(class(Action));
% disp(size(Action));
disp(Action);
% 逐个智能体状态更新
NextObs = zeros(size(State));
Reward = 0;
for agent_i = 1:1
T_solt = State(1,agent_i);
LOAD_E = State(2,agent_i);
Power_buy = State(3,agent_i);
PV = State(4,agent_i);
CHP = State(5,agent_i);
EB = State(6,agent_i);
delay = State(7,agent_i);
T_next = T_solt + 1;
LOAD_E_next = LOAD_EE1(T_next);
Power_buy_next = Power_Buy(T_next);
PV_next = PV1_96(T_next);
CHP_next = CHP1(T_next);
EB_next = EB1(T_next);
delay_next = delay11(T_next);
% 更新状态
NextObs(:, agent_i) = [T_next; LOAD_E_next; Power_buy_next; PV_next; CHP_next; EB_next; delay_next];
% 奖励值
%调控成本
% [BS EB CL]
LOAD_real= PV(1) + 0.9 * CHP -(EB/0.95 - Action(2)) + Action(3) + Action(1);
%平衡项
BUY = LOAD_E(1) - LOAD_real;
COST = (0.5 * abs(Action(1)) + 0.2 * abs(Action(2)) + Power_buy * BUY + 0.5 * abs(Action(3)) );
% 时延偏差成本
% if BUY<=500 && BUY>0
% Penalty_local = 0.5*BUY;
% elseif BUY>500 && BUY<=1000
% Penalty_local = 2*BUY;
% else
% Penalty_local = 5*BUY;
% end
%Penalty_local = LOAD_E(1) - PV(1) + Action(1) + Action(2) - Action(3) - Action(4) - 0.8 * CHP + EB / 0.95;%正的是用电
% if abs(Penalty_local)<=80
% Penalty_local = 1*abs(Penalty_local);
% elseif abs(Penalty_local)<=120
% Penalty_local = 2*abs(Penalty_local);
% elseif abs(Penalty_local)>120
% Penalty_local = 3*abs(Penalty_local);
% else
% Penalty_local = 0;
% end
%Penalty_local
% %全局功率平衡约束
% Penalty_global = 100*(sum(LOAD_E) -sum(PV) + sum(Action(1)) + sum(Action(2)) - sum(Action(3)) - sum(Action(4)) - sum(Action(5))) ;
% if Penalty_global>300 || Penalty_local<-300
% Penalty_global=100;
% else
% Penalty_global = 0;
% end
%功率交互约束
% Penalty_Pt = 100*sum(Action(5,:));
% Reward = -COST- Penalty_local;
Reward = -COST;
end
LoggedSignals.State = NextObs;
LoggedSignals.action=Action;
NextObs = mat2cell(NextObs, 7, 1);
%判断一轮学习是否结束
%T_next
IsDone=(T_next >= 96);
end
%RESET
%状态环境重置函数
function [InitialObservation, LoggedSignal] = sacResetFunction()%重置强化学习环境
%读取三个综合能源系统分别一天内的电负荷需求
load('load_e.mat');
LOAD_E = 0.6*transpose(load_e);
LOAD_E1 = LOAD_E(1:96);
LOAD_E2 = LOAD_E(97:192);
LOAD_E3 = LOAD_E(193:288);
%分时购电电价,扩展至96点
Power_B=[0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.29 0.77 0.77 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 1.19 0.77 0.77 0.77 0.77 0.77];
Power_buy=zeros(1,96);
for n=1:24
Power_buy(4*n-3:4*n)=Power_B(n);
end
%光伏出力,每15分钟
load('PV.mat');
PV_1 = transpose(PV(:,1));
PV_2 = transpose(PV(:,2));
PV_3 = transpose(PV(:,3));
%%指令时延(s),96*3;
delay1 = 2*ones(96,1);
delay2 = 2*ones(96,1);
delay3 = 2*ones(96,1);
%%EP与CHP
load('PSO_data.mat')
CHP1 = PSO_data(:,1);
CHP2 = PSO_data(:,2);
CHP3 = PSO_data(:,3);
EB1 = PSO_data(:,4);
EB2 = PSO_data(:,5);
EB3 = PSO_data(:,6);
%初始化步骤
T_solt = 1;
LOAD_E1 = LOAD_E1(1);
LOAD_E2 = LOAD_E2(1);
LOAD_E3 = LOAD_E3(1);
Power_buy = Power_buy(1);
PV_1 = PV_1(1);
PV_2 = PV_2(1);
PV_3 = PV_3(1);
delay1 = delay1(1);
delay2 = delay2(1);
delay3 = delay3(1);
CHP1 = CHP1(1);
CHP2 = CHP2(1);
CHP3 = CHP3(1);
EB1=EB1(1);
EB2=EB2(1);
EB3=EB3(1);
%重置三个智能体状态的初始化观测值
LoggedSignal.State(:,1) = [T_solt;LOAD_E1;Power_buy;PV_1;CHP1;EB1;delay1;];
% LoggedSignal.State(:,2) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State(:,3) = [T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% LoggedSignal.State=[T_solt;LOAD_E1;LOAD_E2;LOAD_E3;Power_buy;PV_1;PV_2;PV_3;delay1;delay2;delay3];
% 将初始环境状态变量作为记录信号LoggedSignal返回
% InitialObservation = {LoggedSignal.Agent1State, LoggedSignal.Agent2State, LoggedSignal.Agent3State};
InitialObservation = {LoggedSignal.State(:,1)};
end

回答(1 个)

halleyhit
halleyhit 2024-9-2
不同的智能体就是适应不同问题的,如果一个智能体能应对所有问题,那么就不会有其他智能体了

类别

Help CenterFile Exchange 中查找有关 Big Data Processing 的更多信息

产品


版本

R2024a

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!