强化学习---Boltzmann选择
下面的强化学习 代码epsilon_greedy选择怎么改成Boltzmann选择?#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
int move(int a, int &x, int &y, int x_size);
int xy2s(int x, int y, int x_size);
int select_action(int s, int num_a, double** Qtable);
double max_Qval(int s, int num_a, double** Qtable);
int epsilon_greedy(int epsilon,int s, int num_a, double** Qtable);
int main()
{
int x_size;//x軸方向,迷路大小(x_size=10 时,x=0~9)
int y_size;
double alpha, gamma;
int x, y, x_init, y_init;
int **maze;
int num_step;//Q值更新次数
int num_trial;//运行回数
int i,j,k;
int a,s,sd;
int num_a;
int num_s;
double **Qtable;
int reward;
double Qmax;
int epsilon;
//初期值
alpha=0.5;
gamma=0.9;
epsilon=10;
x_size=10;
y_size=9;
x_init=1;
y_init=1;
num_step=100;
num_trial=500;
num_a=4;
num_s=x_size*y_size;
int *suc_sum;
int *suc_sum2;
suc_sum= new int [(int)(num_trial/100)];
suc_sum2= new int [(int)(num_trial/100)];
//初始化
srand( (unsigned)time( NULL ) );
//Q-table
Qtable=new double*;
for(i=0;i<num_s;i++){
Qtable=new double;
}
//Q-table初始化
for(i=0;i<num_s;i++){
for(j=0;j<num_a;j++){
Qtable=0.1;
}
}
//迷路
maze=new int*;
for(i=0;i<x_size;i++){
maze=new int;
}
//迷路初始化(设置迷路墙壁)
for(i=0;i<x_size;i++){
for(j=0;j<y_size;j++){
if(i==0 || j==0 || i==(x_size-1) || j==(y_size-1)){
maze=-1;
for(k=0;k<num_a;k++){
Qtable=0.0;
}
}
else{
maze=0;
}
}
}
//设置墙壁
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
maze=-1;
for(i=0;i<x_size;i++){
for(j=0;j<y_size;j++){
if (maze==-1){
for(k=0;k<num_a;k++){
Qtable=0.0;
}
}
}
}
//设置maze
maze=10;
for(i=0;i<x_size;i++){
for(j=0;j<y_size;j++){
printf("%3d",maze);
}
printf("\n");
}
//初期设定
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
for(i=0;i<num_trial/100;i++){
suc_sum=0;
}
for(i=0;i<num_trial/100;i++){
suc_sum2=0;
}
//开始学习
for(i=0;i<num_trial;i++){
printf("trial=%d\n",i);
for(j=0;j<num_step;j++){
a=epsilon_greedy(epsilon,s,num_a,Qtable);//<--改为 Boltzmann选择
sd = move(a,x,y,x_size);
reward=maze;
Qmax=max_Qval(sd,num_a,Qtable);
Qtable=(1 - alpha) * Qtable + alpha * ((double)reward + gamma * Qmax);
if(reward<0){
//失败
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
printf("失败\n");
break;
}
else if(reward>0){
//成功
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
printf("成功\n");
suc_sum++;
break;
}
else{
//继续
s=sd;
}
}
//验证
if (i%100==0){
for(k=0;k<100;k++){
//初期设定
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
for(j=0;j<num_step;j++){
a=select_action(s,num_a,Qtable);
sd = move(a,x,y,x_size);
reward=maze;
if(reward<0){
//失败
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
//printf("失败\n");
break;
}
else if(reward>0){
//成功
x=x_init;
y=y_init;
s=xy2s(x,y,x_size);
suc_sum2++;
//printf("成功\n");
break;
}
else{
s=sd;
}
}
}
}
}
//追加
//方向
for(x=0;x<x_size;x++){
for(y=0;y<y_size;y++){
s=xy2s(x,y,x_size);
Qmax=max_Qval(s,num_a,Qtable);
if(Qmax==0){
printf("%3d",maze);
}
else{
a=select_action(s,num_a,Qtable);
if(a==0){
printf(" →");
}
else if(a==1){
printf(" ↓");
}
else if(a==2){
printf(" ←");
}
else{
printf(" ↑");
}
}
}
printf("\n");
}
for(i=0;i<num_trial/100;i++){
printf("%d-%d epsilon_greedy:%d/100, select_action:%d/100\n",i*100,i*100+99,suc_sum,suc_sum2);
}
//追加到这里
for(i=0;i<num_s;i++){
delete[] Qtable;
}
delete[] Qtable;
for(i=0;i<x_size;i++){
delete[] maze;
}
delete[] maze;
return 0;
}
int move(int a, int &x, int &y, int x_size){
if( a == 0){
y = y + 1;
}
else if( a == 1 ){
x = x + 1;
}
else if( a == 2 ){
y = y - 1;
}
else{
x = x - 1;
}
int sd;
sd =xy2s(x,y,x_size);
return sd;
}
int xy2s(int x, int y, int x_size){
int s;
s = x + y * x_size;
return s;
}
int select_action(int s, int num_a, double** Qtable){
double max;
int i=0;
int* i_max = new int;
int num_i_max=1;
int a;
i_max=0;
max=Qtable;
for(i=1;i<num_a;i++){
if (Qtable>max){
max=Qtable;
num_i_max=1;
i_max=i;
}
else if(Qtable==max){
num_i_max++;
i_max=i;
}
}
a= i_max;
return a;
}
double max_Qval(int s, int num_a, double** Qtable){
double max;
int i=0;
max=Qtable;
for(i=1;i<num_a;i++){
if (Qtable>max){
max=Qtable;
}
}
return max;
}
int epsilon_greedy(int epsilon, int s, int num_a, double** Qtable){
int a;
if(epsilon > rand()%100){
a=rand()%num_a;
//printf("rand选择行动\n");
}
else{
//选择最大Q值的行动
a=select_action(s,num_a,Qtable);
}
return a;
}
[ 本帖最后由 风花雪月 于 2008-1-18 10:39 编辑 ] 这个和算法好像没有他大关系
估计搞清楚epsilon_greedy 和 Boltzmann两种选择的原理改起来应该很容易
页:
[1]