Skip to content

Commit fb617ae

Browse files
authored
Fix REINFORCE continuous action (#146)
* add cont test for all algos * fix reinforce cont action
1 parent 5ec2a0f commit fb617ae

File tree

8 files changed

+1619
-14
lines changed

8 files changed

+1619
-14
lines changed

slm_lab/agent/algorithm/reinforce.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ def calc_pdparam(self, x, evaluate=True, net=None):
114114
else:
115115
net.train()
116116
pdparam = net(x)
117+
if (not self.body.is_discrete) and len(pdparam) == 1:
118+
pdparam = pdparam[0]
117119
logger.debug(f'pdparam: {pdparam}')
118120
return pdparam
119121

slm_lab/spec/a2c.json

Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,340 @@
333333
}]
334334
}
335335
},
336+
"a2c_mlp_shared_pendulum": {
337+
"agent": [{
338+
"name": "A2C",
339+
"algorithm": {
340+
"name": "ActorCritic",
341+
"action_pdtype": "default",
342+
"action_policy": "default",
343+
"action_policy_update": "no_update",
344+
"explore_var_start": null,
345+
"explore_var_end": null,
346+
"explore_anneal_epi": null,
347+
"gamma": 0.99,
348+
"use_gae": true,
349+
"lam": 1.0,
350+
"use_nstep": false,
351+
"num_step_returns": 100,
352+
"add_entropy": true,
353+
"entropy_coef": 0.01,
354+
"policy_loss_coef": 1.0,
355+
"val_loss_coef": 0.01,
356+
"continuous_action_clip": 2.0,
357+
"training_frequency": 1,
358+
"training_epoch": 8
359+
},
360+
"memory": {
361+
"name": "OnPolicyReplay"
362+
},
363+
"net": {
364+
"type": "MLPNetShared",
365+
"hid_layers": [64],
366+
"hid_layers_activation": "relu",
367+
"clip_grad": false,
368+
"clip_grad_val": 1.0,
369+
"use_same_optim": false,
370+
"actor_optim_spec": {
371+
"name": "Adam",
372+
"lr": 0.02
373+
},
374+
"critic_optim_spec": {
375+
"name": "Adam",
376+
"lr": 0.02
377+
},
378+
"lr_decay": "rate_decay",
379+
"lr_decay_frequency": 500,
380+
"lr_decay_min_timestep": 1000,
381+
"lr_anneal_timestep": 100000,
382+
"gpu": true
383+
}
384+
}],
385+
"env": [{
386+
"name": "Pendulum-v0",
387+
"max_timestep": null,
388+
"max_episode": 500,
389+
"save_epi_frequency": 1000
390+
}],
391+
"body": {
392+
"product": "outer",
393+
"num": 1
394+
},
395+
"meta": {
396+
"max_session": 4,
397+
"max_trial": 100,
398+
"search": "RandomSearch",
399+
"max_generation": null
400+
},
401+
"search": {
402+
"agent": [{
403+
"algorithm": {
404+
"gamma__uniform": [0.90, 0.99],
405+
"lam__uniform": [0.1, 1.0]
406+
},
407+
"net": {
408+
"actor_optim_spec": {
409+
"lr__uniform": [0.0001, 0.1]
410+
},
411+
"critic_optim_spec": {
412+
"lr__uniform": [0.0001, 0.1]
413+
}
414+
}
415+
}]
416+
}
417+
},
418+
"a2c_mlp_separate_pendulum": {
419+
"agent": [{
420+
"name": "A2C",
421+
"algorithm": {
422+
"name": "ActorCritic",
423+
"action_pdtype": "default",
424+
"action_policy": "default",
425+
"action_policy_update": "no_update",
426+
"explore_var_start": null,
427+
"explore_var_end": null,
428+
"explore_anneal_epi": null,
429+
"gamma": 0.99,
430+
"use_gae": true,
431+
"lam": 1.0,
432+
"use_nstep": false,
433+
"num_step_returns": 100,
434+
"add_entropy": true,
435+
"entropy_coef": 0.01,
436+
"policy_loss_coef": 1.0,
437+
"val_loss_coef": 0.01,
438+
"continuous_action_clip": 2.0,
439+
"training_frequency": 1,
440+
"training_epoch": 8
441+
},
442+
"memory": {
443+
"name": "OnPolicyReplay"
444+
},
445+
"net": {
446+
"type": "MLPNetSeparate",
447+
"hid_layers": [64],
448+
"hid_layers_activation": "relu",
449+
"clip_grad": false,
450+
"clip_grad_val": 1.0,
451+
"use_same_optim": false,
452+
"actor_optim_spec": {
453+
"name": "Adam",
454+
"lr": 0.02
455+
},
456+
"critic_optim_spec": {
457+
"name": "Adam",
458+
"lr": 0.02
459+
},
460+
"lr_decay": "rate_decay",
461+
"lr_decay_frequency": 500,
462+
"lr_decay_min_timestep": 1000,
463+
"lr_anneal_timestep": 100000,
464+
"gpu": true
465+
}
466+
}],
467+
"env": [{
468+
"name": "Pendulum-v0",
469+
"max_timestep": null,
470+
"max_episode": 500,
471+
"save_epi_frequency": 1000
472+
}],
473+
"body": {
474+
"product": "outer",
475+
"num": 1
476+
},
477+
"meta": {
478+
"max_session": 4,
479+
"max_trial": 100,
480+
"search": "RandomSearch",
481+
"max_generation": null
482+
},
483+
"search": {
484+
"agent": [{
485+
"algorithm": {
486+
"gamma__uniform": [0.90, 0.99],
487+
"lam__uniform": [0.1, 1.0]
488+
},
489+
"net": {
490+
"actor_optim_spec": {
491+
"lr__uniform": [0.0001, 0.1]
492+
},
493+
"critic_optim_spec": {
494+
"lr__uniform": [0.0001, 0.1]
495+
}
496+
}
497+
}]
498+
}
499+
},
500+
"a2c_rnn_shared_pendulum": {
501+
"agent": [{
502+
"name": "A2C",
503+
"algorithm": {
504+
"name": "ActorCritic",
505+
"action_pdtype": "default",
506+
"action_policy": "default",
507+
"action_policy_update": "no_update",
508+
"explore_var_start": null,
509+
"explore_var_end": null,
510+
"explore_anneal_epi": null,
511+
"gamma": 0.99,
512+
"use_gae": true,
513+
"lam": 1.0,
514+
"use_nstep": false,
515+
"num_step_returns": 100,
516+
"add_entropy": true,
517+
"entropy_coef": 0.01,
518+
"policy_loss_coef": 1.0,
519+
"val_loss_coef": 0.01,
520+
"continuous_action_clip": 2.0,
521+
"training_frequency": 1,
522+
"training_epoch": 8
523+
},
524+
"memory": {
525+
"name": "OnPolicySeqReplay"
526+
},
527+
"net": {
528+
"type": "RecurrentNetShared",
529+
"hid_layers": [],
530+
"hid_layers_activation": "relu",
531+
"rnn_hidden_size": 64,
532+
"rnn_num_layers": 1,
533+
"seq_len": 4,
534+
"clip_grad": false,
535+
"clip_grad_val": 1.0,
536+
"use_same_optim": false,
537+
"actor_optim_spec": {
538+
"name": "Adam",
539+
"lr": 0.02
540+
},
541+
"critic_optim_spec": {
542+
"name": "Adam",
543+
"lr": 0.02
544+
},
545+
"lr_decay": "rate_decay",
546+
"lr_decay_frequency": 500,
547+
"lr_decay_min_timestep": 1000,
548+
"lr_anneal_timestep": 100000,
549+
"gpu": true
550+
}
551+
}],
552+
"env": [{
553+
"name": "Pendulum-v0",
554+
"max_timestep": null,
555+
"max_episode": 500,
556+
"save_epi_frequency": 1000
557+
}],
558+
"body": {
559+
"product": "outer",
560+
"num": 1
561+
},
562+
"meta": {
563+
"max_session": 4,
564+
"max_trial": 100,
565+
"search": "RandomSearch",
566+
"max_generation": null
567+
},
568+
"search": {
569+
"agent": [{
570+
"algorithm": {
571+
"gamma__uniform": [0.90, 0.99],
572+
"lam__uniform": [0.1, 1.0]
573+
},
574+
"net": {
575+
"actor_optim_spec": {
576+
"lr__uniform": [0.0001, 0.1]
577+
},
578+
"critic_optim_spec": {
579+
"lr__uniform": [0.0001, 0.1]
580+
}
581+
}
582+
}]
583+
}
584+
},
585+
"a2c_rnn_separate_pendulum": {
586+
"agent": [{
587+
"name": "A2C",
588+
"algorithm": {
589+
"name": "ActorCritic",
590+
"action_pdtype": "default",
591+
"action_policy": "default",
592+
"action_policy_update": "no_update",
593+
"explore_var_start": null,
594+
"explore_var_end": null,
595+
"explore_anneal_epi": null,
596+
"gamma": 0.99,
597+
"use_gae": true,
598+
"lam": 1.0,
599+
"use_nstep": false,
600+
"num_step_returns": 100,
601+
"add_entropy": true,
602+
"entropy_coef": 0.01,
603+
"policy_loss_coef": 1.0,
604+
"val_loss_coef": 0.01,
605+
"continuous_action_clip": 2.0,
606+
"training_frequency": 1,
607+
"training_epoch": 8
608+
},
609+
"memory": {
610+
"name": "OnPolicySeqReplay"
611+
},
612+
"net": {
613+
"type": "RecurrentNetSeparate",
614+
"hid_layers": [],
615+
"hid_layers_activation": "relu",
616+
"rnn_hidden_size": 64,
617+
"rnn_num_layers": 1,
618+
"seq_len": 4,
619+
"clip_grad": false,
620+
"clip_grad_val": 1.0,
621+
"use_same_optim": false,
622+
"actor_optim_spec": {
623+
"name": "Adam",
624+
"lr": 0.02
625+
},
626+
"critic_optim_spec": {
627+
"name": "Adam",
628+
"lr": 0.02
629+
},
630+
"lr_decay": "rate_decay",
631+
"lr_decay_frequency": 500,
632+
"lr_decay_min_timestep": 1000,
633+
"lr_anneal_timestep": 100000,
634+
"gpu": true
635+
}
636+
}],
637+
"env": [{
638+
"name": "Pendulum-v0",
639+
"max_timestep": null,
640+
"max_episode": 500,
641+
"save_epi_frequency": 1000
642+
}],
643+
"body": {
644+
"product": "outer",
645+
"num": 1
646+
},
647+
"meta": {
648+
"max_session": 4,
649+
"max_trial": 100,
650+
"search": "RandomSearch",
651+
"max_generation": null
652+
},
653+
"search": {
654+
"agent": [{
655+
"algorithm": {
656+
"gamma__uniform": [0.90, 0.99],
657+
"lam__uniform": [0.1, 1.0]
658+
},
659+
"net": {
660+
"actor_optim_spec": {
661+
"lr__uniform": [0.0001, 0.1]
662+
},
663+
"critic_optim_spec": {
664+
"lr__uniform": [0.0001, 0.1]
665+
}
666+
}
667+
}]
668+
}
669+
},
336670
"a2c_conv_shared_breakout": {
337671
"agent": [{
338672
"name": "A2C",

0 commit comments

Comments
 (0)