-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathplayGame_tensorflow.py
136 lines (104 loc) · 4.14 KB
/
playGame_tensorflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
np.random.seed(1337)
from gym_torcs import TorcsEnv
import random
import argparse
import tensorflow as tf
from my_config import *
from ddpg import *
import gc
gc.enable()
import timeit
import math
print(is_training)
print(total_explore)
print(max_eps)
print(max_steps_eps)
print(epsilon_start)
def playGame(train_indicator=is_training): # 1 means Train, 0 means simply Run
action_dim = 3 # Steering/Acceleration/Brake
state_dim = 29 # of sensors input
env_name = 'Torcs_Env'
agent = DDPG(env_name, state_dim, action_dim)
# Generate a Torcs environment
vision = False
env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
EXPLORE = total_explore
episode_count = max_eps
max_steps = max_steps_eps
epsilon = epsilon_start
done = False
step = 0
best_reward = -100000
print("TORCS Experiment Start.")
for i in range(episode_count):
##Occasional Testing
if ((np.mod(i, 10) == 0) and (i > 20)):
train_indicator = 0
else:
train_indicator = is_training
# relaunch TORCS every 3 episode because of the memory leak error
if np.mod(i, 3) == 0:
ob = env.reset(relaunch=True)
else:
ob = env.reset()
# Early episode annealing for out of track driving and small progress
# During early training phases - out of track and slow driving is allowed as humans do ( Margin of error )
# As one learns to drive the constraints become stricter
random_number = random.random()
eps_early = max(epsilon, 0.10)
if (random_number < (1.0 - eps_early)) and (train_indicator == 1):
early_stop = 1
else:
early_stop = 0
print("Episode : " + str(i) + " Replay Buffer " + str(agent.replay_buffer.count()) + ' Early Stopping: ' + str(
early_stop) + ' Epsilon: ' + str(eps_early) + ' RN: ' + str(random_number))
# Initializing the first state
s_t = np.hstack(
(ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
# Counting the total reward and total steps in the current episode
total_reward = 0.
step_eps = 0.
for j in range(max_steps):
# Take noisy actions during training
if (train_indicator):
epsilon -= 1.0 / EXPLORE
epsilon = max(epsilon, 0.1)
a_t = agent.noise_action(s_t, epsilon) #输入状态s得到动作的Q值
else:
a_t = agent.action(s_t)
# ob, r_t, done, info = env.step(a_t[0],early_stop)
ob, r_t, done, info = env.step(a_t, early_stop) # 得到游戏反馈
s_t1 = np.hstack(
(ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
# Add to replay buffer only if training (Is it necessay - don't think so)
if (train_indicator):
agent.perceive(s_t, a_t, r_t, s_t1, done)
# Cheking for nan rewards
if (math.isnan(r_t)):
r_t = 0.0
for bad_r in range(50):
print('Bad Reward Found')
total_reward += r_t
s_t = s_t1
# Displaying progress every 15 steps.
if ((np.mod(step, 15) == 0)):
print("Episode", i, "Step", step_eps, "Epsilon", epsilon, "Action", a_t, "Reward", r_t)
step += 1
step_eps += 1
if done:
break
# Saving the best model.
if total_reward >= best_reward:
if (train_indicator == 1):
print("Now we save model with reward " + str(total_reward) + " previous best reward was " + str(
best_reward))
best_reward = total_reward
agent.saveNetwork()
print("TOTAL REWARD @ " + str(i) + "-th Episode : Reward " + str(total_reward))
print("Total Step: " + str(step))
print("")
env.end() # This is for shutting down TORCS
print("Finish.")
if __name__ == "__main__":
playGame()