mountain_RBF_FA.py

import gym
import numpy as np
import matplotlib.pyplot as plt

# Initializations
env = gym.make('MountainCar-v0')
num_actions = env.action_space.n
dim = env.observation_space.high.size

# Parameters
# one set which converges in around 1200 episodes
# 4 rows, 4 cols, eps = 0.1, Lambda = 0.5, alpha = 0.008, gamma = 0.99
num_rows = 4
num_cols = 4
num_rbf = 4 * np.ones(2).astype(int)
width = 1. / (num_rbf - 1.)
rbf_sigma = width[0] / 2.
epsilon = 0.1
epsilon_final = 0.1
Lambda = 0.5
alpha = 0.01
gamma = 0.99
num_episodes = 2000
num_timesteps = 200

def writeAve(filename, array):
    fout = open(filename, 'w')
    # ave = zeros(numEpisodes)
    for i in range(num_episodes):
        fout.write(repr(array[i]) + '\n')
        # fout.write('\n')
    fout.close()

xbar = np.zeros((2, dim))
xbar[0, :] = env.observation_space.low
xbar[1, :] = env.observation_space.high
num_ind = np.prod(num_rbf)
activations = np.zeros(num_ind)
new_activations = np.zeros(num_ind)
theta = np.zeros((num_ind, num_actions))
rbf_den = 2 * rbf_sigma ** 2
epsilon_coefficient = (epsilon - epsilon_final) ** (1. / num_episodes)
ep_length = np.zeros(num_episodes)
np.set_printoptions(precision=2)


# Construct ndarray of rbf centers
c = np.zeros((num_ind, dim))
for i in range(num_rbf[0]):
    for j in range(num_rbf[1]):
        c[i*num_rbf[1] + j, :] = (i * width[1], j * width[0])


# Returns the state scaled between 0 and 1
def normalize_state(_s):
    _y = np.zeros(len(_s))
    for _i in range(len(_s)):
        _y[_i] = (_s[_i] - xbar[0, _i]) / (xbar[1, _i] - xbar[0, _i])
    return _y


# Returns an ndarray of radial basis function activations
def phi(_state):
    _phi = np.zeros(num_ind)
    for _k in range(num_ind):
        _phi[_k] = np.exp(-np.linalg.norm(_state - c[_k, :]) ** 2 / rbf_den)
    return _phi


# Returns an action following an epsilon-greedy policy
def epsilon_greedy(_epsilon, _vals):
    _rand = np.random.random()
    if _rand < 1. - _epsilon:
        _action = _vals.argmax()
    else:
        _action = env.action_space.sample()
    return int(_action)


# Returns the value of each action at some state
def action_values(_activations, _theta):
    _val = np.dot(_theta.T, _activations)
    return _val


# Returns the value of an action at some state
def action_value(_activations, _action, _theta):
    _val = np.dot(_theta[:, _action], _activations)
    return _val

method = input("0 for Sarsa,1 for Q-Learning :")

tot_rews =[]
steps =[]

for ep in range(num_episodes):

    e = np.zeros((num_ind, num_actions))
    state = normalize_state(env.reset())
    activations = phi(state)
    # print "activations = ", np.reshape(activations.ravel(order='F'), (num_rows, num_cols))
    vals = action_values(activations, theta)
    action = epsilon_greedy(epsilon, vals)
    tot=0
    s=0
    # Each episode
    for t in range(num_timesteps):

        # env.render()
        new_state, reward, done, info = env.step(action)
        new_state = normalize_state(new_state)
        new_activations = phi(new_state)
        new_vals = action_values(new_activations, theta)
        new_action = epsilon_greedy(epsilon, new_vals)
        Q = action_value(activations, action, theta)
        tot+=reward
        s+=1
        if method == "0":
            Q_new = action_value(new_activations, new_action, theta)
        else:
            Q_new = action_value(new_activations, np.argmax(new_vals), theta)

        if done:
            target = reward - Q
        else:
            target = reward + gamma * Q_new - Q
        # e[:, action] += activations  # accumulating traces
        e[:, action] = activations  # replacing traces

        for k in range(num_ind):
            for a in range(num_actions):
                theta[k, a] += alpha * target * e[k, a]
        # e *= gamma * Lambda
        e = np.zeros((num_ind, num_actions))

        if ep % 200 == 0:
            print("t = ", t)
            print("new_state = ", new_state)
            print("new_activations = ", np.reshape(new_activations.ravel(order='F'), (num_rows, num_cols)))
            print ("new_vals", new_vals)
            print ("Q = ", Q)
            print ("Q_new = ", Q_new)
            print ("action = ", action)
            print ("target = ", target)
            print ("e =", e)
            print ("theta = \n", np.reshape(theta.ravel(order='F'), (num_actions, num_rows, num_cols)))
            print ("---------------------------------------------------------------------------")

        state = new_state.copy()
        activations = new_activations.copy()
        action = new_action
        if done:
            break
    steps.append(s)
    tot_rews.append(tot)
    ep_length[ep] = t
    # print "t = ", t
    epsilon *= epsilon_coefficient

writeAve("Return_RBF_FA",tot_rews)
writeAve("Step_RBF_FA",steps)

value_left = np.zeros(num_ind)
value_nothing = np.zeros(num_ind)
value_right = np.zeros(num_ind)

# Display each action-value as a heatmap
for h in range(num_ind):
    current_activations = phi(c[h, :])
    value_left[h] += action_value(current_activations, 0, theta)
    value_nothing[h] += action_value(current_activations, 1, theta)
    value_right[h] += action_value(current_activations, 2, theta)

plt.close('all')
fig, axes = plt.subplots(ncols=3, sharey=True)
plt.setp(axes.flat, aspect=1.0, adjustable='box-forced')
im = axes[0].imshow(value_left.reshape((num_rbf[0], num_rbf[1])), cmap='hot')
axes[0].set_title('Action = left')
axes[0].set_ylabel('Position')
axes[0].set_xlabel('Velocity')
im = axes[1].imshow(value_nothing.reshape((num_rbf[0], num_rbf[1])), cmap='hot')
axes[1].set_title('Action = nothing')
im = axes[2].imshow(value_right.reshape((num_rbf[0], num_rbf[1])), cmap='hot')
axes[2].set_title('Action = right')
fig.subplots_adjust(bottom=0.2)
cbar_ax = fig.add_axes([0.15, 0.15, 0.7, 0.05])
cbar = fig.colorbar(im, cax=cbar_ax, orientation='horizontal')
plt.axis([0, 1, 0, 1])

plt.figure(2)
plt.plot(ep_length)
plt.title('Episode Length')
plt.ylabel('Completion Time')
plt.xlabel('Episode')
plt.show()