LilTwo
diff --git a/‎DQNfromDemo/DQfD.py
+139 b/‎DQNfromDemo/DQfD.py
+139
diff --git a/‎DQNfromDemo/Test/CartPole.py
+144 b/‎DQNfromDemo/Test/CartPole.py
+144
diff --git a/‎DQNfromDemo/Test/CartPoleDemo.txt
+1 b/‎DQNfromDemo/Test/CartPoleDemo.txt
+1
diff --git a/‎DQN_NoisyNet/__init__.py ‎DQNfromDemo/__init__.py b/‎DQN_NoisyNet/__init__.py ‎DQNfromDemo/__init__.py
diff --git a/‎DQNfromDemo/__init__.pyc
121 Bytes b/‎DQNfromDemo/__init__.pyc
121 Bytes
diff --git a/‎DQNfromDemo/__pycache__/DQfD.cpython-37.pyc
4.94 KB b/‎DQNfromDemo/__pycache__/DQfD.cpython-37.pyc
4.94 KB
diff --git a/‎DQNfromDemo/__pycache__/__init__.cpython-37.pyc
160 Bytes b/‎DQNfromDemo/__pycache__/__init__.cpython-37.pyc
160 Bytes
diff --git a/‎DQN_NoisyNet/DQN_NoisyNet.py ‎DQNwithNoisyNet/DQN_NoisyNet.py
+11-8 b/‎DQN_NoisyNet/DQN_NoisyNet.py ‎DQNwithNoisyNet/DQN_NoisyNet.py
+11-8
diff --git a/‎DQN_NoisyNet/NoisyLayer.py ‎DQNwithNoisyNet/NoisyLayer.py
+3-3 b/‎DQN_NoisyNet/NoisyLayer.py ‎DQNwithNoisyNet/NoisyLayer.py
+3-3
diff --git a/‎DQN_NoisyNet/SumTree.py ‎DQNwithNoisyNet/SumTree.py b/‎DQN_NoisyNet/SumTree.py ‎DQNwithNoisyNet/SumTree.py
diff --git a/‎DQN_NoisyNet/Test/CartPole.py ‎DQNwithNoisyNet/Test/CartPole.py
+11-6 b/‎DQN_NoisyNet/Test/CartPole.py ‎DQNwithNoisyNet/Test/CartPole.py
+11-6
diff --git a/‎DQNwithNoisyNet/Test/CartPoleExpert.txt
3.4 KB b/‎DQNwithNoisyNet/Test/CartPoleExpert.txt
3.4 KB
@@ -0,0 +1,139 @@
+import sys
+from os import path
+
+local = path.abspath(__file__)
+root = path.dirname(path.dirname(local))
+if root not in sys.path:
+    sys.path.append(root)
+
+from DQNwithNoisyNet import DQN_NoisyNet
+import torch
+
+
+class DeepQL(DQN_NoisyNet.DeepQL):
+    def __init__(self, *args,lambda1=1.0,lambda2=1.0,lambda3=1e-5, **kwargs,):
+        super().__init__(*args, **kwargs,L2=lambda3)
+        self.ed = 1.0  # bonus for demonstration
+        self.ea = 0.001
+        self.margin = 0.8
+        self.lambda1 = lambda1  # n-step return
+        self.lambda2 = lambda2  # supervised loss
+        self.lambda3 = lambda3  # L2
+        self.replay.e = 0
+
+    def storeTransition(self, s, a, r, s_, done, isdemo):
+        s = torch.Tensor(s)
+        s_ = torch.Tensor(s_)
+        error = self.calcError((s, a, r, s_, done))
+        e = self.ed if isdemo else self.ea
+        self.store((s, a, r, s_, done, isdemo), error + e)
+
+    def JE(self, samples):
+        loss = torch.tensor(0.0)
+        for s, a, *_, isdemo in samples:
+            if not isdemo:
+                continue
+            QE = self.net(s, torch.Tensor(a))[0]
+            Q = self.net(s, torch.Tensor(self.findMaxA(s)))[0]
+            Q = QE if Q + self.margin < QE else Q
+            loss += self.lambda2 * (Q - QE)
+        return loss / self.mbsize
+
+    def update(self):
+        self.opt.zero_grad()
+        samples, idxs, IS = self.sample()
+        if self.noisy:
+            self.net.sample()  # for choosing action
+        maxA = [self.findMaxA(s[3]) for s in samples]
+        maxA = torch.Tensor(maxA)
+        s, a, *_, isdemo = zip(*samples)
+        s = torch.stack(s)
+        a = torch.Tensor(a)
+        if self.noisy:
+            self.net.sample()  # for prediction
+            self.net2.sample()  # for estimating Q
+        predict = self.net(s, a)[:, 0]
+        look_ahead = [r if done else r + self.gamma * self.net2(s_, maxA[i]) for i, (s, a, r, s_, done, isdemo) in
+                      enumerate(samples)]
+        target = torch.Tensor(look_ahead)
+
+        errors, ls = self.loss(predict, target, IS)
+        if self.noisy:
+            self.net.sample()
+        ls += self.JE(samples)
+        ls.backward()
+        for i in range(self.mbsize):
+            e = self.ed if isdemo[i] else self.ea
+            self.replay.update(idxs[i], errors[i] + e)
+
+        self.opt.step()
+        if self.c >= self.C:
+            self.c = 0
+            self.net2.load_state_dict(self.net.state_dict())
+            self.net2.eval()
+        else:
+            self.c += 1
+
+
+class DeepQLv2(DQN_NoisyNet.DeepQLv2):
+    def __init__(self, *args,lambda1=1.0,lambda2=1.0,lambda3=1e-5, **kwargs,):
+        super().__init__(*args, **kwargs,L2=lambda3)
+        self.ed = 1.0  # bonus for demonstration
+        self.ea = 0.001
+        self.margin = 0.8
+        self.lambda1 = lambda1  # n-step return
+        self.lambda2 = lambda2  # supervised loss
+        self.lambda3 = lambda3  # L2
+        self.replay.e = 0
+
+    def storeTransition(self, s, a, r, s_, done, isdemo):
+        s = torch.Tensor(s)
+        s_ = torch.Tensor(s_)
+        error = self.calcError((s, a, r, s_, done))
+        e = self.ed if isdemo else self.ea
+        self.store((s, a, r, s_, done, isdemo), error + e)
+
+    def JE(self, samples):
+        loss = torch.tensor(0.0)
+        for s, a, *_, isdemo in samples:
+            if not isdemo:
+                continue
+            QE = self.net(s)[a[0]]
+            Q = max(self.net(s))
+            Q = QE if Q + self.margin < QE else Q
+            loss += self.lambda2 * (Q - QE)
+        return loss / self.mbsize
+
+    def update(self):
+        self.opt.zero_grad()
+
+        samples, idxs, IS = self.sample()
+        if self.noisy:
+            self.net.sample()  # for choosing action
+        maxA = [self.findMaxA(s[3]) for s in samples]
+        s, a, *_, isdemo = zip(*samples)
+        s = torch.stack(s)
+        if self.noisy:
+            self.net.sample()  # for prediction
+            self.net2.sample()  # for estimating Q
+        predict = [self.net(s[i])[a[i][0]] for i in range(self.mbsize)]
+        look_ahead = [r if done else r + self.gamma * self.net2(s_)[maxA[i][0]] for i, (s, a, r, s_, done, isdemo) in
+                      enumerate(samples)]
+        target = torch.Tensor(look_ahead)
+
+        errors, ls = self.loss(predict, target, IS)
+        if self.noisy:
+            self.net.sample()
+        ls += self.JE(samples)
+        ls.backward()
+        for i in range(self.mbsize):
+            e = self.ed if isdemo[i] else self.ea
+            self.replay.update(idxs[i], errors[i] + e)
+
+        self.opt.step()
+        if self.c >= self.C:
+            self.c = 0
+            self.net2.load_state_dict(self.net.state_dict())
+            self.net2.eval()
+        else:
+            self.c += 1
@@ -0,0 +1,144 @@
+from os import path
+import sys
+local=path.abspath(__file__)
+root=path.dirname(path.dirname(path.dirname(local)))
+if root not in sys.path:
+    sys.path.append(root)
+
+import gym
+import torch
+import matplotlib.pyplot as plt
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+from DQNwithNoisyNet.NoisyLayer import NoisyLinear
+from DQNfromDemo import DQfD
+from operator import methodcaller
+import json
+
+
+class Net(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1_s = nn.Linear(4, 40)
+        self.fc1_a = nn.Linear(1, 40)
+        self.fc2 = nn.Linear(40, 1)
+
+    def forward(self, s, a):
+        x = self.fc1_s(s) + self.fc1_a(a)
+        x = F.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class Net2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = nn.Linear(4, 40)
+        self.fc2 = nn.Linear(40, 2)
+
+    def forward(self, s):
+        x = self.fc1(s)
+        x = F.relu(x)
+        x = self.fc2(x)
+        return x
+
+
+class NoisyNet(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1_s = NoisyLinear(4, 40)
+        self.fc1_a = NoisyLinear(1, 40)
+        self.fc2 = NoisyLinear(40, 1)
+
+    def forward(self, s, a):
+        x = self.fc1_s(s) + self.fc1_a(a)
+        x = F.relu(x)
+        x = self.fc2(x)
+        return x
+
+    def sample(self):
+        for layer in self.children():
+            if hasattr(layer, "sample"):
+                layer.sample()
+
+
+class NoisyNet2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = NoisyLinear(4, 40)
+        self.fc2 = NoisyLinear(40, 2)
+
+    def forward(self, s):
+        x = self.fc1(s)
+        x = F.relu(x)
+        x = self.fc2(x)
+        return x
+
+    def sample(self):
+        for layer in self.children():
+            if hasattr(layer, "sample"):
+                layer.sample()
+
+
+if __name__ == "__main__":
+    env = gym.make('CartPole-v1')
+    s = env.reset()
+    A = [[0], [1]]
+    dqn = DQfD.DeepQL(Net, noisy=False, lr=0.005, gamma=1, actionFinder=lambda x: A,N=5000)
+    process = []
+    randomness = []
+    epoch = 100
+    eps_start = 0.05
+    eps_end = 0.95
+    N = 1 - eps_start
+    lam = -math.log((1 - eps_end) / N) / epoch
+    total = 0
+    count = 0  # successful count
+    with open("CartPoleDemo.txt","r") as file:
+        data=json.load(file)
+        for k,v in data.items():
+            for s,a,r,s_,done in v:
+                dqn.storeTransition(s,a,r,s_,done,True)
+    for i in range(1000):
+        if i % 100 == 0:
+            print("pretraining:",i)
+        dqn.update()
+
+
+    for i in range(epoch):
+        print(i)
+        dqn.eps = 1 - N * math.exp(-lam * i)
+        count = count + 1 if total >= 500 else 0
+        if count >= 2:
+            dqn.eps = 1
+            break
+        total = 0
+        while True:
+            a = dqn.act(s)
+            s_, r, done, _ = env.step(a[0])
+            total += r
+            r = -1 if done and total < 500 else 0.002
+            dqn.storeTransition(s, a, r, s_, done,False)
+            dqn.update()
+            s = s_
+            if done:
+                s = env.reset()
+                print('total:', total)
+                process.append(total)
+                break
+
+    total = 0
+    s = env.reset()
+    dqn.eps = 1
+    while True:
+        a = dqn.act(s)[0]
+        s, r, done, _ = env.step(a)
+        total += 1
+        env.render()
+        if done:
+            s = env.reset()
+            print(total)
+            total = 0
+
+    env.close()
@@ -3,9 +3,14 @@
 from torch import optim
 import torch
 import math
-from .prioritized_memory import Memory, WeightedMSE
 
-#(s,a) => Q(s,a)
+if __package__:
+    from .prioritized_memory import Memory, WeightedMSE
+else:
+    from prioritized_memory import Memory, WeightedMSE
+
+
+# (s,a) => Q(s,a)
 class DeepQL:
     def __init__(self, Net, noisy=True, eps=0.9, lr=5e-3, gamma=0.9, mbsize=20, C=100, N=500, L2=0, actionFinder=None):
         self.exp = []
@@ -17,7 +22,7 @@ def __init__(self, Net, noisy=True, eps=0.9, lr=5e-3, gamma=0.9, mbsize=20, C=10
         self.net2 = Net()
         self.net2.load_state_dict(self.net.state_dict())
         self.net2.eval()
-        self.C = C #for target replacement
+        self.C = C  # for target replacement
         self.c = 0
         self.replay = Memory(capacity=N)
         self.loss = WeightedMSE()
@@ -108,7 +113,7 @@ def update(self):
             self.c += 1
 
 
-#s => Q[s,a1], Q[s,a2]...
+# s => Q[s,a1], Q[s,a2]...
 class DeepQLv2:
     def __init__(self, Net, noisy=True, eps=0.9, lr=5e-3, gamma=0.9, mbsize=20, C=100, N=500, L2=0, actionFinder=None):
         self.exp = []
@@ -126,8 +131,8 @@ def __init__(self, Net, noisy=True, eps=0.9, lr=5e-3, gamma=0.9, mbsize=20, C=10
         self.eps = eps
         self.noisy = noisy
         self.actionFinder = actionFinder
-        self.A = []
-        # (state:tensor => Action :List[List])
+        *_,last=self.net.children()
+        self.A = list(range(last.out_features))
 
     def act(self, state):
         # state:list[float] A:list[list]
@@ -152,8 +157,6 @@ def findMaxA(self, state):
         net = self.net
         net.eval()
         Q = net(state)
-        if not self.A:
-            self.A = list(range(len(Q))) #[0,1,2,3...]
         net.train()
         return [int(Q.argmax())]
 
 
@@ -39,11 +39,11 @@ def reset_parameters(self,sig0):
             self.bias_sig.data.zero_()
             self.bias_sig.data = self.bias_sig.data.zero_() + sig0 / self.weight_mu.shape[1]
 
-    def sample(self, zero=1):
+    def sample(self):
         size_in = self.in_features
         size_out = self.out_features
-        noise_in = f(self.dist.sample((1, size_in))) * zero
-        noise_out = f(self.dist.sample((1, size_out))) * zero
+        noise_in = f(self.dist.sample((1, size_in)))
+        noise_out = f(self.dist.sample((1, size_out)))
         self.weight = self.weight_mu + self.weight_sig * torch.mm(noise_out.t(), noise_in)
         self.bias = (self.bias_mu + self.bias_sig * noise_out).squeeze()
 
 
@@ -1,12 +1,18 @@
-import numpy as np
+from os import path
+import sys
+local=path.abspath(__file__)
+root=path.dirname(path.dirname(path.dirname(local)))
+if root not in sys.path:
+    sys.path.append(root)
+
 import gym
 import torch
 import matplotlib.pyplot as plt
 import math
 import torch.nn as nn
 import torch.nn.functional as F
-from NoisyLayer import NoisyLinear
-import DQN_NoisyNet
+from DQNwithNoisyNet.NoisyLayer import NoisyLinear
+from DQNwithNoisyNet import DQN_NoisyNet
 from operator import methodcaller
 
 
@@ -79,7 +85,6 @@ def sample(self):
     s = env.reset()
     A = [[0], [1]]
     dqn = DQN_NoisyNet.DeepQLv2(NoisyNet2, noisy=True, lr=0.002, gamma=1, actionFinder=lambda x: A)
-
     process = []
     randomness = []
     epoch = 200
@@ -121,10 +126,10 @@ def sample(self):
         plt.show()
     env.close()
 
-    # torch.save(dqn.net.state_dict(),"./model.txt")
+    #torch.save(dqn.net.state_dict(),"./CartPoleExpert.txt")
     # dqn.eps=1
     total = 0
-    # dqn.net.load_state_dict(torch.load("./model.txt"))
+    #dqn.net.load_state_dict(torch.load("./CartPoleExpert.txt"))
     s = env.reset()
     s = torch.Tensor(s)
     while True: