Skip to content

Commit f426c30

Browse files
author
poet-libai
committed
hw_submission(吴振锋): add hw4_20230316
1 parent c609bf7 commit f426c30

File tree

6 files changed

+363
-0
lines changed

6 files changed

+363
-0
lines changed

chapter4_reward/q1.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# RND
2+
模型过小时,容易欠拟合,训练误差更大,而且奖励浮动范围大,但small比little的模型浮动更大,这就有点奇怪。模型过大时,容易过拟合,泛化能力弱,large和very_large模型曲线走势接近,完全拟合了训练数据包括噪声,收敛更慢。
3+
![q1](q1.png)

chapter4_reward/q1.png

190 KB
Loading

chapter4_reward/q1.py

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
# pip install minigrid
2+
from typing import Union, Tuple, Dict, List, Optional
3+
from multiprocessing import Process
4+
import multiprocessing as mp
5+
import random
6+
import numpy as np
7+
import torch
8+
import torch.nn as nn
9+
import torch.nn.functional as F
10+
import torch.optim as optim
11+
import minigrid
12+
import gymnasium as gym
13+
from torch.optim.lr_scheduler import ExponentialLR, MultiStepLR
14+
from tensorboardX import SummaryWriter
15+
from minigrid.wrappers import FlatObsWrapper
16+
17+
random.seed(0)
18+
np.random.seed(0)
19+
torch.manual_seed(0)
20+
if torch.cuda.is_available():
21+
device = torch.device("cuda:0")
22+
else:
23+
device = torch.device("cpu")
24+
25+
train_config = dict(
26+
train_iter=1024,
27+
train_data_count=128,
28+
test_data_count=4096,
29+
)
30+
31+
little_RND_net_config = dict(
32+
exp_name="little_rnd_network",
33+
observation_shape=2835,
34+
hidden_size_list=[32, 16],
35+
learning_rate=1e-3,
36+
batch_size=64,
37+
update_per_collect=100,
38+
obs_norm=True,
39+
obs_norm_clamp_min=-1,
40+
obs_norm_clamp_max=1,
41+
reward_mse_ratio=1e5,
42+
)
43+
44+
small_RND_net_config = dict(
45+
exp_name="small_rnd_network",
46+
observation_shape=2835,
47+
hidden_size_list=[64, 64],
48+
learning_rate=1e-3,
49+
batch_size=64,
50+
update_per_collect=100,
51+
obs_norm=True,
52+
obs_norm_clamp_min=-1,
53+
obs_norm_clamp_max=1,
54+
reward_mse_ratio=1e5,
55+
)
56+
57+
standard_RND_net_config = dict(
58+
exp_name="standard_rnd_network",
59+
observation_shape=2835,
60+
hidden_size_list=[128, 64],
61+
learning_rate=1e-3,
62+
batch_size=64,
63+
update_per_collect=100,
64+
obs_norm=True,
65+
obs_norm_clamp_min=-1,
66+
obs_norm_clamp_max=1,
67+
reward_mse_ratio=1e5,
68+
)
69+
70+
large_RND_net_config = dict(
71+
exp_name="large_RND_network",
72+
observation_shape=2835,
73+
hidden_size_list=[256, 256],
74+
learning_rate=1e-3,
75+
batch_size=64,
76+
update_per_collect=100,
77+
obs_norm=True,
78+
obs_norm_clamp_min=-1,
79+
obs_norm_clamp_max=1,
80+
reward_mse_ratio=1e5,
81+
)
82+
83+
very_large_RND_net_config = dict(
84+
exp_name="very_large_RND_network",
85+
observation_shape=2835,
86+
hidden_size_list=[512, 512],
87+
learning_rate=1e-3,
88+
batch_size=64,
89+
update_per_collect=100,
90+
obs_norm=True,
91+
obs_norm_clamp_min=-1,
92+
obs_norm_clamp_max=1,
93+
reward_mse_ratio=1e5,
94+
)
95+
96+
class FCEncoder(nn.Module):
97+
def __init__(
98+
self,
99+
obs_shape: int,
100+
hidden_size_list,
101+
activation: Optional[nn.Module] = nn.ReLU(),
102+
) -> None:
103+
super(FCEncoder, self).__init__()
104+
self.obs_shape = obs_shape
105+
self.act = activation
106+
self.init = nn.Linear(obs_shape, hidden_size_list[0])
107+
108+
layers = []
109+
for i in range(len(hidden_size_list) - 1):
110+
layers.append(nn.Linear(hidden_size_list[i], hidden_size_list[i + 1]))
111+
layers.append(self.act)
112+
self.main = nn.Sequential(*layers)
113+
114+
def forward(self, x: torch.Tensor) -> torch.Tensor:
115+
x = self.act(self.init(x))
116+
x = self.main(x)
117+
return x
118+
119+
class RndNetwork(nn.Module):
120+
def __init__(self, obs_shape: Union[int, list], hidden_size_list: list) -> None:
121+
super(RndNetwork, self).__init__()
122+
self.target = FCEncoder(obs_shape, hidden_size_list)
123+
self.predictor = FCEncoder(obs_shape, hidden_size_list)
124+
125+
for param in self.target.parameters():
126+
param.requires_grad = False
127+
128+
def forward(self, obs: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
129+
predict_feature = self.predictor(obs)
130+
with torch.no_grad():
131+
target_feature = self.target(obs)
132+
return predict_feature, target_feature
133+
134+
class RunningMeanStd(object):
135+
def __init__(self, epsilon=1e-4, shape=(), device=torch.device('cpu')):
136+
self._epsilon = epsilon
137+
self._shape = shape
138+
self._device = device
139+
self.reset()
140+
141+
def update(self, x):
142+
batch_mean = np.mean(x, axis=0)
143+
batch_var = np.var(x, axis=0)
144+
batch_count = x.shape[0]
145+
146+
new_count = batch_count + self._count
147+
mean_delta = batch_mean - self._mean
148+
new_mean = self._mean + mean_delta * batch_count / new_count
149+
# this method for calculating new variable might be numerically unstable
150+
m_a = self._var * self._count
151+
m_b = batch_var * batch_count
152+
m2 = m_a + m_b + np.square(mean_delta) * self._count * batch_count / new_count
153+
new_var = m2 / new_count
154+
self._mean = new_mean
155+
self._var = new_var
156+
self._count = new_count
157+
158+
def reset(self):
159+
if len(self._shape) > 0:
160+
self._mean = np.zeros(self._shape, 'float32')
161+
self._var = np.ones(self._shape, 'float32')
162+
else:
163+
self._mean, self._var = 0., 1.
164+
self._count = self._epsilon
165+
166+
@property
167+
def mean(self) -> np.ndarray:
168+
if np.isscalar(self._mean):
169+
return self._mean
170+
else:
171+
return torch.FloatTensor(self._mean).to(self._device)
172+
173+
@property
174+
def std(self) -> np.ndarray:
175+
std = np.sqrt(self._var + 1e-8)
176+
if np.isscalar(std):
177+
return std
178+
else:
179+
return torch.FloatTensor(std).to(self._device)
180+
181+
class RndRewardModel():
182+
183+
def __init__(self, config) -> None: # noqa
184+
super(RndRewardModel, self).__init__()
185+
self.cfg = config
186+
187+
self.tb_logger = SummaryWriter(config["exp_name"])
188+
self.reward_model = RndNetwork(
189+
obs_shape=config["observation_shape"], hidden_size_list=config["hidden_size_list"]
190+
).to(device)
191+
192+
self.opt = optim.Adam(self.reward_model.predictor.parameters(), config["learning_rate"])
193+
self.scheduler = ExponentialLR(self.opt, gamma=0.997)
194+
195+
self.estimate_cnt_rnd = 0
196+
if self.cfg["obs_norm"]:
197+
self._running_mean_std_rnd_obs = RunningMeanStd(epsilon=1e-4, device=device)
198+
199+
def __del__(self):
200+
self.tb_logger.flush()
201+
self.tb_logger.close()
202+
203+
def train(self, data) -> None:
204+
for _ in range(self.cfg["update_per_collect"]):
205+
train_data: list = random.sample(data, self.cfg["batch_size"])
206+
train_data: torch.Tensor = torch.stack(train_data).to(device)
207+
if self.cfg["obs_norm"]:
208+
# Note: observation normalization: transform obs to mean 0, std 1
209+
self._running_mean_std_rnd_obs.update(train_data.cpu().numpy())
210+
train_data = (train_data - self._running_mean_std_rnd_obs.mean) / self._running_mean_std_rnd_obs.std
211+
train_data = torch.clamp(
212+
train_data, min=self.cfg["obs_norm_clamp_min"], max=self.cfg["obs_norm_clamp_max"]
213+
)
214+
215+
predict_feature, target_feature = self.reward_model(train_data)
216+
loss = F.mse_loss(predict_feature, target_feature.detach())
217+
self.opt.zero_grad()
218+
loss.backward()
219+
self.opt.step()
220+
self.scheduler.step()
221+
222+
def estimate(self, data: list) -> List[Dict]:
223+
"""
224+
estimate the rnd intrinsic reward
225+
"""
226+
227+
obs = torch.stack(data).to(device)
228+
if self.cfg["obs_norm"]:
229+
# Note: observation normalization: transform obs to mean 0, std 1
230+
obs = (obs - self._running_mean_std_rnd_obs.mean) / self._running_mean_std_rnd_obs.std
231+
obs = torch.clamp(obs, min=self.cfg["obs_norm_clamp_min"], max=self.cfg["obs_norm_clamp_max"])
232+
233+
with torch.no_grad():
234+
self.estimate_cnt_rnd += 1
235+
predict_feature, target_feature = self.reward_model(obs)
236+
mse = F.mse_loss(predict_feature, target_feature, reduction='none').mean(dim=1)
237+
self.tb_logger.add_scalar('rnd_reward/mse', mse.cpu().numpy().mean(), self.estimate_cnt_rnd)
238+
239+
# Note: according to the min-max normalization, transform rnd reward to [0,1]
240+
rnd_reward = mse * self.cfg["reward_mse_ratio"] #(mse - mse.min()) / (mse.max() - mse.min() + 1e-11)
241+
242+
self.tb_logger.add_scalar('rnd_reward/rnd_reward_max', rnd_reward.max(), self.estimate_cnt_rnd)
243+
self.tb_logger.add_scalar('rnd_reward/rnd_reward_mean', rnd_reward.mean(), self.estimate_cnt_rnd)
244+
self.tb_logger.add_scalar('rnd_reward/rnd_reward_min', rnd_reward.min(), self.estimate_cnt_rnd)
245+
246+
rnd_reward = torch.chunk(rnd_reward, rnd_reward.shape[0], dim=0)
247+
248+
def training(config, train_data, test_data):
249+
rnd_reward_model = RndRewardModel(config=config)
250+
for i in range(train_config["train_iter"]):
251+
rnd_reward_model.train([torch.Tensor(item["last_observation"]) for item in train_data[i]])
252+
rnd_reward_model.estimate([torch.Tensor(item["last_observation"]) for item in test_data])
253+
254+
def main():
255+
env = gym.make("MiniGrid-Empty-8x8-v0")
256+
env_obs = FlatObsWrapper(env)
257+
258+
train_data = []
259+
test_data = []
260+
261+
for i in range(train_config["train_iter"]):
262+
263+
train_data_per_iter = []
264+
265+
while len(train_data_per_iter) < train_config["train_data_count"]:
266+
last_observation, _ = env_obs.reset()
267+
terminated = False
268+
while terminated != True and len(train_data_per_iter) < train_config["train_data_count"]:
269+
action = env_obs.action_space.sample()
270+
observation, reward, terminated, truncated, info = env_obs.step(action)
271+
train_data_per_iter.append(
272+
{
273+
"last_observation": last_observation,
274+
"action": action,
275+
"reward": reward,
276+
"observation": observation
277+
}
278+
)
279+
last_observation = observation
280+
env_obs.close()
281+
282+
train_data.append(train_data_per_iter)
283+
284+
while len(test_data) < train_config["test_data_count"]:
285+
last_observation, _ = env_obs.reset()
286+
terminated = False
287+
while terminated != True and len(train_data_per_iter) < train_config["test_data_count"]:
288+
action = env_obs.action_space.sample()
289+
observation, reward, terminated, truncated, info = env_obs.step(action)
290+
test_data.append(
291+
{
292+
"last_observation": last_observation,
293+
"action": action,
294+
"reward": reward,
295+
"observation": observation
296+
}
297+
)
298+
last_observation = observation
299+
env_obs.close()
300+
301+
p0 = Process(target=training, args=(little_RND_net_config, train_data, test_data))
302+
p0.start()
303+
304+
p1 = Process(target=training, args=(small_RND_net_config, train_data, test_data))
305+
p1.start()
306+
307+
p2 = Process(target=training, args=(standard_RND_net_config, train_data, test_data))
308+
p2.start()
309+
310+
p3 = Process(target=training, args=(large_RND_net_config, train_data, test_data))
311+
p3.start()
312+
313+
p4 = Process(target=training, args=(very_large_RND_net_config, train_data, test_data))
314+
p4.start()
315+
316+
p0.join()
317+
p1.join()
318+
p2.join()
319+
p3.join()
320+
p4.join()
321+
322+
if __name__ == "__main__":
323+
mp.set_start_method('spawn')
324+
main()

chapter4_reward/q2.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Please install latest DI-engine's main branch first
2+
from ding.bonus import PPOF
3+
4+
5+
def acrobot():
6+
# Please install acrobot env first, `pip3 install gym`
7+
# You can refer to the env doc (https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/acrobot_zh.html) for more details
8+
agent = PPOF(env='acrobot', exp_name='./acrobot_demo')
9+
agent.train(step=int(1e5))
10+
# Classic RL interaction loop and save replay video
11+
agent.deploy(enable_save_replay=True)
12+
13+
14+
def metadrive():
15+
# Please install metadrive env first, `pip install metadrive-simulator`
16+
# You can refer to the env doc (https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/metadrive_zh.html) for more details
17+
agent = PPOF(env='metadrive', exp_name='./metadrive_demo')
18+
agent.train(step=int(1e6), context='spawn')
19+
# Classic RL interaction loop and save replay video
20+
agent.deploy(enable_save_replay=True)
21+
22+
23+
def minigrid_fourroom():
24+
# Please install minigrid env first, `pip install gym-minigrid`
25+
# Note: minigrid env doesn't support Windows platform
26+
# You can refer to the env doc (https://di-engine-docs.readthedocs.io/zh_CN/latest/13_envs/minigrid_zh.html) for more details
27+
agent = PPOF(env='minigrid_fourroom', exp_name='./minigrid_fourroom_demo')
28+
agent.train(step=int(3e6))
29+
# Classic RL interaction loop and save replay video
30+
agent.deploy(enable_save_replay=True)
31+
32+
33+
if __name__ == "__main__":
34+
# acrobot()
35+
metadrive()
36+
# minigrid_fourroom()

chapter4_reward/q2_1.png

186 KB
Loading

chapter4_reward/q2_2.png

84.8 KB
Loading

0 commit comments

Comments
 (0)