Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__pycache__/

.idea/
92 changes: 92 additions & 0 deletions Cattelle/ExperienceReplay.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from collections import deque

import numpy as np

from utils import process_screen


class ExperienceReplay:
"""
This class defines a handy structure to store and handle the experience replay memory

It provides the following actions:
* Process the screen (convert to grayscale, downscale, crop)
* Update the underlying experience replay array
* Sample randomly the array to yield a minibatch
* Append new state to the array
"""

def __init__(self, size, history_length=4, minibatch_size=32):
"""
Initialise the underlying array holding the experience replay memory

One sample is defined as a (s,a,r,s',d) tuple where one state (s) corresponds to the history_length last
frames stacked together. Each state is implemented as deque which prevents having to handle the maximum
length and speeds up access time to both ends of the queue.

The memory is implemented as a deque as well, and is filled from left to right. The right-most sample is
thus always the newest one.

Args:
size (int): Total number of state
history_length (int): Number of frame to keep in one state (stacked together). Default is 4
minibatch_size (int): Number of samples in a minibatch
"""
self.memory = deque(maxlen=size)
self.history_length = history_length
self.size = size
self.minibatch_size = minibatch_size

def append_sample(self, s, a, r, s_new, d):
"""
Append a new sample to the ER memory.

The screen states will be processed and appended to the correct stacks
Args:
s (np.ndarray): Raw (unprocessed) screen state
a (int): Action taken at state s leading to state s_new
r (float): Reward for taking action a at state s
s_new (np.ndarray): Raw (unprocessed) screen state
d (bool): True if the game is in a terminal state (game over), False otherwise
"""

if len(self.memory) == 0 or self.memory[-1][4] is True:
# We handle the initial insertion or the first one after a terminal differently
# The initial state in this case is 4 times the same frame
s = process_screen(s)
state = deque([s] * self.history_length, maxlen=self.history_length) # state = [s, s, ..., s]

state_new = state.copy()
state_new.append(process_screen(s_new)) # state_new = [s, s, ..., s_new]

self.memory.append((state, a, r, state_new, d))

else:
# Grab the last sample recorded
last_sample = self.memory[-1]

# Build the new state (stack)
new_state = last_sample[3].copy()
new_state.append(process_screen(s_new))

# And append to the memory
self.memory.append((last_sample[3], a, r, new_state, d))

def minibatch(self):
"""
Randomly samples a minibatch of size minibatch_size and returns it
Returns:
minibatch (np.ndarray): Randomly sampled minibatch
"""

# Get the current size of the memory
size = len(self.memory)

if size < self.minibatch_size:
raise IndexError(f'minibatch_size ({self.minibatch_size}) is larger than the current size of the ER '
f'memory ({size})')

# er.memory is not 1D thus we cannot sample it directly, instead we sample indices and build back and array
indices = np.random.choice(size, self.minibatch_size)

return np.array([self.memory[i] for i in indices]).T
28 changes: 28 additions & 0 deletions Cattelle/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np
from keras import models

from config import Config as config
from utils import StateHolder

# Initialise dqn
dqn = models.load_model(config.MODEL_FILENAME)

stateholder = StateHolder()


def FlappyPolicy(_, screen):
"""
Main game policy, define the behaviour of the agent
Args:
_ (dict) : The state vector of the simulator, ignored here
screen (numpy.ndarray): Current state of the screen (RGB matrix)

Returns:
action (int): The action to take
"""
stateholder.append(screen)
state = stateholder.get_dqn_input()

Q = dqn.predict(state) # Expect a (no_samples, history_length, 84, 84) input

return np.argmax(Q) * 119 # argmax is either 0 or 1 with convention 0: no-op; 1: flap
37 changes: 37 additions & 0 deletions Cattelle/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
class Config:
# Experience replay settings
ER_SIZE = 20000 # Total number of samples to keep in the Experience Replay memory
HISTORY_LENGTH = 4 # Number of frames to keep in each state
MINIBATCH_SIZE = 32 # Number of samples in a single minibatch, must be < MIN_ER_SIZE

# Network settings
OPTIMISER = 'rmsprop'
LEARNING_RATE = 1e-6
DECAY = 0.9
MOMENTUM = 0.95

# Learning settings
INITIALISATION_STD = 0.1 # Standard deviation used for initialising weights of the conv2d layers
TIMESTEPS = 100000 # Number of timesteps used for the learning, one action is taken during one step.
INITIAL_EPS = 1.0 # Initial value for the exploratory parametr epsilon
DISCOUNT_RATE = 0.95 # Parameter for the gamma discount rate
MIN_ER_SIZE = 3000 # Minimum number of samples in the ER to begin learning
TEST_DELTA = 10000 # Number of timesteps between two successive tests of the network
NUM_TEST_TRIALS = 10 # Number of trials to conduct during each test session
PROB_FLAP = 1 / 4 # Probability of action "flap" (119) when taking a random action during the exploration

# Simulator settings
REWARD_ALIVE = 0.1 # Reward granted for each timestep where the player remains alive (except if it passes a pipe)

# Misc. settings
MODEL_FILENAME = 'dqn.h5'
SAVE_DELTA = 5000 # Number of timesteps between two successive saves of the network, must be > MIN_ER_SIZE


class DebugConfig(Config):
TIMESTEPS = 100
MIN_ER_SIZE = 10
MINIBATCH_SIZE = 5
ER_SIZE = 50
SAVE_DELTA = 50
TEST_DELTA = 25
Binary file added Cattelle/dqn.h5
Binary file not shown.
222 changes: 222 additions & 0 deletions Cattelle/retrainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
import keras
import numpy as np
from ple import PLE
from ple.games import FlappyBird
from tqdm import trange

from ExperienceReplay import ExperienceReplay
from config import Config as config
from utils import StateHolder


class Retrainer:
"""
Retraining class, used to further train an existing Keras model
"""

def __init__(self, model_file):
"""
Load the existing model file
Args:
model_file (str): Path to the model file (h5 file)
"""

self.model = keras.models.load_model(model_file)
self.er = ExperienceReplay(config.ER_SIZE, config.HISTORY_LENGTH, config.MINIBATCH_SIZE)
self.stateholder = StateHolder()

game = FlappyBird(graphics="fixed")
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for
# learning, just for display purposes.
p.init()

self.game = game
self.p = p

self._gen_er_samples()

def train_network(self):
""""
Train the DQN according to the settings defined in config.py
"""
print(len(self.er.memory))
print('Starting training')

p = self.p
screen = p.getScreenRGB()

# Main training loop, runs for NB_TIMESTEPS iterations
for i in trange(config.TIMESTEPS):
action = self.eps_greedy(i)

reward = p.act(action)
# Shape reward to include rewardAlive
# This is awarded when the agent survives for one timestep (without passing a pipe)
if reward == 0.0:
reward = config.REWARD_ALIVE
# Clip negative rewards so that the reward space remains in [-1,1]
if reward < 0:
reward = -1.0

new_screen = p.getScreenRGB()
done = p.game_over()

self.er.append_sample(screen, action, reward, new_screen, done)

state, a, r, new_state, D = self.er.minibatch()

state = self._unpack_state(state)
new_state = self._unpack_state(new_state)

Q = self.model.predict(state) # shape (minibatch_size, 2)
new_Q = self.model.predict(new_state) # shape (minibatch_size, 2)

# row-wise maximum, shape (minibatch_size, )
max_new_Q = new_Q.max(1).reshape((config.MINIBATCH_SIZE,))

update = r + (1 - D) * (config.DISCOUNT_RATE * max_new_Q)

Q[:, (a // 119).astype(int)] = update.reshape(config.MINIBATCH_SIZE, 1)

# Incremental training
self.model.train_on_batch(x=state, y=Q)

if i % config.TEST_DELTA == 0 and i > 0:
print('Testing the network...')
mean_score, max_score = self.eval_network(config.NUM_TEST_TRIALS)
print('Current scores for the network:\n',
f'\tmean -> {mean_score}'
f'\tmax -> {max_score}')

if i % config.SAVE_DELTA == 0 and i > config.MIN_ER_SIZE:
print('Saving network...')
self._write_network(config.MODEL_FILENAME)

if done:
p.reset_game()

screen = p.getScreenRGB()

print('Training done, saving final weights')
self._write_network(config.MODEL_FILENAME)

def eps_greedy(self):
"""
Epsilon-greedy explorator (GLIE). Takes a random action with probability epsilon (linearly decreasing
from 1.0 to 0.1 over all timesteps), otherwise the greedy action from the current Q-network
Returns:
action (int)): The next action to make
"""
# The epsilon parameter decreases linearly over all timesteps
epsilon = 0.1 # Fixed eps during retraining

if np.random.rand() <= epsilon:
# Take random action, either None (0) or flap (119)
action = np.random.choice([0, 119], p=[1 - config.PROB_FLAP, config.PROB_FLAP])
else:
state = self.er.memory[-1][3]
state = self._unpack_state(state).reshape((1, config.HISTORY_LENGTH, 84, 84))
# reshape necessary since dqn.predict expect a list of samples (in this case only a single sample)
action_array = self.model.predict(state)
action = action_array.argmax()
action *= 119 # the argmax is either 0 or 1, whereas the correct actions are either 0 or 119

return action

def eval_network(self, trials=20):
"""
Evaluate current performances of network.
Args:
trials: Number of trials to perform. One trial is one full game, from initialisation to game over

Returns:
results (tuple): Tuple of (mean score, max score). The mean score is averaged over all trials
"""

scores = np.zeros(trials)

# Create a local copy of the simulator to prevent messing up the training simulator
game = FlappyBird(graphics="fixed")
p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for
# learning, just for display purposes.
p.init()

for i in range(trials):
p.reset_game()
screen = p.getScreenRGB()
holder = StateHolder()
holder.append(screen)

while not p.game_over():
action_array = self.model.predict(holder.get_dqn_input())
action = action_array.argmax() * 119

scores[i] += p.act(action)
holder.append(p.getScreenRGB())

return scores.mean(), scores.max()

def _write_network(self, filename='weights.dqn'):
"""
Save the full model (architecture + weights + status of the optimiser) to the HDF5 archive located at
"filename"
Args:
filename (str): Location of saved model (path)
"""
self.model.save(filename)

def _unpack_state(self, state):
"""
Unroll the state array (array of deques) along its 1st axis (i.e. the dequeue axis)

Args:
state (np.ndarray): State of deques to unpack, shape (n,)

Returns:
unpacked (np.ndarray): Unpacked state, ready to be fed to the DQN, shape (n, history_length, 84, 84)
"""
return np.array([np.array(elt) for elt in state])

def _gen_er_samples(self):
"""
Use the existing model to generate enough sample to start training (i.e. MIN_ER_SIZE samples according to
the config file)
"""
print(f"Generating {config.MIN_ER_SIZE} samples using the existing model")

self.stateholder.append(self.p.getScreenRGB())

for i in trange(config.MIN_ER_SIZE):
screen = self.p.getScreenRGB()

action = self.model.predict(self.stateholder.get_dqn_input())
action = action.argmax() * 119

reward = self.p.act(action)

# Shape reward exactly as we do during training
if reward == 0.0:
reward = config.REWARD_ALIVE
if reward < 0.0:
reward = -1.0

new_screen = self.p.getScreenRGB()
done = self.p.game_over()

# Append to the stateholder
self.stateholder.append(new_screen)

# Append to the ER
self.er.append_sample(screen, action, reward, new_screen, done)

if done:
self.p.reset_game()

print(f'Successfully generated {config.MIN_ER_SIZE} samples')


if __name__ == '__main__':
retrainer = Retrainer(config.MODEL_FILENAME)
retrainer.train_network()
2 changes: 1 addition & 1 deletion RandomBird/run.py → Cattelle/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@
cumulated[i] = cumulated[i] + reward

average_score = np.mean(cumulated)
max_score = np.max(cumulated)
max_score = np.max(cumulated)
Loading