SupaeroDataScience · Gauloiserie · Feb 6, 2018 · Feb 23, 2018 · Mar 5, 2018 · Mar 5, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
 __pycache__/
-
+.idea/
diff --git a/Cattelle/ExperienceReplay.py b/Cattelle/ExperienceReplay.py
@@ -0,0 +1,92 @@
+from collections import deque
+
+import numpy as np
+
+from utils import process_screen
+
+
+class ExperienceReplay:
+    """
+        This class defines a handy structure to store and handle the experience replay memory
+
+        It provides the following actions:
+            * Process the screen (convert to grayscale, downscale, crop)
+            * Update the underlying experience replay array
+            * Sample randomly the array to yield a minibatch
+            * Append new state to the array
+    """
+
+    def __init__(self, size, history_length=4, minibatch_size=32):
+        """
+            Initialise the underlying array holding the experience replay memory
+
+            One sample is defined as a (s,a,r,s',d) tuple where one state (s) corresponds to the history_length last
+            frames stacked together. Each state is implemented as deque which prevents having to handle the maximum
+            length and speeds up access time to both ends of the queue.
+
+            The memory is implemented as a deque as well, and is filled from left to right. The right-most sample is
+            thus always the newest one.
+
+        Args:
+            size (int): Total number of state
+            history_length (int): Number of frame to keep in one state (stacked together). Default is 4
+            minibatch_size (int): Number of samples in a minibatch
+        """
+        self.memory = deque(maxlen=size)
+        self.history_length = history_length
+        self.size = size
+        self.minibatch_size = minibatch_size
+
+    def append_sample(self, s, a, r, s_new, d):
+        """
+            Append a new sample to the ER memory.
+
+            The screen states will be processed and appended to the correct stacks
+        Args:
+            s (np.ndarray): Raw (unprocessed) screen state
+            a (int): Action taken at state s leading to state s_new
+            r (float): Reward for taking action a at state s
+            s_new (np.ndarray): Raw (unprocessed) screen state
+            d (bool): True if the game is in a terminal state (game over), False otherwise
+        """
+
+        if len(self.memory) == 0 or self.memory[-1][4] is True:
+            # We handle the initial insertion or the first one after a terminal differently
+            # The initial state in this case is 4 times the same frame
+            s = process_screen(s)
+            state = deque([s] * self.history_length, maxlen=self.history_length)  # state = [s, s, ..., s]
+
+            state_new = state.copy()
+            state_new.append(process_screen(s_new))  # state_new = [s, s, ..., s_new]
+
+            self.memory.append((state, a, r, state_new, d))
+
+        else:
+            # Grab the last sample recorded
+            last_sample = self.memory[-1]
+
+            # Build the new state (stack)
+            new_state = last_sample[3].copy()
+            new_state.append(process_screen(s_new))
+
+            # And append to the memory
+            self.memory.append((last_sample[3], a, r, new_state, d))
+
+    def minibatch(self):
+        """
+            Randomly samples a minibatch of size minibatch_size and returns it
+        Returns:
+            minibatch (np.ndarray): Randomly sampled minibatch
+        """
+
+        # Get the current size of the memory
+        size = len(self.memory)
+
+        if size < self.minibatch_size:
+            raise IndexError(f'minibatch_size ({self.minibatch_size}) is larger than the current size of the ER '
+                             f'memory ({size})')
+
+        # er.memory is not 1D thus we cannot sample it directly, instead we sample indices and build back and array
+        indices = np.random.choice(size, self.minibatch_size)
+
+        return np.array([self.memory[i] for i in indices]).T
diff --git a/Cattelle/FlappyAgent.py b/Cattelle/FlappyAgent.py
@@ -0,0 +1,28 @@
+import numpy as np
+from keras import models
+
+from config import Config as config
+from utils import StateHolder
+
+# Initialise dqn
+dqn = models.load_model(config.MODEL_FILENAME)
+
+stateholder = StateHolder()
+
+
+def FlappyPolicy(_, screen):
+    """
+    Main game policy, define the behaviour of the agent
+    Args:
+        _ (dict) : The state vector of the simulator, ignored here
+        screen (numpy.ndarray): Current state of the screen (RGB matrix)
+
+    Returns:
+        action (int): The action to take
+    """
+    stateholder.append(screen)
+    state = stateholder.get_dqn_input()
+
+    Q = dqn.predict(state)  # Expect a (no_samples, history_length, 84, 84) input
+
+    return np.argmax(Q) * 119  # argmax is either 0 or 1 with convention 0: no-op; 1: flap
diff --git a/Cattelle/config.py b/Cattelle/config.py
@@ -0,0 +1,37 @@
+class Config:
+    # Experience replay settings
+    ER_SIZE = 20000  # Total number of samples to keep in the Experience Replay memory
+    HISTORY_LENGTH = 4  # Number of frames to keep in each state
+    MINIBATCH_SIZE = 32  # Number of samples in a single minibatch, must be < MIN_ER_SIZE
+
+    # Network settings
+    OPTIMISER = 'rmsprop'
+    LEARNING_RATE = 1e-6
+    DECAY = 0.9
+    MOMENTUM = 0.95
+
+    # Learning settings
+    INITIALISATION_STD = 0.1  # Standard deviation used for initialising weights of the conv2d layers
+    TIMESTEPS = 100000  # Number of timesteps used for the learning, one action is taken during one step.
+    INITIAL_EPS = 1.0  # Initial value for the exploratory parametr epsilon
+    DISCOUNT_RATE = 0.95  # Parameter for the gamma discount rate
+    MIN_ER_SIZE = 3000  # Minimum number of samples in the ER to begin learning
+    TEST_DELTA = 10000  # Number of timesteps between two successive tests of the network
+    NUM_TEST_TRIALS = 10  # Number of trials to conduct during each test session
+    PROB_FLAP = 1 / 4  # Probability of action "flap" (119) when taking a random action during the exploration
+
+    # Simulator settings
+    REWARD_ALIVE = 0.1  # Reward granted for each timestep where the player remains alive (except if it passes a pipe)
+
+    # Misc. settings
+    MODEL_FILENAME = 'dqn.h5'
+    SAVE_DELTA = 5000  # Number of timesteps between two successive saves of the network, must be > MIN_ER_SIZE
+
+
+class DebugConfig(Config):
+    TIMESTEPS = 100
+    MIN_ER_SIZE = 10
+    MINIBATCH_SIZE = 5
+    ER_SIZE = 50
+    SAVE_DELTA = 50
+    TEST_DELTA = 25
diff --git a/Cattelle/dqn.h5 b/Cattelle/dqn.h5
diff --git a/Cattelle/retrainer.py b/Cattelle/retrainer.py
@@ -0,0 +1,222 @@
+import keras
+import numpy as np
+from ple import PLE
+from ple.games import FlappyBird
+from tqdm import trange
+
+from ExperienceReplay import ExperienceReplay
+from config import Config as config
+from utils import StateHolder
+
+
+class Retrainer:
+    """
+        Retraining class, used to further train an existing Keras model
+    """
+
+    def __init__(self, model_file):
+        """
+            Load the existing model file
+        Args:
+            model_file (str): Path to the model file (h5 file)
+        """
+
+        self.model = keras.models.load_model(model_file)
+        self.er = ExperienceReplay(config.ER_SIZE, config.HISTORY_LENGTH, config.MINIBATCH_SIZE)
+        self.stateholder = StateHolder()
+
+        game = FlappyBird(graphics="fixed")
+        p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
+        # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for
+        # learning, just for display purposes.
+        p.init()
+
+        self.game = game
+        self.p = p
+
+        self._gen_er_samples()
+
+    def train_network(self):
+        """"
+            Train the DQN according to the settings defined in config.py
+        """
+        print(len(self.er.memory))
+        print('Starting training')
+
+        p = self.p
+        screen = p.getScreenRGB()
+
+        # Main training loop, runs for NB_TIMESTEPS iterations
+        for i in trange(config.TIMESTEPS):
+            action = self.eps_greedy(i)
+
+            reward = p.act(action)
+            # Shape reward to include rewardAlive
+            # This is awarded when the agent survives for one timestep (without passing a pipe)
+            if reward == 0.0:
+                reward = config.REWARD_ALIVE
+            # Clip negative rewards so that the reward space remains in [-1,1]
+            if reward < 0:
+                reward = -1.0
+
+            new_screen = p.getScreenRGB()
+            done = p.game_over()
+
+            self.er.append_sample(screen, action, reward, new_screen, done)
+
+            state, a, r, new_state, D = self.er.minibatch()
+
+            state = self._unpack_state(state)
+            new_state = self._unpack_state(new_state)
+
+            Q = self.model.predict(state)  # shape (minibatch_size, 2)
+            new_Q = self.model.predict(new_state)  # shape (minibatch_size, 2)
+
+            # row-wise maximum, shape (minibatch_size, )
+            max_new_Q = new_Q.max(1).reshape((config.MINIBATCH_SIZE,))
+
+            update = r + (1 - D) * (config.DISCOUNT_RATE * max_new_Q)
+
+            Q[:, (a // 119).astype(int)] = update.reshape(config.MINIBATCH_SIZE, 1)
+
+            # Incremental training
+            self.model.train_on_batch(x=state, y=Q)
+
+            if i % config.TEST_DELTA == 0 and i > 0:
+                print('Testing the network...')
+                mean_score, max_score = self.eval_network(config.NUM_TEST_TRIALS)
+                print('Current scores for the network:\n',
+                      f'\tmean -> {mean_score}'
+                      f'\tmax -> {max_score}')
+
+            if i % config.SAVE_DELTA == 0 and i > config.MIN_ER_SIZE:
+                print('Saving network...')
+                self._write_network(config.MODEL_FILENAME)
+
+            if done:
+                p.reset_game()
+
+            screen = p.getScreenRGB()
+
+        print('Training done, saving final weights')
+        self._write_network(config.MODEL_FILENAME)
+
+    def eps_greedy(self):
+        """
+            Epsilon-greedy explorator (GLIE). Takes a random action with probability epsilon (linearly decreasing
+            from 1.0 to 0.1 over all timesteps), otherwise the greedy action from the current Q-network
+        Returns:
+            action (int)): The next action to make
+        """
+        # The epsilon parameter decreases linearly over all timesteps
+        epsilon = 0.1  # Fixed eps during retraining
+
+        if np.random.rand() <= epsilon:
+            # Take random action, either None (0) or flap (119)
+            action = np.random.choice([0, 119], p=[1 - config.PROB_FLAP, config.PROB_FLAP])
+        else:
+            state = self.er.memory[-1][3]
+            state = self._unpack_state(state).reshape((1, config.HISTORY_LENGTH, 84, 84))
+            # reshape necessary since dqn.predict expect a list of samples (in this case only a single sample)
+            action_array = self.model.predict(state)
+            action = action_array.argmax()
+            action *= 119  # the argmax is either 0 or 1, whereas the correct actions are either 0 or 119
+
+        return action
+
+    def eval_network(self, trials=20):
+        """
+            Evaluate current performances of network.
+        Args:
+            trials: Number of trials to perform. One trial is one full game, from initialisation to game over
+
+        Returns:
+            results (tuple): Tuple of (mean score, max score). The mean score is averaged over all trials
+        """
+
+        scores = np.zeros(trials)
+
+        # Create a local copy of the simulator to prevent messing up the training simulator
+        game = FlappyBird(graphics="fixed")
+        p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)
+        # Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for
+        # learning, just for display purposes.
+        p.init()
+
+        for i in range(trials):
+            p.reset_game()
+            screen = p.getScreenRGB()
+            holder = StateHolder()
+            holder.append(screen)
+
+            while not p.game_over():
+                action_array = self.model.predict(holder.get_dqn_input())
+                action = action_array.argmax() * 119
+
+                scores[i] += p.act(action)
+                holder.append(p.getScreenRGB())
+
+        return scores.mean(), scores.max()
+
+    def _write_network(self, filename='weights.dqn'):
+        """
+            Save the full model (architecture + weights + status of the optimiser) to the HDF5 archive located at
+            "filename"
+        Args:
+            filename (str): Location of saved model (path)
+        """
+        self.model.save(filename)
+
+    def _unpack_state(self, state):
+        """
+            Unroll the state array (array of deques) along its 1st axis (i.e. the dequeue axis)
+
+        Args:
+            state (np.ndarray): State of deques to unpack, shape (n,)
+
+        Returns:
+            unpacked (np.ndarray): Unpacked state, ready to be fed to the DQN, shape (n, history_length, 84, 84)
+        """
+        return np.array([np.array(elt) for elt in state])
+
+    def _gen_er_samples(self):
+        """
+            Use the existing model to generate enough sample to start training (i.e. MIN_ER_SIZE samples according to
+            the config file)
+        """
+        print(f"Generating {config.MIN_ER_SIZE} samples using the existing model")
+
+        self.stateholder.append(self.p.getScreenRGB())
+
+        for i in trange(config.MIN_ER_SIZE):
+            screen = self.p.getScreenRGB()
+
+            action = self.model.predict(self.stateholder.get_dqn_input())
+            action = action.argmax() * 119
+
+            reward = self.p.act(action)
+
+            # Shape reward exactly as we do during training
+            if reward == 0.0:
+                reward = config.REWARD_ALIVE
+            if reward < 0.0:
+                reward = -1.0
+
+            new_screen = self.p.getScreenRGB()
+            done = self.p.game_over()
+
+            # Append to the stateholder
+            self.stateholder.append(new_screen)
+
+            # Append to the ER
+            self.er.append_sample(screen, action, reward, new_screen, done)
+
+            if done:
+                self.p.reset_game()
+
+        print(f'Successfully generated {config.MIN_ER_SIZE} samples')
+
+
+if __name__ == '__main__':
+    retrainer = Retrainer(config.MODEL_FILENAME)
+    retrainer.train_network()
diff --git a/RandomBird/run.py → Cattelle/run.py b/RandomBird/run.py → Cattelle/run.py
@@ -26,4 +26,4 @@
         cumulated[i] = cumulated[i] + reward
 
 average_score = np.mean(cumulated)
-max_score = np.max(cumulated)
+max_score = np.max(cumulated)