SupaeroDataScience · sergira · Mar 11, 2018 · Mar 11, 2018
diff --git a/RandomBird/FlappyAgent.py b/RandomBird/FlappyAgent.py
diff --git a/Rocamora-Ardevol/FlappyAgent.py b/Rocamora-Ardevol/FlappyAgent.py
@@ -0,0 +1,60 @@
+import numpy as np
+
+Qlearning = dict()
+Qsarsa = dict()
+
+def FlappyPolicy(state, screen):
+    """
+    Returns an action for each timestep depending on the game state:
+        'None' for doing nothing;
+        '119' for jumping
+    """
+    action = actTDLambda(state)
+
+    return( 119 if action else 0 )
+
+
+
+def actQlearning(state):
+
+    global Qlearning
+
+    if not bool(Qlearning):
+        Qlearning = np.load('Qlearning.npy')
+
+    s1, s2, s3 = toDiscreteRef(state)
+    key = str(s1)+'|'+str(s2)+'|'+str(s3)
+
+    if Qlearning[()].get(key) == None:
+        return 0
+
+    return Qlearning[()][key][0] < Qlearning[()][key][1]
+
+
+def actTDLambda(state):
+
+    global Qsarsa
+
+    if not bool(Qsarsa):
+        Qsarsa = np.load('Qsarsa.npy')
+
+    s1, s2, s3 = toDiscreteRef(state)
+    key = str(s1)+'|'+str(s2)+'|'+str(s3)
+
+    if Qsarsa[()].get(key) == None:
+        return 0
+
+    return Qsarsa[()][key][0] < Qsarsa[()][key][1]
+
+
+def toDiscreteRef(state):
+    """
+    Converts the game state variables into the custom discrete variable state for the
+    Q-learning approach.
+    """
+
+    s1 = state['next_pipe_bottom_y'] - state['player_y']
+    s2 = state['next_pipe_dist_to_player']
+    s3 = state['player_vel']
+
+    return int(s1-s1%10), int(s2-s2%20), int(s3-s3%2)
diff --git a/Rocamora-Ardevol/Qlearning.npy b/Rocamora-Ardevol/Qlearning.npy
diff --git a/Rocamora-Ardevol/Qsarsa.npy b/Rocamora-Ardevol/Qsarsa.npy
diff --git a/Rocamora-Ardevol/README.md b/Rocamora-Ardevol/README.md
@@ -0,0 +1,19 @@
+# Implementations
+## Q-learning approach with temporal differences TD(0)
+This algorithm approximates the Q matrix associated to the optimal policy by choosing actions greedily and updating it using the best next Q value independently of the taken policy (off-policy).
+
+## SARSA approach with temporal differences TD($\lambda$)
+The SARSA algorithm inferes the value of the problem's optimal policy's Q matrix by chosing actions greedily and updating the value of the Q on the evaluated policy.
+
+This allows for using the TD($\lambda$) value estimator, which allows for a much faster propagation of the information and thus a faster convergence.
+
+## Q-learning using a neural network as an approximating function
+It inferes the Q matrix values through a neural network. This implementation uses a memory replay.
+
+**This case does not work at the moment** due to a lack of an appropriate hyperparameter tunning. Once properly working it will be naturably extensible to use all the PLE's state variables.
+
+# Acknowledgements
+The theoretical foundations for this work are based on Emmanuel Rachelson's course on machine learning
+
+Some implementation details and hyperparameters based on the work of:
+https://github.com/chncyhn/flappybird-qlearning-bot
diff --git a/Rocamora-Ardevol/Train/Neural network training.ipynb b/Rocamora-Ardevol/Train/Neural network training.ipynb
@@ -0,0 +1,287 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ple.games.flappybird import FlappyBird\n",
+    "from ple import PLE\n",
+    "\n",
+    "import numpy as np\n",
+    "#from FlappyAgent import FlappyPolicy\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "#%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from keras.models import Sequential\n",
+    "from keras.layers.core import Dense, Dropout, Activation\n",
+    "from keras.optimizers import RMSprop, sgd, Adam\n",
+    "from keras.layers.recurrent import LSTM\n",
+    "import numpy as np\n",
+    "import random\n",
+    "import h5py\n",
+    "from IPython.display import clear_output\n",
+    "from collections import deque"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "game = FlappyBird(graphics=\"fixed\") # use \"fancy\" for full background, random bird color and random pipe color, use \"fixed\" (default) for black background and constant bird and pipe colors.\n",
+    "p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n",
+    "# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Declare functions\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convstate(state):\n",
+    "    \"\"\"\n",
+    "    Calculate new state variables from game state\n",
+    "    \"\"\"\n",
+    "    s = np.zeros((3))\n",
+    "    s[0] = state['next_pipe_bottom_y'] - state['player_y']\n",
+    "    s[1] = state['next_pipe_dist_to_player']\n",
+    "    s[2] = state['player_vel']\n",
+    "    \n",
+    "    s[0] = (s[0] - (210 - 40)/2) / ((210 + 40)/2)\n",
+    "    s[1] = (s[1] - (420 - 420)/2) / ((420 + 420)/2) \n",
+    "    s[2] = (s[2] - (10 - 10)/2) / ((10 + 10)/2)\n",
+    "    \n",
+    "    return s.reshape((1,3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def epsilon_greedy(s):\n",
+    "    \n",
+    "    if(np.random.rand()<=epsilon): # random action\n",
+    "        return np.random.choice([0,1], p=[0.9,.1])\n",
+    "    \n",
+    "    else: \n",
+    "        qval = model.predict(s)\n",
+    "        return np.argmax(qval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ReplayMemory:\n",
+    "    \"\"\"\n",
+    "    self.memory contains the old state, the action, the reward, the new state and wether it is a final status, \n",
+    "    concatenated in an array.\n",
+    "    \"\"\"\n",
+    "    def __init__ (self, size):\n",
+    "        self.size = size\n",
+    "        self.index = 0\n",
+    "        self.currentsize = 0\n",
+    "        self.memory = np.zeros((size,9))\n",
+    "        \n",
+    "    def insert (self, state):\n",
+    "        if self.currentsize < self.size:\n",
+    "            self.currentsize += 1\n",
+    "        self.memory[self.index,:] = state[:]\n",
+    "        self.index += 1\n",
+    "        self.index = self.index % self.size\n",
+    "        \n",
+    "    def sample (self, batchSize):\n",
+    "        batchSize = min(self.currentsize, batchSize)\n",
+    "        ind = np.random.choice(self.currentsize, size=batchSize, replace=False)\n",
+    "        return self.memory[ind,:]\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Declare model\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Sequential()\n",
+    "\n",
+    "model.add(Dense(100, kernel_initializer='lecun_uniform', input_shape=(3,)))\n",
+    "model.add(Activation('relu'))\n",
+    "#model.add(Dropout(0.5)) \n",
+    "model.add(Dense(100, kernel_initializer='lecun_uniform'))\n",
+    "model.add(Activation('relu'))\n",
+    "#model.add(Dropout(0.5))\n",
+    "model.add(Dense(2, kernel_initializer='lecun_uniform'))\n",
+    "model.add(Activation('linear'))\n",
+    "#model.compile(loss='mse', optimizer=\"rmsprop\")\n",
+    "adam = Adam(lr=1e-2)\n",
+    "model.compile(loss='mse', optimizer=adam)\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Hyperparameters\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nb_games = 1000\n",
+    "gamma = .99 # discount factor\n",
+    "epsilon = .1 # epsilon-greddy\n",
+    "batchSize = 32\n",
+    "replay = ReplayMemory(10000)\n",
+    "replay_pos = ReplayMemory(10000)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Train network\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "# Some control variables\n",
+    "cumulated = np.zeros((nb_games))\n",
+    "\n",
+    "# Start the game\n",
+    "p.init()\n",
+    "r = 0\n",
+    "step = 0\n",
+    "\n",
+    "for i in range(nb_games):\n",
+    "    p.reset_game()\n",
+    "    \n",
+    "    # Control print\n",
+    "    if i%100 == 0:\n",
+    "        print(i, epsilon, np.mean(cumulated[i-50:i]))\n",
+    "        \n",
+    "        # Decrease exploration ratio\n",
+    "        epsilon *= 0.98\n",
+    "    \n",
+    "    # 0) Retrieve initial state\n",
+    "    \n",
+    "    s = convstate(game.getGameState())\n",
+    "    \n",
+    "    while(not p.game_over()):\n",
+    "        \n",
+    "        # 1) Choose action greedily\n",
+    "        a = epsilon_greedy(s)\n",
+    "        action = 119 if a else None\n",
+    "        \n",
+    "        # Execute \n",
+    "        r = p.act(action)\n",
+    "        cumulated[i] += r\n",
+    "        \n",
+    "        clipped_r = max( min( r, 1 ), -1 ) # Clip the reward values\n",
+    "       \n",
+    "        ss = convstate(game.getGameState())\n",
+    "\n",
+    "        replay.insert(np.concatenate((s,[[a]],[[r]],ss,[[p.game_over()]]),axis=1))\n",
+    "                \n",
+    "        # 2) Update Q \n",
+    "        \n",
+    "        if step > 1000: # and step % 100 == 99:\n",
+    "        \n",
+    "            train_x = np.zeros((batchSize,3))\n",
+    "            train_y = np.zeros((batchSize,2))\n",
+    "            for idx,entry in enumerate(replay.sample(batchSize)):\n",
+    "                currentS = entry[0:3].copy().reshape(1,3)\n",
+    "                nextS = entry[5:8].copy().reshape(1,3)\n",
+    "                act = entry[3]\n",
+    "                rew = entry[4]\n",
+    "                ending = entry[8]\n",
+    "\n",
+    "                currentQ = model.predict(currentS)\n",
+    "                nextQmax = np.max(model.predict(nextS))\n",
+    "                currentQ[0][a] = rew + gamma * nextQmax * (1-ending)\n",
+    "\n",
+    "                train_x[idx,:] = currentS[0,:]\n",
+    "                train_y[idx,:] = currentQ[0,:]\n",
+    "\n",
+    "            model.fit(train_x, train_y, batch_size=1, nb_epoch=1, verbose=0)\n",
+    "            \n",
+    "        \n",
+    "        # 3) Redeclare state\n",
+    "        s = ss\n",
+    "        \n",
+    "        step += 1\n",
+    "        "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}