Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions RandomBird/FlappyAgent.py

This file was deleted.

60 changes: 60 additions & 0 deletions Rocamora-Ardevol/FlappyAgent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np

Qlearning = dict()
Qsarsa = dict()

def FlappyPolicy(state, screen):
"""
Returns an action for each timestep depending on the game state:
'None' for doing nothing;
'119' for jumping
"""
action = actTDLambda(state)

return( 119 if action else 0 )



def actQlearning(state):

global Qlearning

if not bool(Qlearning):
Qlearning = np.load('Qlearning.npy')

s1, s2, s3 = toDiscreteRef(state)
key = str(s1)+'|'+str(s2)+'|'+str(s3)

if Qlearning[()].get(key) == None:
return 0

return Qlearning[()][key][0] < Qlearning[()][key][1]


def actTDLambda(state):

global Qsarsa

if not bool(Qsarsa):
Qsarsa = np.load('Qsarsa.npy')

s1, s2, s3 = toDiscreteRef(state)
key = str(s1)+'|'+str(s2)+'|'+str(s3)

if Qsarsa[()].get(key) == None:
return 0

return Qsarsa[()][key][0] < Qsarsa[()][key][1]


def toDiscreteRef(state):
"""
Converts the game state variables into the custom discrete variable state for the
Q-learning approach.
"""

s1 = state['next_pipe_bottom_y'] - state['player_y']
s2 = state['next_pipe_dist_to_player']
s3 = state['player_vel']

return int(s1-s1%10), int(s2-s2%20), int(s3-s3%2)
Binary file added Rocamora-Ardevol/Qlearning.npy
Binary file not shown.
Binary file added Rocamora-Ardevol/Qsarsa.npy
Binary file not shown.
19 changes: 19 additions & 0 deletions Rocamora-Ardevol/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Implementations
## Q-learning approach with temporal differences TD(0)
This algorithm approximates the Q matrix associated to the optimal policy by choosing actions greedily and updating it using the best next Q value independently of the taken policy (off-policy).

## SARSA approach with temporal differences TD($\lambda$)
The SARSA algorithm inferes the value of the problem's optimal policy's Q matrix by chosing actions greedily and updating the value of the Q on the evaluated policy.

This allows for using the TD($\lambda$) value estimator, which allows for a much faster propagation of the information and thus a faster convergence.

## Q-learning using a neural network as an approximating function
It inferes the Q matrix values through a neural network. This implementation uses a memory replay.

**This case does not work at the moment** due to a lack of an appropriate hyperparameter tunning. Once properly working it will be naturably extensible to use all the PLE's state variables.

# Acknowledgements
The theoretical foundations for this work are based on Emmanuel Rachelson's course on machine learning

Some implementation details and hyperparameters based on the work of:
https://github.com/chncyhn/flappybird-qlearning-bot
287 changes: 287 additions & 0 deletions Rocamora-Ardevol/Train/Neural network training.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from ple.games.flappybird import FlappyBird\n",
"from ple import PLE\n",
"\n",
"import numpy as np\n",
"#from FlappyAgent import FlappyPolicy\n",
"\n",
"import matplotlib.pyplot as plt\n",
"#%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from keras.models import Sequential\n",
"from keras.layers.core import Dense, Dropout, Activation\n",
"from keras.optimizers import RMSprop, sgd, Adam\n",
"from keras.layers.recurrent import LSTM\n",
"import numpy as np\n",
"import random\n",
"import h5py\n",
"from IPython.display import clear_output\n",
"from collections import deque"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"game = FlappyBird(graphics=\"fixed\") # use \"fancy\" for full background, random bird color and random pipe color, use \"fixed\" (default) for black background and constant bird and pipe colors.\n",
"p = PLE(game, fps=30, frame_skip=1, num_steps=1, force_fps=True, display_screen=True)\n",
"# Note: if you want to see you agent act in real time, set force_fps to False. But don't use this setting for learning, just for display purposes."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Declare functions\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convstate(state):\n",
" \"\"\"\n",
" Calculate new state variables from game state\n",
" \"\"\"\n",
" s = np.zeros((3))\n",
" s[0] = state['next_pipe_bottom_y'] - state['player_y']\n",
" s[1] = state['next_pipe_dist_to_player']\n",
" s[2] = state['player_vel']\n",
" \n",
" s[0] = (s[0] - (210 - 40)/2) / ((210 + 40)/2)\n",
" s[1] = (s[1] - (420 - 420)/2) / ((420 + 420)/2) \n",
" s[2] = (s[2] - (10 - 10)/2) / ((10 + 10)/2)\n",
" \n",
" return s.reshape((1,3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def epsilon_greedy(s):\n",
" \n",
" if(np.random.rand()<=epsilon): # random action\n",
" return np.random.choice([0,1], p=[0.9,.1])\n",
" \n",
" else: \n",
" qval = model.predict(s)\n",
" return np.argmax(qval)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class ReplayMemory:\n",
" \"\"\"\n",
" self.memory contains the old state, the action, the reward, the new state and wether it is a final status, \n",
" concatenated in an array.\n",
" \"\"\"\n",
" def __init__ (self, size):\n",
" self.size = size\n",
" self.index = 0\n",
" self.currentsize = 0\n",
" self.memory = np.zeros((size,9))\n",
" \n",
" def insert (self, state):\n",
" if self.currentsize < self.size:\n",
" self.currentsize += 1\n",
" self.memory[self.index,:] = state[:]\n",
" self.index += 1\n",
" self.index = self.index % self.size\n",
" \n",
" def sample (self, batchSize):\n",
" batchSize = min(self.currentsize, batchSize)\n",
" ind = np.random.choice(self.currentsize, size=batchSize, replace=False)\n",
" return self.memory[ind,:]\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Declare model\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Sequential()\n",
"\n",
"model.add(Dense(100, kernel_initializer='lecun_uniform', input_shape=(3,)))\n",
"model.add(Activation('relu'))\n",
"#model.add(Dropout(0.5)) \n",
"model.add(Dense(100, kernel_initializer='lecun_uniform'))\n",
"model.add(Activation('relu'))\n",
"#model.add(Dropout(0.5))\n",
"model.add(Dense(2, kernel_initializer='lecun_uniform'))\n",
"model.add(Activation('linear'))\n",
"#model.compile(loss='mse', optimizer=\"rmsprop\")\n",
"adam = Adam(lr=1e-2)\n",
"model.compile(loss='mse', optimizer=adam)\n",
"\n",
"model.summary()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Hyperparameters\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nb_games = 1000\n",
"gamma = .99 # discount factor\n",
"epsilon = .1 # epsilon-greddy\n",
"batchSize = 32\n",
"replay = ReplayMemory(10000)\n",
"replay_pos = ReplayMemory(10000)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Train network\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# Some control variables\n",
"cumulated = np.zeros((nb_games))\n",
"\n",
"# Start the game\n",
"p.init()\n",
"r = 0\n",
"step = 0\n",
"\n",
"for i in range(nb_games):\n",
" p.reset_game()\n",
" \n",
" # Control print\n",
" if i%100 == 0:\n",
" print(i, epsilon, np.mean(cumulated[i-50:i]))\n",
" \n",
" # Decrease exploration ratio\n",
" epsilon *= 0.98\n",
" \n",
" # 0) Retrieve initial state\n",
" \n",
" s = convstate(game.getGameState())\n",
" \n",
" while(not p.game_over()):\n",
" \n",
" # 1) Choose action greedily\n",
" a = epsilon_greedy(s)\n",
" action = 119 if a else None\n",
" \n",
" # Execute \n",
" r = p.act(action)\n",
" cumulated[i] += r\n",
" \n",
" clipped_r = max( min( r, 1 ), -1 ) # Clip the reward values\n",
" \n",
" ss = convstate(game.getGameState())\n",
"\n",
" replay.insert(np.concatenate((s,[[a]],[[r]],ss,[[p.game_over()]]),axis=1))\n",
" \n",
" # 2) Update Q \n",
" \n",
" if step > 1000: # and step % 100 == 99:\n",
" \n",
" train_x = np.zeros((batchSize,3))\n",
" train_y = np.zeros((batchSize,2))\n",
" for idx,entry in enumerate(replay.sample(batchSize)):\n",
" currentS = entry[0:3].copy().reshape(1,3)\n",
" nextS = entry[5:8].copy().reshape(1,3)\n",
" act = entry[3]\n",
" rew = entry[4]\n",
" ending = entry[8]\n",
"\n",
" currentQ = model.predict(currentS)\n",
" nextQmax = np.max(model.predict(nextS))\n",
" currentQ[0][a] = rew + gamma * nextQmax * (1-ending)\n",
"\n",
" train_x[idx,:] = currentS[0,:]\n",
" train_y[idx,:] = currentQ[0,:]\n",
"\n",
" model.fit(train_x, train_y, batch_size=1, nb_epoch=1, verbose=0)\n",
" \n",
" \n",
" # 3) Redeclare state\n",
" s = ss\n",
" \n",
" step += 1\n",
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading