-
Notifications
You must be signed in to change notification settings - Fork 14
/
karpathy.py
26 lines (24 loc) · 1.24 KB
/
karpathy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy as np
# preprocessing used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def prepro(I):
""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.astype(np.float).ravel()
# reward discount used by Karpathy (cf. https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5)
def discount_rewards(r, gamma):
""" take 1D float array of rewards and compute discounted reward """
r = np.array(r)
discounted_r = np.zeros_like(r)
running_add = 0
# we go from last reward to first one so we don't have to do exponentiations
for t in reversed(range(0, r.size)):
if r[t] != 0: running_add = 0 # if the game ended (in Pong), reset the reward sum
running_add = running_add * gamma + r[t] # the point here is to use Horner's method to compute those rewards efficiently
discounted_r[t] = running_add
discounted_r -= np.mean(discounted_r) #normalizing the result
discounted_r /= np.std(discounted_r) #idem
return discounted_r