from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import numpy as np
from deck import Deck, Card
from player import Player, Dealer, MonteCarloPlayer, SarsaPlayer, HumanPlayer
from utils import value, is_bust, policy_eval, cards_to_str, display_cards
from matplotlib import cm, gridspec
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%pylab inline
import mayavi
from mayavi import mlab
from tqdm import tqdm
from utils import value, cards_to_str, is_bust, policy_eval
from easy21 import Easy21, plot_q
import pickle
mlab.init_notebook()
The game is played according to the following rules:
Interact with the environment by playing the game in the following cell:
run easy21.py -a human
The second task is to apply Monte Carlo Control to the Easy21 game. It is suggested to use a time-varying scalar step size of $\alpha_{t} = 1/N(s_{t}, a_{t})$ and an $\epsilon$-greedy exploration strategy with $\epsilon_{t} = N_{0}/(N_{0} + N(s_{t}))$, where $N_{0} = 100$ is a constant, $N(s)$ is the number of times that state $s$ has been visited, and $N(s, a)$ is the number of times that action $a$ has been selected from state $s$.
run easy21.py -a mc -e 1000000 -p mpl
This is not very satisfying as an optimal value function. It is too noisy. The policy does not look optimal at all, given the symmetry of the problem. It seems that maybe epsilon is going to zero too quickly so a less than optimal policy is freezing too soon. I tried changing epsilon to $\epsilon_{t} = \sqrt{N_{0}/(N_{0} + N(s_{t}))}$ below, which seemed to help a little.
run easy21.py -a mc -e 1000000 -p mpl
That still doesn't look like an optimal policy though, and there is still noise, so I decided to take the plunge and calculate the optimal value function and policy without using Reinforcement Learning. For any given policy we can define a stochastic matrix of probabilities for turning a starting state into a final state through playing the game. For the Dealer, let's call that $\mathbf{D}$. For a given policy $\pi$, we will write the player's stochastic matrix as $\mathbf{P}_\pi$. Then if we define a returns matrix $\mathbf{R}$ where rows enumerate the player end values and columns enumerate the dealer end values, we can express the value function as the expected returns under a given policy, $v_\pi = \mathbf{E}_\pi(R) = \mathbf{P}_\pi^{T}\mathbf{R}\mathbf{D}$.
dprobs = policy_eval((1, 17), 100000)
Let's run this again to check that we are taking a large enough sample to get a good estimate of the probability.
dprobs2 = policy_eval((1, 17), 100000)
ds = dprobs[(dprobs != 1.0) * (dprobs != 0.0)] # Only check those that are not equal to zero or one.
ds2 = dprobs2[(dprobs2 != 1.0) * (dprobs2 != 0.0)]
print "RMS Error is:", np.sqrt(((2 * (ds - ds2)/(ds + ds2))**2).mean())
print "RMS Difference is:", np.sqrt((((ds - ds2))**2).mean())
Create the returns matrix.
returns = np.zeros((22, 22))
for i in range(22):
for j in range(22):
returns[i, j] = np.sign(i - j)
returns[0, 0] = -1.0 # The one asymmetry between the player and the dealer.
returns
pstick = policy_eval((22, 11), 100000)
pstick
Policies that are likely to be among the best:
policies = [[i, j] for j in range(18, 11, -1) for i in range(6, 11)]
[tup for tup in enumerate(policies)]
We have 35 policies, plus the policy of always sticking.
probs = []
for pol in policies:
print pol
probs.append(policy_eval(pol, 100000))
probs.append(pstick)
op = np.matmul(returns, dprobs)
Let's take a look at the value function for a policy of always sticking.
expecteds = np.matmul(pstick.T, op)
mlab.clf()
X = np.arange(1, 11, 1)
Y = np.arange(1, 22, 1)
X, Y = np.meshgrid(X, Y)
mlab.mesh(X, Y, 5*expecteds[1:22, 1:11])
expected_returns = [np.matmul(prob.T, op) for prob in probs]
best_pols = []
for i in range(1, 11):
best_pols.append(np.argmax([np.mean(rtn[1:11, i]) for rtn in expected_returns]))
best_pols
best_returns = np.array([expected_returns[col][1:, idx+1] for (idx, col) in enumerate(best_pols)]).T
Now we can plot the value function for the optimal policy along with the value function for the always sticking policy and see how much better we do.
mlab.clf()
X = np.arange(1, 11, 1)
Y = np.arange(1, 22, 1)
X, Y = np.meshgrid(X, Y)
mlab.mesh(X, Y, 5*expecteds[1:22, 1:11])
mlab.mesh(X, Y, 5*best_returns)
Now we can also find the hit surface for in case we hit form a given state, but otherwise follow our optimal policy.
p_card_value = np.array(10*[1/30.0] + [0] + 10*[2/30.0])
hit_values = np.zeros((21, 10))
for j in range(10):
for i in range(21):
hit_values[i, j] = (np.sum(optimal_values[(i-10)*(i-10>0):11+i, j] * p_card_value[(10-i)*(10-i>0):31-i])
+ np.sum(-1 * p_card_value[((41-i)%31-10)*((41-i)%31-10>0):(42-i)%32]))
qvalues = np.concatenate((np.array([hit_values]), np.array([stick_values])), axis=0).T
optimal_vals = qvalues.max(axis=2).T
mlab.clf()
X = np.arange(1, 11, 1)
Y = np.arange(1, 22, 1)
X, Y = np.meshgrid(X, Y)
mlab.mesh(X, Y, 5*hit_values)
mlab.mesh(X, Y, 5*stick_values)
mlab.mesh(X, Y, 5*optimal_values)
The mean return for the optimal policy across all possible starting states is
np.mean(best_returns[:10, :])
So with the optimal policy it is possible to do about 6.2% better than the dealer. We will save that so we have a beanchmark for how well our Reinforcement Learning agents are doing.
np.savetxt("hit_values.txt", hit_values)
np.savetxt("optimal_values.txt", best_returns)
np.savetxt("stick_values.txt", expecteds[1:22, 1:11])
hit_values = np.loadtxt("hit_values.txt")
optimal_values = np.loadtxt("optimal_values.txt")
stick_values = np.loadtxt("stick_values.txt")
qvalues = np.concatenate((np.array([stick_values]), np.array([hit_values])), axis=0).T
best_returns = optimal_values
expecteds = np.zeros((22, 11))
expecteds[1:22, 1:11] = stick_values
qvals = np.zeros((11, 22, 2))
qvals[1:11, 1:22, :] = qvalues
from mpl_toolkits.mplot3d import Axes3D
X = np.arange(1, 22, 1)
Y = np.arange(1, 11, 1)
X, Y = np.meshgrid(X, Y)
fig = plt.figure(figsize=(15, 10))
ax = fig.add_subplot(111, projection='3d', azim=-40, elev=45)
#ax1 = fig.add_subplot(gs[0], projection='3d', azim=-40, elev=45)
ax.plot_surface(Y, X, qvals[1:, 1:, 1], linewidth=0.5, shade=True,
cmap=cm.coolwarm, edgecolor=".7")
# ax.plot_surface(Y, X, qvals[1:, 1:, 0], linewidth=0.5, shade=True,
# cmap=cm.coolwarm, edgecolor=".7")
ax.set_zlim3d(bottom=-1.0, top=1.0)
ax.set_xticks([1, 4, 7, 10])
ax.set_yticks([1, 5, 9, 13, 17, 21])
ax.set_zticks([-1.0, -0.5, 0.0, 0.5, 1.0])
ax.set_xlabel('Dealer Card')
ax.set_ylabel('Player Cards Value')
ax.set_zlabel('Expected Return')
ax.set_title('Hit Value Surface')
fig = plot_q(qvals)
run easy21.py -a mc -e 1000000 -d
import pickle
with open("q_mc_lam1.0_eps1000000.pkl", 'r') as f:
mc_values = pickle.load(f)
mlab.clf()
X = np.arange(1, 11, 1)
Y = np.arange(1, 22, 1)
X, Y = np.meshgrid(X, Y)
mlab.mesh(X, Y, 5*mc_values.max(axis=2)[1:11, 1:22].T, opacity=.8)
mlab.mesh(X, Y, 5*optimal_values, representation='wireframe', colormap="bone")
from IPython.display import Image
Image("mc_optimal_compare.png")
We see that the Monte Carlo prediction for optimal values is consistently low for the areas where the policy calls to hit. Let's continue and see if other methods improve upon this and if we can understand how MC falls short.
run easy21.py -a mc -e 100000 -c 1000 -p mpl
run easy21.py -a sarsa -e 100000 -c 1000 -p mpl
run easy21.py -a sarsa -e 1000 -m 10
run easy21.py -a sarsa -e 10000 -l 0 -m 100
run easy21.py -a sarsa -e 10000 -l 0 -c 100
run easy21.py -a sarsa -e 10000 -l 1.0 -c 100