# Tutorial - Deep Q-Learning 

Deep Q-Learning uses a neural network to approximate $Q$ functions. Hence, we usually refer to this algorithm as DQN (for *deep Q network*).

The parameters of the neural network are denoted by $\theta$. 
*   As input, the network takes a state $s$,
*   As output, the network returns $Q(s, a, \theta)$, the value of each action $a$ in state $s$, according to the parameters $\theta$.


The goal of Deep Q-Learning is to learn the parameters $\theta$ so that $Q(s, a, \theta)$ approximates well the optimal $Q$-function $Q^*(s, a)$. 

In addition to the network with parameters $\theta$, the algorithm keeps another network with the same architecture and parameters $\theta^-$, called **target network**.

The algorithm works as follows:

1.   At each time $t$, the agent is in state $s_t$ and has observed the transitions $(s_i, a_i, r_i, s_i')_{i=1}^{t-1}$, which are stored in a **replay buffer**.

2.  Choose action $a_t = \arg\max_a Q(s_t, a)$ with probability $1-\varepsilon_t$, and $a_t$=random action with probability $\varepsilon_t$. 

3. Take action $a_t$, observe reward $r_t$ and next state $s_t'$.

4. Add transition $(s_t, a_t, r_t, s_t')$ to the **replay buffer**.

4.  Sample a minibatch $\mathcal{B}$ containing $B$ transitions from the replay buffer. Using this minibatch, we define the loss:

$$
L(\theta) = \sum_{(s_i, a_i, r_i, s_i') \in \mathcal{B}}
\left[
Q(s_i, a_i, \theta) -  y_i
\right]^2
$$
where the $y_i$ are the **targets** computed with the **target network** $\theta^-$:

$$
y_i = r_i + \gamma \max_{a'} Q(s_i', a', \theta^-).
$$

5. Update the parameters $\theta$ to minimize the loss, e.g., with gradient descent (**keeping $\theta^-$ fixed**): 
$$
\theta \gets \theta - \eta \nabla_\theta L(\theta)
$$
where $\eta$ is the optimization learning rate. 

6. Every $N$ transitions ($t\mod N$ = 0), update target parameters: $\theta^- \gets \theta$.

7. $t \gets t+1$. Stop if $t = T$, otherwise go to step 2.

# Setup

In [1]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy
from gym.wrappers import Monitor
import gym

In [2]:
# Create directory for saving videos
!mkdir videos > /dev/null 2>&1

# The following code is will be used to visualize the environments.
import base64
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path

def show_video(filename=None, directory='./videos'):
    """
    Either show all videos in a directory (if filename is None) or 
    show video corresponding to filename.
    """
    html = []
    if filename is not None:
        files = Path('./').glob(filename)
    else:
        files = Path(directory).glob("*.mp4")
    for mp4 in files:
        print(mp4)
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

The system cannot find the path specified.


In [3]:
# Random number generator
import rlberry.seeding as seeding 
seeder = seeding.Seeder(456)
rng = seeder.rng

# 1. Define the parameters

In [4]:
# Environment
env = gym.make("CartPole-v0")

# Discount factor
GAMMA = 0.99

# Batch size
BATCH_SIZE = 128
# Capacity of the replay buffer
BUFFER_CAPACITY = 10000
# Update target net every ... episodes
UPDATE_TARGET_EVERY = 20

# Initial value of epsilon
EPSILON_START = 1.0
# Parameter to decrease epsilon
DECREASE_EPSILON = 200
# Minimum value of epislon
EPSILON_MIN = 0.05

# Number of training episodes
N_EPISODES = 500

# Learning rate
LEARNING_RATE = 0.1

# 2. Define the replay buffer

In [5]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, next_state, done):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return rng.choice(self.memory, batch_size).tolist()


    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

# 3. Define the neural network architecture, objective and optimizer

In [6]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x)

Describe the shape of that network.

In [11]:
# create network and target network
hidden_size = 128
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

q_net = Net(obs_size, hidden_size, n_actions)
target_net = Net(obs_size, hidden_size, n_actions)

# objective and optimizer
objective = nn.MSELoss()  # Mean Square Error loss
#objective = nn.SmoothL1Loss()
optimizer = optim.Adam(params=q_net.parameters(), lr=LEARNING_RATE)  # Optimizer for the parameters of q_net using Adam

# 4. Implement Deep Q-Learning

In [12]:
#
#  Some useful functions
#

def get_q(states):
    """
    Compute Q function for a list of states, for q_net
    """
    with torch.no_grad():
        states_v = torch.FloatTensor([states])
        output = q_net.forward(states_v).data.numpy()  # shape (1, len(states), n_actions)
    return output[0, :, :]  # shape (len(states), n_actions)

def eval_dqn(n_sim=5):
    """   
    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    env_copy = deepcopy(env)
    episode_rewards = np.zeros(n_sim)

    for ii in range(n_sim):
        state = env_copy.reset()
        done = False 
        while not done:
            action = choose_action(state, 0.0)
            next_state, reward, done, _ = env_copy.step(action)
            episode_rewards[ii] += reward
            state = next_state
    return episode_rewards

In [13]:
def choose_action(state, epsilon):
    """
    ** TO BE IMPLEMENTED **
    
    Return action according to an epsilon-greedy exploration policy
    """
    return 0
    

def update(state, action, reward, next_state, done):
    """
    ** TO BE COMPLETED **
    """
    
    # add data to replay buffer
    replay_buffer.push(state, action, reward, next_state, done)
    
    if len(replay_buffer) < BATCH_SIZE:
        return np.inf
    
    # get batch
    transitions = replay_buffer.sample(BATCH_SIZE)
    
    """
    Compute loss - TO BE IMPLEMENTED!
    
    Do not use get_q here: it contains a "no_grad" instruction which disables gradient propagation... but
    here we will want to compute gradients to optimize the model.
    Potentially useful methods: torch.tensor, torch.gather, requires_grad
    """
    values  = torch.zeros(BATCH_SIZE, requires_grad = True)   # to be computed using batch
    targets = torch.zeros(BATCH_SIZE, requires_grad = True)   # to be computed using batch
    
    loss = objective(values, targets)
     
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # Here the gradient of all parameters in the network have been computed. If your implementation is correct,
    # the following line should print something which is not None:
    # print(q_net.net[2].weight.grad)
    optimizer.step()
    
    return loss.data.numpy()

In [14]:

#
# Train
# 

EVAL_EVERY = 5
REWARD_THRESHOLD = 199

def train():
    state = env.reset()
    epsilon = EPSILON_START
    ep = 0
    total_time = 0
    while ep < N_EPISODES:
        action = choose_action(state, epsilon)

        # take action and update replay buffer and networks
        next_state, reward, done, _ = env.step(action)
        loss = update(state, action, reward, next_state, done)

        # update state
        state = next_state

        # end episode if done
        if done:
            state = env.reset()
            ep   += 1
            if ( (ep+1)% EVAL_EVERY == 0):
                rewards = eval_dqn()
                print("episode =", ep+1, ", reward = ", np.mean(rewards))
                if np.mean(rewards) >= REWARD_THRESHOLD:
                    break

            # update target network
            if ep % UPDATE_TARGET_EVERY == 0:
                target_net.load_state_dict(q_net.state_dict())
                # For debugging, it might be useful to print the parameters of the network once in a while
                #print(target_net.state_dict())
            # decrease epsilon
            epsilon = EPSILON_MIN + (EPSILON_START - EPSILON_MIN) * \
                            np.exp(-1. * ep / DECREASE_EPSILON )    

        total_time += 1

# Run the training loop
train()

# Evaluate the final policy
rewards = eval_dqn(20)
print("")
print("mean reward after training = ", np.mean(rewards))

  return rng.choice(self.memory, batch_size).tolist()


episode = 5 , reward =  9.4
episode = 10 , reward =  8.8
episode = 15 , reward =  9.2
episode = 20 , reward =  8.8
episode = 25 , reward =  9.6
episode = 30 , reward =  9.4
episode = 35 , reward =  8.8
episode = 40 , reward =  9.8
episode = 45 , reward =  10.0
episode = 50 , reward =  9.8
episode = 55 , reward =  9.4
episode = 60 , reward =  9.6
episode = 65 , reward =  112.4
episode = 70 , reward =  26.0
episode = 75 , reward =  9.8
episode = 80 , reward =  53.0
episode = 85 , reward =  116.0
episode = 90 , reward =  123.8
episode = 95 , reward =  97.4
episode = 100 , reward =  105.6
episode = 105 , reward =  149.0
episode = 110 , reward =  161.6
episode = 115 , reward =  177.6
episode = 120 , reward =  162.4
episode = 125 , reward =  175.0
episode = 130 , reward =  193.2
episode = 135 , reward =  198.8
episode = 140 , reward =  199.8

mean reward after training =  199.75


# Visualize the DQN policy

In [15]:
def render_env(env):
  env = deepcopy(env)
  env = Monitor(env, './videos', force=True, video_callable=lambda episode: True)
  for episode in range(1):
    done = False
    state = env.reset()
    env.render()
    while not done:
        action = action = choose_action(state, 0.0)
        state, reward, done, info = env.step(action)
        env.render()
    env.close()
    show_video()

render_env(env)



videos\openaigym.video.0.12108.video000000.mp4
