import numpy as np import gymnasium as gym import random import imageio import os import tqdm import timeit import pickle # import pickle5 as pickle from tqdm.notebook import tqdm env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False, render_mode="rgb_array") # We create our environment with gym.make("")- `is_slippery=False`: The agent always moves in the intended direction due to the non-slippery nature of the frozen lake (deterministic). state_space = env.observation_space.n action_space = env.action_space.n # print("_____OBSERVATION SPACE_____ \n") # print("Observation Space", env.observation_space) # print("Sample observation", env.observation_space.sample()) # Get a random observation # print("There are ", state_space, " possible states") # print("\n _____ACTION SPACE_____ \n") # print("Action Space Shape", env.action_space.n) # print("Action Space Sample", env.action_space.sample()) # Take a random action # print("There are ", action_space, " possible actions") # Let's create our Qtable of size (state_space, action_space) and initialized each values at 0 using np.zeros def initialize_q_table(state_space, action_space): Qtable = np.zeros((state_space, action_space)) return Qtable Qtable_frozenlake = initialize_q_table(state_space, action_space) # print("foo look at\n",Qtable_frozenlake) def greedy_policy(Qtable, state): # Exploitation: take the action with the highest state, action value action = np.argmax(Qtable[state][:]) return action def epsilon_greedy_policy(Qtable, state, epsilon): # Randomly generate a number between 0 and 1 random_num = random.uniform(0,1) # if random_num > greater than epsilon --> exploitation if random_num > epsilon: # Take the action with the highest value given a state # np.argmax can be useful here action = greedy_policy(Qtable, state) # else --> exploration else: action = env.action_space.sample() return action # Training parameters n_training_episodes = 10000 # Total training episodes learning_rate = 0.7 # Learning rate # Evaluation parameters n_eval_episodes = 100 # Total number of test episodes # Environment parameters env_id = "FrozenLake-v1" # Name of the environment max_steps = 99 # Max steps per episode gamma = 0.95 # Discounting rate eval_seed = [] # The evaluation seed of the environment # Exploration parameters max_epsilon = 1.0 # Exploration probability at start min_epsilon = 0.05 # Minimum exploration probability decay_rate = 0.0005 # Exponential decay rate for exploration prob def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable): for episode in tqdm(range(n_training_episodes)): # Reduce epsilon (because we need less and less exploration) epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) # Reset the environment state, info = env.reset() step = 0 terminated = False truncated = False # repeat for step in range(max_steps): # Choose the action At using epsilon greedy policy action = epsilon_greedy_policy(Qtable, state, epsilon) # Take action At and observe Rt+1 and St+1 # Take the action (a) and observe the outcome state(s') and reward (r) new_state, reward, terminated, truncated, info = env.step(action) # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)] Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action]) # If terminated or truncated finish the episode if terminated or truncated: break # Our next state is the new state state = new_state return Qtable start_time = timeit.default_timer() Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake) end_time = timeit.default_timer() execution_time = end_time - start_time print(f"\nTrained in {"{:.2f}".format(execution_time)} seconds\n") def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed): """ Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward. :param env: The evaluation environment :param max_steps: Maximum number of steps per episode :param n_eval_episodes: Number of episode to evaluate the agent :param Q: The Q-table :param seed: The evaluation seed array (for taxi-v3) """ episode_rewards = [] for episode in tqdm(range(n_eval_episodes)): if seed: state, info = env.reset(seed=seed[episode]) else: state, info = env.reset() step = 0 truncated = False terminated = False total_rewards_ep = 0 for step in range(max_steps): # Take the action (index) that have the maximum expected future reward given that state action = greedy_policy(Q, state) new_state, reward, terminated, truncated, info = env.step(action) total_rewards_ep += reward if terminated or truncated: break state = new_state episode_rewards.append(total_rewards_ep) mean_reward = np.mean(episode_rewards) std_reward = np.std(episode_rewards) return mean_reward, std_reward # Evaluate our Agent mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed) print(f"\nMean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")