From 0781d4fd05b8fc017eb7cee6b54c81d94b81735b Mon Sep 17 00:00:00 2001 From: ved1 Date: Mon, 30 Jan 2023 17:57:49 +0100 Subject: [PATCH] Changing main file --- icm mario.py | 485 +++++++++++++-------------------------------------- 1 file changed, 126 insertions(+), 359 deletions(-) diff --git a/icm mario.py b/icm mario.py index df5d085..7feb963 100644 --- a/icm mario.py +++ b/icm mario.py @@ -1,319 +1,144 @@ +import gym +import numpy as np + import torch from torch import nn +import torch.nn.functional as F from torchvision import transforms as T -from PIL import Image -import numpy as np -from pathlib import Path -from collections import deque -import random, datetime, os, copy -from torch.distributions import Categorical -import collections -import cv2 -import torch.nn.functional as f + +from models import Actor, Critic, Encoder, InverseModel, ForwardModel +from mario_env import create_mario_env + from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter() -# Gym is an OpenAI toolkit for RL -import gym -from gym.spaces import Box -from gym.wrappers import FrameStack - -# NES Emulator for OpenAI Gym -from nes_py.wrappers import JoypadSpace - -# Super Mario environment for OpenAI Gym -import gym_super_mario_bros -from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT - device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") +# Make environment +env = gym.make('SuperMarioBros-1-1-v0') +env = create_mario_env(env) -class SkipFrame(gym.Wrapper): - def __init__(self, env, skip): - """Return only every `skip`-th frame""" - super().__init__(env) - self._skip = skip +# Models +encoder = Encoder(channels=4, encoded_state_size=256).to(device) +inverse_model = InverseModel(encoded_state_size=256, action_size=env.action_space.n).to(device) +forward_model = ForwardModel(encoded_state_size=256, action_size=env.action_space.n).to(device) +actor = Actor(encoded_state_size=256, action_size=env.action_space.n).to(device) +critic = Critic(encoded_state_size=256).to(device) - def step(self, action): - """Repeat action, and sum reward""" - total_reward = 0.0 - for i in range(self._skip): - # Accumulate reward and repeat the same action - obs, reward, done, trunk, info = self.env.step(action) - total_reward += reward - if done: - break - return obs, total_reward, done, trunk, info +# Optimizers +actor_optim = torch.optim.Adam(actor.parameters(), lr=0.0001) +critic_optim = torch.optim.Adam(critic.parameters(), lr=0.001) +icm_params = list(encoder.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters()) +icm_optim = torch.optim.Adam(icm_params, lr=0.0001) + +# Loss functions +ce = nn.CrossEntropyLoss().to(device) +mse = nn.MSELoss().to(device) + +# Hyperparameters +beta = 0.2 +alpha = 100 +gamma = 0.99 +lamda = 0.1 + +# Training Parameters +render = False +num_episodes = 1000 + +# Training +def train(): + t = 0 + for episode in range(num_episodes): + observation = env.reset() + total_reward = 0 + done = False + while not done: + #env.render() + state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device) + action_probs = actor(state) + action = action_probs.sample() + action_one_hot = F.one_hot(action, num_classes=env.action_space.n).float() + + next_observation, reward, done, info = env.step(action.item()) + next_state = torch.tensor(next_observation).to(device).unsqueeze(0) if next_observation.ndim == 3 else torch.tensor(next_observation).to(device) + + encoded_state = encoder(state) + next_encoded_state = encoder(next_state) + predicted_next_state = forward_model(encoded_state, action_one_hot) + predicted_action = inverse_model(encoded_state, next_encoded_state) + + intrinsic_reward = alpha * mse(predicted_next_state, next_encoded_state.detach()) + extrinsic_reward = torch.tensor(reward).to(device) + reward = intrinsic_reward + extrinsic_reward + + forward_loss = mse(predicted_next_state, next_encoded_state.detach()) + inverse_loss = ce(action_probs.probs,predicted_action.probs) + icm_loss = beta * forward_loss + (1-beta) * inverse_loss + + delta = reward + gamma * (critic(next_state)*(1-done)) - critic(state) + actor_loss = -(action_probs.log_prob(action) +1e-6) * delta + critic_loss = delta ** 2 + ac_loss = actor_loss + critic_loss + + loss = lamda * ac_loss + icm_loss + + actor_optim.zero_grad() + critic_optim.zero_grad() + icm_optim.zero_grad() + loss.backward() + actor_optim.step() + critic_optim.step() + icm_optim.step() + + observation = next_observation + + total_reward += reward.item() + + t +=1 + writer.add_scalar('Loss/Actor Loss', actor_loss.item(), t) + writer.add_scalar('Loss/Critic Loss', critic_loss.item(), t) + writer.add_scalar('Loss/Forward Loss', forward_loss.item(), t) + writer.add_scalar('Loss/Inverse Loss', inverse_loss.item(), t) + + writer.add_scalar('Reward/Episodic Reward', total_reward, episode) + + if episode % 50 == 0: + torch.save(actor.state_dict(), 'saved_models/actor.pth') + torch.save(critic.state_dict(), 'saved_models/critic.pth') + torch.save(encoder.state_dict(), 'saved_models/encoder.pth') + torch.save(inverse_model.state_dict(), 'saved_models/inverse_model.pth') + torch.save(forward_model.state_dict(), 'saved_models/forward_model.pth') + env.close() + +def test(): + actor.load_state_dict(torch.load('saved_models/actor.pth')) + critic.load_state_dict(torch.load('saved_models/critic.pth')) + encoder.load_state_dict(torch.load('saved_models/encoder.pth')) + inverse_model.load_state_dict(torch.load('saved_models/inverse_model.pth')) + forward_model.load_state_dict(torch.load('saved_models/forward_model.pth')) + + observation = env.reset() + + while True: + env.render() + state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device) + action_probs = actor(state) + action = action_probs.sample() + observation, reward, done, info = env.step(action.item()) + if done: + observation = env.reset() + +if __name__ == '__main__': + train() -class GrayScaleObservation(gym.ObservationWrapper): - def __init__(self, env): - super().__init__(env) - obs_shape = self.observation_space.shape[:2] - self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) - - def permute_orientation(self, observation): - # permute [H, W, C] array to [C, H, W] tensor - observation = np.transpose(observation, (2, 0, 1)) - observation = torch.tensor(observation.copy(), dtype=torch.float) - return observation - - def observation(self, observation): - observation = self.permute_orientation(observation) - transform = T.Grayscale() - observation = transform(observation) - return observation - - -class ResizeObservation(gym.ObservationWrapper): - def __init__(self, env, shape): - super().__init__(env) - if isinstance(shape, int): - self.shape = (shape, shape) - else: - self.shape = tuple(shape) - - obs_shape = self.shape + self.observation_space.shape[2:] - self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8) - - def observation(self, observation): - transforms = T.Compose( - [T.Resize(self.shape), T.Normalize(0, 255)] - ) - observation = transforms(observation).squeeze(0) - return observation -class MaxAndSkipEnv(gym.Wrapper): - """ - Each action of the agent is repeated over skip frames - return only every `skip`-th frame - """ - def __init__(self, env=None, skip=4): - super(MaxAndSkipEnv, self).__init__(env) - # most recent raw observations (for max pooling across time steps) - self._obs_buffer = collections.deque(maxlen=2) - self._skip = skip - - def step(self, action): - total_reward = 0.0 - done = None - for _ in range(self._skip): - obs, reward, done, info = self.env.step(action) - self._obs_buffer.append(obs) - total_reward += reward - if done: - break - max_frame = np.max(np.stack(self._obs_buffer), axis=0) - return max_frame, total_reward, done, info - - def reset(self): - """Clear past frame buffer and init to first obs""" - self._obs_buffer.clear() - obs = self.env.reset() - self._obs_buffer.append(obs) - return obs - - -class MarioRescale84x84(gym.ObservationWrapper): - """ - Downsamples/Rescales each frame to size 84x84 with greyscale - """ - def __init__(self, env=None): - super(MarioRescale84x84, self).__init__(env) - self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8) - - def observation(self, obs): - return MarioRescale84x84.process(obs) - - @staticmethod - def process(frame): - if frame.size == 240 * 256 * 3: - img = np.reshape(frame, [240, 256, 3]).astype(np.float32) - else: - assert False, "Unknown resolution." - # image normalization on RBG - img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 - resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA) - x_t = resized_screen[18:102, :] - x_t = np.reshape(x_t, [84, 84, 1]) - return x_t.astype(np.uint8) - - -class ImageToPyTorch(gym.ObservationWrapper): - """ - Each frame is converted to PyTorch tensors - """ - def __init__(self, env): - super(ImageToPyTorch, self).__init__(env) - old_shape = self.observation_space.shape - self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32) - - def observation(self, observation): - return np.moveaxis(observation, 2, 0) - - -class BufferWrapper(gym.ObservationWrapper): - """ - Only every k-th frame is collected by the buffer - """ - def __init__(self, env, n_steps, dtype=np.float32): - super(BufferWrapper, self).__init__(env) - self.dtype = dtype - old_space = env.observation_space - self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0), - old_space.high.repeat(n_steps, axis=0), dtype=dtype) - - def reset(self): - self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype) - return self.observation(self.env.reset()) - - def observation(self, observation): - self.buffer[:-1] = self.buffer[1:] - self.buffer[-1] = observation - return self.buffer - - -class PixelNormalization(gym.ObservationWrapper): - """ - Normalize pixel values in frame --> 0 to 1 - """ - def observation(self, obs): - return np.array(obs).astype(np.float32) / 255.0 - - -def create_mario_env(env): - env = MaxAndSkipEnv(env) - env = MarioRescale84x84(env) - env = ImageToPyTorch(env) - env = BufferWrapper(env, 4) - env = PixelNormalization(env) - return JoypadSpace(env, COMPLEX_MOVEMENT) - -class ActorCritic(nn.Module): - def __init__(self, input_size, action_size=2): - super(ActorCritic, self).__init__() - self.input_size = input_size - self.action_size = action_size - - self.feature = nn.Sequential( - nn.Conv2d(in_channels=self.input_size[0], out_channels=32, kernel_size=8, stride=4), - nn.LeakyReLU(), - nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2), - nn.LeakyReLU(), - nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1), - nn.LeakyReLU(), - nn.Flatten(), - nn.Linear(in_features=7*7*64, out_features=512), - nn.LeakyReLU(), - ) - - def actor(self,state): - policy = nn.Sequential( - nn.Linear(in_features=state.shape[1], out_features=state.shape[1]), - nn.LeakyReLU(), - nn.Linear(in_features=state.shape[1], out_features=self.action_size), - nn.Softmax(dim=-1) - ).to(device) - return policy(state) - - def critic(self,state): - value = nn.Sequential( - nn.Linear(in_features=state.shape[1], out_features=state.shape[1]), - nn.LeakyReLU(), - nn.Linear(in_features=state.shape[1], out_features=1) - ).to(device) - return value(state) - - - def forward(self, state): - if state.dim() == 3: - state = state.unsqueeze(0) - state = self.feature(state) - value = self.critic(state) - policy = self.actor(state) - action_probs = Categorical(policy) - log_action_probs = torch.log(action_probs.probs) - return value, action_probs, log_action_probs - - -class Encoder(nn.Module): - def __init__(self, input_size, action_size=2): - super(Encoder, self).__init__() - self.input_size = input_size[0] - self.action_size = action_size - - self.feature_encoder = nn.Sequential( - nn.Conv2d(in_channels=self.input_size, out_channels=32, kernel_size=3, stride=2), - nn.LeakyReLU(), - nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2), - nn.LeakyReLU(), - nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2), - nn.LeakyReLU(), - nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2), - nn.LeakyReLU(), - nn.Flatten(), - nn.Linear(in_features=32*4*4, out_features=256), - ).to(device) - - def forward(self, state): - if state.dim() == 3: - state = state.unsqueeze(0) - state = self.feature_encoder(state) - return state - - -class InverseModel(nn.Module): - def __init__(self, input_size, action_size=2): - super(InverseModel, self).__init__() - self.input_size = input_size[0] - self.action_size = action_size - self.feature_encoder = Encoder(input_size, action_size) - - self.model = nn.Sequential( - nn.Linear(in_features=256*2, out_features=256), - nn.LeakyReLU(), - nn.Linear(in_features=256, out_features=self.action_size), - nn.Softmax(dim=-1) - ).to(device) - - def forward(self, state, next_state): - - encoded_state, next_encoded_state = torch.unsqueeze(state, dim=0), torch.unsqueeze(next_state, dim=0) - encoded_state, next_encoded_state = self.feature_encoder(encoded_state), self.feature_encoder(next_encoded_state) - encoded_states = torch.cat((encoded_state, next_encoded_state), dim=-1) - actions = Categorical(self.model(encoded_states)) - a = float(np.array(actions.sample().cpu())[0]) - action = torch.FloatTensor([a]) - one_hot_action = f.one_hot(action.to(torch.int64), self.action_size) - return one_hot_action, encoded_state, next_encoded_state - - -class ForwardModel(nn.Module): - def __init__(self, encoded_state_size, action_size): - super(ForwardModel, self).__init__() - self.state_size = encoded_state_size - self.action_size = action_size - - self.model = nn.Sequential( - nn.Linear(self.state_size + 1, 256), - nn.LeakyReLU(), - nn.Linear(256, encoded_state_size) - ).to(device) - - def forward(self, state, action): - if state.dim() == 3: - state = state.unsqueeze(0) - if action.dim() == 1: - action = action.unsqueeze(0) - state = torch.cat((state, action), dim=-1) - return self.model(state) - +exit() class ICM(nn.Module): - def __init__(self, state_size, action_size, encoded_state_size=256): + def __init__(self, state_size, action_size, inverse_model, forward_model, encoded_state_size=256): super(ICM, self).__init__() self.state_size = state_size self.action_size = action_size - self.inverse_model = InverseModel(state_size, action_size) - self.forward_model = ForwardModel(encoded_state_size, action_size) + self.loss = nn.MSELoss().to(device) self.feature_encoder = nn.Sequential( @@ -345,62 +170,4 @@ class ICM(nn.Module): predicted_next_state = self.forward_model(encoded_state, action) intrinsic_reward = 0.5 * self.loss(predicted_next_state, next_encoded_state.detach()) - return intrinsic_reward - -#env = gym.make('SuperMarioBros-1-1-v0') -#env = GrayScaleObservation(env) -#env = ResizeObservation(env, shape=84) -#env = FrameStack(env, num_stack=4) -env = gym.make('SuperMarioBros-1-1-v0') -env = create_mario_env(env) - -ce = nn.CrossEntropyLoss().to(device) -mse = nn.MSELoss().to(device) -icm = ICM(env.observation_space.shape, env.action_space.n).to(device) -ac = ActorCritic(env.observation_space.shape, env.action_space.n).to(device) -optimizer = torch.optim.Adam(list(icm.parameters()) + list(ac.parameters()), lr=0.001) -done = False -t = 0 -gamma = 0.99 -for episode in range(1000): - observation = env.reset() - total_reward = 0 - t_init = t - while not done: - #env.render() - value, actions, log_action_probs = ac(torch.FloatTensor(np.array(observation)).to(device)) - action = actions.sample().item() - - next_observation, reward, done, info = env.step(action) # feedback from environment - observation_array, next_observation_array = torch.FloatTensor(np.array(observation)).to(device), torch.FloatTensor(np.array(next_observation)).to(device) - - int_reward = icm(observation_array, next_observation_array, action) - - delta = torch.squeeze(int_reward + gamma * (ac(next_observation_array)[0]*(1-int(done))) - ac(observation_array)[0]) - actor_loss = -log_action_probs[0,action] * int_reward - critic_loss = delta**2 - - reward = torch.FloatTensor([reward]).to(device) - reward = int_reward - - one_hot_action = icm.inverse_model(observation_array, next_observation_array)[0].to(device) - inverse_loss = ce(one_hot_action.float(), actions.probs) - - - loss = actor_loss + critic_loss + inverse_loss - optimizer.zero_grad() - loss.backward() - optimizer.step() - writer.add_scalar("loss", loss, t) - - - observation = next_observation - - total_reward += reward - t += 1 - #print("timestep: ", t, "reward: ", reward, "loss: ", loss) - if done: - done = False - break - writer.add_scalar("reward", total_reward/(t-t_init), t) -env.close() \ No newline at end of file + return intrinsic_reward \ No newline at end of file