Changing main file

2023-01-30 17:57:49 +01:00 · 2023-01-30 17:57:49 +01:00 · 0781d4fd05
commit 0781d4fd05
parent 6441759199
1 changed files with 126 additions and 359 deletions
--- a/mario.py
+++ b/mario.py
@ -1,319 +1,144 @@
+import gym
+import numpy as np
+
 import torch
 from torch import nn
+import torch.nn.functional as F
 from torchvision import transforms as T
-from PIL import Image
-import numpy as np
-from pathlib import Path
-from collections import deque
-import random, datetime, os, copy
-from torch.distributions import Categorical
-import collections
-import cv2
-import torch.nn.functional as f
+
+from models import Actor, Critic, Encoder, InverseModel, ForwardModel
+from mario_env import create_mario_env
+
 from torch.utils.tensorboard import SummaryWriter
 writer = SummaryWriter()

-# Gym is an OpenAI toolkit for RL
-import gym
-from gym.spaces import Box
-from gym.wrappers import FrameStack
-
-# NES Emulator for OpenAI Gym
-from nes_py.wrappers import JoypadSpace
-
-# Super Mario environment for OpenAI Gym
-import gym_super_mario_bros
-from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
-
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

+# Make environment
+env = gym.make('SuperMarioBros-1-1-v0')
+env = create_mario_env(env)

-class SkipFrame(gym.Wrapper):
-    def __init__(self, env, skip):
-        """Return only every `skip`-th frame"""
-        super().__init__(env)
-        self._skip = skip
+# Models
+encoder = Encoder(channels=4, encoded_state_size=256).to(device)
+inverse_model = InverseModel(encoded_state_size=256, action_size=env.action_space.n).to(device)
+forward_model = ForwardModel(encoded_state_size=256, action_size=env.action_space.n).to(device)
+actor = Actor(encoded_state_size=256, action_size=env.action_space.n).to(device)
+critic = Critic(encoded_state_size=256).to(device)

-    def step(self, action):
-        """Repeat action, and sum reward"""
-        total_reward = 0.0
-        for i in range(self._skip):
-            # Accumulate reward and repeat the same action
-            obs, reward, done, trunk, info = self.env.step(action)
-            total_reward += reward
-            if done:
-                break
-        return obs, total_reward, done, trunk, info
+# Optimizers
+actor_optim = torch.optim.Adam(actor.parameters(), lr=0.0001)
+critic_optim = torch.optim.Adam(critic.parameters(), lr=0.001)
+icm_params = list(encoder.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())
+icm_optim = torch.optim.Adam(icm_params, lr=0.0001)
+
+# Loss functions
+ce = nn.CrossEntropyLoss().to(device)
+mse = nn.MSELoss().to(device)
+
+# Hyperparameters
+beta = 0.2
+alpha = 100
+gamma = 0.99
+lamda = 0.1
+
+# Training Parameters
+render = False
+num_episodes = 1000
+
+# Training
+def train():
+    t = 0
+    for episode in range(num_episodes):
+        observation = env.reset()
+        total_reward = 0
+        done = False
+        while not done:
+            #env.render()
+            state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device)
+            action_probs = actor(state)
+            action = action_probs.sample()
+            action_one_hot = F.one_hot(action, num_classes=env.action_space.n).float()
+
+            next_observation, reward, done, info = env.step(action.item())
+            next_state = torch.tensor(next_observation).to(device).unsqueeze(0) if next_observation.ndim == 3 else torch.tensor(next_observation).to(device)
+
+            encoded_state = encoder(state)
+            next_encoded_state = encoder(next_state)
+            predicted_next_state = forward_model(encoded_state, action_one_hot)
+            predicted_action = inverse_model(encoded_state, next_encoded_state)
+
+            intrinsic_reward = alpha * mse(predicted_next_state, next_encoded_state.detach())
+            extrinsic_reward = torch.tensor(reward).to(device)
+            reward = intrinsic_reward + extrinsic_reward
+
+            forward_loss = mse(predicted_next_state, next_encoded_state.detach())
+            inverse_loss = ce(action_probs.probs,predicted_action.probs) 
+            icm_loss = beta * forward_loss + (1-beta) * inverse_loss
+
+            delta = reward + gamma * (critic(next_state)*(1-done)) - critic(state) 
+            actor_loss = -(action_probs.log_prob(action) +1e-6) * delta
+            critic_loss = delta ** 2
+            ac_loss = actor_loss + critic_loss
+
+            loss = lamda * ac_loss + icm_loss
+            
+            actor_optim.zero_grad()
+            critic_optim.zero_grad()
+            icm_optim.zero_grad()
+            loss.backward()
+            actor_optim.step()
+            critic_optim.step()
+            icm_optim.step()
+
+            observation = next_observation
+
+            total_reward += reward.item()
+            
+            t +=1
+            writer.add_scalar('Loss/Actor Loss', actor_loss.item(), t)
+            writer.add_scalar('Loss/Critic Loss', critic_loss.item(), t)
+            writer.add_scalar('Loss/Forward Loss', forward_loss.item(), t)
+            writer.add_scalar('Loss/Inverse Loss', inverse_loss.item(), t)
+
+        writer.add_scalar('Reward/Episodic Reward', total_reward, episode)
+
+        if episode % 50 == 0:
+            torch.save(actor.state_dict(), 'saved_models/actor.pth')
+            torch.save(critic.state_dict(), 'saved_models/critic.pth')
+            torch.save(encoder.state_dict(), 'saved_models/encoder.pth')
+            torch.save(inverse_model.state_dict(), 'saved_models/inverse_model.pth')
+            torch.save(forward_model.state_dict(), 'saved_models/forward_model.pth')
+    env.close()
+
+def test():
+    actor.load_state_dict(torch.load('saved_models/actor.pth'))
+    critic.load_state_dict(torch.load('saved_models/critic.pth'))
+    encoder.load_state_dict(torch.load('saved_models/encoder.pth'))
+    inverse_model.load_state_dict(torch.load('saved_models/inverse_model.pth'))
+    forward_model.load_state_dict(torch.load('saved_models/forward_model.pth'))
+
+    observation = env.reset()
+
+    while True:
+        env.render()
+        state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device)
+        action_probs = actor(state)
+        action = action_probs.sample()
+        observation, reward, done, info = env.step(action.item())
+        if done:
+            observation = env.reset()
+
+if __name__ == '__main__':
+    train()


-class GrayScaleObservation(gym.ObservationWrapper):
-    def __init__(self, env):
-        super().__init__(env)
-        obs_shape = self.observation_space.shape[:2]
-        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
-
-    def permute_orientation(self, observation):
-        # permute [H, W, C] array to [C, H, W] tensor
-        observation = np.transpose(observation, (2, 0, 1))
-        observation = torch.tensor(observation.copy(), dtype=torch.float)
-        return observation
-
-    def observation(self, observation):
-        observation = self.permute_orientation(observation)
-        transform = T.Grayscale()
-        observation = transform(observation)
-        return observation
-
-
-class ResizeObservation(gym.ObservationWrapper):
-    def __init__(self, env, shape):
-        super().__init__(env)
-        if isinstance(shape, int):
-            self.shape = (shape, shape)
-        else:
-            self.shape = tuple(shape)
-
-        obs_shape = self.shape + self.observation_space.shape[2:]
-        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
-
-    def observation(self, observation):
-        transforms = T.Compose(
-            [T.Resize(self.shape), T.Normalize(0, 255)]
-        )
-        observation = transforms(observation).squeeze(0)
-        return observation
-class MaxAndSkipEnv(gym.Wrapper):
-    """
-        Each action of the agent is repeated over skip frames
-        return only every `skip`-th frame
-    """
-    def __init__(self, env=None, skip=4):
-        super(MaxAndSkipEnv, self).__init__(env)
-        # most recent raw observations (for max pooling across time steps)
-        self._obs_buffer = collections.deque(maxlen=2)
-        self._skip = skip
-
-    def step(self, action):
-        total_reward = 0.0
-        done = None
-        for _ in range(self._skip):
-            obs, reward, done, info = self.env.step(action)
-            self._obs_buffer.append(obs)
-            total_reward += reward
-            if done:
-                break
-        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
-        return max_frame, total_reward, done, info
-
-    def reset(self):
-        """Clear past frame buffer and init to first obs"""
-        self._obs_buffer.clear()
-        obs = self.env.reset()
-        self._obs_buffer.append(obs)
-        return obs
-
-
-class MarioRescale84x84(gym.ObservationWrapper):
-    """
-    Downsamples/Rescales each frame to size 84x84 with greyscale
-    """
-    def __init__(self, env=None):
-        super(MarioRescale84x84, self).__init__(env)
-        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
-
-    def observation(self, obs):
-        return MarioRescale84x84.process(obs)
-
-    @staticmethod
-    def process(frame):
-        if frame.size == 240 * 256 * 3:
-            img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
-        else:
-            assert False, "Unknown resolution." 
-        # image normalization on RBG
-        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
-        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
-        x_t = resized_screen[18:102, :]
-        x_t = np.reshape(x_t, [84, 84, 1])
-        return x_t.astype(np.uint8)
-
-
-class ImageToPyTorch(gym.ObservationWrapper):
-    """
-    Each frame is converted to PyTorch tensors
-    """
-    def __init__(self, env):
-        super(ImageToPyTorch, self).__init__(env)
-        old_shape = self.observation_space.shape
-        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)
-
-    def observation(self, observation):
-        return np.moveaxis(observation, 2, 0)
-
-    
-class BufferWrapper(gym.ObservationWrapper):
-    """
-    Only every k-th frame is collected by the buffer
-    """
-    def __init__(self, env, n_steps, dtype=np.float32):
-        super(BufferWrapper, self).__init__(env)
-        self.dtype = dtype
-        old_space = env.observation_space
-        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
-                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)
-
-    def reset(self):
-        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
-        return self.observation(self.env.reset())
-
-    def observation(self, observation):
-        self.buffer[:-1] = self.buffer[1:]
-        self.buffer[-1] = observation
-        return self.buffer
-
-
-class PixelNormalization(gym.ObservationWrapper):
-    """
-    Normalize pixel values in frame --> 0 to 1
-    """
-    def observation(self, obs):
-        return np.array(obs).astype(np.float32) / 255.0
-
-
-def create_mario_env(env):
-    env = MaxAndSkipEnv(env)
-    env = MarioRescale84x84(env)
-    env = ImageToPyTorch(env)
-    env = BufferWrapper(env, 4)
-    env = PixelNormalization(env)
-    return JoypadSpace(env, COMPLEX_MOVEMENT)
-
-class ActorCritic(nn.Module):
-    def __init__(self, input_size, action_size=2):
-        super(ActorCritic, self).__init__()
-        self.input_size = input_size
-        self.action_size = action_size
-
-        self.feature = nn.Sequential(
-            nn.Conv2d(in_channels=self.input_size[0], out_channels=32, kernel_size=8, stride=4),
-            nn.LeakyReLU(),
-            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
-            nn.LeakyReLU(),
-            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
-            nn.LeakyReLU(),
-            nn.Flatten(),
-            nn.Linear(in_features=7*7*64, out_features=512),
-            nn.LeakyReLU(),
-        )
-
-    def actor(self,state):
-        policy = nn.Sequential(
-            nn.Linear(in_features=state.shape[1], out_features=state.shape[1]),
-            nn.LeakyReLU(),
-            nn.Linear(in_features=state.shape[1], out_features=self.action_size),
-            nn.Softmax(dim=-1)
-        ).to(device)
-        return policy(state)
-
-    def critic(self,state):
-        value = nn.Sequential(
-            nn.Linear(in_features=state.shape[1], out_features=state.shape[1]),
-            nn.LeakyReLU(),
-            nn.Linear(in_features=state.shape[1], out_features=1)
-        ).to(device)
-        return value(state)
-
-
-    def forward(self, state):
-        if state.dim() == 3:
-            state = state.unsqueeze(0)
-        state = self.feature(state)
-        value = self.critic(state)
-        policy = self.actor(state)
-        action_probs = Categorical(policy)
-        log_action_probs = torch.log(action_probs.probs)
-        return value, action_probs, log_action_probs
-
-
-class Encoder(nn.Module):
-    def __init__(self, input_size, action_size=2):
-        super(Encoder, self).__init__()
-        self.input_size = input_size[0]
-        self.action_size = action_size
-
-        self.feature_encoder = nn.Sequential(
-            nn.Conv2d(in_channels=self.input_size, out_channels=32, kernel_size=3, stride=2),
-            nn.LeakyReLU(),
-            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
-            nn.LeakyReLU(),
-            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
-            nn.LeakyReLU(),
-            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
-            nn.LeakyReLU(),
-            nn.Flatten(),
-            nn.Linear(in_features=32*4*4, out_features=256),  
-        ).to(device)
-
-    def forward(self, state):
-        if state.dim() == 3:
-            state = state.unsqueeze(0)
-        state = self.feature_encoder(state)
-        return state
-
-
-class InverseModel(nn.Module):
-    def __init__(self, input_size, action_size=2):
-        super(InverseModel, self).__init__()
-        self.input_size = input_size[0]
-        self.action_size = action_size
-        self.feature_encoder = Encoder(input_size, action_size)
-
-        self.model = nn.Sequential(
-            nn.Linear(in_features=256*2, out_features=256),
-            nn.LeakyReLU(),
-            nn.Linear(in_features=256, out_features=self.action_size),
-            nn.Softmax(dim=-1)
-        ).to(device)
-
-    def forward(self, state, next_state):
-
-        encoded_state, next_encoded_state = torch.unsqueeze(state, dim=0), torch.unsqueeze(next_state, dim=0)
-        encoded_state, next_encoded_state = self.feature_encoder(encoded_state), self.feature_encoder(next_encoded_state)
-        encoded_states = torch.cat((encoded_state, next_encoded_state), dim=-1)
-        actions = Categorical(self.model(encoded_states))
-        a = float(np.array(actions.sample().cpu())[0])
-        action = torch.FloatTensor([a])
-        one_hot_action = f.one_hot(action.to(torch.int64), self.action_size)
-        return one_hot_action, encoded_state, next_encoded_state
-
-
-class ForwardModel(nn.Module):
-    def __init__(self, encoded_state_size, action_size):
-        super(ForwardModel, self).__init__()
-        self.state_size = encoded_state_size
-        self.action_size = action_size
-  
-        self.model = nn.Sequential(
-            nn.Linear(self.state_size + 1, 256),
-            nn.LeakyReLU(),
-            nn.Linear(256, encoded_state_size)
-        ).to(device)
-
-    def forward(self, state, action):
-        if state.dim() == 3:
-            state = state.unsqueeze(0)
-        if action.dim() == 1:
-            action = action.unsqueeze(0)
-        state = torch.cat((state, action), dim=-1)
-        return self.model(state)
-
+exit()
 class ICM(nn.Module):
-    def __init__(self, state_size, action_size, encoded_state_size=256):
+    def __init__(self, state_size, action_size, inverse_model, forward_model, encoded_state_size=256):
        super(ICM, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
-        self.inverse_model = InverseModel(state_size, action_size)
-        self.forward_model = ForwardModel(encoded_state_size, action_size)
+        
        self.loss = nn.MSELoss().to(device)

        self.feature_encoder = nn.Sequential(
@ -345,62 +170,4 @@ class ICM(nn.Module):
        predicted_next_state = self.forward_model(encoded_state, action)
        
        intrinsic_reward = 0.5 * self.loss(predicted_next_state, next_encoded_state.detach())
-        return intrinsic_reward
-
-#env = gym.make('SuperMarioBros-1-1-v0')
-#env = GrayScaleObservation(env)
-#env = ResizeObservation(env, shape=84)
-#env = FrameStack(env, num_stack=4)
-env = gym.make('SuperMarioBros-1-1-v0')
-env = create_mario_env(env)
-
-ce = nn.CrossEntropyLoss().to(device)
-mse = nn.MSELoss().to(device)
-icm = ICM(env.observation_space.shape, env.action_space.n).to(device)
-ac = ActorCritic(env.observation_space.shape, env.action_space.n).to(device)
-optimizer = torch.optim.Adam(list(icm.parameters()) + list(ac.parameters()), lr=0.001)
-done = False
-t = 0
-gamma = 0.99    
-for episode in range(1000):
-    observation = env.reset()
-    total_reward = 0
-    t_init = t
-    while not done:
-        #env.render()
-        value, actions, log_action_probs = ac(torch.FloatTensor(np.array(observation)).to(device))    
-        action = actions.sample().item()
-
-        next_observation, reward, done, info = env.step(action)  # feedback from environment    
-        observation_array, next_observation_array = torch.FloatTensor(np.array(observation)).to(device), torch.FloatTensor(np.array(next_observation)).to(device)
-
-        int_reward = icm(observation_array, next_observation_array, action)
-
-        delta = torch.squeeze(int_reward + gamma * (ac(next_observation_array)[0]*(1-int(done))) - ac(observation_array)[0])
-        actor_loss = -log_action_probs[0,action] * int_reward
-        critic_loss = delta**2
-
-        reward = torch.FloatTensor([reward]).to(device)
-        reward = int_reward
-        
-        one_hot_action = icm.inverse_model(observation_array, next_observation_array)[0].to(device)
-        inverse_loss = ce(one_hot_action.float(), actions.probs)
-
-
-        loss = actor_loss + critic_loss + inverse_loss
-        optimizer.zero_grad()
-        loss.backward()
-        optimizer.step()
-        writer.add_scalar("loss", loss, t)
-        
-
-        observation = next_observation
-
-        total_reward += reward
-        t += 1
-        #print("timestep: ", t, "reward: ", reward, "loss: ", loss)
-        if done:
-            done = False
-            break
-    writer.add_scalar("reward", total_reward/(t-t_init), t)
-env.close()
+        return intrinsic_reward