Changing main file

This commit is contained in:
ved1 2023-01-30 17:57:49 +01:00
parent 6441759199
commit 0781d4fd05

View File

@ -1,319 +1,144 @@
import gym
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms as T
from PIL import Image
import numpy as np
from pathlib import Path
from collections import deque
import random, datetime, os, copy
from torch.distributions import Categorical
import collections
import cv2
import torch.nn.functional as f
from models import Actor, Critic, Encoder, InverseModel, ForwardModel
from mario_env import create_mario_env
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
# Gym is an OpenAI toolkit for RL
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace
# Super Mario environment for OpenAI Gym
import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Make environment
env = gym.make('SuperMarioBros-1-1-v0')
env = create_mario_env(env)
class SkipFrame(gym.Wrapper):
def __init__(self, env, skip):
"""Return only every `skip`-th frame"""
super().__init__(env)
self._skip = skip
# Models
encoder = Encoder(channels=4, encoded_state_size=256).to(device)
inverse_model = InverseModel(encoded_state_size=256, action_size=env.action_space.n).to(device)
forward_model = ForwardModel(encoded_state_size=256, action_size=env.action_space.n).to(device)
actor = Actor(encoded_state_size=256, action_size=env.action_space.n).to(device)
critic = Critic(encoded_state_size=256).to(device)
def step(self, action):
"""Repeat action, and sum reward"""
total_reward = 0.0
for i in range(self._skip):
# Accumulate reward and repeat the same action
obs, reward, done, trunk, info = self.env.step(action)
total_reward += reward
# Optimizers
actor_optim = torch.optim.Adam(actor.parameters(), lr=0.0001)
critic_optim = torch.optim.Adam(critic.parameters(), lr=0.001)
icm_params = list(encoder.parameters()) + list(forward_model.parameters()) + list(inverse_model.parameters())
icm_optim = torch.optim.Adam(icm_params, lr=0.0001)
# Loss functions
ce = nn.CrossEntropyLoss().to(device)
mse = nn.MSELoss().to(device)
# Hyperparameters
beta = 0.2
alpha = 100
gamma = 0.99
lamda = 0.1
# Training Parameters
render = False
num_episodes = 1000
# Training
def train():
t = 0
for episode in range(num_episodes):
observation = env.reset()
total_reward = 0
done = False
while not done:
#env.render()
state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device)
action_probs = actor(state)
action = action_probs.sample()
action_one_hot = F.one_hot(action, num_classes=env.action_space.n).float()
next_observation, reward, done, info = env.step(action.item())
next_state = torch.tensor(next_observation).to(device).unsqueeze(0) if next_observation.ndim == 3 else torch.tensor(next_observation).to(device)
encoded_state = encoder(state)
next_encoded_state = encoder(next_state)
predicted_next_state = forward_model(encoded_state, action_one_hot)
predicted_action = inverse_model(encoded_state, next_encoded_state)
intrinsic_reward = alpha * mse(predicted_next_state, next_encoded_state.detach())
extrinsic_reward = torch.tensor(reward).to(device)
reward = intrinsic_reward + extrinsic_reward
forward_loss = mse(predicted_next_state, next_encoded_state.detach())
inverse_loss = ce(action_probs.probs,predicted_action.probs)
icm_loss = beta * forward_loss + (1-beta) * inverse_loss
delta = reward + gamma * (critic(next_state)*(1-done)) - critic(state)
actor_loss = -(action_probs.log_prob(action) +1e-6) * delta
critic_loss = delta ** 2
ac_loss = actor_loss + critic_loss
loss = lamda * ac_loss + icm_loss
actor_optim.zero_grad()
critic_optim.zero_grad()
icm_optim.zero_grad()
loss.backward()
actor_optim.step()
critic_optim.step()
icm_optim.step()
observation = next_observation
total_reward += reward.item()
t +=1
writer.add_scalar('Loss/Actor Loss', actor_loss.item(), t)
writer.add_scalar('Loss/Critic Loss', critic_loss.item(), t)
writer.add_scalar('Loss/Forward Loss', forward_loss.item(), t)
writer.add_scalar('Loss/Inverse Loss', inverse_loss.item(), t)
writer.add_scalar('Reward/Episodic Reward', total_reward, episode)
if episode % 50 == 0:
torch.save(actor.state_dict(), 'saved_models/actor.pth')
torch.save(critic.state_dict(), 'saved_models/critic.pth')
torch.save(encoder.state_dict(), 'saved_models/encoder.pth')
torch.save(inverse_model.state_dict(), 'saved_models/inverse_model.pth')
torch.save(forward_model.state_dict(), 'saved_models/forward_model.pth')
env.close()
def test():
actor.load_state_dict(torch.load('saved_models/actor.pth'))
critic.load_state_dict(torch.load('saved_models/critic.pth'))
encoder.load_state_dict(torch.load('saved_models/encoder.pth'))
inverse_model.load_state_dict(torch.load('saved_models/inverse_model.pth'))
forward_model.load_state_dict(torch.load('saved_models/forward_model.pth'))
observation = env.reset()
while True:
env.render()
state = torch.tensor(observation).to(device).unsqueeze(0) if observation.ndim == 3 else torch.tensor(observation).to(device)
action_probs = actor(state)
action = action_probs.sample()
observation, reward, done, info = env.step(action.item())
if done:
break
return obs, total_reward, done, trunk, info
observation = env.reset()
if __name__ == '__main__':
train()
class GrayScaleObservation(gym.ObservationWrapper):
def __init__(self, env):
super().__init__(env)
obs_shape = self.observation_space.shape[:2]
self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
def permute_orientation(self, observation):
# permute [H, W, C] array to [C, H, W] tensor
observation = np.transpose(observation, (2, 0, 1))
observation = torch.tensor(observation.copy(), dtype=torch.float)
return observation
def observation(self, observation):
observation = self.permute_orientation(observation)
transform = T.Grayscale()
observation = transform(observation)
return observation
class ResizeObservation(gym.ObservationWrapper):
def __init__(self, env, shape):
super().__init__(env)
if isinstance(shape, int):
self.shape = (shape, shape)
else:
self.shape = tuple(shape)
obs_shape = self.shape + self.observation_space.shape[2:]
self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)
def observation(self, observation):
transforms = T.Compose(
[T.Resize(self.shape), T.Normalize(0, 255)]
)
observation = transforms(observation).squeeze(0)
return observation
class MaxAndSkipEnv(gym.Wrapper):
"""
Each action of the agent is repeated over skip frames
return only every `skip`-th frame
"""
def __init__(self, env=None, skip=4):
super(MaxAndSkipEnv, self).__init__(env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = collections.deque(maxlen=2)
self._skip = skip
def step(self, action):
total_reward = 0.0
done = None
for _ in range(self._skip):
obs, reward, done, info = self.env.step(action)
self._obs_buffer.append(obs)
total_reward += reward
if done:
break
max_frame = np.max(np.stack(self._obs_buffer), axis=0)
return max_frame, total_reward, done, info
def reset(self):
"""Clear past frame buffer and init to first obs"""
self._obs_buffer.clear()
obs = self.env.reset()
self._obs_buffer.append(obs)
return obs
class MarioRescale84x84(gym.ObservationWrapper):
"""
Downsamples/Rescales each frame to size 84x84 with greyscale
"""
def __init__(self, env=None):
super(MarioRescale84x84, self).__init__(env)
self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
def observation(self, obs):
return MarioRescale84x84.process(obs)
@staticmethod
def process(frame):
if frame.size == 240 * 256 * 3:
img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
else:
assert False, "Unknown resolution."
# image normalization on RBG
img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
x_t = resized_screen[18:102, :]
x_t = np.reshape(x_t, [84, 84, 1])
return x_t.astype(np.uint8)
class ImageToPyTorch(gym.ObservationWrapper):
"""
Each frame is converted to PyTorch tensors
"""
def __init__(self, env):
super(ImageToPyTorch, self).__init__(env)
old_shape = self.observation_space.shape
self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)
def observation(self, observation):
return np.moveaxis(observation, 2, 0)
class BufferWrapper(gym.ObservationWrapper):
"""
Only every k-th frame is collected by the buffer
"""
def __init__(self, env, n_steps, dtype=np.float32):
super(BufferWrapper, self).__init__(env)
self.dtype = dtype
old_space = env.observation_space
self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
old_space.high.repeat(n_steps, axis=0), dtype=dtype)
def reset(self):
self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
return self.observation(self.env.reset())
def observation(self, observation):
self.buffer[:-1] = self.buffer[1:]
self.buffer[-1] = observation
return self.buffer
class PixelNormalization(gym.ObservationWrapper):
"""
Normalize pixel values in frame --> 0 to 1
"""
def observation(self, obs):
return np.array(obs).astype(np.float32) / 255.0
def create_mario_env(env):
env = MaxAndSkipEnv(env)
env = MarioRescale84x84(env)
env = ImageToPyTorch(env)
env = BufferWrapper(env, 4)
env = PixelNormalization(env)
return JoypadSpace(env, COMPLEX_MOVEMENT)
class ActorCritic(nn.Module):
def __init__(self, input_size, action_size=2):
super(ActorCritic, self).__init__()
self.input_size = input_size
self.action_size = action_size
self.feature = nn.Sequential(
nn.Conv2d(in_channels=self.input_size[0], out_channels=32, kernel_size=8, stride=4),
nn.LeakyReLU(),
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
nn.LeakyReLU(),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
nn.LeakyReLU(),
nn.Flatten(),
nn.Linear(in_features=7*7*64, out_features=512),
nn.LeakyReLU(),
)
def actor(self,state):
policy = nn.Sequential(
nn.Linear(in_features=state.shape[1], out_features=state.shape[1]),
nn.LeakyReLU(),
nn.Linear(in_features=state.shape[1], out_features=self.action_size),
nn.Softmax(dim=-1)
).to(device)
return policy(state)
def critic(self,state):
value = nn.Sequential(
nn.Linear(in_features=state.shape[1], out_features=state.shape[1]),
nn.LeakyReLU(),
nn.Linear(in_features=state.shape[1], out_features=1)
).to(device)
return value(state)
def forward(self, state):
if state.dim() == 3:
state = state.unsqueeze(0)
state = self.feature(state)
value = self.critic(state)
policy = self.actor(state)
action_probs = Categorical(policy)
log_action_probs = torch.log(action_probs.probs)
return value, action_probs, log_action_probs
class Encoder(nn.Module):
def __init__(self, input_size, action_size=2):
super(Encoder, self).__init__()
self.input_size = input_size[0]
self.action_size = action_size
self.feature_encoder = nn.Sequential(
nn.Conv2d(in_channels=self.input_size, out_channels=32, kernel_size=3, stride=2),
nn.LeakyReLU(),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
nn.LeakyReLU(),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
nn.LeakyReLU(),
nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3, stride=2),
nn.LeakyReLU(),
nn.Flatten(),
nn.Linear(in_features=32*4*4, out_features=256),
).to(device)
def forward(self, state):
if state.dim() == 3:
state = state.unsqueeze(0)
state = self.feature_encoder(state)
return state
class InverseModel(nn.Module):
def __init__(self, input_size, action_size=2):
super(InverseModel, self).__init__()
self.input_size = input_size[0]
self.action_size = action_size
self.feature_encoder = Encoder(input_size, action_size)
self.model = nn.Sequential(
nn.Linear(in_features=256*2, out_features=256),
nn.LeakyReLU(),
nn.Linear(in_features=256, out_features=self.action_size),
nn.Softmax(dim=-1)
).to(device)
def forward(self, state, next_state):
encoded_state, next_encoded_state = torch.unsqueeze(state, dim=0), torch.unsqueeze(next_state, dim=0)
encoded_state, next_encoded_state = self.feature_encoder(encoded_state), self.feature_encoder(next_encoded_state)
encoded_states = torch.cat((encoded_state, next_encoded_state), dim=-1)
actions = Categorical(self.model(encoded_states))
a = float(np.array(actions.sample().cpu())[0])
action = torch.FloatTensor([a])
one_hot_action = f.one_hot(action.to(torch.int64), self.action_size)
return one_hot_action, encoded_state, next_encoded_state
class ForwardModel(nn.Module):
def __init__(self, encoded_state_size, action_size):
super(ForwardModel, self).__init__()
self.state_size = encoded_state_size
self.action_size = action_size
self.model = nn.Sequential(
nn.Linear(self.state_size + 1, 256),
nn.LeakyReLU(),
nn.Linear(256, encoded_state_size)
).to(device)
def forward(self, state, action):
if state.dim() == 3:
state = state.unsqueeze(0)
if action.dim() == 1:
action = action.unsqueeze(0)
state = torch.cat((state, action), dim=-1)
return self.model(state)
exit()
class ICM(nn.Module):
def __init__(self, state_size, action_size, encoded_state_size=256):
def __init__(self, state_size, action_size, inverse_model, forward_model, encoded_state_size=256):
super(ICM, self).__init__()
self.state_size = state_size
self.action_size = action_size
self.inverse_model = InverseModel(state_size, action_size)
self.forward_model = ForwardModel(encoded_state_size, action_size)
self.loss = nn.MSELoss().to(device)
self.feature_encoder = nn.Sequential(
@ -346,61 +171,3 @@ class ICM(nn.Module):
intrinsic_reward = 0.5 * self.loss(predicted_next_state, next_encoded_state.detach())
return intrinsic_reward
#env = gym.make('SuperMarioBros-1-1-v0')
#env = GrayScaleObservation(env)
#env = ResizeObservation(env, shape=84)
#env = FrameStack(env, num_stack=4)
env = gym.make('SuperMarioBros-1-1-v0')
env = create_mario_env(env)
ce = nn.CrossEntropyLoss().to(device)
mse = nn.MSELoss().to(device)
icm = ICM(env.observation_space.shape, env.action_space.n).to(device)
ac = ActorCritic(env.observation_space.shape, env.action_space.n).to(device)
optimizer = torch.optim.Adam(list(icm.parameters()) + list(ac.parameters()), lr=0.001)
done = False
t = 0
gamma = 0.99
for episode in range(1000):
observation = env.reset()
total_reward = 0
t_init = t
while not done:
#env.render()
value, actions, log_action_probs = ac(torch.FloatTensor(np.array(observation)).to(device))
action = actions.sample().item()
next_observation, reward, done, info = env.step(action) # feedback from environment
observation_array, next_observation_array = torch.FloatTensor(np.array(observation)).to(device), torch.FloatTensor(np.array(next_observation)).to(device)
int_reward = icm(observation_array, next_observation_array, action)
delta = torch.squeeze(int_reward + gamma * (ac(next_observation_array)[0]*(1-int(done))) - ac(observation_array)[0])
actor_loss = -log_action_probs[0,action] * int_reward
critic_loss = delta**2
reward = torch.FloatTensor([reward]).to(device)
reward = int_reward
one_hot_action = icm.inverse_model(observation_array, next_observation_array)[0].to(device)
inverse_loss = ce(one_hot_action.float(), actions.probs)
loss = actor_loss + critic_loss + inverse_loss
optimizer.zero_grad()
loss.backward()
optimizer.step()
writer.add_scalar("loss", loss, t)
observation = next_observation
total_reward += reward
t += 1
#print("timestep: ", t, "reward: ", reward, "loss: ", loss)
if done:
done = False
break
writer.add_scalar("reward", total_reward/(t-t_init), t)
env.close()