diff --git a/rl_games/common/env_configurations.py b/rl_games/common/env_configurations.py index d8b335e3..a407496d 100644 --- a/rl_games/common/env_configurations.py +++ b/rl_games/common/env_configurations.py @@ -9,7 +9,10 @@ import numpy as np import math - +try: + import bark_ml.environments.gym +except: + pass class HCRewardEnv(gym.RewardWrapper): def __init__(self, env): diff --git a/rl_games/common/player.py b/rl_games/common/player.py index 98be6501..33c848cc 100644 --- a/rl_games/common/player.py +++ b/rl_games/common/player.py @@ -12,6 +12,7 @@ from rl_games.common import env_configurations from rl_games.algos_torch import model_builder +import pandas as pd class BasePlayer(object): @@ -271,6 +272,9 @@ def init_rnn(self): )[2]), dtype=torch.float32).to(self.device) for s in rnn_states] def run(self): + # create pandas dataframe with fields: game_index, observation, action, reward and done + df = pd.DataFrame(columns=['game_index', 'observation', 'action', 'reward', 'done']) + n_games = self.games_num render = self.render_env n_game_life = self.n_game_life @@ -313,6 +317,8 @@ def run(self): print_game_res = False + game_indices = torch.arange(0, batch_size).to(self.device) + cur_games = batch_size for n in range(self.max_steps): if self.evaluation and n % self.update_checkpoint_freq == 0: self.maybe_load_new_checkpoint() @@ -324,7 +330,11 @@ def run(self): else: action = self.get_action(obses, is_deterministic) + prev_obses = obses obses, r, done, info = self.env_step(self.env, action) + + for i in range(batch_size): + df.loc[len(df)] = [game_indices[i].cpu().numpy().item(), prev_obses[i].cpu().numpy(), action[i].cpu().numpy(), r[i].cpu().numpy().item(), done[i].cpu().numpy().item()] cr += r steps += 1 @@ -337,6 +347,9 @@ def run(self): done_count = len(done_indices) games_played += done_count + for bid in done_indices: + game_indices[bid] = cur_games + cur_games += 1 if done_count > 0: if self.is_rnn: for s in self.states: @@ -379,6 +392,8 @@ def run(self): else: print('av reward:', sum_rewards / games_played * n_game_life, 'av steps:', sum_steps / games_played * n_game_life) + + df.to_parquet('game_data.parquet') def get_batch_size(self, obses, batch_size): obs_shape = self.obs_shape diff --git a/rl_games/configs/mujoco/ant_envpool.yaml b/rl_games/configs/mujoco/ant_envpool.yaml index da769e45..54eb015f 100644 --- a/rl_games/configs/mujoco/ant_envpool.yaml +++ b/rl_games/configs/mujoco/ant_envpool.yaml @@ -62,4 +62,7 @@ params: #flat_observation: True player: - render: False \ No newline at end of file + render: False + num_actors: 64 + games_num: 1000 + use_vecenv: True \ No newline at end of file