Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
weidler committed Sep 14, 2022
2 parents 50c1287 + a27bf95 commit 61782d8
Show file tree
Hide file tree
Showing 36 changed files with 804 additions and 257 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ AngoraPy is available on PyPI.
pip install angorapy
```

### MuJoCo and MuJoCo-Py
### MuJoCo
To train on any MuJoCo-based environment, you will need MuJoCo. As of late 2021, MuJoCo is free and can be [downloaded here](https://mujoco.org/download).
As an interface to python, we use mujoco-py, [available here](https://github.com/openai/mujoco-py). To install both, follow their respective instructions.

Expand Down
25 changes: 13 additions & 12 deletions angorapy/agent/ppo_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from angorapy.agent.gather import Gatherer, evaluate, EpsilonGreedyGatherer
from angorapy.agent.ppo.optim import learn_on_batch
from angorapy.common import policies, const
from angorapy.common.const import COLORS, BASE_SAVE_PATH, PRETRAINED_COMPONENTS_PATH, STORAGE_DIR
from angorapy.common.const import COLORS, BASE_SAVE_PATH, PRETRAINED_COMPONENTS_PATH, STORAGE_DIR, PATH_TO_EXPERIMENTS
from angorapy.common.const import MIN_STAT_EPS
from angorapy.common.mpi_optim import MpiAdam
from angorapy.common.policies import BasePolicyDistribution, CategoricalPolicyDistribution, GaussianPolicyDistribution
Expand Down Expand Up @@ -68,14 +68,6 @@


class PPOAgent:
"""Agent using the Proximal Policy Optimization Algorithm for learning.
The default is an implementation using two independent models for the critic and the actor. This is of course more
expensive than using shared parameters because we need two forward and backward calculations
per batch however this is what is used in the original paper and most implementations. During development this also
turned out to be beneficial for performance relative to episodes seen in easy tasks (e.g. CartPole) and crucial
to make any significant progress in more difficult environments such as LunarLander.
"""
policy: tf.keras.Model
value: tf.keras.Model
joint: tf.keras.Model
Expand All @@ -102,6 +94,14 @@ def __init__(self,
pretrained_components: list = None):
""" Initialize the PPOAgent with given hyperparameters. Policy and value network will be freshly initialized.
Agent using the Proximal Policy Optimization Algorithm for learning.
The default is an implementation using two independent models for the critic and the actor. This is of course more
expensive than using shared parameters because we need two forward and backward calculations
per batch however this is what is used in the original paper and most implementations. During development this also
turned out to be beneficial for performance relative to episodes seen in easy tasks (e.g. CartPole) and crucial
to make any significant progress in more difficult environments such as LunarLander.
Args:
model_builder: a function creating a policy, value and joint model
environment (gym.Env): the environment in which the agent will learn
Expand Down Expand Up @@ -219,6 +219,7 @@ def __init__(self,
self.model_export_dir = "storage/saved_models/exports/"
self.agent_id = mpi_comm.bcast(f"{round(time.time())}{random.randint(int(1e5), int(1e6) - 1)}", root=0)
self.agent_directory = f"{BASE_SAVE_PATH}/{self.agent_id}/"
self.experiment_directory = f"{PATH_TO_EXPERIMENTS}/{self.agent_id}/"
if _make_dirs:
os.makedirs(self.model_export_dir, exist_ok=True)
os.makedirs(self.agent_directory, exist_ok=True)
Expand Down Expand Up @@ -771,9 +772,9 @@ def report(self, total_iterations):
current_lr = self.lr_schedule

# losses
pi_loss = "-" if len(self.policy_loss_history) == 0 else f"{round(self.policy_loss_history[-1], 2):6.2f}"
v_loss = "-" if len(self.value_loss_history) == 0 else f"{round(self.value_loss_history[-1], 2):8.2f}"
ent = "-" if len(self.entropy_history) == 0 else f"{round(self.entropy_history[-1], 2):6.2f}"
pi_loss = " pi " if len(self.policy_loss_history) == 0 else f"{round(self.policy_loss_history[-1], 2):6.2f}"
v_loss = " v " if len(self.value_loss_history) == 0 else f"{round(self.value_loss_history[-1], 2):8.2f}"
ent = " ent " if len(self.entropy_history) == 0 else f"{round(self.entropy_history[-1], 2):6.2f}"

# tbptt underflow
underflow = f"w: {nc}{self.underflow_history[-1]}{ec}; " if self.underflow_history[-1] is not None else ""
Expand Down
11 changes: 4 additions & 7 deletions angorapy/analysis/inspect_camera.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import tensorflow as tf
from environments import *

from agent.ppo_agent import PPOAgent
from common.wrappers import make_env
from angorapy.models import get_model_builder
from angorapy.common.wrappers import make_env

import matplotlib.pyplot as plt

tf.get_logger().setLevel('INFO')

env = make_env("ReachAbsoluteVisual-v0")
agent = PPOAgent(get_model_builder("shadow", "gru"), env, 1024, 8)
env = make_env("HumanoidVisualManipulateBlock-v0")

state = env.reset()

for i in range(100):
state, r, dd, info = env.step(env.action_space.sample())

plt.imshow(state.vision / 255)
plt.show()
plt.show()
111 changes: 111 additions & 0 deletions angorapy/analysis/plotting/plot_benchmark_groups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import json
import os
import re
from json import JSONDecodeError

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

from angorapy.common.const import PATH_TO_EXPERIMENTS, BASE_SAVE_PATH

matplotlib.use('TkAgg')

group_names = ["benchmark-performance-pdl", "benchmark-performance-llc", "benchmark-performance-cp",
"benchmark-performance-ab"]
titles = ["Pendulum", "LunarLanderContinuous", "CartPole", "Acrobot"]

# group_names = [
# "benchmark-performance-ant",
# "benchmark-performance-walker2d",
# "benchmark-performance-swimmer",
# "benchmark-performance-reacher",
# "benchmark-performance-humanoidstandup",
# "benchmark-performance-humanoid",
# "benchmark-performance-hopper",
# "benchmark-performance-halfcheetah"
# ]

# group_names = [
# "benchmark-beta-reach",
# "benchmark-beta-freereach"
# ]

# titles = [n.split("-")[-1].capitalize() for n in group_names]

exp_dir = "../../../" + PATH_TO_EXPERIMENTS
experiment_paths = [os.path.join(exp_dir, p) for p in os.listdir(exp_dir)]

environments = {}
reward_thresholds = {}
experiments_by_groups = {}
envs_available = set()

for exp_path in experiment_paths:

eid_m = re.match("[0-9]+", str(exp_path.split("/")[-1]))
if eid_m:
eid = eid_m.group(0)
model_path = os.path.join(BASE_SAVE_PATH, eid)

if os.path.isfile(os.path.join(exp_path, "progress.json")):
with open(os.path.join(exp_path, "progress.json"), "r") as f:
progress = json.load(f)

with open(os.path.join(exp_path, "meta.json"), "r") as f:
try:
meta = json.load(f)
except JSONDecodeError as jserr:
continue

exp_group = meta.get("experiment_group", "n/a")

if exp_group not in group_names:
continue

reward_threshold = None if meta["environment"]["reward_threshold"] == "None" else float(
meta["environment"]["reward_threshold"])

if not exp_group in experiments_by_groups.keys():
experiments_by_groups[exp_group] = {}
reward_thresholds[exp_group] = reward_threshold
environments[exp_group] = meta["environment"]["name"]

envs_available.add(meta["environment"]["name"])

experiments_by_groups[exp_group].update({
eid: progress
})

n_rows, n_cols = 1, 4
fig, axs = plt.subplots(n_rows, n_cols)
fig.set_size_inches(16, 3 * n_rows)

if not isinstance(axs[0], list):
axs = [axs]

for i, name in enumerate(group_names):
data = experiments_by_groups[name]
reward_trajectories = list(map(lambda x: x["rewards"]["mean"], data.values()))
max_length = max([len(x) for x in reward_trajectories])
padded_reward_trajectories = list(map(lambda x: np.pad(x, (0, max_length - len(x)),
mode="constant",
constant_values=np.nan), reward_trajectories))
mean_reward = np.ma.mean(np.ma.array(padded_reward_trajectories, mask=np.isnan(padded_reward_trajectories)), axis=0)
std_reward = np.ma.std(np.ma.array(padded_reward_trajectories, mask=np.isnan(padded_reward_trajectories)), axis=0)

ax = axs[i // n_cols][i % n_cols]

ax.plot(mean_reward)
ax.fill_between(range(mean_reward.shape[0]), mean_reward - std_reward, mean_reward + std_reward, alpha=.2)

ax.set_xlim(0, mean_reward.shape[0] - 1)
ax.set_ylim(np.min(mean_reward - std_reward), np.max(mean_reward + std_reward))
ax.set_xlabel("Cycle")
ax.set_title(titles[i])

if i % n_cols == 0:
ax.set_ylabel("Episode Return")

plt.savefig(f"../../../docs/figures/benchmarks/{'_'.join(titles)}_benchmark.pdf", format="pdf", bbox_inches='tight')
plt.show()
139 changes: 139 additions & 0 deletions angorapy/analysis/plotting/plot_benchmark_groups_with_beta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import itertools
import json
import os
import re
from json import JSONDecodeError
from typing import Iterable

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.axes import Axes

from angorapy.common.const import PATH_TO_EXPERIMENTS, BASE_SAVE_PATH, QUALITATIVE_COLOR_PALETTE

matplotlib.use('TkAgg')

group_names = {
"gaussian": [
"benchmark-performance-ant",
"benchmark-performance-walker2d",
"benchmark-performance-swimmer",
"benchmark-gaussian-reach",
"benchmark-performance-reacher",
"benchmark-performance-hopper",
"benchmark-performance-halfcheetah",
"benchmark-gaussian-freereach"
], "beta": [
"benchmark-beta-ant",
"benchmark-beta-walker2d",
"benchmark-beta-swimmer",
"benchmark-beta-reach",
"benchmark-beta-reacher",
"benchmark-beta-hopper",
"benchmark-beta-halfcheetah",
"benchmark-beta-freereach"
]
}

titles = [n.split("-")[-1].capitalize() for n in group_names[list(group_names.keys())[0]]]

# group_names = {"any": ["benchmark-performance-pdl", "benchmark-performance-llc", "benchmark-performance-cp",
# "benchmark-performance-ab"]}
# titles = ["Pendulum", "LunarLanderContinuous", "CartPole", "Acrobot"]

exp_dir = "../../../" + PATH_TO_EXPERIMENTS
experiment_paths = [os.path.join(exp_dir, p) for p in os.listdir(exp_dir)]

environments = {category: {} for category in group_names.keys()}
reward_thresholds = {category: {} for category in group_names.keys()}
experiments_by_groups = {category: {} for category in group_names.keys()}
envs_available = set()

for exp_path in experiment_paths:

eid_m = re.match("[0-9]+", str(exp_path.split("/")[-1]))
if eid_m:
eid = eid_m.group(0)
model_path = os.path.join(BASE_SAVE_PATH, eid)

if os.path.isfile(os.path.join(exp_path, "progress.json")):
with open(os.path.join(exp_path, "progress.json"), "r") as f:
progress = json.load(f)

with open(os.path.join(exp_path, "meta.json"), "r") as f:
try:
meta = json.load(f)
except JSONDecodeError as jserr:
continue

exp_group = meta.get("experiment_group", "n/a")

if exp_group not in itertools.chain(*group_names.values()):
continue

reward_threshold = None if meta["environment"]["reward_threshold"] == "None" else float(
meta["environment"]["reward_threshold"])

for category in group_names.keys():
if exp_group in group_names[category] and exp_group not in experiments_by_groups[category].keys():
experiments_by_groups[category][exp_group] = {}
reward_thresholds[category][exp_group] = reward_threshold
environments[category][exp_group] = meta["environment"]["name"]

envs_available.add(meta["environment"]["name"])

for category in group_names.keys():
if exp_group in group_names[category]:
experiments_by_groups[category][exp_group].update({
eid: progress
})

n_rows, n_cols = 2, 4
fig, axs = plt.subplots(n_rows, n_cols)
fig.set_size_inches(16, 4 * n_rows)

if not isinstance(axs[0], Iterable):
axs = [axs]


for i_cat, category in enumerate(group_names.keys()):
for i, name in enumerate(group_names[category]):
data = experiments_by_groups[category][name]
reward_trajectories = list(map(lambda x: x["rewards"]["mean"], data.values()))
max_length = max([len(x) for x in reward_trajectories])
padded_reward_trajectories = list(map(lambda x: np.pad(x, (0, max_length - len(x)),
mode="constant",
constant_values=np.nan), reward_trajectories))
mean_reward = np.ma.mean(np.ma.array(padded_reward_trajectories, mask=np.isnan(padded_reward_trajectories)),
axis=0)
std_reward = np.ma.std(np.ma.array(padded_reward_trajectories, mask=np.isnan(padded_reward_trajectories)),
axis=0)

ax: Axes = axs[i // n_cols][i % n_cols]

if reward_thresholds[category][name] is not None:
ax.axhline(reward_thresholds[category][name], color=QUALITATIVE_COLOR_PALETTE[2], ls="--")
ax.plot(mean_reward, label=category, color=QUALITATIVE_COLOR_PALETTE[i_cat])
ax.fill_between(range(mean_reward.shape[0]), mean_reward - std_reward, mean_reward + std_reward, alpha=.2)

ax.set_xlim(0, mean_reward.shape[0] - 1)
ax.set_ylim(min(np.min(mean_reward - std_reward), ax.get_ylim()[0]),
max(np.max(mean_reward + std_reward) * 1.1, ax.get_ylim()[1]))
ax.set_xlabel("Cycle")
ax.set_title(titles[i])

if titles[i] in ["Reach", "Freereach"]:
ax.set_title(titles[i], fontstyle="italic")

if i % n_cols == 0:
ax.set_ylabel("Episode Return")

if len(group_names.keys()) > 1:
ax.legend(loc="lower right")

plt.subplots_adjust(top=0.8, bottom=0.2, hspace=0.35, wspace=0.2)

plt.savefig(f"../../../docs/figures/benchmarks/{'_'.join(titles)}_benchmark_comparison.pdf", format="pdf",
bbox_inches='tight')
plt.show()
22 changes: 13 additions & 9 deletions angorapy/analysis/plotting/plot_experiment_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

from matplotlib import pyplot as plt

from common.const import PATH_TO_EXPERIMENTS
from angorapy.common.const import PATH_TO_EXPERIMENTS, QUALITATIVE_COLOR_PALETTE

experiment_ids = [ 1626374994 ]
experiment_ids = ['1653053413', '1655284851', '1654708464']

reward_developments = {}
for id in experiment_ids:
Expand All @@ -14,16 +14,20 @@
with open(os.path.join("../../../", PATH_TO_EXPERIMENTS, str(id), "progress.json")) as f:
progress = json.load(f)

exp_name = meta["hyperparameters"]["model"].upper()
exp_name = meta["hyperparameters"]["distribution"]
reward_developments[exp_name] = progress["rewards"]["mean"]

for name, rewards in reward_developments.items():
plt.plot(rewards, label=name)
for i, (name, rewards) in enumerate(reward_developments.items()):
plt.plot(rewards[:800], label=name, color=QUALITATIVE_COLOR_PALETTE[i])

plt.title("In-Hand Object Manipulation")
plt.xlabel("Cycle")
plt.ylabel("Reward")
# plt.legend()
plt.ylabel("Avg. Episode Return")
plt.legend()

plt.gcf().set_size_inches(8, 4)
plt.xlim(0, 800)
plt.ylim(0)

plt.gcf().set_size_inches(16, 4)
# plt.show()
plt.savefig("manipulate-progress.pdf", format="pdf", bbox_inches="tight")
plt.savefig("../../../docs/figures/manipulate-progress.pdf", format="pdf", bbox_inches="tight")
Loading

0 comments on commit 61782d8

Please sign in to comment.