Equidiff / equidiff /equi_diffpo /workspace /train_dp3_workspace.py
Lillianwei's picture
mimicgen
c1f1d32
if __name__ == "__main__":
import sys
import os
import pathlib
ROOT_DIR = str(pathlib.Path(__file__).parent.parent.parent)
sys.path.append(ROOT_DIR)
os.chdir(ROOT_DIR)
import os
import hydra
import torch
import dill
from omegaconf import OmegaConf
import pathlib
from torch.utils.data import DataLoader
import copy
import random
import wandb
import tqdm
import numpy as np
from termcolor import cprint
import shutil
import time
import threading
from hydra.core.hydra_config import HydraConfig
from equi_diffpo.policy.dp3 import DP3
from equi_diffpo.dataset.base_dataset import BaseImageDataset
from equi_diffpo.env_runner.base_image_runner import BaseImageRunner
from equi_diffpo.common.checkpoint_util import TopKCheckpointManager
from equi_diffpo.common.pytorch_util import dict_apply, optimizer_to
from equi_diffpo.model.diffusion.ema_model import EMAModel
from equi_diffpo.model.common.lr_scheduler import get_scheduler
OmegaConf.register_new_resolver("eval", eval, replace=True)
class TrainDP3Workspace:
include_keys = ['global_step', 'epoch']
exclude_keys = tuple()
def __init__(self, cfg: OmegaConf, output_dir=None):
self.cfg = cfg
self._output_dir = output_dir
self._saving_thread = None
# set seed
seed = cfg.training.seed
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
# configure model
self.model: DP3 = hydra.utils.instantiate(cfg.policy)
self.ema_model: DP3 = None
if cfg.training.use_ema:
try:
self.ema_model = copy.deepcopy(self.model)
except: # minkowski engine could not be copied. recreate it
self.ema_model = hydra.utils.instantiate(cfg.policy)
# configure training state
self.optimizer = hydra.utils.instantiate(
cfg.optimizer, params=self.model.parameters())
# configure training state
self.global_step = 0
self.epoch = 0
def run(self):
cfg = copy.deepcopy(self.cfg)
if cfg.training.debug:
cfg.training.num_epochs = 100
cfg.training.max_train_steps = 10
cfg.training.max_val_steps = 3
cfg.training.rollout_every = 20
cfg.training.checkpoint_every = 1
cfg.training.val_every = 1
cfg.training.sample_every = 1
RUN_ROLLOUT = True
RUN_CKPT = False
verbose = True
else:
RUN_ROLLOUT = True
RUN_CKPT = True
verbose = False
RUN_VALIDATION = False # reduce time cost
# resume training
if cfg.training.resume:
lastest_ckpt_path = self.get_checkpoint_path()
if lastest_ckpt_path.is_file():
print(f"Resuming from checkpoint {lastest_ckpt_path}")
self.load_checkpoint(path=lastest_ckpt_path)
# configure dataset
dataset: BaseImageDataset
dataset = hydra.utils.instantiate(cfg.task.dataset)
assert isinstance(dataset, BaseImageDataset), print(f"dataset must be BaseDataset, got {type(dataset)}")
train_dataloader = DataLoader(dataset, **cfg.dataloader)
normalizer = dataset.get_normalizer()
# configure validation dataset
val_dataset = dataset.get_validation_dataset()
val_dataloader = DataLoader(val_dataset, **cfg.val_dataloader)
self.model.set_normalizer(normalizer)
if cfg.training.use_ema:
self.ema_model.set_normalizer(normalizer)
# configure lr scheduler
lr_scheduler = get_scheduler(
cfg.training.lr_scheduler,
optimizer=self.optimizer,
num_warmup_steps=cfg.training.lr_warmup_steps,
num_training_steps=(
len(train_dataloader) * cfg.training.num_epochs) \
// cfg.training.gradient_accumulate_every,
# pytorch assumes stepping LRScheduler every epoch
# however huggingface diffusers steps it every batch
last_epoch=self.global_step-1
)
# configure ema
ema: EMAModel = None
if cfg.training.use_ema:
ema = hydra.utils.instantiate(
cfg.ema,
model=self.ema_model)
# configure env
env_runner: BaseImageRunner
env_runner = hydra.utils.instantiate(
cfg.task.env_runner,
output_dir=self.output_dir)
if env_runner is not None:
assert isinstance(env_runner, BaseImageRunner)
cfg.logging.name = str(cfg.logging.name)
cprint("-----------------------------", "yellow")
cprint(f"[WandB] group: {cfg.logging.group}", "yellow")
cprint(f"[WandB] name: {cfg.logging.name}", "yellow")
cprint("-----------------------------", "yellow")
# configure logging
wandb_run = wandb.init(
dir=str(self.output_dir),
config=OmegaConf.to_container(cfg, resolve=True),
**cfg.logging
)
wandb.config.update(
{
"output_dir": self.output_dir,
}
)
# configure checkpoint
topk_manager = TopKCheckpointManager(
save_dir=os.path.join(self.output_dir, 'checkpoints'),
**cfg.checkpoint.topk
)
# device transfer
device = torch.device(cfg.training.device)
self.model.to(device)
if self.ema_model is not None:
self.ema_model.to(device)
optimizer_to(self.optimizer, device)
# save batch for sampling
train_sampling_batch = None
# training loop
log_path = os.path.join(self.output_dir, 'logs.json.txt')
while self.epoch < cfg.training.num_epochs:
step_log = dict()
# ========= train for this epoch ==========
train_losses = list()
with tqdm.tqdm(train_dataloader, desc=f"Training epoch {self.epoch}",
leave=False, mininterval=cfg.training.tqdm_interval_sec) as tepoch:
for batch_idx, batch in enumerate(tepoch):
t1 = time.time()
# device transfer
batch = dict_apply(batch, lambda x: x.to(device, non_blocking=True))
if train_sampling_batch is None:
train_sampling_batch = batch
# compute loss
t1_1 = time.time()
raw_loss, loss_dict = self.model.compute_loss(batch)
loss = raw_loss / cfg.training.gradient_accumulate_every
loss.backward()
t1_2 = time.time()
# step optimizer
if self.global_step % cfg.training.gradient_accumulate_every == 0:
self.optimizer.step()
self.optimizer.zero_grad()
lr_scheduler.step()
t1_3 = time.time()
# update ema
if cfg.training.use_ema:
ema.step(self.model)
t1_4 = time.time()
# logging
raw_loss_cpu = raw_loss.item()
tepoch.set_postfix(loss=raw_loss_cpu, refresh=False)
train_losses.append(raw_loss_cpu)
step_log = {
'train_loss': raw_loss_cpu,
'global_step': self.global_step,
'epoch': self.epoch,
'lr': lr_scheduler.get_last_lr()[0]
}
t1_5 = time.time()
step_log.update(loss_dict)
t2 = time.time()
if verbose:
print(f"total one step time: {t2-t1:.3f}")
print(f" compute loss time: {t1_2-t1_1:.3f}")
print(f" step optimizer time: {t1_3-t1_2:.3f}")
print(f" update ema time: {t1_4-t1_3:.3f}")
print(f" logging time: {t1_5-t1_4:.3f}")
is_last_batch = (batch_idx == (len(train_dataloader)-1))
if not is_last_batch:
# log of last step is combined with validation and rollout
wandb_run.log(step_log, step=self.global_step)
self.global_step += 1
if (cfg.training.max_train_steps is not None) \
and batch_idx >= (cfg.training.max_train_steps-1):
break
# at the end of each epoch
# replace train_loss with epoch average
train_loss = np.mean(train_losses)
step_log['train_loss'] = train_loss
# ========= eval for this epoch ==========
policy = self.model
if cfg.training.use_ema:
policy = self.ema_model
policy.eval()
# run rollout
if (self.epoch % cfg.training.rollout_every) == 0 and RUN_ROLLOUT and env_runner is not None:
t3 = time.time()
# runner_log = env_runner.run(policy, dataset=dataset)
runner_log = env_runner.run(policy)
t4 = time.time()
# print(f"rollout time: {t4-t3:.3f}")
# log all
step_log.update(runner_log)
# run validation
if (self.epoch % cfg.training.val_every) == 0 and RUN_VALIDATION:
with torch.no_grad():
val_losses = list()
with tqdm.tqdm(val_dataloader, desc=f"Validation epoch {self.epoch}",
leave=False, mininterval=cfg.training.tqdm_interval_sec) as tepoch:
for batch_idx, batch in enumerate(tepoch):
batch = dict_apply(batch, lambda x: x.to(device, non_blocking=True))
loss, loss_dict = self.model.compute_loss(batch)
val_losses.append(loss)
if (cfg.training.max_val_steps is not None) \
and batch_idx >= (cfg.training.max_val_steps-1):
break
if len(val_losses) > 0:
val_loss = torch.mean(torch.tensor(val_losses)).item()
# log epoch average validation loss
step_log['val_loss'] = val_loss
# run diffusion sampling on a training batch
if (self.epoch % cfg.training.sample_every) == 0:
with torch.no_grad():
# sample trajectory from training set, and evaluate difference
batch = dict_apply(train_sampling_batch, lambda x: x.to(device, non_blocking=True))
obs_dict = batch['obs']
gt_action = batch['action']
result = policy.predict_action(obs_dict)
pred_action = result['action_pred']
mse = torch.nn.functional.mse_loss(pred_action, gt_action)
step_log['train_action_mse_error'] = mse.item()
del batch
del obs_dict
del gt_action
del result
del pred_action
del mse
if env_runner is None:
step_log['test_mean_score'] = - train_loss
# checkpoint
if (self.epoch % cfg.training.checkpoint_every) == 0 and cfg.checkpoint.save_ckpt:
# checkpointing
if cfg.checkpoint.save_last_ckpt:
self.save_checkpoint()
if cfg.checkpoint.save_last_snapshot:
self.save_snapshot()
# sanitize metric names
metric_dict = dict()
for key, value in step_log.items():
new_key = key.replace('/', '_')
metric_dict[new_key] = value
# We can't copy the last checkpoint here
# since save_checkpoint uses threads.
# therefore at this point the file might have been empty!
topk_ckpt_path = topk_manager.get_ckpt_path(metric_dict)
if topk_ckpt_path is not None:
self.save_checkpoint(path=topk_ckpt_path)
# ========= eval end for this epoch ==========
policy.train()
# end of epoch
# log of last step is combined with validation and rollout
wandb_run.log(step_log, step=self.global_step)
self.global_step += 1
self.epoch += 1
del step_log
def eval(self):
# load the latest checkpoint
cfg = copy.deepcopy(self.cfg)
lastest_ckpt_path = self.get_checkpoint_path(tag="latest")
if lastest_ckpt_path.is_file():
cprint(f"Resuming from checkpoint {lastest_ckpt_path}", 'magenta')
self.load_checkpoint(path=lastest_ckpt_path)
# configure env
env_runner: BaseImageRunner
env_runner = hydra.utils.instantiate(
cfg.task.env_runner,
output_dir=self.output_dir)
assert isinstance(env_runner, BaseImageRunner)
policy = self.model
if cfg.training.use_ema:
policy = self.ema_model
policy.eval()
policy.cuda()
runner_log = env_runner.run(policy)
cprint(f"---------------- Eval Results --------------", 'magenta')
for key, value in runner_log.items():
if isinstance(value, float):
cprint(f"{key}: {value:.4f}", 'magenta')
@property
def output_dir(self):
output_dir = self._output_dir
if output_dir is None:
output_dir = HydraConfig.get().runtime.output_dir
return output_dir
def save_checkpoint(self, path=None, tag='latest',
exclude_keys=None,
include_keys=None,
use_thread=False):
if path is None:
path = pathlib.Path(self.output_dir).joinpath('checkpoints', f'{tag}.ckpt')
else:
path = pathlib.Path(path)
if exclude_keys is None:
exclude_keys = tuple(self.exclude_keys)
if include_keys is None:
include_keys = tuple(self.include_keys) + ('_output_dir',)
path.parent.mkdir(parents=False, exist_ok=True)
payload = {
'cfg': self.cfg,
'state_dicts': dict(),
'pickles': dict()
}
for key, value in self.__dict__.items():
if hasattr(value, 'state_dict') and hasattr(value, 'load_state_dict'):
# modules, optimizers and samplers etc
if key not in exclude_keys:
if use_thread:
payload['state_dicts'][key] = _copy_to_cpu(value.state_dict())
else:
payload['state_dicts'][key] = value.state_dict()
elif key in include_keys:
payload['pickles'][key] = dill.dumps(value)
if use_thread:
self._saving_thread = threading.Thread(
target=lambda : torch.save(payload, path.open('wb'), pickle_module=dill))
self._saving_thread.start()
else:
torch.save(payload, path.open('wb'), pickle_module=dill)
del payload
torch.cuda.empty_cache()
return str(path.absolute())
def get_checkpoint_path(self, tag='latest'):
if tag=='latest':
return pathlib.Path(self.output_dir).joinpath('checkpoints', f'{tag}.ckpt')
elif tag=='best':
# the checkpoints are saved as format: epoch={}-test_mean_score={}.ckpt
# find the best checkpoint
checkpoint_dir = pathlib.Path(self.output_dir).joinpath('checkpoints')
all_checkpoints = os.listdir(checkpoint_dir)
best_ckpt = None
best_score = -1e10
for ckpt in all_checkpoints:
if 'latest' in ckpt:
continue
score = float(ckpt.split('test_mean_score=')[1].split('.ckpt')[0])
if score > best_score:
best_ckpt = ckpt
best_score = score
return pathlib.Path(self.output_dir).joinpath('checkpoints', best_ckpt)
else:
raise NotImplementedError(f"tag {tag} not implemented")
def load_payload(self, payload, exclude_keys=None, include_keys=None, **kwargs):
if exclude_keys is None:
exclude_keys = tuple()
if include_keys is None:
include_keys = payload['pickles'].keys()
for key, value in payload['state_dicts'].items():
if key not in exclude_keys:
self.__dict__[key].load_state_dict(value, **kwargs)
for key in include_keys:
if key in payload['pickles']:
self.__dict__[key] = dill.loads(payload['pickles'][key])
def load_checkpoint(self, path=None, tag='latest',
exclude_keys=None,
include_keys=None,
**kwargs):
if path is None:
path = self.get_checkpoint_path(tag=tag)
else:
path = pathlib.Path(path)
payload = torch.load(path.open('rb'), pickle_module=dill, map_location='cpu')
self.load_payload(payload,
exclude_keys=exclude_keys,
include_keys=include_keys)
return payload
@classmethod
def create_from_checkpoint(cls, path,
exclude_keys=None,
include_keys=None,
**kwargs):
payload = torch.load(open(path, 'rb'), pickle_module=dill)
instance = cls(payload['cfg'])
instance.load_payload(
payload=payload,
exclude_keys=exclude_keys,
include_keys=include_keys,
**kwargs)
return instance
def save_snapshot(self, tag='latest'):
"""
Quick loading and saving for reserach, saves full state of the workspace.
However, loading a snapshot assumes the code stays exactly the same.
Use save_checkpoint for long-term storage.
"""
path = pathlib.Path(self.output_dir).joinpath('snapshots', f'{tag}.pkl')
path.parent.mkdir(parents=False, exist_ok=True)
torch.save(self, path.open('wb'), pickle_module=dill)
return str(path.absolute())
@classmethod
def create_from_snapshot(cls, path):
return torch.load(open(path, 'rb'), pickle_module=dill)
@hydra.main(
version_base=None,
config_path=str(pathlib.Path(__file__).parent.joinpath(
'equi_diffpo', 'config'))
)
def main(cfg):
workspace = TrainDP3Workspace(cfg)
workspace.run()
if __name__ == "__main__":
main()