import d4rl # noqa import gym import tqdm from diffusers.experimental import ValueGuidedRLPipeline config = dict( n_samples=64, horizon=32, num_inference_steps=20, n_guide_steps=2, # can set to 0 for faster sampling, does not use value network scale_grad_by_std=True, scale=0.1, eta=0.0, t_grad_cutoff=2, device="cpu", ) if __name__ == "__main__": env_name = "hopper-medium-v2" env = gym.make(env_name) pipeline = ValueGuidedRLPipeline.from_pretrained( "bglick13/hopper-medium-v2-value-function-hor32", env=env, ) env.seed(0) obs = env.reset() total_reward = 0 total_score = 0 T = 1000 rollout = [obs.copy()] try: for t in tqdm.tqdm(range(T)): # call the policy denorm_actions = pipeline(obs, planning_horizon=32) # execute action in environment next_observation, reward, terminal, _ = env.step(denorm_actions) score = env.get_normalized_score(total_reward) # update return total_reward += reward total_score += score print( f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" f" {total_score}" ) # save observations for rendering rollout.append(next_observation.copy()) obs = next_observation except KeyboardInterrupt: pass print(f"Total reward: {total_reward}")