Update README.md
Browse files
README.md
CHANGED
@@ -1,25 +1,27 @@
|
|
1 |
-
---
|
2 |
-
library_name: stable-baselines3
|
3 |
-
tags:
|
4 |
-
- BipedalWalker-v3
|
5 |
-
- deep-reinforcement-learning
|
6 |
-
- reinforcement-learning
|
7 |
-
- stable-baselines3
|
8 |
-
|
9 |
-
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
name:
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
23 |
|
24 |
# **PPO** Agent playing **BipedalWalker-v3**
|
25 |
This is a trained model of a **PPO** agent playing **BipedalWalker-v3**
|
@@ -33,5 +35,250 @@ TODO: Add your code
|
|
33 |
from stable_baselines3 import ...
|
34 |
from huggingface_sb3 import load_from_hub
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
...
|
37 |
-
```
|
|
|
1 |
+
---
|
2 |
+
library_name: stable-baselines3
|
3 |
+
tags:
|
4 |
+
- BipedalWalker-v3
|
5 |
+
- deep-reinforcement-learning
|
6 |
+
- reinforcement-learning
|
7 |
+
- stable-baselines3
|
8 |
+
- Bipedal
|
9 |
+
- OpenAI
|
10 |
+
model-index:
|
11 |
+
- name: PPO
|
12 |
+
results:
|
13 |
+
- task:
|
14 |
+
type: reinforcement-learning
|
15 |
+
name: reinforcement-learning
|
16 |
+
dataset:
|
17 |
+
name: BipedalWalker-v3
|
18 |
+
type: BipedalWalker-v3
|
19 |
+
metrics:
|
20 |
+
- type: mean_reward
|
21 |
+
value: '-58.54 +/- 39.24'
|
22 |
+
name: mean_reward
|
23 |
+
verified: false
|
24 |
+
---
|
25 |
|
26 |
# **PPO** Agent playing **BipedalWalker-v3**
|
27 |
This is a trained model of a **PPO** agent playing **BipedalWalker-v3**
|
|
|
35 |
from stable_baselines3 import ...
|
36 |
from huggingface_sb3 import load_from_hub
|
37 |
|
38 |
+
# **1. Setup**
|
39 |
+
|
40 |
+
### **Install Packages**
|
41 |
+
"""
|
42 |
+
|
43 |
+
# Install necessary packages
|
44 |
+
!apt install swig cmake ffmpeg xvfb python3-opengl
|
45 |
+
!pip install stable-baselines3==2.0.0a5 gymnasium[box2d] huggingface_sb3 pyvirtualdisplay imageio[ffmpeg]
|
46 |
+
|
47 |
+
"""The Next Cell will force the notebook runtime to restart. This is to ensure all the new libraries installed will be used."""
|
48 |
+
|
49 |
+
import os
|
50 |
+
os.kill(os.getpid(), 9)
|
51 |
+
|
52 |
+
"""### **Start Virtual Display**"""
|
53 |
+
|
54 |
+
from pyvirtualdisplay import Display
|
55 |
+
virtual_display = Display(visible=0, size=(1400, 900))
|
56 |
+
virtual_display.start()
|
57 |
+
|
58 |
+
"""### **Setup Environment**"""
|
59 |
+
|
60 |
+
import gymnasium as gym
|
61 |
+
env = gym.make("BipedalWalker-v3", hardcore=True)
|
62 |
+
env.reset()
|
63 |
+
|
64 |
+
"""### **Observation Space**
|
65 |
+
Observation Space Shape (24,) vector of size 24, where each value contains different information about the walker:
|
66 |
+
|
67 |
+
- **Hull Angle Speed**: The speed at which the main body of the walker is rotating.
|
68 |
+
- **Angular Velocity**: The rate of change of the angular position of the walker.
|
69 |
+
- **Horizontal Speed**: The speed at which the walker is moving horizontally.
|
70 |
+
- **Vertical Speed**: The speed at which the walker is moving vertically.
|
71 |
+
- **Position of Joints**: The positions (angles) of the walker's joints. Given that the walker has 4 joints, this take up 4 values.
|
72 |
+
- **Joints Angular Speed**: The rate of change of the angular position for each joint. Again, this would be 4 values for the 4 joints.
|
73 |
+
- **Legs Contact with Ground**: Indicating whether each leg is in contact with the ground. Given two legs, this contains 2 values.
|
74 |
+
- **10 Lidar Rangefinder Measurements**: These are distance measurements to detect obstacles or terrain features around the walker. There are 10 of these values.
|
75 |
+
|
76 |
+
"""
|
77 |
+
|
78 |
+
print("_____OBSERVATION SPACE_____ \n")
|
79 |
+
print("Observation Space Shape", env.observation_space.shape)
|
80 |
+
print("Sample observation", env.observation_space.sample()) # Get a random observation
|
81 |
+
|
82 |
+
"""### **Action Space**
|
83 |
+
|
84 |
+
Actions are motor speed values in the [-1, 1] range for each of the 4 joints at both hips and knees.
|
85 |
+
"""
|
86 |
+
|
87 |
+
print("\n _____ACTION SPACE_____ \n")
|
88 |
+
print("Action Space Shape", env.action_space.shape)
|
89 |
+
print("Action Space Sample", env.action_space.sample()) # Take a random action
|
90 |
+
|
91 |
+
"""### **Vectorized Environment**
|
92 |
+
Create a vectorized environment (a method for stacking multiple independent environments into a single environment) of 16 environments to have more diverse experiences.
|
93 |
+
"""
|
94 |
+
|
95 |
+
from stable_baselines3.common.env_util import make_vec_env
|
96 |
+
env = make_vec_env('BipedalWalker-v3', n_envs=16)
|
97 |
+
|
98 |
+
"""# **2. Building the Model**"""
|
99 |
+
|
100 |
+
from stable_baselines3 import PPO
|
101 |
+
model = PPO(
|
102 |
+
policy = 'MlpPolicy',
|
103 |
+
env = env,
|
104 |
+
n_steps = 2048,
|
105 |
+
batch_size = 128,
|
106 |
+
n_epochs = 6,
|
107 |
+
gamma = 0.999,
|
108 |
+
gae_lambda = 0.98,
|
109 |
+
ent_coef = 0.01,
|
110 |
+
verbose=1)
|
111 |
+
|
112 |
+
"""# 3.**Video Generation**"""
|
113 |
+
|
114 |
+
from wasabi import Printer
|
115 |
+
import numpy as np
|
116 |
+
from stable_baselines3.common.base_class import BaseAlgorithm
|
117 |
+
from pathlib import Path
|
118 |
+
import tempfile
|
119 |
+
from stable_baselines3.common.monitor import Monitor
|
120 |
+
from stable_baselines3.common.vec_env import (
|
121 |
+
DummyVecEnv,
|
122 |
+
VecEnv,
|
123 |
+
VecVideoRecorder,
|
124 |
+
)
|
125 |
+
|
126 |
+
msg = Printer()
|
127 |
+
|
128 |
+
def generate_replay(
|
129 |
+
model: BaseAlgorithm,
|
130 |
+
eval_env: VecEnv,
|
131 |
+
video_length: int,
|
132 |
+
is_deterministic: bool,
|
133 |
+
local_path: Path,
|
134 |
+
):
|
135 |
+
"""
|
136 |
+
Generate a replay video of the agent
|
137 |
+
:param model: trained model
|
138 |
+
:param eval_env: environment used to evaluate the agent
|
139 |
+
:param video_length: length of the video (in timesteps)
|
140 |
+
:param is_deterministic: use deterministic or stochastic actions
|
141 |
+
:param local_path: path of the local repository
|
142 |
+
"""
|
143 |
+
# This is another temporary directory for video outputs
|
144 |
+
# SB3 created a -step-0-to-... meta files as well as other
|
145 |
+
# artifacts which we don't want in the repo.
|
146 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
147 |
+
# Step 1: Create the VecVideoRecorder
|
148 |
+
env = VecVideoRecorder(
|
149 |
+
eval_env,
|
150 |
+
tmpdirname,
|
151 |
+
record_video_trigger=lambda x: x == 0,
|
152 |
+
video_length=video_length,
|
153 |
+
name_prefix="",
|
154 |
+
)
|
155 |
+
|
156 |
+
obs = env.reset()
|
157 |
+
lstm_states = None
|
158 |
+
episode_starts = np.ones((env.num_envs,), dtype=bool)
|
159 |
+
|
160 |
+
try:
|
161 |
+
for _ in range(video_length):
|
162 |
+
action, lstm_states = model.predict(
|
163 |
+
obs,
|
164 |
+
state=lstm_states,
|
165 |
+
episode_start=episode_starts,
|
166 |
+
deterministic=is_deterministic,
|
167 |
+
)
|
168 |
+
obs, _, episode_starts, _ = env.step(action)
|
169 |
+
|
170 |
+
# Save the video
|
171 |
+
env.close()
|
172 |
+
|
173 |
+
# Convert the video with x264 codec
|
174 |
+
inp = env.video_recorder.path
|
175 |
+
out = local_path
|
176 |
+
os.system(f"ffmpeg -y -i {inp} -vcodec h264 {out}".format(inp, out))
|
177 |
+
print(f"Video saved to: {out}")
|
178 |
+
except KeyboardInterrupt:
|
179 |
+
pass
|
180 |
+
except Exception as e:
|
181 |
+
msg.fail(str(e))
|
182 |
+
# Add a message for video
|
183 |
+
msg.fail(
|
184 |
+
"We are unable to generate a replay of your agent"
|
185 |
+
)
|
186 |
+
|
187 |
+
"""# **4. Training, Saving and Record the Videos**"""
|
188 |
+
|
189 |
+
import os
|
190 |
+
|
191 |
+
#create a directory to save the videos
|
192 |
+
video_dir = "/content/videos"
|
193 |
+
if not os.path.exists(video_dir):
|
194 |
+
os.makedirs(video_dir)
|
195 |
+
|
196 |
+
env_id = "BipedalWalker-v3"
|
197 |
+
# Train and generate video at every 100000 steps, adjust the timesteps to your liking
|
198 |
+
for i in range(0, 2000000, 100000):
|
199 |
+
model.learn(total_timesteps=100000)
|
200 |
+
# Save the model
|
201 |
+
model_name = "ppo-BipedalWalker-v3"
|
202 |
+
model.save(model_name)
|
203 |
+
video_name = f"replay_{i + 100000}.mp4"
|
204 |
+
generate_replay(
|
205 |
+
model=model,
|
206 |
+
eval_env=DummyVecEnv([lambda: Monitor(gym.make(env_id, hardcore=True, render_mode="rgb_array"))]),
|
207 |
+
video_length=1000,
|
208 |
+
is_deterministic=True,
|
209 |
+
local_path=os.path.join(video_dir, video_name)
|
210 |
+
)
|
211 |
+
|
212 |
+
model_name = "ppo-BipedalWalker-v3"
|
213 |
+
model.save(model_name)
|
214 |
+
|
215 |
+
with open(os.path.join(video_dir, "filelist.txt"), "w") as f:
|
216 |
+
for i in range(0, 2000000, 100000):
|
217 |
+
video_name = f"replay_{i + 100000}.mp4"
|
218 |
+
f.write(f"file '{os.path.join(video_dir, video_name)}'\n")
|
219 |
+
# Concatenate all the videos into one
|
220 |
+
os.system(f"ffmpeg -f concat -safe 0 -i {os.path.join(video_dir, 'filelist.txt')} -c copy {os.path.join(video_dir, 'replay_all.mp4')}")
|
221 |
+
|
222 |
+
"""# **5. Visualize Final Video**"""
|
223 |
+
|
224 |
+
from IPython.display import HTML
|
225 |
+
from base64 import b64encode
|
226 |
+
mp4 = open('videos/replay_all.mp4','rb').read()
|
227 |
+
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
|
228 |
+
HTML("""
|
229 |
+
<video width=600 controls>
|
230 |
+
<source src="%s" type="video/mp4">
|
231 |
+
</video>
|
232 |
+
""" % data_url)
|
233 |
+
|
234 |
+
"""# **6. Evaluate the Model**"""
|
235 |
+
|
236 |
+
from stable_baselines3.common.evaluation import evaluate_policy
|
237 |
+
|
238 |
+
eval_env = Monitor(gym.make("BipedalWalker-v3"))
|
239 |
+
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
|
240 |
+
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
|
241 |
+
|
242 |
+
"""# **7. Upload to HuggingFace**"""
|
243 |
+
|
244 |
+
from huggingface_sb3 import load_from_hub, package_to_hub
|
245 |
+
from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
|
246 |
+
|
247 |
+
notebook_login()
|
248 |
+
!git config --global credential.helper store
|
249 |
+
|
250 |
+
env_id = "BipedalWalker-v3"
|
251 |
+
model_name = "ppo-BipedalWalker-v3"
|
252 |
+
model_architecture = "PPO"
|
253 |
+
|
254 |
+
repo_id = "Mahanthesh0r/BipedalWalker-RL" # Change with your repo id
|
255 |
+
|
256 |
+
## Define the commit message
|
257 |
+
commit_message = "Upload PPO BipedalWalker-v3 trained agent"
|
258 |
+
|
259 |
+
# Create the evaluation env and set the render_mode="rgb_array"
|
260 |
+
eval_env = DummyVecEnv([lambda: gym.make(env_id, hardcore=True, render_mode="rgb_array")])
|
261 |
+
|
262 |
+
package_to_hub(model=model, # trained model
|
263 |
+
model_name=model_name, # The name of our trained model
|
264 |
+
model_architecture=model_architecture, # The model architecture we used: in our case PPO
|
265 |
+
env_id=env_id, # Name of the environment
|
266 |
+
eval_env=eval_env,
|
267 |
+
repo_id=repo_id,
|
268 |
+
commit_message=commit_message)
|
269 |
+
|
270 |
+
"""# **8. Load Models from HuggingFace (Optional)**"""
|
271 |
+
|
272 |
+
from huggingface_sb3 import load_from_hub
|
273 |
+
repo_id = "Mahanthesh0r/BipedalWalker-RL" # The repo_id
|
274 |
+
filename = "ppo-BipedalWalker-v3.zip" # The model filename.zip
|
275 |
+
|
276 |
+
checkpoint = load_from_hub(repo_id, filename)
|
277 |
+
model = PPO.load(checkpoint, print_system_info=True)
|
278 |
+
|
279 |
+
eval_env = Monitor(gym.make("BipedalWalker-v3", hardcore=True))
|
280 |
+
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)
|
281 |
+
print(f"mean_reward={mean_reward:.2f} +/- {std_reward}")
|
282 |
+
|
283 |
...
|
284 |
+
```
|