arad1367 commited on
Commit
9227b81
·
verified ·
1 Parent(s): ca323d4

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +1373 -18
index.html CHANGED
@@ -1,19 +1,1374 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  </html>
 
1
+ <!-- PPO Simulation By Pejman Ebrahimi -->
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <head>
5
+ <meta charset="UTF-8" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>PPO Reinforcement Learning Simulation</title>
8
+ <style>
9
+ body {
10
+ font-family: Arial, sans-serif;
11
+ margin: 0;
12
+ padding: 20px;
13
+ line-height: 1.6;
14
+ color: #333;
15
+ background-color: #f8f9fa;
16
+ }
17
+ .container {
18
+ max-width: 1000px;
19
+ margin: 0 auto;
20
+ background-color: white;
21
+ padding: 20px;
22
+ border-radius: 8px;
23
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
24
+ }
25
+ h1,
26
+ h2,
27
+ h3 {
28
+ color: #2c3e50;
29
+ }
30
+ h1 {
31
+ text-align: center;
32
+ margin-bottom: 30px;
33
+ border-bottom: 2px solid #3498db;
34
+ padding-bottom: 10px;
35
+ }
36
+ .grid-container {
37
+ display: grid;
38
+ grid-template-columns: repeat(10, 1fr);
39
+ gap: 2px;
40
+ margin: 20px 0;
41
+ }
42
+ .cell {
43
+ width: 100%;
44
+ aspect-ratio: 1;
45
+ background-color: #ecf0f1;
46
+ display: flex;
47
+ align-items: center;
48
+ justify-content: center;
49
+ cursor: pointer;
50
+ position: relative;
51
+ transition: all 0.3s;
52
+ }
53
+ .agent {
54
+ background-color: #3498db;
55
+ border-radius: 50%;
56
+ width: 80%;
57
+ height: 80%;
58
+ position: absolute;
59
+ }
60
+ .goal {
61
+ background-color: #2ecc71;
62
+ width: 80%;
63
+ height: 80%;
64
+ position: absolute;
65
+ }
66
+ .obstacle {
67
+ background-color: #e74c3c;
68
+ width: 80%;
69
+ height: 80%;
70
+ position: absolute;
71
+ }
72
+ .panel {
73
+ background-color: #f5f7f9;
74
+ padding: 15px;
75
+ border-radius: 5px;
76
+ margin-bottom: 20px;
77
+ border: 1px solid #ddd;
78
+ }
79
+ .controls {
80
+ display: flex;
81
+ gap: 10px;
82
+ flex-wrap: wrap;
83
+ margin: 20px 0;
84
+ }
85
+ button {
86
+ padding: 8px 15px;
87
+ background-color: #3498db;
88
+ color: white;
89
+ border: none;
90
+ border-radius: 4px;
91
+ cursor: pointer;
92
+ transition: background-color 0.3s;
93
+ }
94
+ button:hover {
95
+ background-color: #2980b9;
96
+ }
97
+ button:disabled {
98
+ background-color: #95a5a6;
99
+ cursor: not-allowed;
100
+ }
101
+ .sliders {
102
+ display: flex;
103
+ flex-direction: column;
104
+ gap: 10px;
105
+ margin: 15px 0;
106
+ }
107
+ .slider-container {
108
+ display: flex;
109
+ align-items: center;
110
+ }
111
+ .slider-container label {
112
+ flex: 1;
113
+ min-width: 180px;
114
+ }
115
+ .slider-container input {
116
+ flex: 2;
117
+ }
118
+ .slider-value {
119
+ flex: 0 0 50px;
120
+ text-align: right;
121
+ }
122
+ #log-container {
123
+ max-height: 200px;
124
+ overflow-y: auto;
125
+ background-color: #2c3e50;
126
+ color: #ecf0f1;
127
+ padding: 10px;
128
+ border-radius: 4px;
129
+ margin-top: 20px;
130
+ font-family: monospace;
131
+ }
132
+ .log-entry {
133
+ margin: 5px 0;
134
+ }
135
+ .tab-container {
136
+ margin-top: 20px;
137
+ }
138
+ .tab-buttons {
139
+ display: flex;
140
+ border-bottom: 1px solid #ddd;
141
+ }
142
+ .tab-button {
143
+ padding: 10px 20px;
144
+ background-color: #f1f1f1;
145
+ border: none;
146
+ cursor: pointer;
147
+ transition: background-color 0.3s;
148
+ }
149
+ .tab-button.active {
150
+ background-color: #3498db;
151
+ color: white;
152
+ }
153
+ .tab-content {
154
+ display: none;
155
+ padding: 15px;
156
+ border: 1px solid #ddd;
157
+ border-top: none;
158
+ animation: fadeIn 0.5s;
159
+ }
160
+ .tab-content.active {
161
+ display: block;
162
+ }
163
+ #policy-display {
164
+ width: 100%;
165
+ height: 300px;
166
+ overflow: auto;
167
+ margin-top: 10px;
168
+ }
169
+ .policy-grid {
170
+ display: grid;
171
+ grid-template-columns: repeat(10, 1fr);
172
+ gap: 2px;
173
+ }
174
+ .policy-cell {
175
+ aspect-ratio: 1;
176
+ border: 1px solid #ddd;
177
+ padding: 2px;
178
+ font-size: 10px;
179
+ display: flex;
180
+ flex-direction: column;
181
+ align-items: center;
182
+ justify-content: center;
183
+ }
184
+ .arrow {
185
+ width: 0;
186
+ height: 0;
187
+ border-style: solid;
188
+ margin: 2px;
189
+ }
190
+ .arrow-up {
191
+ border-width: 0 4px 8px 4px;
192
+ border-color: transparent transparent #3498db transparent;
193
+ }
194
+ .arrow-right {
195
+ border-width: 4px 0 4px 8px;
196
+ border-color: transparent transparent transparent #3498db;
197
+ }
198
+ .arrow-down {
199
+ border-width: 8px 4px 0 4px;
200
+ border-color: #3498db transparent transparent transparent;
201
+ }
202
+ .arrow-left {
203
+ border-width: 4px 8px 4px 0;
204
+ border-color: transparent #3498db transparent transparent;
205
+ }
206
+ .progress-container {
207
+ margin-top: 10px;
208
+ background-color: #f1f1f1;
209
+ border-radius: 5px;
210
+ height: 20px;
211
+ position: relative;
212
+ }
213
+ .progress-bar {
214
+ height: 100%;
215
+ background-color: #3498db;
216
+ border-radius: 5px;
217
+ width: 0%;
218
+ transition: width 0.3s;
219
+ }
220
+ .chart-container {
221
+ height: 300px;
222
+ margin: 15px 0;
223
+ }
224
+ @keyframes fadeIn {
225
+ from {
226
+ opacity: 0;
227
+ }
228
+ to {
229
+ opacity: 1;
230
+ }
231
+ }
232
+ .popup {
233
+ display: none;
234
+ position: fixed;
235
+ top: 50%;
236
+ left: 50%;
237
+ transform: translate(-50%, -50%);
238
+ background-color: white;
239
+ padding: 20px;
240
+ border-radius: 8px;
241
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
242
+ z-index: 1000;
243
+ max-width: 80%;
244
+ max-height: 80%;
245
+ overflow-y: auto;
246
+ }
247
+ .popup-overlay {
248
+ display: none;
249
+ position: fixed;
250
+ top: 0;
251
+ left: 0;
252
+ width: 100%;
253
+ height: 100%;
254
+ background-color: rgba(0, 0, 0, 0.5);
255
+ z-index: 999;
256
+ }
257
+ .reward-display {
258
+ font-weight: bold;
259
+ font-size: 1.2em;
260
+ text-align: center;
261
+ margin: 10px 0;
262
+ }
263
+ .explanation {
264
+ background-color: #e8f4fc;
265
+ padding: 15px;
266
+ border-radius: 5px;
267
+ margin: 10px 0;
268
+ border-left: 4px solid #3498db;
269
+ }
270
+ .highlight {
271
+ background-color: #fffacd;
272
+ padding: 2px 4px;
273
+ border-radius: 3px;
274
+ }
275
+ .concept-box {
276
+ border: 1px solid #ddd;
277
+ margin: 15px 0;
278
+ border-radius: 5px;
279
+ overflow: hidden;
280
+ }
281
+ .concept-title {
282
+ background-color: #3498db;
283
+ color: white;
284
+ padding: 10px;
285
+ margin: 0;
286
+ }
287
+ .concept-content {
288
+ padding: 15px;
289
+ }
290
+ </style>
291
+ </head>
292
+ <body>
293
+ <div class="container">
294
+ <h1>Proximal Policy Optimization (PPO) Simulation</h1>
295
+
296
+ <div class="explanation">
297
+ <p>
298
+ This simulation demonstrates how an agent learns to navigate to a goal
299
+ using <strong>Proximal Policy Optimization (PPO)</strong>. PPO is an
300
+ on-policy reinforcement learning algorithm that uses a "clipping"
301
+ mechanism to prevent large policy updates, making training more stable
302
+ and efficient.
303
+ </p>
304
+ </div>
305
+
306
+ <div class="tab-container">
307
+ <div class="tab-buttons">
308
+ <button class="tab-button active" onclick="openTab('simulation-tab')">
309
+ Simulation
310
+ </button>
311
+ <button class="tab-button" onclick="openTab('concepts-tab')">
312
+ PPO Concepts
313
+ </button>
314
+ <button class="tab-button" onclick="openTab('metrics-tab')">
315
+ Training Metrics
316
+ </button>
317
+ </div>
318
+
319
+ <div id="simulation-tab" class="tab-content active">
320
+ <div class="panel">
321
+ <h3>Environment</h3>
322
+ <p>
323
+ The agent (blue) must navigate to the goal (green) while avoiding
324
+ obstacles (red).
325
+ </p>
326
+ <div class="grid-container" id="grid"></div>
327
+ <div class="reward-display">
328
+ Total Reward: <span id="reward-value">0</span>
329
+ </div>
330
+ </div>
331
+
332
+ <div class="controls">
333
+ <button id="start-btn" onclick="startTraining()">
334
+ Start Training
335
+ </button>
336
+ <button id="reset-btn" onclick="resetEnvironment()">
337
+ Reset Environment
338
+ </button>
339
+ <button id="step-btn" onclick="stepTraining()" disabled>
340
+ Step Forward
341
+ </button>
342
+ <button id="place-obstacle-btn" onclick="toggleObstaclePlacement()">
343
+ Place Obstacles
344
+ </button>
345
+ <button id="animation-speed-btn" onclick="toggleAnimationSpeed()">
346
+ Animation Speed: Normal
347
+ </button>
348
+ </div>
349
+
350
+ <div class="panel">
351
+ <h3>PPO Parameters</h3>
352
+ <div class="sliders">
353
+ <div class="slider-container">
354
+ <label for="clip-ratio">Clip Ratio (ε):</label>
355
+ <input
356
+ type="range"
357
+ id="clip-ratio"
358
+ min="0.05"
359
+ max="0.5"
360
+ step="0.05"
361
+ value="0.2"
362
+ oninput="updateSliderValue('clip-ratio')"
363
+ />
364
+ <span class="slider-value" id="clip-ratio-value">0.2</span>
365
+ </div>
366
+ <div class="slider-container">
367
+ <label for="learning-rate">Learning Rate:</label>
368
+ <input
369
+ type="range"
370
+ id="learning-rate"
371
+ min="0.01"
372
+ max="1"
373
+ step="0.01"
374
+ value="0.1"
375
+ oninput="updateSliderValue('learning-rate')"
376
+ />
377
+ <span class="slider-value" id="learning-rate-value">0.1</span>
378
+ </div>
379
+ <div class="slider-container">
380
+ <label for="epochs">PPO Epochs per Update:</label>
381
+ <input
382
+ type="range"
383
+ id="epochs"
384
+ min="1"
385
+ max="10"
386
+ step="1"
387
+ value="4"
388
+ oninput="updateSliderValue('epochs')"
389
+ />
390
+ <span class="slider-value" id="epochs-value">4</span>
391
+ </div>
392
+ </div>
393
+ </div>
394
+
395
+ <div class="panel">
396
+ <h3>Policy Visualization</h3>
397
+ <p>
398
+ This shows the current policy of the agent (arrows indicate
399
+ preferred actions in each state).
400
+ </p>
401
+ <div id="policy-display">
402
+ <div class="policy-grid" id="policy-grid"></div>
403
+ </div>
404
+ </div>
405
+
406
+ <div id="log-container"></div>
407
+ </div>
408
+
409
+ <div id="concepts-tab" class="tab-content">
410
+ <div class="concept-box">
411
+ <h3 class="concept-title">What is PPO?</h3>
412
+ <div class="concept-content">
413
+ <p>
414
+ Proximal Policy Optimization (PPO) is a policy gradient method
415
+ for reinforcement learning developed by OpenAI in 2017. It has
416
+ become one of the most popular RL algorithms due to its
417
+ simplicity and effectiveness.
418
+ </p>
419
+ <p>PPO aims to balance two objectives:</p>
420
+ <ul>
421
+ <li>Improving the agent's policy to maximize rewards</li>
422
+ <li>
423
+ Preventing large policy updates that could destabilize
424
+ training
425
+ </li>
426
+ </ul>
427
+ </div>
428
+ </div>
429
+
430
+ <div class="concept-box">
431
+ <h3 class="concept-title">Key Innovations in PPO</h3>
432
+ <div class="concept-content">
433
+ <p>
434
+ The central innovation in PPO is the
435
+ <strong>clipped surrogate objective function</strong>:
436
+ </p>
437
+ <p style="text-align: center">
438
+ L<sup>CLIP</sup>(θ) = E[min(r<sub>t</sub>(θ)A<sub>t</sub>,
439
+ clip(r<sub>t</sub>(θ), 1-ε, 1+ε)A<sub>t</sub>)]
440
+ </p>
441
+ <p>where:</p>
442
+ <ul>
443
+ <li>
444
+ <strong>r<sub>t</sub>(θ)</strong> is the ratio of
445
+ probabilities under new and old policies
446
+ </li>
447
+ <li>
448
+ <strong>A<sub>t</sub></strong> is the advantage estimate
449
+ </li>
450
+ <li>
451
+ <strong>ε</strong> is the clipping parameter (usually 0.1 or
452
+ 0.2)
453
+ </li>
454
+ </ul>
455
+ <p>
456
+ The clipping mechanism ensures that the policy update stays
457
+ within a "trust region" by limiting how much the new policy can
458
+ deviate from the old one.
459
+ </p>
460
+ </div>
461
+ </div>
462
+
463
+ <div class="concept-box">
464
+ <h3 class="concept-title">How PPO Works in This Simulation</h3>
465
+ <div class="concept-content">
466
+ <ol>
467
+ <li>
468
+ The agent collects experience by interacting with the
469
+ environment using its current policy
470
+ </li>
471
+ <li>Advantages are computed for each state-action pair</li>
472
+ <li>
473
+ The policy is updated using the clipped surrogate objective
474
+ </li>
475
+ <li>
476
+ Multiple optimization epochs are performed on the same batch
477
+ of data
478
+ </li>
479
+ <li>The process repeats with the new policy</li>
480
+ </ol>
481
+ <p>
482
+ You can observe these steps in action in the simulation tab by
483
+ watching the policy visualization and training metrics.
484
+ </p>
485
+ </div>
486
+ </div>
487
+
488
+ <div class="concept-box">
489
+ <h3 class="concept-title">PPO vs. Other RL Algorithms</h3>
490
+ <div class="concept-content">
491
+ <p>PPO improves upon earlier algorithms in several ways:</p>
492
+ <ul>
493
+ <li>
494
+ <strong>vs. REINFORCE:</strong> More stable training due to
495
+ advantage estimation and clipping
496
+ </li>
497
+ <li>
498
+ <strong>vs. TRPO:</strong> Simpler implementation while
499
+ maintaining similar performance
500
+ </li>
501
+ <li>
502
+ <strong>vs. A2C/A3C:</strong> Better sample efficiency and
503
+ more stable policy updates
504
+ </li>
505
+ <li>
506
+ <strong>vs. Off-policy algorithms (DQN, DDPG):</strong> Less
507
+ sensitive to hyperparameters and often more stable
508
+ </li>
509
+ </ul>
510
+ </div>
511
+ </div>
512
+ </div>
513
+
514
+ <div id="metrics-tab" class="tab-content">
515
+ <div class="panel">
516
+ <h3>Training Progress</h3>
517
+ <div class="progress-container">
518
+ <div class="progress-bar" id="training-progress"></div>
519
+ </div>
520
+ <p id="episode-counter">Episodes: 0 / 100</p>
521
+ </div>
522
+
523
+ <div class="panel">
524
+ <h3>Reward Over Time</h3>
525
+ <div class="chart-container" id="reward-chart"></div>
526
+ </div>
527
+
528
+ <div class="panel">
529
+ <h3>Policy Loss</h3>
530
+ <div class="chart-container" id="policy-loss-chart"></div>
531
+ </div>
532
+
533
+ <div class="panel">
534
+ <h3>Value Loss</h3>
535
+ <div class="chart-container" id="value-loss-chart"></div>
536
+ </div>
537
+ </div>
538
+ </div>
539
+ </div>
540
+
541
+ <div class="popup-overlay" id="popup-overlay"></div>
542
+ <div class="popup" id="popup">
543
+ <h2 id="popup-title">Title</h2>
544
+ <div id="popup-content">Content</div>
545
+ <button onclick="closePopup()">Close</button>
546
+ </div>
547
+
548
+ <script>
549
+ // Environment configuration
550
+ const GRID_SIZE = 10;
551
+ let grid = [];
552
+ let agentPos = { x: 0, y: 0 };
553
+ let goalPos = { x: 9, y: 9 };
554
+ let obstacles = [];
555
+ let placingObstacles = false;
556
+
557
+ // Agent and PPO parameters
558
+ let policyNetwork = {};
559
+ let valueNetwork = {};
560
+ let clipRatio = 0.2;
561
+ let learningRate = 0.1; // Default learning rate (0-1 range)
562
+ let ppoEpochs = 4;
563
+ let gamma = 0.99; // Discount factor
564
+ let lambda = 0.95; // GAE parameter
565
+
566
+ // Training state
567
+ let isTraining = false;
568
+ let episode = 0;
569
+ let maxEpisodes = 100;
570
+ let episodeSteps = 0;
571
+ let maxStepsPerEpisode = 100; // Increased max steps to allow more exploration
572
+ let totalReward = 0;
573
+ let episodeRewards = [];
574
+ let policyLosses = [];
575
+ let valueLosses = [];
576
+
577
+ // Tracking for visualization
578
+ let trajectories = [];
579
+ let oldPolicy = {};
580
+
581
+ // Exploration parameters
582
+ let explorationRate = 0.2; // Probability of taking a random action (exploration)
583
+
584
+ // Initialize the environment
585
+ function initializeEnvironment() {
586
+ grid = [];
587
+ obstacles = [];
588
+
589
+ // Create the grid UI
590
+ const gridContainer = document.getElementById("grid");
591
+ gridContainer.innerHTML = "";
592
+
593
+ for (let y = 0; y < GRID_SIZE; y++) {
594
+ for (let x = 0; x < GRID_SIZE; x++) {
595
+ const cell = document.createElement("div");
596
+ cell.classList.add("cell");
597
+ cell.dataset.x = x;
598
+ cell.dataset.y = y;
599
+ cell.addEventListener("click", handleCellClick);
600
+ gridContainer.appendChild(cell);
601
+ }
602
+ }
603
+
604
+ // Place agent and goal
605
+ agentPos = { x: 0, y: 0 };
606
+ goalPos = { x: 9, y: 9 };
607
+ renderGrid();
608
+
609
+ // Initialize policy and value networks
610
+ initializeNetworks();
611
+ renderPolicy();
612
+ updateReward(0);
613
+ }
614
+
615
+ // Initialize policy and value networks
616
+ function initializeNetworks() {
617
+ policyNetwork = {};
618
+ valueNetwork = {};
619
+
620
+ // Initialize learning rate
621
+ learningRate = parseFloat(
622
+ document.getElementById("learning-rate").value
623
+ );
624
+
625
+ // Initialize policy and value for each state (cell)
626
+ for (let y = 0; y < GRID_SIZE; y++) {
627
+ for (let x = 0; x < GRID_SIZE; x++) {
628
+ const stateKey = `${x},${y}`;
629
+
630
+ // Initialize policy with random probabilities
631
+ policyNetwork[stateKey] = {
632
+ up: 0.25,
633
+ right: 0.25,
634
+ down: 0.25,
635
+ left: 0.25,
636
+ };
637
+
638
+ // Initialize value to zero
639
+ valueNetwork[stateKey] = 0;
640
+ }
641
+ }
642
+ }
643
+
644
+ function renderGrid() {
645
+ // Clear all cells
646
+ const cells = document.querySelectorAll(".cell");
647
+ cells.forEach((cell) => {
648
+ cell.innerHTML = "";
649
+ });
650
+
651
+ // Place agent
652
+ const agentCell = document.querySelector(
653
+ `.cell[data-x="${agentPos.x}"][data-y="${agentPos.y}"]`
654
+ );
655
+ const agentElement = document.createElement("div");
656
+ agentElement.classList.add("agent");
657
+ agentCell.appendChild(agentElement);
658
+
659
+ // Place goal
660
+ const goalCell = document.querySelector(
661
+ `.cell[data-x="${goalPos.x}"][data-y="${goalPos.y}"]`
662
+ );
663
+ const goalElement = document.createElement("div");
664
+ goalElement.classList.add("goal");
665
+ goalCell.appendChild(goalElement);
666
+
667
+ // Place obstacles
668
+ obstacles.forEach((obstacle) => {
669
+ const obstacleCell = document.querySelector(
670
+ `.cell[data-x="${obstacle.x}"][data-y="${obstacle.y}"]`
671
+ );
672
+ const obstacleElement = document.createElement("div");
673
+ obstacleElement.classList.add("obstacle");
674
+ obstacleCell.appendChild(obstacleElement);
675
+ });
676
+ }
677
+
678
+ function renderPolicy() {
679
+ const policyGrid = document.getElementById("policy-grid");
680
+ policyGrid.innerHTML = "";
681
+
682
+ for (let y = 0; y < GRID_SIZE; y++) {
683
+ for (let x = 0; x < GRID_SIZE; x++) {
684
+ const cell = document.createElement("div");
685
+ cell.classList.add("policy-cell");
686
+
687
+ const stateKey = `${x},${y}`;
688
+ const policy = policyNetwork[stateKey];
689
+
690
+ // Skip rendering policy for obstacles
691
+ if (isObstacle(x, y)) {
692
+ cell.style.backgroundColor = "#e74c3c";
693
+ policyGrid.appendChild(cell);
694
+ continue;
695
+ }
696
+
697
+ // If it's the goal, mark it green
698
+ if (x === goalPos.x && y === goalPos.y) {
699
+ cell.style.backgroundColor = "#2ecc71";
700
+ policyGrid.appendChild(cell);
701
+ continue;
702
+ }
703
+
704
+ // Create arrows for each action probability
705
+ for (const [action, prob] of Object.entries(policy)) {
706
+ if (prob > 0.2) {
707
+ // Only show significant probabilities
708
+ const arrow = document.createElement("div");
709
+ arrow.classList.add("arrow", `arrow-${action}`);
710
+ arrow.style.opacity = Math.min(1, prob * 2); // Scale opacity with probability
711
+ cell.appendChild(arrow);
712
+ }
713
+ }
714
+
715
+ // Add state value indication using background color intensity
716
+ const value = valueNetwork[stateKey];
717
+ const normalizedValue = (value + 10) / 20; // Normalize to [0,1] range assuming values between -10 and 10
718
+ const intensity = Math.max(
719
+ 0,
720
+ Math.min(255, Math.floor(normalizedValue * 255))
721
+ );
722
+ cell.style.backgroundColor = `rgba(236, 240, 241, ${normalizedValue})`;
723
+
724
+ policyGrid.appendChild(cell);
725
+ }
726
+ }
727
+ }
728
+
729
+ function handleCellClick(event) {
730
+ const x = parseInt(event.currentTarget.dataset.x);
731
+ const y = parseInt(event.currentTarget.dataset.y);
732
+
733
+ if (placingObstacles) {
734
+ // Don't allow obstacles on agent or goal
735
+ if (
736
+ (x === agentPos.x && y === agentPos.y) ||
737
+ (x === goalPos.x && y === goalPos.y)
738
+ ) {
739
+ return;
740
+ }
741
+
742
+ const obstacleIndex = obstacles.findIndex(
743
+ (o) => o.x === x && o.y === y
744
+ );
745
+ if (obstacleIndex === -1) {
746
+ obstacles.push({ x, y });
747
+ } else {
748
+ obstacles.splice(obstacleIndex, 1);
749
+ }
750
+ renderGrid();
751
+ renderPolicy();
752
+ }
753
+ }
754
+
755
+ function toggleObstaclePlacement() {
756
+ placingObstacles = !placingObstacles;
757
+ const btn = document.getElementById("place-obstacle-btn");
758
+ btn.textContent = placingObstacles ? "Done Placing" : "Place Obstacles";
759
+ btn.style.backgroundColor = placingObstacles ? "#e74c3c" : "#3498db";
760
+ }
761
+
762
+ function isObstacle(x, y) {
763
+ return obstacles.some((o) => o.x === x && o.y === y);
764
+ }
765
+
766
+ function resetEnvironment() {
767
+ initializeEnvironment();
768
+ episodeRewards = [];
769
+ policyLosses = [];
770
+ valueLosses = [];
771
+ episode = 0;
772
+ updateEpisodeCounter();
773
+ updateReward(0);
774
+
775
+ // Reset training state
776
+ isTraining = false;
777
+ document.getElementById("start-btn").textContent = "Start Training";
778
+ document.getElementById("step-btn").disabled = true;
779
+
780
+ // Clear charts
781
+ // In a real implementation, you would update the charts here
782
+
783
+ logMessage("Environment reset. Ready for training!");
784
+ }
785
+
786
+ function startTraining() {
787
+ if (isTraining) {
788
+ // Stop training
789
+ isTraining = false;
790
+ document.getElementById("start-btn").textContent = "Start Training";
791
+ document.getElementById("step-btn").disabled = true;
792
+ } else {
793
+ // Start training
794
+ isTraining = true;
795
+ document.getElementById("start-btn").textContent = "Stop Training";
796
+ document.getElementById("step-btn").disabled = false;
797
+
798
+ // If we're at the end of training, reset first
799
+ if (episode >= maxEpisodes) {
800
+ resetEnvironment();
801
+ }
802
+
803
+ runTrainingLoop();
804
+ }
805
+ }
806
+
807
+ function stepTraining() {
808
+ if (episode < maxEpisodes) {
809
+ runEpisode();
810
+ updateTrainingProgress();
811
+ } else {
812
+ logMessage("Training complete! Reset to train again.");
813
+ }
814
+ }
815
+
816
+ async function runTrainingLoop() {
817
+ while (isTraining && episode < maxEpisodes) {
818
+ await runEpisode();
819
+ updateTrainingProgress();
820
+
821
+ // Add a small delay to visualize the process
822
+ await new Promise((resolve) => setTimeout(resolve, 200));
823
+ }
824
+
825
+ if (episode >= maxEpisodes) {
826
+ logMessage("Training complete!");
827
+ isTraining = false;
828
+ document.getElementById("start-btn").textContent = "Start Training";
829
+ }
830
+ }
831
+
832
+ async function runEpisode() {
833
+ // Reset agent position and episodic variables
834
+ agentPos = { x: 0, y: 0 };
835
+ episodeSteps = 0;
836
+ totalReward = 0;
837
+ trajectories = [];
838
+
839
+ // Decay exploration rate over time (important for improving policy)
840
+ explorationRate = Math.max(0.05, 0.2 * Math.pow(0.99, episode));
841
+
842
+ renderGrid();
843
+ updateReward(totalReward);
844
+
845
+ // Save old policy for PPO ratio calculation
846
+ oldPolicy = JSON.parse(JSON.stringify(policyNetwork));
847
+
848
+ // Run episode until termination
849
+ let done = false;
850
+ while (!done && episodeSteps < maxStepsPerEpisode) {
851
+ done = await executeStep();
852
+ episodeSteps++;
853
+
854
+ // Small delay for visualization
855
+ await new Promise((resolve) =>
856
+ setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
857
+ );
858
+ }
859
+
860
+ // Add episode reward to history
861
+ episodeRewards.push(totalReward);
862
+
863
+ // Run PPO update if we have enough steps
864
+ if (trajectories.length > 0) {
865
+ const [policyLoss, valueLoss] = updatePPO();
866
+ policyLosses.push(policyLoss);
867
+ valueLosses.push(valueLoss);
868
+ }
869
+
870
+ // Update UI
871
+ renderPolicy();
872
+ episode++;
873
+ updateEpisodeCounter();
874
+
875
+ logMessage(
876
+ `Episode ${episode}: Reward=${totalReward.toFixed(
877
+ 2
878
+ )}, Steps=${episodeSteps}, Exploration=${explorationRate.toFixed(2)}`
879
+ );
880
+
881
+ return new Promise((resolve) => setTimeout(resolve, 10));
882
+ }
883
+
884
+ async function executeStep() {
885
+ const stateKey = `${agentPos.x},${agentPos.y}`;
886
+ const policy = policyNetwork[stateKey];
887
+
888
+ // Choose action based on policy
889
+ const action = sampleAction(policy);
890
+
891
+ // Store old position
892
+ const oldPos = { ...agentPos };
893
+
894
+ // Move agent
895
+ moveAgent(action);
896
+
897
+ // Calculate reward
898
+ const reward = calculateReward(oldPos);
899
+ totalReward += reward;
900
+ updateReward(totalReward);
901
+
902
+ // Check if episode is done
903
+ const done =
904
+ (agentPos.x === goalPos.x && agentPos.y === goalPos.y) ||
905
+ isObstacle(agentPos.x, agentPos.y);
906
+
907
+ // If agent hit obstacle, move it back for visualization
908
+ if (isObstacle(agentPos.x, agentPos.y)) {
909
+ agentPos = { ...oldPos };
910
+ }
911
+
912
+ // Render the grid
913
+ renderGrid();
914
+
915
+ // Store trajectory
916
+ const newStateKey = `${agentPos.x},${agentPos.y}`;
917
+ trajectories.push({
918
+ state: stateKey,
919
+ action,
920
+ reward,
921
+ nextState: newStateKey,
922
+ done,
923
+ });
924
+
925
+ return done;
926
+ }
927
+
928
+ function sampleAction(policy) {
929
+ // Use exploration rate to decide whether to take random action or follow policy
930
+ if (Math.random() < explorationRate) {
931
+ // Take random action with exploration probability
932
+ const actions = Object.keys(policy);
933
+ const randomIndex = Math.floor(Math.random() * actions.length);
934
+ return actions[randomIndex];
935
+ }
936
+
937
+ // Otherwise sample from policy distribution
938
+ const actions = Object.keys(policy);
939
+ const probs = actions.map((a) => policy[a]);
940
+
941
+ const rand = Math.random();
942
+ let cumProb = 0;
943
+
944
+ for (let i = 0; i < actions.length; i++) {
945
+ cumProb += probs[i];
946
+ if (rand < cumProb) {
947
+ return actions[i];
948
+ }
949
+ }
950
+
951
+ return actions[actions.length - 1];
952
+ }
953
+
954
+ function moveAgent(action) {
955
+ // Save previous position
956
+ const prevPos = { ...agentPos };
957
+
958
+ // Attempt to move agent
959
+ switch (action) {
960
+ case "up":
961
+ agentPos.y = Math.max(0, agentPos.y - 1);
962
+ break;
963
+ case "right":
964
+ agentPos.x = Math.min(GRID_SIZE - 1, agentPos.x + 1);
965
+ break;
966
+ case "down":
967
+ agentPos.y = Math.min(GRID_SIZE - 1, agentPos.y + 1);
968
+ break;
969
+ case "left":
970
+ agentPos.x = Math.max(0, agentPos.x - 1);
971
+ break;
972
+ }
973
+
974
+ // Check if new position is an obstacle
975
+ if (isObstacle(agentPos.x, agentPos.y)) {
976
+ // Revert to previous position if it hit an obstacle
977
+ agentPos.x = prevPos.x;
978
+ agentPos.y = prevPos.y;
979
+ return false; // Indicate movement was blocked
980
+ }
981
+
982
+ return true; // Movement successful
983
+ }
984
+
985
+ function calculateReward(oldPos, movementSuccessful) {
986
+ // Reward for reaching goal
987
+ if (agentPos.x === goalPos.x && agentPos.y === goalPos.y) {
988
+ return 10;
989
+ }
990
+
991
+ // Penalty for attempting to move into an obstacle (but not actually moving into it)
992
+ if (!movementSuccessful) {
993
+ return -1; // Reduced penalty to avoid too much negative learning
994
+ }
995
+
996
+ // Small penalty for each step to encourage efficiency
997
+ let stepPenalty = -0.1;
998
+
999
+ // Small reward for getting closer to goal (using Manhattan distance)
1000
+ const oldDistance =
1001
+ Math.abs(oldPos.x - goalPos.x) + Math.abs(oldPos.y - goalPos.y);
1002
+ const newDistance =
1003
+ Math.abs(agentPos.x - goalPos.x) + Math.abs(agentPos.y - goalPos.y);
1004
+ const proximityReward = oldDistance > newDistance ? 0.3 : -0.1; // Stronger reward for progress
1005
+
1006
+ return stepPenalty + proximityReward;
1007
+ }
1008
+
1009
+ function updatePPO() {
1010
+ // Get parameters from sliders
1011
+ clipRatio = parseFloat(document.getElementById("clip-ratio").value);
1012
+ learningRate = parseFloat(
1013
+ document.getElementById("learning-rate").value
1014
+ );
1015
+ ppoEpochs = parseInt(document.getElementById("epochs").value);
1016
+
1017
+ // Compute returns and advantages
1018
+ const returns = [];
1019
+ const advantages = [];
1020
+
1021
+ // Compute returns (discounted sum of future rewards)
1022
+ let discountedReturn = 0;
1023
+ for (let i = trajectories.length - 1; i >= 0; i--) {
1024
+ const transition = trajectories[i];
1025
+ discountedReturn =
1026
+ transition.reward +
1027
+ gamma * (transition.done ? 0 : discountedReturn);
1028
+ returns.unshift(discountedReturn);
1029
+ }
1030
+
1031
+ // Compute advantages using Generalized Advantage Estimation (GAE)
1032
+ let lastGaeAdvantage = 0;
1033
+ for (let i = trajectories.length - 1; i >= 0; i--) {
1034
+ const transition = trajectories[i];
1035
+ const stateKey = transition.state;
1036
+ const nextStateKey = transition.nextState;
1037
+
1038
+ const currentValue = valueNetwork[stateKey];
1039
+ const nextValue = transition.done ? 0 : valueNetwork[nextStateKey];
1040
+
1041
+ // TD error
1042
+ const delta = transition.reward + gamma * nextValue - currentValue;
1043
+
1044
+ // GAE
1045
+ lastGaeAdvantage = delta + gamma * lambda * lastGaeAdvantage;
1046
+ advantages.unshift(lastGaeAdvantage);
1047
+ }
1048
+
1049
+ // Normalize advantages for more stable learning
1050
+ const meanAdvantage =
1051
+ advantages.reduce((a, b) => a + b, 0) / advantages.length;
1052
+ const stdAdvantage =
1053
+ Math.sqrt(
1054
+ advantages.reduce((a, b) => a + Math.pow(b - meanAdvantage, 2), 0) /
1055
+ advantages.length
1056
+ ) || 1; // Avoid division by zero
1057
+
1058
+ for (let i = 0; i < advantages.length; i++) {
1059
+ advantages[i] =
1060
+ (advantages[i] - meanAdvantage) / (stdAdvantage + 1e-8);
1061
+ }
1062
+
1063
+ // Store losses for metrics
1064
+ let totalPolicyLoss = 0;
1065
+ let totalValueLoss = 0;
1066
+
1067
+ // Backup old policy for PPO ratio calculation
1068
+ const oldPolicyBackup = JSON.parse(JSON.stringify(policyNetwork));
1069
+
1070
+ // Multiple epochs of optimization on the same data (key PPO feature)
1071
+ for (let epoch = 0; epoch < ppoEpochs; epoch++) {
1072
+ // Update policy and value networks for each step in the trajectory
1073
+ for (let i = 0; i < trajectories.length; i++) {
1074
+ const transition = trajectories[i];
1075
+ const stateKey = transition.state;
1076
+ const action = transition.action;
1077
+
1078
+ // Get old action probability
1079
+ const oldActionProb = oldPolicy[stateKey][action];
1080
+
1081
+ // Get current action probability
1082
+ const currentActionProb = policyNetwork[stateKey][action];
1083
+
1084
+ // Compute probability ratio (crucial for PPO)
1085
+ const ratio = currentActionProb / Math.max(oldActionProb, 1e-8);
1086
+
1087
+ // Get advantage for this action
1088
+ const advantage = advantages[i];
1089
+
1090
+ // Compute unclipped and clipped surrogate objectives
1091
+ const unclippedObjective = ratio * advantage;
1092
+ const clippedRatio = Math.max(
1093
+ Math.min(ratio, 1 + clipRatio),
1094
+ 1 - clipRatio
1095
+ );
1096
+ const clippedObjective = clippedRatio * advantage;
1097
+
1098
+ // PPO's clipped surrogate objective (core of PPO)
1099
+ const surrogateObjective = Math.min(
1100
+ unclippedObjective,
1101
+ clippedObjective
1102
+ );
1103
+
1104
+ // Compute policy gradient
1105
+ // Note: In PPO, we maximize the objective, so negative for gradient ascent
1106
+ const policyLoss = -surrogateObjective;
1107
+ totalPolicyLoss += policyLoss;
1108
+
1109
+ // Value loss (using returns as targets)
1110
+ const valueTarget = returns[i];
1111
+ const valuePrediction = valueNetwork[stateKey];
1112
+ const valueLoss = 0.5 * Math.pow(valueTarget - valuePrediction, 2);
1113
+ totalValueLoss += valueLoss;
1114
+
1115
+ // Update value network with gradient descent
1116
+ valueNetwork[stateKey] +=
1117
+ learningRate * (valueTarget - valuePrediction);
1118
+
1119
+ // Compute policy update based on whether we're using clipped or unclipped objective
1120
+ const useClippedObjective = unclippedObjective > clippedObjective;
1121
+ const policyGradient =
1122
+ learningRate * advantage * (useClippedObjective ? 0 : 1);
1123
+
1124
+ // Apply policy gradient update
1125
+ // Increase probability of the taken action if it was good (positive advantage)
1126
+ // Decrease probability if it was bad (negative advantage)
1127
+ let newProb = policyNetwork[stateKey][action] + policyGradient;
1128
+
1129
+ // Ensure probability stays positive (important for ratio calculation)
1130
+ newProb = Math.max(newProb, 0.01);
1131
+ policyNetwork[stateKey][action] = newProb;
1132
+
1133
+ // Normalize probabilities to ensure they sum to 1
1134
+ const sumProb = Object.values(policyNetwork[stateKey]).reduce(
1135
+ (a, b) => a + b,
1136
+ 0
1137
+ );
1138
+ for (const a in policyNetwork[stateKey]) {
1139
+ policyNetwork[stateKey][a] /= sumProb;
1140
+ }
1141
+
1142
+ // Add some exploration (entropy bonus)
1143
+ // This is crucial for avoiding local optima
1144
+ if (i % 5 === 0) {
1145
+ // Apply periodically to maintain some exploration
1146
+ for (const a in policyNetwork[stateKey]) {
1147
+ // Slightly nudge probabilities toward uniform
1148
+ policyNetwork[stateKey][a] =
1149
+ 0.95 * policyNetwork[stateKey][a] + 0.05 * 0.25;
1150
+ }
1151
+ // Re-normalize
1152
+ const sumProb = Object.values(policyNetwork[stateKey]).reduce(
1153
+ (a, b) => a + b,
1154
+ 0
1155
+ );
1156
+ for (const a in policyNetwork[stateKey]) {
1157
+ policyNetwork[stateKey][a] /= sumProb;
1158
+ }
1159
+ }
1160
+ }
1161
+ }
1162
+
1163
+ // Calculate average losses
1164
+ const avgPolicyLoss =
1165
+ totalPolicyLoss / (trajectories.length * ppoEpochs);
1166
+ const avgValueLoss = totalValueLoss / (trajectories.length * ppoEpochs);
1167
+
1168
+ // Log progress periodically
1169
+ if (episode % 5 === 0) {
1170
+ logMessage(
1171
+ `Episode ${episode}: Average Policy Loss = ${avgPolicyLoss.toFixed(
1172
+ 4
1173
+ )}, Value Loss = ${avgValueLoss.toFixed(4)}`
1174
+ );
1175
+ }
1176
+
1177
+ return [avgPolicyLoss, avgValueLoss];
1178
+ }
1179
+
1180
+ function updateReward(reward) {
1181
+ document.getElementById("reward-value").textContent = reward.toFixed(2);
1182
+ }
1183
+
1184
+ function updateEpisodeCounter() {
1185
+ document.getElementById(
1186
+ "episode-counter"
1187
+ ).textContent = `Episodes: ${episode} / ${maxEpisodes}`;
1188
+ document.getElementById("training-progress").style.width = `${
1189
+ (episode / maxEpisodes) * 100
1190
+ }%`;
1191
+ }
1192
+
1193
+ function updateTrainingProgress() {
1194
+ // Update charts with the latest data
1195
+ // In a real implementation, you would update charts here
1196
+
1197
+ // Show progress
1198
+ updateEpisodeCounter();
1199
+ }
1200
+
1201
+ function updateSliderValue(id) {
1202
+ const slider = document.getElementById(id);
1203
+ const valueDisplay = document.getElementById(`${id}-value`);
1204
+ valueDisplay.textContent = slider.value;
1205
+
1206
+ // Update corresponding variables
1207
+ if (id === "clip-ratio") clipRatio = parseFloat(slider.value);
1208
+ if (id === "learning-rate") learningRate = parseFloat(slider.value);
1209
+ if (id === "epochs") ppoEpochs = parseInt(slider.value);
1210
+ }
1211
+
1212
+ function logMessage(message) {
1213
+ const logContainer = document.getElementById("log-container");
1214
+ const logEntry = document.createElement("div");
1215
+ logEntry.classList.add("log-entry");
1216
+ logEntry.textContent = message;
1217
+ logContainer.appendChild(logEntry);
1218
+ logContainer.scrollTop = logContainer.scrollHeight;
1219
+ }
1220
+
1221
+ function openTab(tabId) {
1222
+ // Hide all tab contents
1223
+ const tabContents = document.getElementsByClassName("tab-content");
1224
+ for (let i = 0; i < tabContents.length; i++) {
1225
+ tabContents[i].classList.remove("active");
1226
+ }
1227
+
1228
+ // Remove active class from tab buttons
1229
+ const tabButtons = document.getElementsByClassName("tab-button");
1230
+ for (let i = 0; i < tabButtons.length; i++) {
1231
+ tabButtons[i].classList.remove("active");
1232
+ }
1233
+
1234
+ // Show selected tab content and mark button as active
1235
+ document.getElementById(tabId).classList.add("active");
1236
+ const activeButton = document.querySelector(
1237
+ `.tab-button[onclick="openTab('${tabId}')"]`
1238
+ );
1239
+ activeButton.classList.add("active");
1240
+ }
1241
+
1242
+ function showPopup(title, content) {
1243
+ document.getElementById("popup-title").textContent = title;
1244
+ document.getElementById("popup-content").innerHTML = content;
1245
+ document.getElementById("popup-overlay").style.display = "block";
1246
+ document.getElementById("popup").style.display = "block";
1247
+ }
1248
+
1249
+ function closePopup() {
1250
+ document.getElementById("popup-overlay").style.display = "none";
1251
+ document.getElementById("popup").style.display = "none";
1252
+ }
1253
+
1254
+ // Initialize the environment when the page loads
1255
+ window.onload = function () {
1256
+ initializeEnvironment();
1257
+ logMessage('Environment initialized. Click "Start Training" to begin!');
1258
+
1259
+ // Show concept popup with a delay
1260
+ setTimeout(() => {
1261
+ showPopup(
1262
+ "Welcome to PPO Simulation",
1263
+ `
1264
+ <p>This simulation demonstrates Proximal Policy Optimization (PPO), a reinforcement learning algorithm.</p>
1265
+ <p>In this grid world:</p>
1266
+ <ul>
1267
+ <li>The agent (blue circle) must learn to navigate to the goal (green square)</li>
1268
+ <li>You can place obstacles (red squares) by clicking the "Place Obstacles" button</li>
1269
+ <li>The agent receives rewards for approaching the goal and penalties for hitting obstacles</li>
1270
+ <li>PPO helps the agent learn efficiently by preventing large policy updates</li>
1271
+ </ul>
1272
+ <p>Try experimenting with different parameters to see how they affect learning!</p>
1273
+ `
1274
+ );
1275
+ }, 1000);
1276
+ };
1277
+ // Animation speed control
1278
+ let animationSpeed = "normal";
1279
+ const animationSpeeds = {
1280
+ slow: 300,
1281
+ normal: 100,
1282
+ fast: 20,
1283
+ };
1284
+
1285
+ function toggleAnimationSpeed() {
1286
+ const speedBtn = document.getElementById("animation-speed-btn");
1287
+
1288
+ if (animationSpeed === "slow") {
1289
+ animationSpeed = "normal";
1290
+ speedBtn.textContent = "Animation Speed: Normal";
1291
+ } else if (animationSpeed === "normal") {
1292
+ animationSpeed = "fast";
1293
+ speedBtn.textContent = "Animation Speed: Fast";
1294
+ } else {
1295
+ animationSpeed = "slow";
1296
+ speedBtn.textContent = "Animation Speed: Slow";
1297
+ }
1298
+ }
1299
+
1300
+ // Update animation speed in relevant functions
1301
+ async function runTrainingLoop() {
1302
+ while (isTraining && episode < maxEpisodes) {
1303
+ await runEpisode();
1304
+ updateTrainingProgress();
1305
+
1306
+ // Use dynamic animation speed
1307
+ await new Promise((resolve) =>
1308
+ setTimeout(resolve, animationSpeeds[animationSpeed])
1309
+ );
1310
+ }
1311
+
1312
+ if (episode >= maxEpisodes) {
1313
+ logMessage("Training complete!");
1314
+ isTraining = false;
1315
+ document.getElementById("start-btn").textContent = "Start Training";
1316
+ }
1317
+ }
1318
+
1319
+ async function executeStep() {
1320
+ const stateKey = `${agentPos.x},${agentPos.y}`;
1321
+ const policy = policyNetwork[stateKey];
1322
+
1323
+ // Choose action based on policy
1324
+ const action = sampleAction(policy);
1325
+
1326
+ // Store old position
1327
+ const oldPos = { ...agentPos };
1328
+
1329
+ // Move agent
1330
+ const movementSuccessful = moveAgent(action);
1331
+
1332
+ // Calculate reward
1333
+ const reward = calculateReward(oldPos, movementSuccessful);
1334
+ totalReward += reward;
1335
+ updateReward(totalReward);
1336
+
1337
+ // Check if episode is done
1338
+ const done = agentPos.x === goalPos.x && agentPos.y === goalPos.y;
1339
+
1340
+ // Render the grid
1341
+ renderGrid();
1342
+
1343
+ // Store trajectory
1344
+ const newStateKey = `${agentPos.x},${agentPos.y}`;
1345
+ trajectories.push({
1346
+ state: stateKey,
1347
+ action,
1348
+ reward,
1349
+ nextState: newStateKey,
1350
+ done,
1351
+ });
1352
+
1353
+ // Use dynamic animation speed
1354
+ await new Promise((resolve) =>
1355
+ setTimeout(resolve, animationSpeeds[animationSpeed] / 2)
1356
+ );
1357
+
1358
+ return done;
1359
+ }
1360
+ </script>
1361
+
1362
+ <footer
1363
+ style="
1364
+ text-align: center;
1365
+ margin-top: 30px;
1366
+ padding: 15px;
1367
+ background-color: #f8f9fa;
1368
+ border-top: 1px solid #ddd;
1369
+ "
1370
+ >
1371
+ &copy; 2025 Pejman Ebrahimi - All Rights Reserved
1372
+ </footer>
1373
+ </body>
1374
  </html>