puffer 1.0 port

thatguy11325 · Jun 20, 2024 · 48dbc3d · 48dbc3d
1 parent afe63df
commit 48dbc3d
Show file tree

Hide file tree

Showing 8 changed files with 788 additions and 700 deletions.
diff --git a/config.yaml b/config.yaml
@@ -15,10 +15,11 @@ debug:
     device: cpu
     compile: False
     compile_mode: default
-    num_envs: 1
-    envs_per_worker: 1
-    envs_per_batch: 1
-    batch_size: 16
+    num_envs: 2
+    num_workers: 1
+    env_batch_size: 16
+    env_pool: True
+    zero_copy: False
     batch_rows: 4
     bptt_horizon: 2
     total_timesteps: 100_000_000
@@ -73,6 +74,7 @@ train:
   float32_matmul_precision: "high"
   total_timesteps: 100_000_000_000
   batch_size: 65536 
+  minibatch_size: 32768
   learning_rate: 2.0e-4
   anneal_lr: False
   gamma: 0.998
@@ -91,9 +93,10 @@ train:
   vf_clip_coef: 0.1
 
   num_envs: 96
-  envs_per_worker: 1
-  envs_per_batch: 32
+  num_workers: 96
+  env_batch_size: 32
   env_pool: True
+  zero_copy: False
 
   verbose: True
   data_dir: runs
@@ -104,6 +107,7 @@ train:
   cpu_offload: True
   pool_kernel: [0]
   load_optimizer_state: False
+  use_rnn: True
 
   # swarm_frequency: 500
   # swarm_keep_pct: .8
@@ -126,6 +130,7 @@ wrappers:
         forgetting_frequency: 10
     - exploration.OnResetExplorationWrapper:
         full_reset_frequency: 1
+        jitter: 0
 
   finite_coords:
     - stream_wrapper.StreamWrapper:
@@ -224,9 +229,10 @@ policies:
     policy:
       hidden_size: 512
 
-    recurrent:
+    rnn:
       # Assumed to be in the same module as the policy
-      name: RecurrentMultiConvolutionalWrapper
-      input_size: 512
-      hidden_size: 512
-      num_layers: 1
+      name: MultiConvolutionalRNN
+      args:
+        input_size: 512
+        hidden_size: 512
+        num_layers: 1
diff --git a/pokemonred_puffer/c_gae.pyx b/pokemonred_puffer/c_gae.pyx
@@ -0,0 +1,33 @@
+# distutils: define_macros=NPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: initializedcheck=False
+# cython: wraparound=False
+# cython: nonecheck=False
+
+import numpy as np
+cimport numpy as cnp
+
+def compute_gae(cnp.ndarray dones, cnp.ndarray values,
+        cnp.ndarray rewards, float gamma, float gae_lambda):
+    '''Fast Cython implementation of Generalized Advantage Estimation (GAE)'''
+    cdef int num_steps = len(rewards)
+    cdef cnp.ndarray advantages = np.zeros(num_steps, dtype=np.float32)
+    cdef float[:] c_advantages = advantages
+    cdef float[:] c_dones = dones
+    cdef float[:] c_values = values
+    cdef float[:] c_rewards = rewards
+
+    cdef float lastgaelam = 0
+    cdef float nextnonterminal, delta
+    cdef int t, t_cur, t_next
+    for t in range(num_steps-1):
+        t_cur = num_steps - 2 - t
+        t_next = num_steps - 1 - t
+        nextnonterminal = 1.0 - c_dones[t_next]
+        delta = c_rewards[t_next] + gamma * c_values[t_next] * nextnonterminal - c_values[t_cur]
+        lastgaelam = delta + gamma * gae_lambda * nextnonterminal * lastgaelam
+        c_advantages[t_cur] = lastgaelam
+
+    return advantages
+