0.6

xinpw8 · Jan 13, 2024 · b7d2384 · b7d2384
1 parent 09e69bb
commit b7d2384
Show file tree

Hide file tree

Showing 6 changed files with 50 additions and 6 deletions.
diff --git a/pufferlib/environments/ocean/bandit.py b/pufferlib/environments/ocean/bandit.py
@@ -13,7 +13,16 @@ def init(self,
         ):
     '''Pufferlib Bandit environment
 
-    Simulates a variety of classic bandit problems
+    Simulates a classic multiarmed bandit problem.
+
+    Observation space: Box(0, 1, (1,)). The observation is always 1.
+    Action space: Discrete(num_actions). Which arm to pull.
+
+    Args:
+        num_actions: The number of bandit arms
+        reward_scale: The scale of the reward
+        reward_noise: The standard deviation of the reward signal
+        hard_fixed_seed: All instances of the environment should share the same seed.
     '''
     return namespace(self,
         num_actions=num_actions,

diff --git a/pufferlib/environments/ocean/memory.py b/pufferlib/environments/ocean/memory.py
@@ -11,7 +11,16 @@ def init(self,
         ):
     '''Pufferlib Memory environment
 
-    Repeat the provided sequence back
+    Repeat the observed sequence after a delay. It is randomly generated upon every reset. This is a test of memory length and capacity. It starts requiring credit assignment if you make the sequence too long.
+
+    The sequence is presented one digit at a time, followed by a string of 0. The agent should output 0s for the first mem_length + mem_delay steps, then output the sequence.
+
+    Observation space: Box(0, 1, (1,)). The current digit.
+    Action space: Discrete(2). Your guess for the next digit.
+
+    Args:
+        mem_length: The length of the sequence
+        mem_delay: The number of 0s between the sequence and the agent's response
     '''
     return namespace(self,
         mem_length=mem_length,

diff --git a/pufferlib/environments/ocean/password.py b/pufferlib/environments/ocean/password.py
@@ -8,7 +8,16 @@
 def init(self, password_length=5, hard_fixed_seed=42):
     '''Pufferlib Password environment
 
-    Guess the password. This is a test for reward sparsity.
+    Guess the password, which is a static binary string. Your policy has to
+    not determinize before it happens to get the reward, and it also has to
+    latch onto the reward within a few instances of getting it. 
+
+    Observation space: Box(0, 1, (password_length,)). A binary vector containing your guesses so far, so that the environment will be solvable without memory.
+    Action space: Discrete(2). Your guess for the next digit.
+
+    Args:
+        password_length: The number of binary digits in the password.
+        hard_fixed_seed: A fixed seed for the environment. It should be the same for all instances. This environment does not make sense when randomly generated.
     '''
     return namespace(self,
         password_length=password_length,

diff --git a/pufferlib/environments/ocean/squared.py b/pufferlib/environments/ocean/squared.py
@@ -18,13 +18,21 @@ def init(self,
         distance_to_target=1,
         num_targets=-1,
         ):
-    '''Pufferlib Diamond environment
+    '''Pufferlib Squared environment
 
     Agent starts at the center of a square grid.
     Targets are placed on the perimeter of the grid.
     Reward is 1 minus the L-inf distance to the closest target.
     This means that reward varies from -1 to 1.
     Reward is not given for targets that have already been hit.
+
+    Observation space: Box(-1, 1, (grid_size, grid_size)). The map.
+    Action space: Discrete(8). Which direction to move.
+
+    Args:
+        distance_to_target: The distance from the center to the closest target.
+        num_targets: The number of targets to randomly generate.
+ 
     '''
     grid_size = 2 * distance_to_target + 1
     if num_targets == -1:

diff --git a/pufferlib/environments/ocean/stochastic.py b/pufferlib/environments/ocean/stochastic.py
@@ -11,7 +11,16 @@ def init(self,
         ):
     '''Pufferlib Stochastic environment
 
-    Rewarded for playing action 0 < p % of the time and action 1 < (1 - p) %
+    The optimal policy is to play action 0 < p % of the time and action 1 < (1 - p) %
+    This is a test of whether your algorithm can learn a nontrivial stochastic policy.
+    Do not use a policy with memory, as that will trivialize the problem.
+
+    Observation space: Box(0, 1, (1,)). The observation is always 0.
+    Action space: Discrete(2). Select action 0 or action 1.
+
+    Args:
+        p: The optimal probability for action 0
+        horizon: How often the environment should reset
     '''
     return namespace(self,
         p=p,

diff --git a/pufferlib/version.py b/pufferlib/version.py
@@ -1 +1 @@
-__version__ = '0.5.1'
+__version__ = '0.6.0'