forget about triton, optimize for clarity and education

lucidrains · Apr 4, 2022 · 0832087 · 0832087
1 parent f5332eb
commit 0832087
Show file tree

Hide file tree

Showing 3 changed files with 4 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -29,10 +29,6 @@ tokens = torch.randint(0, 20000, (1, 2048))
 logits = palm(tokens) # (1, 2048, 20000)
 ```
 
-## Todo
-
-- [ ] use Triton to add bias-less Layernorm + stable softmax
-
 ## Citations
 
 ```bibtex

diff --git a/palm_pytorch/palm_pytorch.py b/palm_pytorch/palm_pytorch.py
@@ -12,12 +12,11 @@ class LayerNorm(nn.Module):
     def __init__(self, dim, eps = 1e-5):
         super().__init__()
         self.eps = eps
-        self.g = nn.Parameter(torch.ones(dim))
+        self.gamma = nn.Parameter(torch.ones(dim))
+        self.register_buffer('beta', torch.zeros(dim))
 
     def forward(self, x):
-        var = torch.var(x, dim = -1, unbiased = False, keepdim = True)
-        mean = torch.mean(x, dim = -1, keepdim = True)
-        return (x - mean) / (var + self.eps).sqrt() * self.g
+        return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
 
 # parallel with residual
 # discovered by Wang et al + EleutherAI from GPT-J fame

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'PaLM-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.0.4',
+  version = '0.0.5',
   license='MIT',
   description = 'PaLM: Scaling Language Modeling with Pathways - Pytorch',
   author = 'Phil Wang',