diff --git a/the_annotated_transformer.py b/the_annotated_transformer.py index 4aa1d46..5342fc3 100644 --- a/the_annotated_transformer.py +++ b/the_annotated_transformer.py @@ -433,7 +433,7 @@ def forward(self, x, memory, src_mask, tgt_mask): # # We also modify the self-attention sub-layer in the decoder stack to # prevent positions from attending to subsequent positions. This -# masking, combined with fact that the output embeddings are offset by +# masking, combined with the fact that output embeddings are offset by # one position, ensures that the predictions for position $i$ can # depend only on the known outputs at positions less than $i$.