diff --git a/src/layers.ts b/src/layers.ts
index b3edd24..9f80795 100644
--- a/src/layers.ts
+++ b/src/layers.ts
@@ -92,24 +92,22 @@ export class Module implements ModuleInterface {
 
 // Standard Layers:
 
-/**
- * Simple linear layer, with weight matrix and optional bias. Does not contain nonlinearity.
- *
- * @param {number} in_size - size of the last dimention of the input array.
- * @param {number} out_size - size of the last dimention of the output array.
- * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
- * @param {boolean} bias - wether to include a bias term.
- * @param {boolean} xavier - Wether to use xavier initialization (divide by square root of first input dimension).
- */
 export class Linear extends Module {
   public W: Tensor;
   public b: Tensor;
   public has_bias: boolean;
-
+  /**
+   * Simple linear layer, with weight matrix and optional bias. Does not contain nonlinearity.
+   *
+   * @param {number} in_size - size of the last dimention of the input array.
+   * @param {number} out_size - size of the last dimention of the output array.
+   * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
+   * @param {boolean} bias - wether to include a bias term.
+   * @param {boolean} xavier - Wether to use xavier initialization (divide by square root of first input dimension).
+   */
   constructor(in_size: number, out_size: number, device = 'cpu', bias = true, xavier = true) {
     super();
-    this.W = randn([in_size, out_size], true, xavier);
-    this.W.device = device;
+    this.W = randn([in_size, out_size], true, device, xavier);
     this.b = zeros([out_size], true);
     this.has_bias = bias;
   }
@@ -128,16 +126,6 @@ export class Linear extends Module {
   }
 }
 
-/**
- * Full transformer Layer implementation.
- *
- * @param {number} in_size - size of the last dimention of the input array.
- * @param {number} out_size - size of the last dimention of the output array.
- * @param {number} n_heads - number of parallel heads to be computed (must equally divide in_size).
- * @param {number} n_timesteps - length of text sequence to be processed bt Transformer.
- * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
- * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
- */
 export class MultiHeadSelfAttention extends Module {
   public Wk: Linear;
   public Wq: Linear;
@@ -149,6 +137,16 @@ export class MultiHeadSelfAttention extends Module {
   public softmax: Softmax;
   public H: number;
 
+  /**
+   * Full transformer Layer implementation.
+   *
+   * @param {number} in_size - size of the last dimention of the input array.
+   * @param {number} out_size - size of the last dimention of the output array.
+   * @param {number} n_heads - number of parallel heads to be computed (must equally divide in_size).
+   * @param {number} n_timesteps - length of text sequence to be processed bt Transformer.
+   * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
+   * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
+   */
   constructor(
     in_size: number,
     out_size: number,
@@ -221,21 +219,20 @@ export class MultiHeadSelfAttention extends Module {
   }
 }
 
-/**
- * Small block composed of two Linear layers, a ReLU non-linearity and a Dropout layer.
- *
- * @param {number} in_size - size of the last dimention of the input array.
- * @param {number} out_size - size of the last dimention of the output array.
- * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
- * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
- * @param {boolean} bias - wether to include a bias term.
- */
 export class FullyConnected extends Module {
   public l1: Linear;
   public relu: ReLU;
   public l2: Linear;
   public dropout: Dropout;
-
+  /**
+   * Small block composed of two Linear layers, a ReLU non-linearity and a Dropout layer.
+   *
+   * @param {number} in_size - size of the last dimention of the input array.
+   * @param {number} out_size - size of the last dimention of the output array.
+   * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
+   * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
+   * @param {boolean} bias - wether to include a bias term.
+   */
   constructor(in_size: number, out_size: number, dropout_prob = 0, device: string = 'cpu', bias: boolean = true) {
     super();
 
@@ -259,22 +256,22 @@ export class FullyConnected extends Module {
   }
 }
 
-/**
- * Full transformer decoder block. Composed of Multi Head Self Attention, Fully connected layers and Layer Norms.
- *
- * @param {number} in_size - size of the last dimention of the input array.
- * @param {number} out_size - size of the last dimention of the output array.
- * @param {number} n_heads - number of parallel heads to be computed (must equally divide in_size).
- * @param {number} n_timesteps - length of text sequence to be processed bt Transformer.
- * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
- * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
- */
 export class Block extends Module {
   public att: MultiHeadSelfAttention;
   public ln1: LayerNorm;
   public fcc: FullyConnected;
   public ln2: LayerNorm;
 
+  /**
+   * Full transformer decoder block. Composed of Multi Head Self Attention, Fully connected layers and Layer Norms.
+   *
+   * @param {number} in_size - size of the last dimention of the input array.
+   * @param {number} out_size - size of the last dimention of the output array.
+   * @param {number} n_heads - number of parallel heads to be computed (must equally divide in_size).
+   * @param {number} n_timesteps - length of text sequence to be processed bt Transformer.
+   * @param {number} dropout_prob - probability of zeroing each activation in dropout Layer.
+   * @param {string} device - Device to perform Tensor operations. Either "gpu" or "cpu".
+   */
   constructor(
     in_size: number,
     out_size: number,
@@ -313,18 +310,18 @@ export class Block extends Module {
 
 // Embedding Layers
 
-/**
- * Embedding class, turns indexes into vectors.
- *
- * @param {number} in_size - number of different indexes (vocabulary size).
- * @param {number} out_size - size of the embedding vector generated.
- */
 export class Embedding extends Module {
   public E: Tensor;
 
+  /**
+   * Embedding class, turns indexes into vectors.
+   *
+   * @param {number} in_size - number of different indexes (vocabulary size).
+   * @param {number} out_size - size of the embedding vector generated.
+   */
   constructor(in_size: number, embed_size: number) {
     super();
-    this.E = randn([in_size, embed_size], true, false);
+    this.E = randn([in_size, embed_size], true, 'cpu', false);
   }
 
   /**
@@ -345,18 +342,18 @@ export class Embedding extends Module {
   }
 }
 
-/**
- * Embedding class, turns indexes into vectors.
- *
- * @param {number} n_timesteps - number of different embeddings (number of timesteps in each instance in batch).
- * @param {number} embed_size - size of the embedding vector generated.
- */
 export class PositionalEmbedding extends Module {
   public E: Tensor;
 
+  /**
+   * Embedding class, turns indexes into vectors.
+   *
+   * @param {number} n_timesteps - number of different embeddings (number of timesteps in each instance in batch).
+   * @param {number} embed_size - size of the embedding vector generated.
+   */
   constructor(n_timesteps: number, embed_size: number) {
     super();
-    this.E = randn([n_timesteps, embed_size], true, false);
+    this.E = randn([n_timesteps, embed_size], true, 'cpu', false);
   }
 
   /**
@@ -376,10 +373,10 @@ export class PositionalEmbedding extends Module {
 
 // Non-linearity Layers:
 
-/**
- * Rectified Linear Unit nonlinearity. Returns z if z>0 else 0.
- */
 export class ReLU extends Module {
+  /**
+   * Rectified Linear Unit nonlinearity. Returns z if z>0 else 0.
+   */
   constructor() {
     super();
   }
@@ -413,10 +410,10 @@ export class ReLU extends Module {
   }
 }
 
-/**
- * Softmax nonlinearity class. Returns distribution of values (sum=1).
- */
 export class Softmax extends Module {
+  /**
+   * Softmax nonlinearity class. Returns distribution of values (sum=1).
+   */
   constructor() {
     super();
   }
@@ -436,14 +433,14 @@ export class Softmax extends Module {
 
 // Regularization Layers:
 
-/**
- * Dropout class, added usually after other layers, to drop values to zero with given probability
- *
- * @param {number} drop_prob - probability to drop each value in input.
- */
 export class Dropout extends Module {
   public p: number;
 
+  /**
+   * Dropout class, added usually after other layers, to drop values to zero with given probability
+   *
+   * @param {number} drop_prob - probability to drop each value in input.
+   */
   constructor(drop_prob: number) {
     super();
     this.p = drop_prob;
@@ -473,15 +470,15 @@ export class Dropout extends Module {
   }
 }
 
-/**
- * Layer Norm class, added usually after other layers to normalize across all of the output.
- *
- * @param {number} n_embed - size of the last dimention of the input.
- */
 export class LayerNorm extends Module {
   public gamma: Tensor;
   public beta: Tensor;
 
+  /**
+   * Layer Norm class, added usually after other layers to normalize across all of the output.
+   *
+   * @param {number} n_embed - size of the last dimention of the input.
+   */
   constructor(n_embed: number) {
     super();
     this.gamma = ones([n_embed], true);
@@ -498,10 +495,10 @@ export class LayerNorm extends Module {
 
 // Loss layers:
 
-/**
- * Cross Entropy Loss class, returns the loss given the output and the expected indexes.
- */
 export class CrossEntropyLoss extends Module {
+  /**
+   * Cross Entropy Loss class, returns the loss given the output and the expected indexes.
+   */
   constructor() {
     super();
   }
diff --git a/src/optim.ts b/src/optim.ts
index 9687d5a..37aeb90 100644
--- a/src/optim.ts
+++ b/src/optim.ts
@@ -1,13 +1,5 @@
 ﻿import { Parameter, Tensor, zeros } from "./tensor";
 
-/**
- * Adam optimizer class.
- * @param {(Parameter | Tensor)[]} params - List of all Parameter or Tensor (with requires_grad = True) to be optimized by Adam. "params" is usually set to nn.Module.parameters(), which automatically returns all parameters in a list form.
- * @param {number} lr - Scalar multiplying each learning step, controls speed of learning.
- * @param {number} reg - Scalar controling strength l2 regularization.
- * @param {(number)[]} betas - Two scalar floats controling how slowly the optimizer changes the "m" and "v" attributes.
- * @param {number} eps - Scalar added to denominator to stop it from ever going to zero.
- */
 export class Adam {
   // Declare Adam's types:
   params: (Parameter | Tensor)[];
@@ -16,7 +8,15 @@ export class Adam {
   b1: number;
   b2: number;
   eps: number;
-
+  
+  /**
+   * Adam optimizer class.
+   * @param {(Parameter | Tensor)[]} params - List of all Parameter or Tensor (with requires_grad = True) to be optimized by Adam. "params" is usually set to nn.Module.parameters(), which automatically returns all parameters in a list form.
+   * @param {number} lr - Scalar multiplying each learning step, controls speed of learning.
+   * @param {number} reg - Scalar controling strength l2 regularization.
+   * @param {(number)[]} betas - Two scalar floats controling how slowly the optimizer changes the "m" and "v" attributes.
+   * @param {number} eps - Scalar added to denominator to stop it from ever going to zero.
+   */
   constructor(
     params: (Parameter | Tensor)[],
     lr = 1e-3,
diff --git a/src/tensor.ts b/src/tensor.ts
index e19bcf6..698d5e2 100644
--- a/src/tensor.ts
+++ b/src/tensor.ts
@@ -398,11 +398,11 @@ export class Tensor {
 
 // <<< Parameter class, tensor that always tracks gradients >>> //
 
-/**
- * Creates new Parameter (an instance of the Tensor class that always tracks gradients).
- * @param {object} data - Iterable containing the data to be stored in the Tensor.
- */
 export class Parameter extends Tensor {
+  /**
+   * Creates new Parameter (an instance of the Tensor class that always tracks gradients).
+   * @param {object} data - Iterable containing the data to be stored in the Tensor.
+   */
   constructor(data: Array<any> | number) {
     super(data, true);
   }
@@ -681,8 +681,6 @@ class MatMul {
     }
   }
 }
-// ================================ NUEVO ================================ //
-// ================================ NUEVO ================================ //
 
 export class Pow {
   cache: any;
@@ -1956,8 +1954,8 @@ export function rand(shape: Array<number>, requires_grad = false, device = 'cpu'
 export function randn(
   shape: Array<number>,
   requires_grad = false,
-  xavier = false,
-  device = 'cpu'
+  device = 'cpu',
+  xavier = false
 ): Tensor {
   return new Tensor(
     _tensorInitializer(shape, () => {