Add demo for comparison with libtorch (and pytorch)

vaithak · Jul 23, 2024 · 6c6dbbf · 6c6dbbf
1 parent f03fc98
commit 6c6dbbf
Show file tree

Hide file tree

Showing 4 changed files with 316 additions and 0 deletions.
diff --git a/demos/PytorchComparison/CMakeLists.txt b/demos/PytorchComparison/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+project(torch-basic-nn)
+
+find_package(Torch REQUIRED)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
+
+add_executable(torch-basic-nn torch-basic-nn.cpp)
+target_link_libraries(torch-basic-nn "${TORCH_LIBRARIES}")
+set_property(TARGET torch-basic-nn PROPERTY CXX_STANDARD 17)
+
diff --git a/demos/PytorchComparison/clad-basic-nn.cpp b/demos/PytorchComparison/clad-basic-nn.cpp
@@ -0,0 +1,153 @@
+#include <chrono>
+#include <iostream>
+#include <random>
+#include "clad/Differentiator/Differentiator.h"
+
+/*
+  The code below implements a simple neural network with 3 hidden nodes and 1
+  output node. The neural network is trained using gradient descent to minimize
+  the mean squared error loss. The code uses Clad to automatically generate the
+  gradient computation code for the loss function.
+
+  To compile the code, use the following command:
+  clang++ -std=c++14 -I<path-to-clad-include-dir> -fplugin=<path-to-clad.so>
+  basic-nn.cpp -DBENCHMARK_MATRIX_MULTIPLY
+*/
+
+#define M 100       // num of data points
+#define N 100       // num of features
+#define N_ITER 5000 // num of iterations
+
+void relu(double* arr, unsigned int n) {
+  for (int i = 0; i < n; ++i)
+    arr[i] = (arr[i] > 0) ? arr[i] : 0;
+}
+
+// Create a model of a neural network with 3 hidden nodes and 1 output node.
+// It takes in input data x and weight vectors for each hidden node and the
+// output node.
+double model_fn(double x[N],  // input data
+                double w1[N], // weight vector for hidden node 1
+                double w2[N], // weight vector for hidden node 2
+                double w3[N], // weight vector for hidden node 3
+                double w4[3]  // weight vector for output node
+) {
+  // Layer 1 - matrix multiplication
+  double h1 = 0, h2 = 0, h3 = 0;
+  for (int i = 0; i < N; ++i) {
+    h1 += x[i] * w1[i];
+    h2 += x[i] * w2[i];
+    h3 += x[i] * w3[i];
+  }
+  // Computing non-linear activation function
+  double h[3] = {h1, h2, h3};
+  relu(h, 3);
+
+  // Layer 2 - matrix multiplication
+  double o = 0;
+  for (int i = 0; i < 3; ++i)
+    o += h[i] * w4[i];
+  return o;
+}
+
+double loss_fn(double x[M * N], double y[M], double w1[N], double w2[N],
+               double w3[N], double w4[3]) {
+  double loss = 0;
+  for (int i = 0; i < M; ++i) {
+    double y_hat = model_fn(x + i * N, w1, w2, w3, w4);
+    loss += (y_hat - y[i]) * (y_hat - y[i]);
+  }
+  return loss / M;
+}
+
+// Perform a single gradient descent step.
+// theta_x are the hypothesis parameters, t is the generated dataset and
+// clad_grad is the gradient function generated by Clad
+template <typename T>
+void performStep(double x[M * N], double y[M], double w1[N], double w2[N],
+                 double w3[N], double w4[3], T clad_grad,
+                 double learning_rate = 1e-2) {
+  // Gradient computation
+  double grad_w1[N], grad_w2[N], grad_w3[N], grad_w4[3];
+  clad_grad.execute(x, y, w1, w2, w3, w4, grad_w1, grad_w2, grad_w3, grad_w4);
+
+  // Update weights for hidden nodes
+  for (int i = 0; i < N; ++i) {
+    w1[i] -= learning_rate * grad_w1[i];
+    w2[i] -= learning_rate * grad_w2[i];
+    w3[i] -= learning_rate * grad_w3[i];
+  }
+
+  // Update weights for output node
+  for (int i = 0; i < 3; ++i)
+    w4[i] -= learning_rate * grad_w4[i];
+}
+
+// Perform gradient descent to optimize the weights.
+void optimize(double x[M * N], double y[M], double w1[N], double w2[N],
+              double w3[N], double w4[3], double lr, int num_iters) {
+  auto grad = clad::gradient(loss_fn, "w1,w2,w3,w4");
+  for (int i = 0; i < num_iters; ++i)
+    performStep(x, y, w1, w2, w3, w4, grad, lr);
+}
+
+// Benchmark the time for 100 matrix multiplications.
+void benchmark_matrix_multiply(double x[M + N], double y[M]) {
+  double z[M * N];
+  auto start_bench = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < 100; ++i) {
+    for (int j = 0; j < M; ++j) {
+      for (int k = 0; k < N; ++k) {
+        z[j * N + k] = 0;
+        for (int l = 0; l < N; ++l)
+          z[j * N + k] += x[j * N + l] * x[j * N + l];
+      }
+    }
+  }
+  auto end_bench = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
+  std::cout << "Time taken for 100 matrix multiplications: "
+            << elapsed_bench.count() << " seconds" << std::endl;
+}
+
+int main() {
+  // Initialize random number generator.
+  std::mt19937 gen(42);
+
+  // Generate random input data and labels.
+  double x[M * N], y[M];
+  for (int i = 0; i < M * N; ++i)
+    x[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+  for (int i = 0; i < M; ++i)
+    y[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+
+#ifdef BENCHMARK_MATRIX_MULTIPLY
+  benchmark_matrix_multiply(x, y);
+#endif
+
+  // Initialize weights from a random distribution.
+  double w1[N], w2[N], w3[N], w4[3];
+  for (int i = 0; i < N; ++i) {
+    w1[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+    w2[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+    w3[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+  }
+  for (int i = 0; i < 3; ++i)
+    w4[i] = std::uniform_real_distribution<double>(0, 1)(gen);
+
+  // Calculate the loss before optimization.
+  double loss = loss_fn(x, y, w1, w2, w3, w4);
+  std::cout << "Initial loss before optimization: " << loss << std::endl;
+
+  // Optimize the weights and compute the time taken.
+  auto start = std::chrono::high_resolution_clock::now();
+  optimize(x, y, w1, w2, w3, w4, 1e-2, N_ITER);
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+
+  // Calculate the loss after optimization.
+  loss = loss_fn(x, y, w1, w2, w3, w4);
+  std::cout << "Final loss after optimization: " << loss << std::endl;
+  std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
+  return 0;
+}
diff --git a/demos/PytorchComparison/torch-basic-nn.cpp b/demos/PytorchComparison/torch-basic-nn.cpp
@@ -0,0 +1,85 @@
+// Do the same thing what's done in basic-nn.cpp, but using libtorch instead of
+// Clad.
+
+/*
+  To build and run
+  1. Create a build directory: mkdir build
+  2. Change to the build directory: cd build
+  3. Run cmake: cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch
+  4. Run make: make
+  5. Run the executable: ./torch-basic-nn
+*/
+
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <torch/torch.h>
+
+#define M 100       // num of data points
+#define N 100       // num of features
+#define N_ITER 5000 // num of iterations
+
+// C++ neural network model defined with value semantics
+struct TwoLayerNet : torch::nn::Module {
+  // constructor with submodules registered in the initializer list
+  TwoLayerNet(int64_t D_in, int64_t D_out, int64_t H)
+      : linear1(register_module("linear1", torch::nn::Linear(D_in, H))),
+        linear2(register_module("linear2", torch::nn::Linear(H, D_out))) {}
+
+  torch::Tensor forward(torch::Tensor x) {
+    x = torch::relu(linear1->forward(x));
+    x = linear2->forward(x);
+    return x;
+  }
+  torch::nn::Linear linear1;
+  torch::nn::Linear linear2;
+};
+
+int main() {
+  // Generate random data
+  // std::default_random_engine generator;
+  // std::normal_distribution<double> distribution(0.0, 1.0);
+  torch::Tensor x = torch::randn({M, N});
+  torch::Tensor y = torch::randn({M, 1});
+
+  // Benchmark the time for 100 matrix multiplications
+  torch::Tensor z;
+  auto start_bench = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < 100; ++i)
+    z = torch::matmul(x, x);
+  auto end_bench = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
+  std::cout << "Time taken for 100 matrix multiplications: "
+            << elapsed_bench.count() << " seconds" << std::endl;
+
+  // Initialize model weights
+  TwoLayerNet model(N, 1, 3);
+
+  // Create the optimizer and loss function
+  torch::optim::SGD optimizer(model.parameters(), 0.01);
+  torch::nn::MSELoss loss_fn;
+
+  // Calculate the loss before optimization
+  torch::Tensor output = model.forward(x);
+  std::cout << "Initial loss before optimization: "
+            << loss_fn(output, y).item<double>() << std::endl;
+
+  // Optimize
+  auto start = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < N_ITER; ++i) {
+    optimizer.zero_grad();
+    torch::Tensor output = model.forward(x);
+    torch::Tensor loss = loss_fn(output, y);
+    loss.backward();
+    optimizer.step();
+  }
+  auto end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> elapsed = end - start;
+
+  // Calculate the loss after optimization
+  output = model.forward(x);
+  std::cout << "Final loss after optimization: "
+            << loss_fn(output, y).item<double>() << std::endl;
+  std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
+  return 0;
+}
diff --git a/demos/PytorchComparison/torch-basic-nn.py b/demos/PytorchComparison/torch-basic-nn.py
@@ -0,0 +1,68 @@
+# Do the same as torch-basic-nn.cpp but in Python using PyTorch
+# To run: python torch-basic-nn.py
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import numpy as np
+
+M = 100 # Number of samples
+N = 100   # Number of features
+N_ITER = 5000 # Number of iterations
+
+# Create a simple neural network with 3 hidden nodes and 1 output node
+class Net(nn.Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = nn.Linear(N, 3)
+        self.fc2 = nn.Linear(3, 1)
+    def forward(self, x):
+        x = torch.relu(self.fc1(x))
+        x = self.fc2(x)
+        return x
+
+def main():
+    # Create a simple dataset
+    x = np.random.randn(M, N).astype(np.float32)
+    y = np.random.randn(M, 1).astype(np.float32)
+
+    # Convert the dataset to PyTorch tensors
+    x = torch.from_numpy(x)
+    y = torch.from_numpy(y)
+
+    # Create the neural network
+    net = Net()
+
+    # Create the loss function and the optimizer
+    criterion = nn.MSELoss()
+    optimizer = optim.SGD(net.parameters(), lr=0.01)
+
+    # Print the initial loss
+    output = net(x)
+    loss = criterion(output, y)
+    print("Initial loss: ", loss.item())
+
+    # Calculate start time
+    import time
+    start = time.time()
+
+    # Train the neural network
+    for i in range(N_ITER):
+        optimizer.zero_grad()
+        output = net(x)
+        loss = criterion(output, y)
+        loss.backward()
+        optimizer.step()
+
+    # Stop the timer
+    end = time.time()
+
+    # Print the final loss
+    output = net(x)
+    loss = criterion(output, y)
+    print("Final loss: ", loss.item())
+    print("Time: ", end - start, " seconds")
+
+
+if __name__ == "__main__":
+    main()