Skip to content

Commit

Permalink
Add demo for comparison with libtorch (and pytorch)
Browse files Browse the repository at this point in the history
  • Loading branch information
vaithak committed Jul 23, 2024
1 parent f03fc98 commit 6c6dbbf
Show file tree
Hide file tree
Showing 4 changed files with 316 additions and 0 deletions.
10 changes: 10 additions & 0 deletions demos/PytorchComparison/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
project(torch-basic-nn)

find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

add_executable(torch-basic-nn torch-basic-nn.cpp)
target_link_libraries(torch-basic-nn "${TORCH_LIBRARIES}")
set_property(TARGET torch-basic-nn PROPERTY CXX_STANDARD 17)

153 changes: 153 additions & 0 deletions demos/PytorchComparison/clad-basic-nn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include <chrono>
#include <iostream>
#include <random>
#include "clad/Differentiator/Differentiator.h"

/*
The code below implements a simple neural network with 3 hidden nodes and 1
output node. The neural network is trained using gradient descent to minimize
the mean squared error loss. The code uses Clad to automatically generate the
gradient computation code for the loss function.
To compile the code, use the following command:
clang++ -std=c++14 -I<path-to-clad-include-dir> -fplugin=<path-to-clad.so>
basic-nn.cpp -DBENCHMARK_MATRIX_MULTIPLY
*/

#define M 100 // num of data points
#define N 100 // num of features
#define N_ITER 5000 // num of iterations

void relu(double* arr, unsigned int n) {
for (int i = 0; i < n; ++i)
arr[i] = (arr[i] > 0) ? arr[i] : 0;
}

// Create a model of a neural network with 3 hidden nodes and 1 output node.
// It takes in input data x and weight vectors for each hidden node and the
// output node.
double model_fn(double x[N], // input data
double w1[N], // weight vector for hidden node 1
double w2[N], // weight vector for hidden node 2
double w3[N], // weight vector for hidden node 3
double w4[3] // weight vector for output node
) {
// Layer 1 - matrix multiplication
double h1 = 0, h2 = 0, h3 = 0;
for (int i = 0; i < N; ++i) {
h1 += x[i] * w1[i];
h2 += x[i] * w2[i];
h3 += x[i] * w3[i];
}
// Computing non-linear activation function
double h[3] = {h1, h2, h3};
relu(h, 3);

// Layer 2 - matrix multiplication
double o = 0;
for (int i = 0; i < 3; ++i)
o += h[i] * w4[i];
return o;
}

double loss_fn(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3]) {
double loss = 0;
for (int i = 0; i < M; ++i) {
double y_hat = model_fn(x + i * N, w1, w2, w3, w4);
loss += (y_hat - y[i]) * (y_hat - y[i]);
}
return loss / M;
}

// Perform a single gradient descent step.
// theta_x are the hypothesis parameters, t is the generated dataset and
// clad_grad is the gradient function generated by Clad
template <typename T>
void performStep(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3], T clad_grad,
double learning_rate = 1e-2) {
// Gradient computation
double grad_w1[N], grad_w2[N], grad_w3[N], grad_w4[3];
clad_grad.execute(x, y, w1, w2, w3, w4, grad_w1, grad_w2, grad_w3, grad_w4);

// Update weights for hidden nodes
for (int i = 0; i < N; ++i) {
w1[i] -= learning_rate * grad_w1[i];
w2[i] -= learning_rate * grad_w2[i];
w3[i] -= learning_rate * grad_w3[i];
}

// Update weights for output node
for (int i = 0; i < 3; ++i)
w4[i] -= learning_rate * grad_w4[i];
}

// Perform gradient descent to optimize the weights.
void optimize(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3], double lr, int num_iters) {
auto grad = clad::gradient(loss_fn, "w1,w2,w3,w4");
for (int i = 0; i < num_iters; ++i)
performStep(x, y, w1, w2, w3, w4, grad, lr);
}

// Benchmark the time for 100 matrix multiplications.
void benchmark_matrix_multiply(double x[M + N], double y[M]) {
double z[M * N];
auto start_bench = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; ++i) {
for (int j = 0; j < M; ++j) {
for (int k = 0; k < N; ++k) {
z[j * N + k] = 0;
for (int l = 0; l < N; ++l)
z[j * N + k] += x[j * N + l] * x[j * N + l];
}
}
}
auto end_bench = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
std::cout << "Time taken for 100 matrix multiplications: "
<< elapsed_bench.count() << " seconds" << std::endl;
}

int main() {
// Initialize random number generator.
std::mt19937 gen(42);

// Generate random input data and labels.
double x[M * N], y[M];
for (int i = 0; i < M * N; ++i)
x[i] = std::uniform_real_distribution<double>(0, 1)(gen);
for (int i = 0; i < M; ++i)
y[i] = std::uniform_real_distribution<double>(0, 1)(gen);

#ifdef BENCHMARK_MATRIX_MULTIPLY
benchmark_matrix_multiply(x, y);
#endif

// Initialize weights from a random distribution.
double w1[N], w2[N], w3[N], w4[3];
for (int i = 0; i < N; ++i) {
w1[i] = std::uniform_real_distribution<double>(0, 1)(gen);
w2[i] = std::uniform_real_distribution<double>(0, 1)(gen);
w3[i] = std::uniform_real_distribution<double>(0, 1)(gen);
}
for (int i = 0; i < 3; ++i)
w4[i] = std::uniform_real_distribution<double>(0, 1)(gen);

// Calculate the loss before optimization.
double loss = loss_fn(x, y, w1, w2, w3, w4);
std::cout << "Initial loss before optimization: " << loss << std::endl;

// Optimize the weights and compute the time taken.
auto start = std::chrono::high_resolution_clock::now();
optimize(x, y, w1, w2, w3, w4, 1e-2, N_ITER);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;

// Calculate the loss after optimization.
loss = loss_fn(x, y, w1, w2, w3, w4);
std::cout << "Final loss after optimization: " << loss << std::endl;
std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
85 changes: 85 additions & 0 deletions demos/PytorchComparison/torch-basic-nn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Do the same thing what's done in basic-nn.cpp, but using libtorch instead of
// Clad.

/*
To build and run
1. Create a build directory: mkdir build
2. Change to the build directory: cd build
3. Run cmake: cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch
4. Run make: make
5. Run the executable: ./torch-basic-nn
*/

#include <chrono>
#include <iostream>
#include <random>
#include <torch/torch.h>

#define M 100 // num of data points
#define N 100 // num of features
#define N_ITER 5000 // num of iterations

// C++ neural network model defined with value semantics
struct TwoLayerNet : torch::nn::Module {
// constructor with submodules registered in the initializer list
TwoLayerNet(int64_t D_in, int64_t D_out, int64_t H)
: linear1(register_module("linear1", torch::nn::Linear(D_in, H))),
linear2(register_module("linear2", torch::nn::Linear(H, D_out))) {}

torch::Tensor forward(torch::Tensor x) {
x = torch::relu(linear1->forward(x));
x = linear2->forward(x);
return x;
}
torch::nn::Linear linear1;
torch::nn::Linear linear2;
};

int main() {
// Generate random data
// std::default_random_engine generator;
// std::normal_distribution<double> distribution(0.0, 1.0);
torch::Tensor x = torch::randn({M, N});
torch::Tensor y = torch::randn({M, 1});

// Benchmark the time for 100 matrix multiplications
torch::Tensor z;
auto start_bench = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; ++i)
z = torch::matmul(x, x);
auto end_bench = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
std::cout << "Time taken for 100 matrix multiplications: "
<< elapsed_bench.count() << " seconds" << std::endl;

// Initialize model weights
TwoLayerNet model(N, 1, 3);

// Create the optimizer and loss function
torch::optim::SGD optimizer(model.parameters(), 0.01);
torch::nn::MSELoss loss_fn;

// Calculate the loss before optimization
torch::Tensor output = model.forward(x);
std::cout << "Initial loss before optimization: "
<< loss_fn(output, y).item<double>() << std::endl;

// Optimize
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N_ITER; ++i) {
optimizer.zero_grad();
torch::Tensor output = model.forward(x);
torch::Tensor loss = loss_fn(output, y);
loss.backward();
optimizer.step();
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;

// Calculate the loss after optimization
output = model.forward(x);
std::cout << "Final loss after optimization: "
<< loss_fn(output, y).item<double>() << std::endl;
std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
68 changes: 68 additions & 0 deletions demos/PytorchComparison/torch-basic-nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Do the same as torch-basic-nn.cpp but in Python using PyTorch
# To run: python torch-basic-nn.py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

M = 100 # Number of samples
N = 100 # Number of features
N_ITER = 5000 # Number of iterations

# Create a simple neural network with 3 hidden nodes and 1 output node
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(N, 3)
self.fc2 = nn.Linear(3, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x

def main():
# Create a simple dataset
x = np.random.randn(M, N).astype(np.float32)
y = np.random.randn(M, 1).astype(np.float32)

# Convert the dataset to PyTorch tensors
x = torch.from_numpy(x)
y = torch.from_numpy(y)

# Create the neural network
net = Net()

# Create the loss function and the optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

# Print the initial loss
output = net(x)
loss = criterion(output, y)
print("Initial loss: ", loss.item())

# Calculate start time
import time
start = time.time()

# Train the neural network
for i in range(N_ITER):
optimizer.zero_grad()
output = net(x)
loss = criterion(output, y)
loss.backward()
optimizer.step()

# Stop the timer
end = time.time()

# Print the final loss
output = net(x)
loss = criterion(output, y)
print("Final loss: ", loss.item())
print("Time: ", end - start, " seconds")


if __name__ == "__main__":
main()

0 comments on commit 6c6dbbf

Please sign in to comment.