Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add demo for comparison with libtorch (and pytorch) #1002

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions demos/PytorchComparison/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a bot to run this as part of the CI?

project(torch-basic-nn)

find_package(Torch REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

add_executable(torch-basic-nn torch-basic-nn.cpp)
target_link_libraries(torch-basic-nn "${TORCH_LIBRARIES}")
set_property(TARGET torch-basic-nn PROPERTY CXX_STANDARD 17)

153 changes: 153 additions & 0 deletions demos/PytorchComparison/clad-basic-nn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#include <chrono>
#include <iostream>
#include <random>
#include "clad/Differentiator/Differentiator.h"

/*
The code below implements a simple neural network with 3 hidden nodes and 1
output node. The neural network is trained using gradient descent to minimize
the mean squared error loss. The code uses Clad to automatically generate the
gradient computation code for the loss function.

To compile the code, use the following command:
clang++ -std=c++14 -I<path-to-clad-include-dir> -fplugin=<path-to-clad.so>
basic-nn.cpp -DBENCHMARK_MATRIX_MULTIPLY
*/

#define M 100 // num of data points
#define N 100 // num of features
#define N_ITER 5000 // num of iterations

void relu(double* arr, unsigned int n) {
for (int i = 0; i < n; ++i)
arr[i] = (arr[i] > 0) ? arr[i] : 0;
}

// Create a model of a neural network with 3 hidden nodes and 1 output node.
// It takes in input data x and weight vectors for each hidden node and the
// output node.
double model_fn(double x[N], // input data
double w1[N], // weight vector for hidden node 1
double w2[N], // weight vector for hidden node 2
double w3[N], // weight vector for hidden node 3
double w4[3] // weight vector for output node
) {
// Layer 1 - matrix multiplication
double h1 = 0, h2 = 0, h3 = 0;
for (int i = 0; i < N; ++i) {
h1 += x[i] * w1[i];
h2 += x[i] * w2[i];
h3 += x[i] * w3[i];
}
// Computing non-linear activation function
double h[3] = {h1, h2, h3};
relu(h, 3);

// Layer 2 - matrix multiplication
double o = 0;
for (int i = 0; i < 3; ++i)
o += h[i] * w4[i];
return o;
}

double loss_fn(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3]) {
double loss = 0;
for (int i = 0; i < M; ++i) {
double y_hat = model_fn(x + i * N, w1, w2, w3, w4);
loss += (y_hat - y[i]) * (y_hat - y[i]);
}
return loss / M;
}

// Perform a single gradient descent step.
// theta_x are the hypothesis parameters, t is the generated dataset and
// clad_grad is the gradient function generated by Clad
template <typename T>
void performStep(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3], T clad_grad,
double learning_rate = 1e-2) {
// Gradient computation
double grad_w1[N], grad_w2[N], grad_w3[N], grad_w4[3];
clad_grad.execute(x, y, w1, w2, w3, w4, grad_w1, grad_w2, grad_w3, grad_w4);

// Update weights for hidden nodes
for (int i = 0; i < N; ++i) {
w1[i] -= learning_rate * grad_w1[i];
w2[i] -= learning_rate * grad_w2[i];
w3[i] -= learning_rate * grad_w3[i];
}

// Update weights for output node
for (int i = 0; i < 3; ++i)
w4[i] -= learning_rate * grad_w4[i];
}

// Perform gradient descent to optimize the weights.
void optimize(double x[M * N], double y[M], double w1[N], double w2[N],
double w3[N], double w4[3], double lr, int num_iters) {
auto grad = clad::gradient(loss_fn, "w1,w2,w3,w4");
for (int i = 0; i < num_iters; ++i)
performStep(x, y, w1, w2, w3, w4, grad, lr);
}

// Benchmark the time for 100 matrix multiplications.
void benchmark_matrix_multiply(double x[M + N], double y[M]) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it matters, but shouldn't x's size be [M * N] here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also y seems to be unused here.

double z[M * N];
auto start_bench = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; ++i) {
for (int j = 0; j < M; ++j) {
for (int k = 0; k < N; ++k) {
z[j * N + k] = 0;
for (int l = 0; l < N; ++l)
z[j * N + k] += x[j * N + l] * x[j * N + l];
}
}
}
auto end_bench = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
std::cout << "Time taken for 100 matrix multiplications: "
<< elapsed_bench.count() << " seconds" << std::endl;
}

int main() {
// Initialize random number generator.
std::mt19937 gen(42);

// Generate random input data and labels.
double x[M * N], y[M];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both of the demos seem to benchmark the multiplication XX, which I believe only makes sense if M=N. Maybe transpose one of them so that it's always valid?

for (int i = 0; i < M * N; ++i)
x[i] = std::uniform_real_distribution<double>(0, 1)(gen);
for (int i = 0; i < M; ++i)
y[i] = std::uniform_real_distribution<double>(0, 1)(gen);

#ifdef BENCHMARK_MATRIX_MULTIPLY
benchmark_matrix_multiply(x, y);
#endif

// Initialize weights from a random distribution.
double w1[N], w2[N], w3[N], w4[3];
for (int i = 0; i < N; ++i) {
w1[i] = std::uniform_real_distribution<double>(0, 1)(gen);
w2[i] = std::uniform_real_distribution<double>(0, 1)(gen);
w3[i] = std::uniform_real_distribution<double>(0, 1)(gen);
}
for (int i = 0; i < 3; ++i)
w4[i] = std::uniform_real_distribution<double>(0, 1)(gen);

// Calculate the loss before optimization.
double loss = loss_fn(x, y, w1, w2, w3, w4);
std::cout << "Initial loss before optimization: " << loss << std::endl;

// Optimize the weights and compute the time taken.
auto start = std::chrono::high_resolution_clock::now();
optimize(x, y, w1, w2, w3, w4, 1e-2, N_ITER);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;

// Calculate the loss after optimization.
loss = loss_fn(x, y, w1, w2, w3, w4);
std::cout << "Final loss after optimization: " << loss << std::endl;
std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
85 changes: 85 additions & 0 deletions demos/PytorchComparison/torch-basic-nn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Do the same thing what's done in basic-nn.cpp, but using libtorch instead of
// Clad.

/*
To build and run
1. Create a build directory: mkdir build
2. Change to the build directory: cd build
3. Run cmake: cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch
4. Run make: make
5. Run the executable: ./torch-basic-nn
*/

#include <chrono>
#include <iostream>
#include <random>
#include <torch/torch.h>

#define M 100 // num of data points
#define N 100 // num of features
#define N_ITER 5000 // num of iterations

// C++ neural network model defined with value semantics
struct TwoLayerNet : torch::nn::Module {
// constructor with submodules registered in the initializer list
TwoLayerNet(int64_t D_in, int64_t D_out, int64_t H)
: linear1(register_module("linear1", torch::nn::Linear(D_in, H))),
linear2(register_module("linear2", torch::nn::Linear(H, D_out))) {}

torch::Tensor forward(torch::Tensor x) {
x = torch::relu(linear1->forward(x));
x = linear2->forward(x);
return x;
}
torch::nn::Linear linear1;
torch::nn::Linear linear2;
};

int main() {
// Generate random data
// std::default_random_engine generator;
// std::normal_distribution<double> distribution(0.0, 1.0);
torch::Tensor x = torch::randn({M, N});
torch::Tensor y = torch::randn({M, 1});

// Benchmark the time for 100 matrix multiplications
torch::Tensor z;
auto start_bench = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 100; ++i)
z = torch::matmul(x, x);
auto end_bench = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_bench = end_bench - start_bench;
std::cout << "Time taken for 100 matrix multiplications: "
<< elapsed_bench.count() << " seconds" << std::endl;

// Initialize model weights
TwoLayerNet model(N, 1, 3);

// Create the optimizer and loss function
torch::optim::SGD optimizer(model.parameters(), 0.01);
torch::nn::MSELoss loss_fn;

// Calculate the loss before optimization
torch::Tensor output = model.forward(x);
std::cout << "Initial loss before optimization: "
<< loss_fn(output, y).item<double>() << std::endl;

// Optimize
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < N_ITER; ++i) {
optimizer.zero_grad();
torch::Tensor output = model.forward(x);
torch::Tensor loss = loss_fn(output, y);
loss.backward();
optimizer.step();
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = end - start;

// Calculate the loss after optimization
output = model.forward(x);
std::cout << "Final loss after optimization: "
<< loss_fn(output, y).item<double>() << std::endl;
std::cout << "Time taken: " << elapsed.count() << " seconds" << std::endl;
return 0;
}
68 changes: 68 additions & 0 deletions demos/PytorchComparison/torch-basic-nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Do the same as torch-basic-nn.cpp but in Python using PyTorch
# To run: python torch-basic-nn.py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

M = 100 # Number of samples
N = 100 # Number of features
N_ITER = 5000 # Number of iterations

# Create a simple neural network with 3 hidden nodes and 1 output node
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(N, 3)
self.fc2 = nn.Linear(3, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x

def main():
# Create a simple dataset
x = np.random.randn(M, N).astype(np.float32)
y = np.random.randn(M, 1).astype(np.float32)

# Convert the dataset to PyTorch tensors
x = torch.from_numpy(x)
y = torch.from_numpy(y)

# Create the neural network
net = Net()

# Create the loss function and the optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

# Print the initial loss
output = net(x)
loss = criterion(output, y)
print("Initial loss: ", loss.item())

# Calculate start time
import time
start = time.time()

# Train the neural network
for i in range(N_ITER):
optimizer.zero_grad()
output = net(x)
loss = criterion(output, y)
loss.backward()
optimizer.step()

# Stop the timer
end = time.time()

# Print the final loss
output = net(x)
loss = criterion(output, y)
print("Final loss: ", loss.item())
print("Time: ", end - start, " seconds")


if __name__ == "__main__":
main()
Loading