diff --git a/.github/workflows/Ubuntu.yml b/.github/workflows/Ubuntu.yml
index c618db96..68b51fd4 100644
--- a/.github/workflows/Ubuntu.yml
+++ b/.github/workflows/Ubuntu.yml
@@ -8,7 +8,9 @@ jobs:
id: cuda-toolkit
with:
cuda: '11.2.2'
- linux-local-args: '["--toolkit"]'
+ linux-local-args: '["--toolkit"]'
+ - run: sudo apt-get update
+ - run: sudo apt-get install -y xorg-dev libglu1-mesa-dev freeglut3-dev mesa-common-dev
- run: nvcc -V
- name: Checkout
uses: actions/checkout@v2
@@ -19,12 +21,7 @@ jobs:
run: cmake ../
- name: Run make
working-directory: ${{github.workspace}}/build
- run: |
- make RXMesh_test -j 99
- make Geodesic -j 99
- make MCF -j 99
- make VertexNormal -j 99
- # make Filtering -j 99
+ run: make -j 4
#- name: Run Test
# working-directory: ${{github.workspace}}/build
# run: ctest --no-compress-output -T Test -C Release --output-on-failure
diff --git a/.github/workflows/Windows.yml b/.github/workflows/Windows.yml
index 575fbc68..08ad1b7a 100644
--- a/.github/workflows/Windows.yml
+++ b/.github/workflows/Windows.yml
@@ -2,7 +2,7 @@ name: Windows
on: [push, pull_request, workflow_dispatch]
jobs:
WindowsRun:
- runs-on: windows-latest
+ runs-on: windows-2019
steps:
- uses: Jimver/cuda-toolkit@v0.2.4
id: cuda-toolkit
@@ -18,12 +18,7 @@ jobs:
working-directory: ${{github.workspace}}/build
run: cmake ../
- name: Run VS
- run: |
- cmake --build ${{github.workspace}}/build --target RXMesh_test --config Release -j 99
- cmake --build ${{github.workspace}}/build --target Geodesic --config Release -j 99
- cmake --build ${{github.workspace}}/build --target MCF --config Release -j 99
- cmake --build ${{github.workspace}}/build --target VertexNormal --config Release -j 99
- # cmake --build ${{github.workspace}}/build --target Filtering --config Release -j 99
+ run: cmake --build ${{github.workspace}}/build --clean-first --config Release -j 4
#- name: Run Test
# working-directory: ${{github.workspace}}/build
# run: ctest --no-compress-output -T Test -C Release --output-on-failure
diff --git a/.gitignore b/.gitignore
index 6b9aad91..ba62169c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
output/
-input/
+input/*
+!input/dragon.obj
+!input/diamond.obj
build/
include/rxmesh/util/git_sha1.cpp
.vscode/
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea45e707..d8292893 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,9 +5,17 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
endif()
project(RXMesh
- VERSION 0.2.0
+ VERSION 0.2.1
LANGUAGES C CXX CUDA)
+if (WIN32)
+ set(USE_POLYSCOPE "ON" CACHE BOOL "Enable Ployscope for visualization")
+ message(STATUS "Polyscope is enabled")
+else()
+ set(USE_POLYSCOPE "OFF" CACHE BOOL "Enable Ployscope for visualization")
+ message(STATUS "Polyscope is disabled")
+endif()
+
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
@@ -49,6 +57,15 @@ FetchContent_Declare(spdlog
)
FetchContent_Populate(spdlog)
+# polyscope
+if(USE_POLYSCOPE)
+FetchContent_Declare(polyscope
+ GIT_REPOSITORY https://github.com/Ahdhn/polyscope.git
+ GIT_TAG glm_patch #v1.3.0 with patch for glm
+)
+FetchContent_MakeAvailable(polyscope)
+endif()
+
# Auto-detect GPU architecture, sets ${CUDA_ARCHS}
include("cmake/AutoDetectCudaArch.cmake")
@@ -73,6 +90,9 @@ target_compile_definitions(RXMesh_header_lib
INTERFACE INPUT_DIR=${CMAKE_CURRENT_SOURCE_DIR}/input/
INTERFACE OUTPUT_DIR=${CMAKE_CURRENT_SOURCE_DIR}/output/
)
+if (USE_POLYSCOPE)
+ target_compile_definitions(RXMesh_header_lib INTERFACE USE_POLYSCOPE)
+endif()
target_include_directories( RXMesh_header_lib
INTERFACE "include"
INTERFACE "${rapidjson_SOURCE_DIR}/include"
@@ -101,7 +121,8 @@ set(cuda_flags
-use_fast_math
$<$:-O3>
--expt-relaxed-constexpr
- #-Xptxas -warn-spills -res-usage
+ -Xptxas -warn-spills -res-usage
+ --ptxas-options=-v
#-G
)
@@ -114,6 +135,10 @@ target_compile_options(developer_flags INTERFACE
target_link_libraries(RXMesh_header_lib INTERFACE $)
+if (USE_POLYSCOPE)
+ target_link_libraries(RXMesh_header_lib INTERFACE polyscope)
+endif()
+
#OpenMP
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
diff --git a/README.md b/README.md
index 4afaa1e9..9afe82f7 100644
--- a/README.md
+++ b/README.md
@@ -4,17 +4,28 @@
+## **Contents**
+- [**About**](#about)
+- [**Compilation**](#compilation)
+ * [**Dependencies**](#dependencies)
+- [**Organization**](#organization)
+- [**Programming Model**](#programming-model)
+ * [**Structures**](#structures)
+ * [**Computation**](#computation)
+ * [**Viewer**](#viewer)
+- [**Replicability**](#replicability)
+- [**Bibtex**](#bibtex)
+
## **About**
-RXMesh is a surface triangle mesh data structure and programming model for processing static meshes on the GPU. RXMesh aims at provides a high-performance, generic, and compact data structure that can handle meshes regardless of their quality (e.g., non-manifold). The programming model helps to hide the complexity of the data structure and provides an intuitive access model for different use cases. For more details, please check out our paper:
+RXMesh is a surface triangle mesh data structure and programming model for processing static meshes on the GPU. RXMesh aims at provides a high-performance, generic, and compact data structure that can handle meshes regardless of their quality (e.g., non-manifold). The programming model helps to hide the complexity of the data structure and provides an intuitive access model for different use cases. For more details, please check out our paper and GTC talk:
-*[RXMesh: A GPU Mesh Data Structure](https://escholarship.org/uc/item/8r5848vp)*
+- *[RXMesh: A GPU Mesh Data Structure](https://escholarship.org/uc/item/8r5848vp)*
*[Ahmed H. Mahmoud](https://www.ece.ucdavis.edu/~ahdhn/), [Serban D. Porumbescu](https://web.cs.ucdavis.edu/~porumbes/), and [John D. Owens](https://www.ece.ucdavis.edu/~jowens/)*
*[ACM Transaction on Graphics](https://dl.acm.org/doi/abs/10.1145/3450626.3459748) (Proceedings of SIGGRAPH 2021)*
-This repository provides 1) source code to reproduce the results presented in the paper (git tag [`v0.1.0`](https://github.com/owensgroup/RXMesh/tree/v0.1.0)) and 2) ongoing development of RXMesh. For 1), all input models used in the paper can be found [here](https://ucdavis365-my.sharepoint.com/:f:/g/personal/ahmahmoud_ucdavis_edu/En-vEpIdSGBHqvCIa-MVXRQBg5g7GfM3P3RwZBHL4Hby3w?e=2EVnJd). Models were collected from [Thingi10K](https://ten-thousand-models.appspot.com/) and [Smithsonian 3D](https://3d.si.edu/explore) repository.
+- *[RXMesh: A High-performance Mesh Data Structure and Programming Model on the GPU [S41051]](https://www.nvidia.com/gtc/session-catalog/?tab.scheduledorondemand=1583520458947001NJiE&search=rxmesh#/session/1633891051385001Q9SE)—NVIDIA GTC 2022*
-## **A Quick Glance**
-RXMesh is a CUDA/C++ header-only library. All unit tests are under `tests/` folder. This includes the unit test for some basic functionalities along with the unit test for the query operations. All applications are under `apps/` folder.
+This repository provides 1) source code to reproduce the results presented in the paper (git tag [`v0.1.0`](https://github.com/owensgroup/RXMesh/tree/v0.1.0)) and 2) ongoing development of RXMesh. For 1), all input models used in the paper can be found [here](https://ucdavis365-my.sharepoint.com/:f:/g/personal/ahmahmoud_ucdavis_edu/En-vEpIdSGBHqvCIa-MVXRQBg5g7GfM3P3RwZBHL4Hby3w?e=2EVnJd). Models were collected from [Thingi10K](https://ten-thousand-models.appspot.com/) and [Smithsonian 3D](https://3d.si.edu/explore) repository.
## **Compilation**
The code can be compiled on Ubuntu (GCC 9) and Windows (Visual Studio 2019) providing that CUDA (>=11.1.0) is installed. To run the executable(s), an NVIDIA GPU should be installed on the machine.
@@ -36,8 +47,234 @@ All the dependencies are installed automatically! To compile the code:
```
Depending on the system, this will generate either a `.sln` project on Windows or a `make` file for a Linux system.
+## **Organization**
+RXMesh is a CUDA/C++ header-only library. All unit tests are under the `tests/` folder. This includes the unit test for some basic functionalities along with the unit test for the query operations. All applications are under the `apps/` folder.
+
+## **Programming Model**
+The goal of defining a programming model is to make it easy to write applications using RXMesh without getting into the nuances of the data structure. Applications written using RXMesh are composed of one or more of the high-level building blocks defined under [**Computation**](#computation). To use these building blocks, the user would have to interact with data structures specific to RXMesh discussed under [**Structures**](#structures). Finally, RXMesh integrates [Polyscope](https://polyscope.run) as a mesh [**Viewer**](#viewer) which the user can use to render their final results or for debugging purposes.
+
+### **Structures**
+- **Attributes** are the metadata (geometry information) attached to vertices, edges, or faces. Allocation of the attributes is per-patch basis and managed internally by RXMesh. The allocation could be done on the host, device, or both. Allocating attributes on the host is only beneficial for I/O operations or initializing attributes and then eventually moving them to the device.
+ - Example: allocation
+ ```c++
+ RXMeshStatic rx("input.obj");
+ auto vertex_color =
+ rx.add_vertex_attribute("vColor", //Unique name
+ 3, //Number of attribute per vertex
+ DEVICE, //Allocation place
+ SoA); //Memory layout (SoA vs. AoS)
+
+ ```
+ - Example: reading from `std::vector`
+ ```c++
+ RXMeshStatic rx("input.obj");
+ std::vector> face_color_vector;
+ //....
+
+ auto face_color =
+ rx.add_face_attribute(face_color_vector,//Input attribute where number of attributes per face is inferred
+ "fColor", //Unique name
+ SoA); //Memory layout (SoA vs. AoS)
+ ```
+ - Example: move, reset, and copy
+ ```c++
+ //By default, attributes are allocated on both host and device
+ auto edge_attr = rx.add_edge_attribute("eAttr", 1);
+ //Initialize edge_attr on the host
+ // .....
+
+ //Move attributes from host to device
+ edge_attr.move(HOST, DEVICE);
+
+ //Reset all entries to zero
+ edge_attr.reset(0, DEVICE);
+
+ auto edge_attr_1 = rx.add_edge_attribute("eAttr1", 1);
+
+ //Copy from another attribute.
+ //Here, what is on the host sde of edge_attr will be copied into the device side of edge_attr_1
+ edge_attr_1.copy_from(edge_attr, HOST, DEVICE);
+ ```
+
+- **Handles** are the unique identifiers for vertices, edges, and faces. They are usually internally populated by RXMesh (by concatenating the patch ID and mesh element index within the patch). Handles can be used to access attributes, `for_each` operations, and query operations.
+
+ - Example: Setting vertex attribute using vertex handle
+ ```c++
+ auto vertex_color = ...
+ VertexHandle vh;
+ //...
+
+ vertex_color(vh, 0) = 0.9;
+ vertex_color(vh, 1) = 0.5;
+ vertex_color(vh, 2) = 0.6;
+ ```
+
+- **Iterators** are used during query operations to iterate over the output of the query operation. The type of iterator defines the type of mesh element iterated on e.g., `VertexIterator` iterates over vertices which is the output of `VV`, `EV`, or `FV` query operations. Since query operations are only supported on the device, iterators can be only used inside the kernel. Iterators are usually populated internally.
+
+ - Example: Iterating over faces
+ ```c++
+ FaceIterator f_iter;
+ //...
+
+ for (uint32_t f = 0; f < f_iter.size(); ++f) {
+ FaceHandle fh = f_iter[f];
+ //do something with fh ....
+ }
+ ```
+
+
+### **Computation**
+- **`for_each`** runs a computation over all vertices, edges, or faces _without_ requiring information from neighbor mesh elements. The computation run on each mesh element is defined as a lambda function that takes a handle as an input. The lambda function could run either on the host, device, or both. On the host, we parallelize the computation using OpenMP. Care must be taken for lambda function on the device since it needs to be annotated using `__device__` and it can only capture by value. More about lambda function in CUDA can be found [here](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#extended-lambda)
+ - Example: using `for_each` to initialize attributes
+ ```cpp
+ RXMeshStatic rx("input.obj");
+ auto vertex_pos = rx.get_input_vertex_coordinates(); //vertex position
+ auto vertex_color = rx.add_vertex_attribute("vColor", 3, DEVICE); //vertex color
+
+ //This function will be executed on the device
+ rx.for_each_vertex(
+ DEVICE,
+ [vertex_color, vertex_pos] __device__(const VertexHandle vh) {
+ vertex_color(vh, 0) = 0.9;
+ vertex_color(vh, 1) = vertex_pos(vh, 1);
+ vertex_color(vh, 2) = 0.9;
+ });
+ ```
+
+- **Queries** operations supported by RXMesh with description are listed below
+
+ | Query | Description |
+ |-------|:-------------------------------------------|
+ | `VV` | For vertex V, return its adjacent vertices |
+ | `VE` | For vertex V, return its incident edges |
+ | `VF` | For vertex V, return its incident faces |
+ | `EV` | For edge E, return its incident vertices |
+ | `EF` | For edge E, return its incident faces |
+ | `FV` | For face F, return its incident vertices |
+ | `FE` | For face F, return its incident edges |
+ | `FF` | For face F, return its adjacent faces |
+
+ Queries are only supported on the device. RXMesh API for queries takes a lambda function along with the type of query. The lambda function defines the computation that will be run on the query output.
+
+ - Example: [vertex normal computation](./apps/VertexNormal/vertex_normal_kernel.cuh)
+ ```cpp
+ template
+ __global__ void vertex_normal (Context context){
+ auto compute_vn = [&](FaceHandle face_id, VertexIterator& fv) {
+ //This thread is assigned to face_id
+
+ // get the face's three vertices coordinates
+ Vector<3, T> c0(coords(fv[0], 0), coords(fv[0], 1), coords(fv[0], 2));
+ Vector<3, T> c1(coords(fv[1], 0), coords(fv[1], 1), coords(fv[1], 2));
+ Vector<3, T> c2(coords(fv[2], 0), coords(fv[2], 1), coords(fv[2], 2));
+
+ //compute face normal
+ Vector<3, T> n = cross(c1 - c0, c2 - c0);
+
+ // add the face's normal to its vertices
+ for (uint32_t v = 0; v < 3; ++v) // for every vertex in this face
+ for (uint32_t i = 0; i < 3; ++i) // for the vertex 3 coordinates
+ atomicAdd(&normals(fv[v], i), n[i]);
+ };
+
+ //Query dispatcher must be called by all threads in the block.
+ //Dispatcher will first perform the query, store the results in shared memory, then
+ //run the user-defined computation i.e., compute_vn
+ query_block_dispatcher(context, compute_vn);
+ }
+ ```
+ To save computation, `query_block_dispatcher` could be run on a subset of the input mesh element i.e., _active set_. The user can define the active set using a lambda function that returns true if the input mesh element is in the active set.
+
+ - Example: defining active set
+ ```cpp
+ template
+ __global__ void active_set_query (Context context){
+ auto active_set = [&](FaceHandle face_id) -> bool{
+ // ....
+ };
+
+ auto computation = [&](FaceHandle face_id, VertexIterator& fv) {
+ // ....
+ };
+
+ query_block_dispatcher(context, computation, active_set);
+ }
+ ```
+
+- **Reduction** operations apply a binary associative operation on the input attributes. RXMesh provides dot products between two attributes (of the same type), L2 norm of an input attribute, and user-defined reduction operation on an input attribute. For user-defined reduction operation, the user needs to pass a binary reduction functor with member `__device__ T operator()(const T &a, const T &b)` or use on of [CUB's thread operators](https://github.com/NVIDIA/cub/blob/main/cub/thread/thread_operators.cuh) e.g., `cub::Max()`. Reduction operations require allocation of temporary buffers which we abstract away using `ReduceHandle`.
+
+ - Example: dot product, L2 norm, user-defined reduction
+ ```cpp
+ RXMeshStatic rx("input.obj");
+ auto vertex_attr1 = rx.add_vertex_attribute("v_attr1", 3, DEVICE);
+ auto vertex_attr2 = rx.add_vertex_attribute("v_attr2", 3, DEVICE);
+
+ // Populate vertex_attr1 and vertex_attr2
+ //....
+
+ //Reduction handle
+ ReduceHandle reduce(v1_attr);
+
+ //Dot product between two attributes. Results are returned on the host
+ float dot_product = reduce.dot(v1_attr, v2_attr);
+
+ cudaStream_t stream;
+ //init stream
+ //...
+
+ //Reduction operation could be performed on specific attribute and using specific stream
+ float l2_norm = reduce.norm2(v1_attr, //input attribute
+ 1, //attribute ID. If not specified, reduction is run on all attributes
+ stream); //stream used for reduction.
+
+
+ //User-defined reduction operation
+ float l2_norm = reduce.reduce(v1_attr, //input attribute
+ cub::Max(), //binary reduction functor
+ std::numeric_limits::lowest()); //initial value
+ ```
+
+### **Viewer**
+Starting v0.2.1, RXMesh integrates [Polyscope](https://polyscope.run) as a mesh viewer. To use it, make sure to turn on the CMake parameter `USE_POLYSCOPE` i.e.,
+
+```
+> cd build
+> cmake -DUSE_POLYSCOPE=True ../
+```
+By default, the parameter is set to True on Windows and False on Linux machines. RXMesh implements the necessary functionalities to pass attributes to Polyscope—thanks to its [data adaptors](https://polyscope.run/data_adaptors/). However, this needs attributes to be moved to the host first before passing it to Polyscope. For more information about Polyscope's different visualization options, please checkout Polyscope's [Surface Mesh documentation](https://polyscope.run/structures/surface_mesh/basics/).
+
+ - Example: [render vertex color](./tests/Polyscope_test/test_polyscope.cu)
+ ```cpp
+ //initialize polyscope
+ polyscope::init();
+
+ RXMeshStatic rx("dragon.obj");
+
+ //vertex color attribute
+ auto vertex_color = rx.add_vertex_attribute("vColor", 3);
+
+ //Populate vertex color on the device
+ //....
+
+ //Move vertex color to the host
+ vertex_color.move(DEVICE, HOST);
+
+ //polyscope instance associated with rx
+ auto polyscope_mesh = rx.get_polyscope_mesh();
+
+ //pass vertex color to polyscope
+ polyscope_mesh->addVertexColorQuantity("vColor", vertex_color);
+
+ //render
+ polyscope::show();
+ ```
+
+
+
+
+
## **Replicability**
-This repo was awarded the [replicability stamp](http://www.replicabilitystamp.org#https-github-com-owensgroup-rxmesh) by the Graphics Replicability Stamp Initiative (GRSI).
+This repo was awarded the [replicability stamp](http://www.replicabilitystamp.org#https-github-com-owensgroup-rxmesh) by the Graphics Replicability Stamp Initiative (GRSI) :tada:
The scripts used to generate the data shown in the paper can be found under
* [Figure 6](https://github.com/owensgroup/RXMesh/blob/main/tests/RXMesh_test/benchmark.sh)
@@ -46,9 +283,7 @@ The scripts used to generate the data shown in the paper can be found under
* [Figure 8 (c)](https://github.com/owensgroup/RXMesh/blob/main/apps/Filtering/benchmark.sh)
* [Figure 8 (d)](https://github.com/owensgroup/RXMesh/blob/main/apps/VertexNormal/benchmark.sh)
-Each script should be run from the script's containing directory after compiling the code in `build/` directory. The only input parameter needed is the path to the input OBJ files. The resulting JSON files will be written to `output/` directory.
-
-
+Each script should be run from the script's containing directory after compiling the code in the `build/` directory. The only input parameter needed is the path to the input OBJ files. The resulting JSON files will be written to the `output/` directory.
## **Bibtex**
```
diff --git a/apps/Filtering/filtering.cu b/apps/Filtering/filtering.cu
index 3a85ee2a..b045b573 100644
--- a/apps/Filtering/filtering.cu
+++ b/apps/Filtering/filtering.cu
@@ -33,27 +33,22 @@ TEST(App, Filtering)
cuda_query(Arg.device_id);
- // Load mesh
- std::vector> Faces;
- std::vector> Verts;
- ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
-
-
TriMesh input_mesh;
ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
- ASSERT_EQ(input_mesh.n_vertices(), Verts.size());
-
// OpenMesh Impl
- std::vector> ground_truth(Verts);
+ std::vector> ground_truth(input_mesh.n_vertices());
+ for (auto& g : ground_truth) {
+ g.resize(3);
+ }
size_t max_neighbour_size = 0;
- filtering_openmesh(
+ filtering_openmesh(
omp_get_max_threads(), input_mesh, ground_truth, max_neighbour_size);
// RXMesh Impl
- filtering_rxmesh(Faces, Verts, ground_truth, max_neighbour_size);
-
+ filtering_rxmesh(
+ Arg.obj_file_name, ground_truth, max_neighbour_size);
}
int main(int argc, char** argv)
diff --git a/apps/Filtering/filtering_rxmesh.cuh b/apps/Filtering/filtering_rxmesh.cuh
index ff528279..ac670c76 100644
--- a/apps/Filtering/filtering_rxmesh.cuh
+++ b/apps/Filtering/filtering_rxmesh.cuh
@@ -11,10 +11,9 @@
* filtering_rxmesh()
*/
template
-void filtering_rxmesh(std::vector>& Faces,
- const std::vector>& Verts,
- const std::vector>& ground_truth,
- const size_t max_neighbour_size)
+void filtering_rxmesh(const std::string file_path,
+ const std::vector>& ground_truth,
+ const size_t max_neighbour_size)
{
using namespace rxmesh;
@@ -25,7 +24,7 @@ void filtering_rxmesh(std::vector>& Faces,
"greater than maxVVSize. Should increase maxVVSize to "
<< max_neighbour_size << " to avoid illegal memory access";
- RXMeshStatic rxmesh(Faces, false);
+ RXMeshStatic rxmesh(file_path, false);
// Report
Report report("Filtering_RXMesh");
@@ -38,7 +37,7 @@ void filtering_rxmesh(std::vector>& Faces,
// input coords
- auto coords = rxmesh.add_vertex_attribute(Verts, "coords");
+ auto coords = rxmesh.get_input_vertex_coordinates();
// Vertex normals (only on device)
auto vertex_normal = rxmesh.add_vertex_attribute("vn", 3, DEVICE);
@@ -53,15 +52,16 @@ void filtering_rxmesh(std::vector>& Faces,
// vertex normal launch box
constexpr uint32_t vn_block_threads = 256;
LaunchBox vn_launch_box;
- rxmesh.prepare_launch_box(rxmesh::Op::FV,
- vn_launch_box,
- (void*)compute_vertex_normal);
+ rxmesh.prepare_launch_box(
+ {rxmesh::Op::FV},
+ vn_launch_box,
+ (void*)compute_vertex_normal);
// filter launch box
- constexpr uint32_t filter_block_threads = 512;
+ constexpr uint32_t filter_block_threads = 256;
LaunchBox filter_launch_box;
rxmesh.prepare_launch_box(
- rxmesh::Op::VV,
+ {rxmesh::Op::VV},
filter_launch_box,
(void*)bilateral_filtering);
diff --git a/apps/Geodesic/geodesic.cu b/apps/Geodesic/geodesic.cu
index c11aa7af..8b18b68e 100644
--- a/apps/Geodesic/geodesic.cu
+++ b/apps/Geodesic/geodesic.cu
@@ -36,13 +36,7 @@ TEST(App, Geodesic)
// Select device
cuda_query(Arg.device_id);
-
- // Load mesh
- std::vector> Verts;
- std::vector> Faces;
- ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
-
- RXMeshStatic rxmesh(Faces, false);
+ RXMeshStatic rxmesh(Arg.obj_file_name, false);
ASSERT_TRUE(rxmesh.is_closed())
<< "Geodesic only works on watertight/closed manifold mesh without "
"boundaries";
@@ -55,8 +49,8 @@ TEST(App, Geodesic)
std::vector h_seeds(Arg.num_seeds);
std::random_device dev;
std::mt19937 rng(dev());
- std::uniform_int_distribution dist(0,
- Verts.size());
+ std::uniform_int_distribution dist(
+ 0, rxmesh.get_num_vertices());
for (auto& s : h_seeds) {
s = dist(rng);
// s = 0;
@@ -68,15 +62,13 @@ TEST(App, Geodesic)
// sorted_index and limit. We keep it for RXMesh because it is
// used to quickly determine whether or not a vertex is within
// the "update band".
- std::vector toplesets(Verts.size(), 1u);
+ std::vector toplesets(rxmesh.get_num_vertices(), 1u);
std::vector sorted_index;
std::vector limits;
- geodesic_ptp_openmesh(
- Faces, Verts, h_seeds, sorted_index, limits, toplesets);
+ geodesic_ptp_openmesh(h_seeds, sorted_index, limits, toplesets);
// RXMesh Impl
- geodesic_rxmesh(
- rxmesh, Faces, Verts, h_seeds, sorted_index, limits, toplesets);
+ geodesic_rxmesh(rxmesh, h_seeds, sorted_index, limits, toplesets);
}
int main(int argc, char** argv)
diff --git a/apps/Geodesic/geodesic_ptp_openmesh.h b/apps/Geodesic/geodesic_ptp_openmesh.h
index e8ad2569..b8f45697 100644
--- a/apps/Geodesic/geodesic_ptp_openmesh.h
+++ b/apps/Geodesic/geodesic_ptp_openmesh.h
@@ -256,12 +256,10 @@ inline float toplesets_propagation(TriMesh& mesh,
}
template
-void geodesic_ptp_openmesh(const std::vector>& Faces,
- const std::vector>& Verts,
- const std::vector& h_seeds,
- std::vector& sorted_index,
- std::vector& limits,
- std::vector& toplesets)
+void geodesic_ptp_openmesh(const std::vector& h_seeds,
+ std::vector& sorted_index,
+ std::vector& limits,
+ std::vector& toplesets)
{
TriMesh input_mesh;
ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
@@ -275,9 +273,6 @@ void geodesic_ptp_openmesh(const std::vector>& Faces,
std::string method = "OpenMeshSingleCore";
report.add_member("method", method);
- ASSERT_TRUE(Faces.size() == input_mesh.n_faces());
- ASSERT_TRUE(Verts.size() == input_mesh.n_vertices());
-
std::vector geo_distance(input_mesh.n_vertices(),
std::numeric_limits::infinity());
@@ -303,13 +298,6 @@ void geodesic_ptp_openmesh(const std::vector>& Faces,
input_mesh, h_seeds, limits, sorted_index, geo_distance, iter);
RXMESH_TRACE("geodesic_ptp_openmesh() took {} (ms)", processing_time);
- // export_attribute_VTK("geo_openmesh.vtk",
- // Faces,
- // Verts,
- // false,
- // geo_distance.data(),
- // geo_distance.data());
-
// Finalize report
report.add_member("num_iter_taken", iter);
rxmesh::TestData td;
diff --git a/apps/Geodesic/geodesic_ptp_rxmesh.h b/apps/Geodesic/geodesic_ptp_rxmesh.h
index 11a2c819..a05fc9a5 100644
--- a/apps/Geodesic/geodesic_ptp_rxmesh.h
+++ b/apps/Geodesic/geodesic_ptp_rxmesh.h
@@ -7,10 +7,8 @@
constexpr float EPS = 10e-6;
template
-inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh,
- const std::vector>& Faces,
- const std::vector>& Verts,
- const std::vector& h_seeds,
+inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh,
+ const std::vector& h_seeds,
const std::vector& h_sorted_index,
const std::vector& h_limits,
const std::vector& toplesets)
@@ -28,7 +26,7 @@ inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh,
report.add_member("method", std::string("RXMesh"));
// input coords
- auto input_coord = rxmesh.add_vertex_attribute(Verts, "coord");
+ auto input_coord = rxmesh.get_input_vertex_coordinates();
// toplesets
auto d_toplesets = rxmesh.add_vertex_attribute(toplesets, "topleset");
@@ -36,7 +34,7 @@ inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh,
// RXMesh launch box
LaunchBox launch_box;
- rxmesh.prepare_launch_box(rxmesh::Op::VV,
+ rxmesh.prepare_launch_box({rxmesh::Op::VV},
launch_box,
(void*)relax_ptp_rxmesh,
true);
@@ -134,8 +132,7 @@ inline void geodesic_rxmesh(rxmesh::RXMeshStatic& rxmesh,
// uint32_t v_id = rxmesh.map_to_global(vh);
// geo[v_id] = (*rxmesh_geo)(vh);
//});
- // export_attribute_VTK(
- // "geo_rxmesh.vtk", Faces, Verts, false, geo.data(), geo.data());
+
GPU_FREE(d_error);
diff --git a/apps/MCF/mcf.cu b/apps/MCF/mcf.cu
index abfec068..34f2d8b7 100644
--- a/apps/MCF/mcf.cu
+++ b/apps/MCF/mcf.cu
@@ -38,26 +38,21 @@ TEST(App, MCF)
// Select device
cuda_query(Arg.device_id);
-
- // Load mesh
- std::vector> Verts;
- std::vector> Faces;
-
- ASSERT_TRUE(import_obj(Arg.obj_file_name, Verts, Faces));
-
-
- RXMeshStatic rxmesh(Faces, false);
+ RXMeshStatic rxmesh(Arg.obj_file_name, false);
TriMesh input_mesh;
ASSERT_TRUE(OpenMesh::IO::read_mesh(input_mesh, Arg.obj_file_name));
// OpenMesh Impl
- std::vector> ground_truth(Verts);
+ std::vector> ground_truth(rxmesh.get_num_vertices());
+ for (auto& g : ground_truth) {
+ g.resize(3);
+ }
mcf_openmesh(omp_get_max_threads(), input_mesh, ground_truth);
// RXMesh Impl
- mcf_rxmesh(rxmesh, Verts, ground_truth);
+ mcf_rxmesh(rxmesh, ground_truth);
}
int main(int argc, char** argv)
diff --git a/apps/MCF/mcf_rxmesh.h b/apps/MCF/mcf_rxmesh.h
index e3681c8b..6dcaf831 100644
--- a/apps/MCF/mcf_rxmesh.h
+++ b/apps/MCF/mcf_rxmesh.h
@@ -48,7 +48,6 @@ void init_PR(rxmesh::RXMeshStatic& rxmesh,
template
void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh,
- const std::vector>& Verts,
const std::vector>& ground_truth)
{
using namespace rxmesh;
@@ -71,8 +70,7 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh,
<< "mcf_rxmesh only takes watertight/closed mesh without boundaries";
// Different attributes used throughout the application
- auto input_coord =
- rxmesh.add_vertex_attribute(Verts, "coord", rxmesh::LOCATION_ALL);
+ auto input_coord = rxmesh.get_input_vertex_coordinates();
// S in CG
auto S =
@@ -95,18 +93,19 @@ void mcf_rxmesh(rxmesh::RXMeshStatic& rxmesh,
B->reset(0.0, rxmesh::DEVICE);
// X in CG (the output)
- auto X = rxmesh.add_vertex_attribute(Verts, "X", rxmesh::LOCATION_ALL);
+ auto X = rxmesh.add_vertex_attribute("X", 3, rxmesh::LOCATION_ALL);
+ X->copy_from(*input_coord, rxmesh::DEVICE, rxmesh::DEVICE);
ReduceHandle reduce_handle(*X);
// RXMesh launch box
LaunchBox launch_box_init_B;
LaunchBox launch_box_matvec;
- rxmesh.prepare_launch_box(rxmesh::Op::VV,
+ rxmesh.prepare_launch_box({rxmesh::Op::VV},
launch_box_init_B,
(void*)init_B,
true);
- rxmesh.prepare_launch_box(rxmesh::Op::VV,
+ rxmesh.prepare_launch_box({rxmesh::Op::VV},
launch_box_matvec,
(void*)rxmesh_matvec,
true);
diff --git a/apps/VertexNormal/vertex_normal.cu b/apps/VertexNormal/vertex_normal.cu
index 3fe2da5b..d8b57391 100644
--- a/apps/VertexNormal/vertex_normal.cu
+++ b/apps/VertexNormal/vertex_normal.cu
@@ -50,8 +50,9 @@ void vertex_normal_rxmesh(rxmesh::RXMeshStatic& rxmesh,
// launch box
LaunchBox launch_box;
- rxmesh.prepare_launch_box(
- rxmesh::Op::FV, launch_box, (void*)compute_vertex_normal);
+ rxmesh.prepare_launch_box({rxmesh::Op::FV},
+ launch_box,
+ (void*)compute_vertex_normal);
TestData td;
diff --git a/assets/polyscope_dragon.PNG b/assets/polyscope_dragon.PNG
new file mode 100644
index 00000000..a973ca06
Binary files /dev/null and b/assets/polyscope_dragon.PNG differ
diff --git a/include/rxmesh/attribute.h b/include/rxmesh/attribute.h
index 22d413a2..b7a08251 100644
--- a/include/rxmesh/attribute.h
+++ b/include/rxmesh/attribute.h
@@ -8,6 +8,7 @@
#include "rxmesh/kernels/collective.cuh"
#include "rxmesh/kernels/util.cuh"
#include "rxmesh/patch_info.h"
+#include "rxmesh/rxmesh.h"
#include "rxmesh/types.h"
#include "rxmesh/util/cuda_query.h"
#include "rxmesh/util/log.h"
@@ -611,12 +612,46 @@ class FaceAttribute : public Attribute
const std::vector& face_per_patch,
const uint32_t num_attributes,
locationT location,
- const layoutT layout)
- : Attribute(name)
+ const layoutT layout,
+ const RXMesh* rxmesh)
+ : Attribute(name), m_rxmesh(rxmesh)
{
this->init(face_per_patch, num_attributes, location, layout);
}
+#ifdef USE_POLYSCOPE
+ T operator()(size_t i, size_t j = 0) const
+ {
+ uint32_t p = m_rxmesh->m_patcher->get_face_patch_id(i);
+ const auto end = m_rxmesh->m_h_patches_ltog_f[p].begin() +
+ m_rxmesh->m_h_num_owned_f[p];
+ const auto lid =
+ std::lower_bound(m_rxmesh->m_h_patches_ltog_f[p].begin(), end, i);
+ if (lid == end) {
+ RXMESH_ERROR(
+ "FaceAttribute operator(i,j) can not find the local id");
+ }
+ return Attribute::operator()(
+ p, lid - m_rxmesh->m_h_patches_ltog_f[p].begin(), j);
+ }
+ size_t rows() const
+ {
+ return size();
+ }
+ size_t cols() const
+ {
+ return this->get_num_attributes();
+ }
+
+ /**
+ * @brief returns the size of the attributes i.e., number of faces
+ */
+ uint32_t size() const
+ {
+ return m_rxmesh->get_num_faces();
+ }
+#endif
+
/**
* @brief Accessing face attribute using FaceHandle
* @param f_handle input face handle
@@ -644,6 +679,9 @@ class FaceAttribute : public Attribute
auto pl = f_handle.unpack();
return Attribute::operator()(pl.first, pl.second, attr);
}
+
+ private:
+ const RXMesh* m_rxmesh;
};
@@ -673,12 +711,46 @@ class EdgeAttribute : public Attribute
const std::vector& edge_per_patch,
const uint32_t num_attributes,
locationT location,
- const layoutT layout)
- : Attribute(name)
+ const layoutT layout,
+ const RXMesh* rxmesh)
+ : Attribute(name), m_rxmesh(rxmesh)
{
this->init(edge_per_patch, num_attributes, location, layout);
}
+#ifdef USE_POLYSCOPE
+ T operator()(size_t i, size_t j = 0) const
+ {
+ uint32_t p = m_rxmesh->m_patcher->get_edge_patch_id(i);
+ const auto end = m_rxmesh->m_h_patches_ltog_e[p].begin() +
+ m_rxmesh->m_h_num_owned_e[p];
+ const auto lid =
+ std::lower_bound(m_rxmesh->m_h_patches_ltog_e[p].begin(), end, i);
+ if (lid == end) {
+ RXMESH_ERROR(
+ "EdgeAttribute operator(i,j) can not find the local id");
+ }
+ return Attribute::operator()(
+ p, lid - m_rxmesh->m_h_patches_ltog_e[p].begin(), j);
+ }
+ size_t rows() const
+ {
+ return size();
+ }
+ size_t cols() const
+ {
+ return this->get_num_attributes();
+ }
+
+ /**
+ * @brief returns the size of the attributes i.e., number of edges
+ */
+ uint32_t size() const
+ {
+ return m_rxmesh->get_num_edges();
+ }
+#endif
+
/**
* @brief Accessing edge attribute using EdgeHandle
* @param e_handle input edge handle
@@ -705,6 +777,9 @@ class EdgeAttribute : public Attribute
auto pl = e_handle.unpack();
return Attribute::operator()(pl.first, pl.second, attr);
}
+
+ private:
+ const RXMesh* m_rxmesh;
};
@@ -734,13 +809,47 @@ class VertexAttribute : public Attribute
const std::vector& vertex_per_patch,
const uint32_t num_attributes,
locationT location,
- const layoutT layout)
- : Attribute(name)
+ const layoutT layout,
+ const RXMesh* rxmesh)
+ : Attribute(name), m_rxmesh(rxmesh)
{
this->init(vertex_per_patch, num_attributes, location, layout);
}
+#ifdef USE_POLYSCOPE
+ T operator()(size_t i, size_t j = 0) const
+ {
+ uint32_t p = m_rxmesh->m_patcher->get_vertex_patch_id(i);
+ const auto end = m_rxmesh->m_h_patches_ltog_v[p].begin() +
+ m_rxmesh->m_h_num_owned_v[p];
+ const auto lid =
+ std::lower_bound(m_rxmesh->m_h_patches_ltog_v[p].begin(), end, i);
+ if (lid == end) {
+ RXMESH_ERROR(
+ "VertexAttribute operator(i,j) can not find the local id");
+ }
+ return Attribute::operator()(
+ p, lid - m_rxmesh->m_h_patches_ltog_v[p].begin(), j);
+ }
+ size_t rows() const
+ {
+ return size();
+ }
+ size_t cols() const
+ {
+ return this->get_num_attributes();
+ }
+
+ /**
+ * @brief returns the size of the attributes i.e., number of vertices
+ */
+ uint32_t size() const
+ {
+ return m_rxmesh->get_num_vertices();
+ }
+#endif
+
/**
* @brief Accessing vertex attribute using VertexHandle
* @param v_handle input face handle
@@ -768,6 +877,9 @@ class VertexAttribute : public Attribute
auto pl = v_handle.unpack();
return Attribute::operator()(pl.first, pl.second, attr);
}
+
+ private:
+ const RXMesh* m_rxmesh;
};
/**
@@ -829,7 +941,8 @@ class AttributeContainer
std::vector& element_per_patch,
uint32_t num_attributes,
locationT location,
- layoutT layout)
+ layoutT layout,
+ const RXMesh* rxmesh)
{
if (does_exist(name)) {
RXMESH_WARN(
@@ -839,7 +952,7 @@ class AttributeContainer
}
auto new_attr = std::make_shared(
- name, element_per_patch, num_attributes, location, layout);
+ name, element_per_patch, num_attributes, location, layout, rxmesh);
m_attr_container.push_back(
std::dynamic_pointer_cast(new_attr));
diff --git a/include/rxmesh/context.h b/include/rxmesh/context.h
index d93a2d72..62a2ff95 100644
--- a/include/rxmesh/context.h
+++ b/include/rxmesh/context.h
@@ -14,47 +14,29 @@ namespace rxmesh {
class Context
{
public:
+ friend class RXMesh;
+ friend class RXMeshDynamic;
+
/**
* @brief Default constructor
*/
Context()
- : m_num_edges(0),
- m_num_faces(0),
- m_num_vertices(0),
- m_num_patches(0),
+ : m_num_edges(nullptr),
+ m_num_faces(nullptr),
+ m_num_vertices(nullptr),
+ m_num_patches(nullptr),
m_patches_info(nullptr)
{
}
- /**
- * @brief initialize various members
- * @param num_edges total number of edges in the mesh
- * @param num_faces total number of faces in the mesh
- * @param num_vertices total number of vertices in the mesh
- * @param num_patches number of patches
- * @param patches pointer to PatchInfo that contains different info about
- * the patches
- */
- void init(const uint32_t num_edges,
- const uint32_t num_faces,
- const uint32_t num_vertices,
- const uint32_t num_patches,
- PatchInfo* patches)
- {
-
- m_num_edges = num_edges;
- m_num_faces = num_faces;
- m_num_vertices = num_vertices;
- m_num_patches = num_patches;
- m_patches_info = patches;
- }
+ Context(const Context&) = default;
/**
* @brief Total number of edges in mesh
*/
__device__ __forceinline__ uint32_t get_num_edges() const
{
- return m_num_edges;
+ return *m_num_edges;
}
/**
@@ -62,7 +44,7 @@ class Context
*/
__device__ __forceinline__ uint32_t get_num_faces() const
{
- return m_num_faces;
+ return *m_num_faces;
}
/**
@@ -70,7 +52,7 @@ class Context
*/
__device__ __forceinline__ uint32_t get_num_vertices() const
{
- return m_num_vertices;
+ return *m_num_vertices;
}
/**
@@ -78,7 +60,7 @@ class Context
*/
__device__ __forceinline__ uint32_t get_num_patches() const
{
- return m_num_patches;
+ return *m_num_patches;
}
/**
@@ -105,7 +87,51 @@ class Context
}
private:
- uint32_t m_num_edges, m_num_faces, m_num_vertices, m_num_patches;
+ /**
+ * @brief initialize various members
+ * @param num_edges total number of edges in the mesh
+ * @param num_faces total number of faces in the mesh
+ * @param num_vertices total number of vertices in the mesh
+ * @param num_patches number of patches
+ * @param patches pointer to PatchInfo that contains different info about
+ * the patches
+ */
+ void init(const uint32_t num_edges,
+ const uint32_t num_faces,
+ const uint32_t num_vertices,
+ const uint32_t num_patches,
+ PatchInfo* patches)
+ {
+ CUDA_ERROR(cudaMalloc((void**)&m_num_vertices, sizeof(uint32_t)));
+ CUDA_ERROR(cudaMalloc((void**)&m_num_edges, sizeof(uint32_t)));
+ CUDA_ERROR(cudaMalloc((void**)&m_num_faces, sizeof(uint32_t)));
+ CUDA_ERROR(cudaMalloc((void**)&m_num_patches, sizeof(uint32_t)));
+
+ CUDA_ERROR(cudaMemcpy(m_num_vertices,
+ &num_vertices,
+ sizeof(uint32_t),
+ cudaMemcpyHostToDevice));
+ CUDA_ERROR(cudaMemcpy(
+ m_num_edges, &num_edges, sizeof(uint32_t), cudaMemcpyHostToDevice));
+ CUDA_ERROR(cudaMemcpy(
+ m_num_faces, &num_faces, sizeof(uint32_t), cudaMemcpyHostToDevice));
+ CUDA_ERROR(cudaMemcpy(m_num_patches,
+ &num_patches,
+ sizeof(uint32_t),
+ cudaMemcpyHostToDevice));
+ m_patches_info = patches;
+ }
+
+ void release()
+ {
+ CUDA_ERROR(cudaFree(m_num_edges));
+ CUDA_ERROR(cudaFree(m_num_faces));
+ CUDA_ERROR(cudaFree(m_num_vertices));
+ CUDA_ERROR(cudaFree(m_num_patches));
+ }
+
+
+ uint32_t * m_num_edges, *m_num_faces, *m_num_vertices, *m_num_patches;
PatchInfo* m_patches_info;
};
} // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/iterator.cuh b/include/rxmesh/iterator.cuh
index 5eda0450..0ccc4346 100644
--- a/include/rxmesh/iterator.cuh
+++ b/include/rxmesh/iterator.cuh
@@ -19,14 +19,13 @@ struct Iterator
const uint16_t* not_owned_local_id,
int shift = 0)
: m_patch_output(patch_output),
- m_patch_offset(patch_offset),
m_patch_id(patch_id),
m_num_owned(num_owned),
m_not_owned_patch(not_owned_patch),
m_not_owned_local_id(not_owned_local_id),
m_shift(shift)
{
- set(local_id, offset_size);
+ set(local_id, offset_size, patch_offset);
}
Iterator(const Iterator& orig) = default;
@@ -45,7 +44,7 @@ struct Iterator
if (lid < m_num_owned) {
return {m_patch_id, lid};
} else {
- lid -= m_num_owned;
+ lid -= m_num_owned;
return {m_not_owned_patch[lid], m_not_owned_local_id[lid]};
}
}
@@ -109,7 +108,6 @@ struct Iterator
private:
const LocalT* m_patch_output;
- const uint16_t* m_patch_offset;
const uint32_t m_patch_id;
const uint32_t* m_not_owned_patch;
const uint16_t* m_not_owned_local_id;
@@ -120,13 +118,15 @@ struct Iterator
uint16_t m_current;
int m_shift;
- __device__ void set(const uint16_t local_id, const uint32_t offset_size)
+ __device__ void set(const uint16_t local_id,
+ const uint32_t offset_size,
+ const uint16_t* patch_offset)
{
m_current = 0;
m_local_id = local_id;
if (offset_size == 0) {
- m_begin = m_patch_offset[m_local_id];
- m_end = m_patch_offset[m_local_id + 1];
+ m_begin = patch_offset[m_local_id];
+ m_end = patch_offset[m_local_id + 1];
} else {
m_begin = m_local_id * offset_size;
m_end = (m_local_id + 1) * offset_size;
diff --git a/include/rxmesh/kernels/attribute.cuh b/include/rxmesh/kernels/attribute.cuh
index 6a4b3e90..cd3cbdec 100644
--- a/include/rxmesh/kernels/attribute.cuh
+++ b/include/rxmesh/kernels/attribute.cuh
@@ -28,16 +28,22 @@ __launch_bounds__(blockSize) __global__
const uint16_t* d_element_per_patch,
const uint32_t num_patches,
const uint32_t num_attributes,
- T* d_block_output)
+ T* d_block_output,
+ uint32_t attribute_id)
{
uint32_t p_id = blockIdx.x;
if (p_id < num_patches) {
const uint16_t element_per_patch = d_element_per_patch[p_id];
T thread_val = 0;
for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
- for (uint32_t j = 0; j < num_attributes; ++j) {
- const T val = X(p_id, i, j);
+ if (attribute_id != INVALID32) {
+ const T val = X(p_id, i, attribute_id);
thread_val += val * val;
+ } else {
+ for (uint32_t j = 0; j < num_attributes; ++j) {
+ const T val = X(p_id, i, j);
+ thread_val += val * val;
+ }
}
}
@@ -53,7 +59,8 @@ __launch_bounds__(blockSize) __global__
const uint16_t* d_element_per_patch,
const uint32_t num_patches,
const uint32_t num_attributes,
- T* d_block_output)
+ T* d_block_output,
+ uint32_t attribute_id)
{
assert(X.get_num_attributes() == Y.get_num_attributes());
@@ -62,8 +69,13 @@ __launch_bounds__(blockSize) __global__
const uint16_t element_per_patch = d_element_per_patch[p_id];
T thread_val = 0;
for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
- for (uint32_t j = 0; j < num_attributes; ++j) {
- thread_val += X(p_id, i, j) * Y(p_id, i, j);
+ if (attribute_id != INVALID32) {
+ thread_val +=
+ X(p_id, i, attribute_id) * Y(p_id, i, attribute_id);
+ } else {
+ for (uint32_t j = 0; j < num_attributes; ++j) {
+ thread_val += X(p_id, i, j) * Y(p_id, i, j);
+ }
}
}
@@ -71,6 +83,45 @@ __launch_bounds__(blockSize) __global__
}
}
+
+template
+__launch_bounds__(blockSize) __global__
+ void generic_reduce(const Attribute X,
+ const uint16_t* d_element_per_patch,
+ const uint32_t num_patches,
+ const uint32_t num_attributes,
+ T* d_block_output,
+ ReductionOp reduction_op,
+ T init,
+ uint32_t attribute_id)
+{
+ uint32_t p_id = blockIdx.x;
+ if (p_id < num_patches) {
+ const uint16_t element_per_patch = d_element_per_patch[p_id];
+ T thread_val = init;
+ for (uint16_t i = threadIdx.x; i < element_per_patch; i += blockSize) {
+ if (attribute_id != INVALID32) {
+ const T val = X(p_id, i, attribute_id);
+ thread_val = reduction_op(thread_val, val);
+ } else {
+ for (uint32_t j = 0; j < num_attributes; ++j) {
+ const T val = X(p_id, i, j);
+ thread_val = reduction_op(thread_val, val);
+ }
+ }
+ }
+ typedef cub::BlockReduce BlockReduce;
+ __shared__ typename BlockReduce::TempStorage temp_storage;
+
+ T block_aggregate =
+ BlockReduce(temp_storage).Reduce(thread_val, reduction_op);
+ if (threadIdx.x == 0) {
+ d_block_output[blockIdx.x] = block_aggregate;
+ }
+ }
+}
+
+
template
__global__ void memset_attribute(const Attribute attr,
const T value,
diff --git a/include/rxmesh/kernels/collective.cuh b/include/rxmesh/kernels/collective.cuh
index 43baec8c..2df386f7 100644
--- a/include/rxmesh/kernels/collective.cuh
+++ b/include/rxmesh/kernels/collective.cuh
@@ -5,6 +5,7 @@
#include "rxmesh/util/macros.h"
namespace rxmesh {
+namespace detail {
/**
* @brief Compute block-wide exclusive sum using CUB
*/
@@ -93,5 +94,5 @@ __device__ __forceinline__ void cub_block_exclusive_sum(T* data,
data[size] = s_prv_run_aggregate;
}*/
}
-
+} // namespace detail
} // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/edge_flip.cuh b/include/rxmesh/kernels/edge_flip.cuh
new file mode 100644
index 00000000..af0313c3
--- /dev/null
+++ b/include/rxmesh/kernels/edge_flip.cuh
@@ -0,0 +1,190 @@
+namespace rxmesh {
+
+namespace detail {
+/**
+ * @brief
+ */
+template
+__device__ __inline__ void edge_flip(PatchInfo& patch_info,
+ const predicateT predicate)
+{
+ // Extract the argument in the predicate lambda function
+ using PredicateTTraits = detail::FunctionTraits;
+ using HandleT = typename PredicateTTraits::template arg<0>::type;
+ static_assert(
+ std::is_same_v,
+ "First argument in predicate lambda function should be EdgeHandle");
+
+ // patch basic info
+ const uint16_t num_faces = patch_info.num_faces;
+ const uint16_t num_edges = patch_info.num_edges;
+ const uint16_t num_owned_edges = patch_info.num_owned_edges;
+
+ // shared memory used to store EF, EV, FE, and info about the flipped edge
+ __shared__ uint16_t s_num_flipped_edges;
+ extern __shared__ uint16_t shrd_mem[];
+ uint16_t* s_fe = shrd_mem;
+ uint16_t* s_ef = &shrd_mem[3 * num_faces + (3 * num_faces) % 2];
+ uint16_t* s_ev = s_ef;
+ uint16_t* s_flipped = &s_ef[2 * num_edges];
+
+ if (threadIdx.x == 0) {
+ s_num_flipped_edges = 0;
+ }
+ // Initialize EF to invalid values
+ // Cast EF into 32-but to fix the bank conflicts
+ uint32_t* s_ef32 = reinterpret_cast(s_ef);
+ for (uint16_t i = threadIdx.x; i < num_edges; i += blockThreads) {
+ s_ef32[i] = INVALID32;
+ }
+
+ // load FE into shared memory
+ load_async(reinterpret_cast(patch_info.fe),
+ num_faces * 3,
+ reinterpret_cast(s_fe),
+ true);
+ __syncthreads();
+
+ // Transpose FE into EF so we obtain the two incident triangles to
+ // to-be-flipped edges. We use the version that is optimized for
+ // manifolds (we don't flip non-manifold edges)
+ e_f_manifold(num_edges, num_faces, s_fe, s_ef);
+ __syncthreads();
+
+ // load over all edges---one thread per edge
+ uint16_t local_id = threadIdx.x;
+ while (local_id < num_owned_edges) {
+
+ // check if we should flip this edge based on the user-supplied
+ // predicate
+ if (predicate({patch_info.patch_id, local_id})) {
+
+ // read the two faces incident to this edge
+ const uint16_t f0 = s_ef[2 * local_id];
+ const uint16_t f1 = s_ef[2 * local_id + 1];
+
+ // if the edge is boundary (i.e., only incident to one face), then
+ // we don't flip
+ if (f0 != INVALID16 && f1 != INVALID16) {
+ const uint16_t flipped_id = atomicAdd(&s_num_flipped_edges, 1);
+
+ // for each flipped edge, we add it to shared memory buffer
+ // along with one of its incident faces
+ s_flipped[2 * flipped_id] = local_id;
+ s_flipped[2 * flipped_id + 1] = f0;
+
+ // the three edges incident to the first face
+ uint16_t f0_e[3];
+ f0_e[0] = s_fe[3 * f0];
+ f0_e[1] = s_fe[3 * f0 + 1];
+ f0_e[2] = s_fe[3 * f0 + 2];
+
+ // the flipped edge position in first face
+ const uint16_t l0 = ((f0_e[0] >> 1) == local_id) ?
+ 0 :
+ (((f0_e[1] >> 1) == local_id) ? 1 : 2);
+
+ // the three edges incident to the second face
+ uint16_t f1_e[3];
+ f1_e[0] = s_fe[3 * f1];
+ f1_e[1] = s_fe[3 * f1 + 1];
+ f1_e[2] = s_fe[3 * f1 + 2];
+
+ // the flipped edge position in second face
+ const uint16_t l1 = ((f1_e[0] >> 1) == local_id) ?
+ 0 :
+ (((f1_e[1] >> 1) == local_id) ? 1 : 2);
+
+ const uint16_t f0_shift = 3 * f0;
+ const uint16_t f1_shift = 3 * f1;
+
+ s_fe[f0_shift + ((l0 + 1) % 3)] = f0_e[l0];
+ s_fe[f1_shift + ((l1 + 1) % 3)] = f1_e[l1];
+
+ s_fe[f1_shift + l1] = f0_e[(l0 + 1) % 3];
+ s_fe[f0_shift + l0] = f1_e[(l1 + 1) % 3];
+ }
+ }
+ local_id += blockThreads;
+ }
+ __syncthreads();
+
+ // If flipped at least one edge
+ if (s_num_flipped_edges > 0) {
+ // we store the changes we made to FE in global memory
+ detail::load_uint16(
+ s_fe, 3 * num_faces, reinterpret_cast(patch_info.fe));
+
+ // load EV in the same place that was used to store EF
+ load_async(reinterpret_cast(patch_info.ev),
+ num_edges * 2,
+ reinterpret_cast(s_ev),
+ true);
+ __syncthreads();
+
+ // Now, we go over all edge that has been flipped
+ for (uint32_t e = threadIdx.x; e < s_num_flipped_edges;
+ e += blockThreads) {
+
+ // grab the edge that was flipped and one of its incident faces
+ const uint16_t edge = s_flipped[2 * e];
+ const uint16_t face = s_flipped[2 * e + 1];
+
+ // grab the three edges incident to this face
+ uint16_t fe[3];
+ fe[0] = s_fe[3 * face];
+ fe[1] = s_fe[3 * face + 1];
+ fe[2] = s_fe[3 * face + 2];
+
+ // find the flipped edge's position in this face incident edges
+ const uint16_t l =
+ ((fe[0] >> 1) == edge) ? 0 : (((fe[1] >> 1) == edge) ? 1 : 2);
+
+ // find which edge is next and which is before the flipped edge
+ // in this face incident edge
+ const uint16_t next_edge = fe[(l + 1) % 3];
+ const uint16_t prev_edge = fe[(l + 2) % 3];
+
+ // grab the two end vertices of the next edge
+ uint16_t next_edge_v[2];
+ next_edge_v[0] = s_ev[2 * (next_edge >> 1)];
+ next_edge_v[1] = s_ev[2 * (next_edge >> 1) + 1];
+
+ // grab the two end vertices of the previous edge
+ uint16_t prev_edge_v[2];
+ prev_edge_v[0] = s_ev[2 * (prev_edge >> 1)];
+ prev_edge_v[1] = s_ev[2 * (prev_edge >> 1) + 1];
+
+ // Find which vertex from the next edge that is not common between
+ // the two incident vertices in the next and previous edges
+ const uint16_t n = (next_edge_v[0] == prev_edge_v[0] ||
+ next_edge_v[0] == prev_edge_v[1]) ?
+ next_edge_v[1] :
+ next_edge_v[0];
+
+ // Find which vertex from the previous edge that is not common
+ // between the two incident vertices in the next and previous edges
+ const uint16_t p = (prev_edge_v[0] == next_edge_v[0] ||
+ prev_edge_v[0] == next_edge_v[1]) ?
+ prev_edge_v[1] :
+ prev_edge_v[0];
+
+ // The flipped edge connects p and n vertices. The order depends on
+ // the edge direction
+ if ((fe[l] & 1)) {
+ s_ev[2 * edge] = n;
+ s_ev[2 * edge + 1] = p;
+ } else {
+ s_ev[2 * edge] = p;
+ s_ev[2 * edge + 1] = n;
+ }
+ }
+
+ // We store the changes in EV to global memory
+ __syncthreads();
+ detail::load_uint16(
+ s_ev, num_edges * 2, reinterpret_cast(patch_info.ev));
+ }
+}
+} // namespace detail
+} // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/kernels/loader.cuh b/include/rxmesh/kernels/loader.cuh
index 9f4fb62c..ba6af8a8 100644
--- a/include/rxmesh/kernels/loader.cuh
+++ b/include/rxmesh/kernels/loader.cuh
@@ -2,11 +2,36 @@
#include
#include
+
+#include
+#include
#include "rxmesh/context.h"
#include "rxmesh/local.h"
#include "rxmesh/types.h"
+#include "rxmesh/util/util.h"
namespace rxmesh {
+namespace detail {
+
+template
+__device__ __inline__ void load_async(const T* in,
+ const SizeT size,
+ T* out,
+ bool with_wait)
+{
+ namespace cg = cooperative_groups;
+ cg::thread_block block = cg::this_thread_block();
+
+ cg::memcpy_async(
+ block,
+ out,
+ in,
+ sizeof(T) * size);
+
+ if (with_wait) {
+ cg::wait(block);
+ }
+}
template
__device__ __forceinline__ void load_uint16(const uint16_t* in,
@@ -32,109 +57,124 @@ __device__ __forceinline__ void load_uint16(const uint16_t* in,
/**
- * @brief load the patch FE
- * @param patch_info input patch info
- * @param patch_faces output FE
- * @return
- */
-template
-__device__ __forceinline__ void load_patch_FE(const PatchInfo& patch_info,
- LocalEdgeT* fe)
-{
- load_uint16(reinterpret_cast(patch_info.fe),
- patch_info.num_faces * 3,
- reinterpret_cast(fe));
-}
-
-/**
- * @brief load the patch EV
- * @param patch_info input patch info
- * @param ev output EV
- * @return
- */
-template
-__device__ __forceinline__ void load_patch_EV(const PatchInfo& patch_info,
- LocalVertexT* ev)
-{
- const uint32_t num_edges = patch_info.num_edges;
- const uint32_t* input_ev32 =
- reinterpret_cast(patch_info.ev);
- uint32_t* output_ev32 = reinterpret_cast(ev);
-#pragma unroll 2
- for (uint32_t i = threadIdx.x; i < num_edges; i += blockThreads) {
- uint32_t a = input_ev32[i];
- output_ev32[i] = a;
- }
-}
-
-/**
- * @brief load the patch topology i.e., EV and FE
- * @param patch_info input patch info
- * @param load_ev input indicates if we should load EV
- * @param load_fe input indicates if we should load FE
+ * @brief load the patch topology based on the requirements of a query operation
+ * @tparam op the query operation
+ * @param patch_info input patch info
* @param s_ev where EV will be loaded
* @param s_fe where FE will be loaded
+ * @param with_wait wither to add a sync at the end
* @return
*/
-template
-__device__ __forceinline__ void load_mesh(const PatchInfo& patch_info,
- const bool load_ev,
- const bool load_fe,
- LocalVertexT*& s_ev,
- LocalEdgeT*& s_fe)
+template
+__device__ __forceinline__ void load_mesh_async(const PatchInfo& patch_info,
+ uint16_t*& s_ev,
+ uint16_t*& s_fe,
+ bool with_wait)
{
+ assert(s_ev == s_fe);
- if (load_ev) {
- load_patch_EV(patch_info, s_ev);
- }
- // load patch faces
- if (load_fe) {
- if (load_ev) {
- // if we loaded the edges, then we need to move where
- // s_fe is pointing at to avoid overwrite
- s_fe =
- reinterpret_cast(&s_ev[patch_info.num_edges * 2]);
+ switch (op) {
+ case Op::VV: {
+ load_async(reinterpret_cast(patch_info.ev),
+ 2 * patch_info.num_edges,
+ s_ev,
+ with_wait);
+ break;
}
- load_patch_FE(patch_info, s_fe);
- }
-}
-
-template
-__device__ __forceinline__ void load_not_owned_local_id(
- const uint16_t num_not_owned,
- uint16_t* output_not_owned_local_id,
- const uint16_t* input_not_owned_local_id)
-{
- load_uint16(
- input_not_owned_local_id, num_not_owned, output_not_owned_local_id);
-}
+ case Op::VE: {
+ assert(2 * patch_info.num_edges > patch_info.num_vertices);
+ load_async(reinterpret_cast(patch_info.ev),
+ 2 * patch_info.num_edges,
+ s_ev,
+ with_wait);
+ break;
+ }
+ case Op::VF: {
+ assert(3 * patch_info.num_faces > patch_info.num_vertices);
+ // TODO need to revisit this
+ s_ev = s_fe + 3 * patch_info.num_faces;
+ load_async(reinterpret_cast(patch_info.fe),
+ 3 * patch_info.num_faces,
+ s_fe,
+ false);
+ load_async(reinterpret_cast(patch_info.ev),
+ 2 * patch_info.num_edges,
+ s_ev,
+ with_wait);
+ break;
+ }
+ case Op::FV: {
+ // TODO need to revisit this
+ s_fe = s_ev + 2 * patch_info.num_edges;
+ load_async(reinterpret_cast(patch_info.ev),
+ 2 * patch_info.num_edges,
+ s_ev,
+ false);
-template
-__device__ __forceinline__ void load_not_owned_patch(
- const uint16_t num_not_owned,
- uint32_t* output_not_owned_patch,
- const uint32_t* input_not_owned_patch)
-{
- for (uint32_t i = threadIdx.x; i < num_not_owned; i += blockThreads) {
- output_not_owned_patch[i] = input_not_owned_patch[i];
+ load_async(reinterpret_cast(patch_info.fe),
+ 3 * patch_info.num_faces,
+ s_fe,
+ with_wait);
+ break;
+ }
+ case Op::FE: {
+ load_async(reinterpret_cast(patch_info.fe),
+ 3 * patch_info.num_faces,
+ s_fe,
+ with_wait);
+ break;
+ }
+ case Op::FF: {
+ load_async(reinterpret_cast(patch_info.fe),
+ 3 * patch_info.num_faces,
+ s_fe,
+ with_wait);
+ break;
+ }
+ case Op::EV: {
+ load_async(reinterpret_cast(patch_info.ev),
+ 2 * patch_info.num_edges,
+ s_ev,
+ with_wait);
+ break;
+ }
+ case Op::EF: {
+ assert(3 * patch_info.num_faces > patch_info.num_edges);
+ load_async(reinterpret_cast(patch_info.fe),
+ 3 * patch_info.num_faces,
+ s_fe,
+ with_wait);
+ break;
+ }
+ default: {
+ assert(1 != 1);
+ break;
+ }
}
}
/**
- * @brief Load local id and patch of the not-owned verteices, edges, or faces
+ * @brief Load local id and patch of the not-owned vertices, edges, or faces
* based on query op.
+ * @tparam op the query operation
* @param patch_info input patch info
* @param not_owned_local_id output local id
* @param not_owned_patch output patch id
- * @param num_not_owned number of not-owned mesh elements
+ * @param num_owned number of owned mesh elements
+ * @param with_wait to set a block sync after loading the memory
*/
-template
-__device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
- uint16_t*& not_owned_local_id,
- uint32_t*& not_owned_patch,
- uint16_t& num_owned)
+template
+__device__ __forceinline__ void load_not_owned_async(
+ const PatchInfo& patch_info,
+ uint16_t*& not_owned_local_id,
+ uint32_t*& not_owned_patch,
+ uint16_t& num_owned,
+ bool with_wait)
{
- uint32_t num_not_owned = 0;
+ uint16_t num_not_owned = 0;
+ uint32_t* g_not_owned_patch = nullptr;
+ uint16_t* g_not_owned_local_id = nullptr;
+
switch (op) {
case Op::VV: {
num_owned = patch_info.num_owned_vertices;
@@ -146,12 +186,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch = not_owned_patch + 2 * patch_info.num_edges;
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_v));
+
+ g_not_owned_patch = patch_info.not_owned_patch_v;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_v);
break;
}
case Op::VE: {
@@ -164,12 +202,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch = not_owned_patch + 2 * patch_info.num_edges;
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_e);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_e));
+
+ g_not_owned_patch = patch_info.not_owned_patch_e;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_e);
break;
}
case Op::VF: {
@@ -183,12 +219,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch = not_owned_patch + shift;
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_f));
+
+ g_not_owned_patch = patch_info.not_owned_patch_f;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_f);
break;
}
case Op::FV: {
@@ -196,14 +230,13 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
num_not_owned = patch_info.num_vertices - num_owned;
assert(2 * patch_info.num_edges >= (1 + 2) * num_not_owned);
+
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_v));
+
+ g_not_owned_patch = patch_info.not_owned_patch_v;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_v);
break;
}
case Op::FE: {
@@ -217,12 +250,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch + DIVIDE_UP(3 * patch_info.num_faces, 2);
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_e);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_e));
+
+ g_not_owned_patch = patch_info.not_owned_patch_e;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_e);
break;
}
case Op::FF: {
@@ -231,12 +262,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_f));
+
+ g_not_owned_patch = patch_info.not_owned_patch_f;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_f);
break;
}
case Op::EV: {
@@ -249,12 +278,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch = not_owned_patch + patch_info.num_edges;
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_v);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_v));
+
+ g_not_owned_patch = patch_info.not_owned_patch_v;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_v);
break;
}
case Op::EF: {
@@ -267,12 +294,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
not_owned_patch = not_owned_patch + 3 * patch_info.num_faces;
not_owned_local_id =
reinterpret_cast(not_owned_patch + num_not_owned);
- load_not_owned_patch(
- num_not_owned, not_owned_patch, patch_info.not_owned_patch_f);
- load_not_owned_local_id(
- num_not_owned,
- not_owned_local_id,
- reinterpret_cast(patch_info.not_owned_id_f));
+
+ g_not_owned_patch = patch_info.not_owned_patch_f;
+ g_not_owned_local_id =
+ reinterpret_cast(patch_info.not_owned_id_f);
break;
}
default: {
@@ -280,6 +305,10 @@ __device__ __forceinline__ void load_not_owned(const PatchInfo& patch_info,
break;
}
}
-}
+ load_async(g_not_owned_patch, num_not_owned, not_owned_patch, false);
+ load_async(
+ g_not_owned_local_id, num_not_owned, not_owned_local_id, with_wait);
+}
+} // namespace detail
} // namespace rxmesh
diff --git a/include/rxmesh/kernels/query_dispatcher.cuh b/include/rxmesh/kernels/query_dispatcher.cuh
index ae3c1387..e7ff2d46 100644
--- a/include/rxmesh/kernels/query_dispatcher.cuh
+++ b/include/rxmesh/kernels/query_dispatcher.cuh
@@ -33,13 +33,6 @@ __device__ __inline__ void query_block_dispatcher(const PatchInfo& patch_info,
{
static_assert(op != Op::EE, "Op::EE is not supported!");
- constexpr bool load_fe = (op == Op::VF || op == Op::EE || op == Op::EF ||
- op == Op::FV || op == Op::FE || op == Op::FF);
- constexpr bool loead_ev = (op == Op::VV || op == Op::VE || op == Op::VF ||
- op == Op::EV || op == Op::FV);
- static_assert(loead_ev || load_fe,
- "At least faces or edges needs to be loaded");
-
// Check if any of the mesh elements are in the active set
// input mapping does not need to be stored in shared memory since it will
// be read coalesced, we can rely on L1 cache here
@@ -70,34 +63,38 @@ __device__ __inline__ void query_block_dispatcher(const PatchInfo& patch_info,
}
// 2) Load the patch info
+ // TODO need shift shrd_mem to be aligned to 128-byte boundary
extern __shared__ uint16_t shrd_mem[];
- LocalVertexT* s_ev = reinterpret_cast(shrd_mem);
- LocalEdgeT* s_fe = reinterpret_cast(shrd_mem);
- load_mesh(patch_info, loead_ev, load_fe, s_ev, s_fe);
+ uint16_t* s_ev = shrd_mem;
+ uint16_t* s_fe = shrd_mem;
+ load_mesh_async(patch_info, s_ev, s_fe, true);
not_owned_patch = reinterpret_cast(shrd_mem);
not_owned_local_id = shrd_mem;
num_owned = 0;
+
+ __syncthreads();
+
// 3)Perform the query operation
if (oriented) {
assert(op == Op::VV);
- if constexpr (op == Op::VV) {
- __syncthreads();
- v_v_oreinted(patch_info,
- s_output_offset,
- s_output_value,
- reinterpret_cast(s_ev));
+ if constexpr (op == Op::VV) {
+ v_v_oreinted(
+ patch_info, s_output_offset, s_output_value, s_ev);
}
} else {
if constexpr (!(op == Op::VV || op == Op::FV || op == Op::FF)) {
- load_not_owned(
- patch_info, not_owned_local_id, not_owned_patch, num_owned);
+ load_not_owned_async(patch_info,
+ not_owned_local_id,
+ not_owned_patch,
+ num_owned,
+ true);
}
- __syncthreads();
+
query(s_output_offset,
s_output_value,
- reinterpret_cast(s_ev),
- reinterpret_cast(s_fe),
+ s_ev,
+ s_fe,
patch_info.num_vertices,
patch_info.num_edges,
patch_info.num_faces);
@@ -108,8 +105,8 @@ __device__ __inline__ void query_block_dispatcher(const PatchInfo& patch_info,
// need to sync since we will overwrite things that are used in
// query
__syncthreads();
- load_not_owned(
- patch_info, not_owned_local_id, not_owned_patch, num_owned);
+ load_not_owned_async(
+ patch_info, not_owned_local_id, not_owned_patch, num_owned, true);
}
@@ -158,7 +155,7 @@ __device__ __inline__ void query_block_dispatcher(const Context& context,
uint32_t* not_owned_patch(nullptr);
uint16_t* not_owned_local_id(nullptr);
- detail::template query_block_dispatcher(
+ query_block_dispatcher(
context.get_patches_info()[patch_id],
compute_active_set,
oriented,
diff --git a/include/rxmesh/kernels/rxmesh_queries.cuh b/include/rxmesh/kernels/rxmesh_queries.cuh
index b77459c8..e88a49bb 100644
--- a/include/rxmesh/kernels/rxmesh_queries.cuh
+++ b/include/rxmesh/kernels/rxmesh_queries.cuh
@@ -10,6 +10,8 @@
#include "rxmesh/types.h"
namespace rxmesh {
+namespace detail {
+
template
@@ -92,6 +94,26 @@ __device__ __forceinline__ void block_mat_transpose(const uint32_t num_rows,
}
}
+template
+__device__ __forceinline__ void e_f_manifold(const uint16_t num_edges,
+ const uint16_t num_faces,
+ const uint16_t* s_fe,
+ uint16_t* s_ef)
+{
+ // s_ef should be filled with INVALID16 before calling this function
+
+ for (uint16_t e = threadIdx.x; e < 3 * num_faces; e += blockThreads) {
+ uint16_t edge = s_fe[e] >> 1;
+ uint16_t face_id = e / 3;
+
+ auto ret = atomicCAS(s_ef + 2 * edge, INVALID16, face_id);
+ if (ret != INVALID16) {
+ ret = atomicCAS(s_ef + 2 * edge + 1, INVALID16, face_id);
+ assert(ret == INVALID16);
+ }
+ }
+}
+
template
__device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info,
uint16_t*& s_output_offset,
@@ -112,8 +134,10 @@ __device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info,
uint16_t* s_fe = &s_output_value[2 * num_edges];
uint16_t* s_ef = &s_fe[3 * num_faces + (3 * num_faces) % 2];
LocalEdgeT* temp_fe = reinterpret_cast(s_fe);
- load_patch_FE(patch_info, temp_fe);
-
+ load_async(reinterpret_cast(patch_info.fe),
+ num_faces * 3,
+ reinterpret_cast(temp_fe),
+ true);
for (uint32_t i = threadIdx.x; i < num_edges * 2; i += blockThreads) {
s_ef[i] = INVALID16;
@@ -133,17 +157,7 @@ __device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info,
// We need to sync here to make sure that s_fe is loaded but there is
// a sync in block_mat_transpose that takes care of this
-
- for (uint16_t e = threadIdx.x; e < 3 * num_faces; e += blockThreads) {
- uint16_t edge = s_fe[e] >> 1;
- uint16_t face_id = e / 3;
-
- auto ret = atomicCAS(s_ef + 2 * edge, INVALID16, face_id);
- if (ret != INVALID16) {
- ret = atomicCAS(s_ef + 2 * edge + 1, INVALID16, face_id);
- assert(ret == INVALID16);
- }
- }
+ e_f_manifold(num_edges, num_faces, s_fe, s_ef);
// To orient, we pin the first edge and check all the subsequent edges
// For each edge, we search for the two faces containing it (should be
@@ -209,7 +223,10 @@ __device__ __forceinline__ void v_v_oreinted(const PatchInfo& patch_info,
// Load EV into s_ef since both has the same size (2*#E)
s_ev = s_ef;
LocalVertexT* temp_ev = reinterpret_cast(s_ef);
- load_patch_EV(patch_info, temp_ev);
+ load_async(reinterpret_cast(patch_info.ev),
+ num_edges * 2,
+ reinterpret_cast(temp_ev),
+ true);
__syncthreads();
@@ -293,6 +310,7 @@ __device__ __forceinline__ void v_v(const uint16_t num_vertices,
}
}
+template
__device__ __forceinline__ void f_v(const uint16_t num_edges,
const uint16_t* d_edges,
const uint16_t num_faces,
@@ -304,7 +322,7 @@ __device__ __forceinline__ void f_v(const uint16_t num_edges,
// face in d_faces (i.e., three items), then this thread
// can safely over-write what is in d_faces.
- for (uint32_t f = threadIdx.x; f < num_faces; f += blockDim.x) {
+ for (uint32_t f = threadIdx.x; f < num_faces; f += blockThreads) {
uint16_t f_v[3];
uint32_t f_id = 3 * f;
// TODO use vector load and store instead of looping
@@ -340,7 +358,7 @@ __device__ __forceinline__ void v_f(const uint16_t num_faces,
// Second, the transpose happens in place i.e., d_faces will hold the
// offset and d_edges will hold the value (row id)
- f_v(num_edges, d_edges, num_faces, d_faces);
+ f_v(num_edges, d_edges, num_faces, d_faces);
__syncthreads();
block_mat_transpose<3u, blockThreads>(
@@ -493,7 +511,7 @@ __device__ __forceinline__ void query(uint16_t*& s_output_offset,
}
case Op::FV: {
s_output_value = s_fe;
- f_v(num_edges, s_ev, num_faces, s_fe);
+ f_v(num_edges, s_ev, num_faces, s_fe);
break;
}
case Op::FE: {
@@ -515,5 +533,5 @@ __device__ __forceinline__ void query(uint16_t*& s_output_offset,
break;
}
}
-
+} // namespace detail
} // namespace rxmesh
diff --git a/include/rxmesh/kernels/update_dispatcher.cuh b/include/rxmesh/kernels/update_dispatcher.cuh
new file mode 100644
index 00000000..48410d7a
--- /dev/null
+++ b/include/rxmesh/kernels/update_dispatcher.cuh
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "rxmesh/context.h"
+#include "rxmesh/handle.h"
+#include "rxmesh/kernels/edge_flip.cuh"
+#include "rxmesh/kernels/loader.cuh"
+#include "rxmesh/patch_info.h"
+#include "rxmesh/util/meta.h"
+
+namespace rxmesh {
+
+/**
+ * @brief The main entry for update operations. This function should be called
+ * by the whole block. In this function, threads will be assigned to mesh
+ * elements depending on the update operations.The user should supply a
+ * predicate lambda function that check if the update operation should be done
+ * on the input mesh element.
+ * @tparam DynOp the type of update operation
+ * @tparam blockThreads the number of CUDA threads in the block
+ * @tparam predicateT the type of predicate lambda function (inferred)
+ * @param context which store various parameters needed for the update
+ * operation. The context can be obtained from RXMeshDynamic
+ * @param predicate the predicate lambda function that will be executed by
+ * each thread in the block. This lambda function takes one input parameters
+ * which is a handle depending on the update operations e.g., EdgeHandle for
+ * edge flip. This lambda function should return a boolean that is true only if
+ * the update operation should be done on the give mesh element
+ */
+template
+__device__ __inline__ void update_block_dispatcher(const Context& context,
+ const predicateT predicate)
+{
+
+ const uint32_t patch_id = blockIdx.x;
+
+ if (patch_id >= context.get_num_patches()) {
+ return;
+ }
+
+ if constexpr (op == DynOp::EdgeFlip) {
+ detail::edge_flip(context.get_patches_info()[patch_id],
+ predicate);
+ }
+}
+
+} // namespace rxmesh
\ No newline at end of file
diff --git a/include/rxmesh/reduce_handle.h b/include/rxmesh/reduce_handle.h
index a44bb9ce..61dce5ad 100644
--- a/include/rxmesh/reduce_handle.h
+++ b/include/rxmesh/reduce_handle.h
@@ -56,12 +56,15 @@ class ReduceHandle
* output on the host
* @param attr1 first input attribute
* @param attr2 second input attribute
+ * @param attribute_id specific attribute ID to compute its dot product.
+ * Default is INVALID32 which compute dot product for all attributes
* @param stream stream to run the computation on
* @return the output of dot product on the host
*/
T dot(const Attribute& attr1,
const Attribute& attr2,
- cudaStream_t stream = NULL)
+ uint32_t attribute_id = INVALID32,
+ cudaStream_t stream = NULL)
{
if ((attr1.get_allocated() & DEVICE) != DEVICE ||
(attr2.get_allocated() & DEVICE) != DEVICE) {
@@ -77,19 +80,24 @@ class ReduceHandle
attr1.m_d_element_per_patch,
m_num_patches,
attr1.get_num_attributes(),
- m_d_reduce_1st_stage);
+ m_d_reduce_1st_stage,
+ attribute_id);
- return reduce_2nd_stage(stream);
+ return reduce_2nd_stage(stream, cub::Sum(), 0);
}
/**
* @brief compute L2 norm between two input attributes and return the output
* on the host
* @param attr input attribute
+ * @param attribute_id specific attribute ID to compute its norm2. Default
+ * is INVALID32 which compute norm2 for all attributes
* @param stream stream to run the computation on
* @return the output of L2 norm on the host
*/
- T norm2(const Attribute& attr, cudaStream_t stream = NULL)
+ T norm2(const Attribute& attr,
+ uint32_t attribute_id = INVALID32,
+ cudaStream_t stream = NULL)
{
if ((attr.get_allocated() & DEVICE) != DEVICE) {
RXMESH_ERROR(
@@ -103,23 +111,81 @@ class ReduceHandle
attr.m_d_element_per_patch,
m_num_patches,
attr.get_num_attributes(),
- m_d_reduce_1st_stage);
+ m_d_reduce_1st_stage,
+ attribute_id);
- return std::sqrt(reduce_2nd_stage(stream));
+ return std::sqrt(reduce_2nd_stage(stream, cub::Sum(), 0));
}
+ /**
+ * @brief performn generic reduction operations on an input attribute
+ * @tparam ReductionOp type of the binary reduction functor having member T
+ * operator()(const T &a, const T &b)
+ * @param attr input attribute
+ * @param reduction_op the binary reduction functor. It is possible to use
+ * CUB built-in reduction functor like cub::Max(), cub::Sum(). An example of
+ * user-defined:
+ *
+ * struct CustomMin
+ * {
+ * template
+ * __device__ __forceinline__ T operator()(const T& a, const T& b) const
+ * {
+ * return (b < a) ? b : a;
+ * }
+ * };
+ * Read more about reduction from CUB doc
+ * https://nvlabs.github.io/cub/structcub_1_1_device_reduce.html
+ * @param init initial value for reduction. This should be the "neutral"
+ * value for the reduction operations e.g., 0 for sum, 1 for multiplication,
+ * 0 for max on uint32_t
+ * @param attribute_id specific attribute ID to compute its reduction.
+ * Default is INVALID32 which compute reduction for all attributes
+ * @param stream stream to run the computation on
+ * @return the reduced output on the host
+ */
+
+ template
+ T reduce(const Attribute& attr,
+ ReductionOp reduction_op,
+ T init,
+ uint32_t attribute_id = INVALID32,
+ cudaStream_t stream = NULL)
+ {
+ if ((attr.get_allocated() & DEVICE) != DEVICE) {
+ RXMESH_ERROR(
+ "ReduceHandle::reduce() input attribute to should be "
+ "allocated on the device");
+ }
+
+ detail::generic_reduce
+ <<>>(
+ attr,
+ attr.m_d_element_per_patch,
+ m_num_patches,
+ attr.get_num_attributes(),
+ m_d_reduce_1st_stage,
+ reduction_op,
+ init,
+ attribute_id);
+
+ return reduce_2nd_stage(stream, reduction_op, init);
+ }
private:
- T reduce_2nd_stage(cudaStream_t stream)
+ template
+ T reduce_2nd_stage(cudaStream_t stream, ReductionOp reduction_op, T init)
{
T h_output = 0;
- cub::DeviceReduce::Sum(m_d_reduce_temp_storage,
- m_reduce_temp_storage_bytes,
- m_d_reduce_1st_stage,
- m_d_reduce_2nd_stage,
- m_num_patches,
- stream);
+ cub::DeviceReduce::Reduce(m_d_reduce_temp_storage,
+ m_reduce_temp_storage_bytes,
+ m_d_reduce_1st_stage,
+ m_d_reduce_2nd_stage,
+ m_num_patches,
+ reduction_op,
+ init,
+ stream);
CUDA_ERROR(cudaMemcpyAsync(&h_output,
m_d_reduce_2nd_stage,
diff --git a/include/rxmesh/rxmesh.cpp b/include/rxmesh/rxmesh.cpp
index a9c28b07..16145ea1 100644
--- a/include/rxmesh/rxmesh.cpp
+++ b/include/rxmesh/rxmesh.cpp
@@ -11,7 +11,7 @@
#include "rxmesh/util/util.h"
namespace rxmesh {
-RXMesh::RXMesh(const std::vector>& fv, const bool quite)
+RXMesh::RXMesh()
: m_num_edges(0),
m_num_faces(0),
m_num_vertices(0),
@@ -28,15 +28,21 @@ RXMesh::RXMesh(const std::vector>& fv, const bool quite)
m_patch_size(512),
m_is_input_edge_manifold(true),
m_is_input_closed(true),
- m_quite(quite),
+ m_quite(false),
m_d_patches_info(nullptr),
m_h_patches_info(nullptr)
{
+}
+
+void RXMesh::init(const std::vector>& fv,
+ const bool quite)
+{
+ m_quite = quite;
+
// Build everything from scratch including patches
if (fv.empty()) {
RXMESH_ERROR(
- "RXMesh::RXMesh input fv is empty. Can not be build RXMesh "
- "properly");
+ "RXMesh::init input fv is empty. Can not be build RXMesh properly");
}
build(fv);
build_device();
@@ -108,6 +114,7 @@ RXMesh::~RXMesh()
}
GPU_FREE(m_d_patches_info);
free(m_h_patches_info);
+ m_rxmesh_context.release();
}
void RXMesh::build(const std::vector>& fv)
diff --git a/include/rxmesh/rxmesh.h b/include/rxmesh/rxmesh.h
index cbf10331..0cdd70ae 100644
--- a/include/rxmesh/rxmesh.h
+++ b/include/rxmesh/rxmesh.h
@@ -193,8 +193,10 @@ class RXMesh
RXMesh(const RXMesh&) = delete;
- RXMesh(const std::vector>& fv,
- const bool quite = false);
+ RXMesh();
+
+ void init(const std::vector>& fv,
+ const bool quite = false);
/**
* @brief build different supporting data structure used to build RXMesh
@@ -246,9 +248,15 @@ class RXMesh
uint32_t get_edge_id(const std::pair