forked from cp2k/cp2k
-
Notifications
You must be signed in to change notification settings - Fork 0
/
JURECA-gpu.psmp
459 lines (426 loc) · 16.1 KB
/
JURECA-gpu.psmp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
#!/bin/bash
#
# CP2K arch file for JURECA at FZ Juelich (2 AMD EPYC 7742 CPUs and 4 A100 GPUs per node)
#
# Tested with: GNU 11.3.0, ParaStationMPI,
# ScaLAPACK 2.2.1, OpenBLAS 0.3.25,
# FFTW 3.3.10, COSMA 2.6.6,
# ELPA 2023.05.001, HDF5 1.14.2,
# LIBINT 2.6.0, LIBPEXSI 1.2.0,
# LIBXC 6.2.2, LIBVORI 220621,
# LIBXSMM 1.17, PLUMED 2.9.0,
# SIRIUS 7.5.2, SPGLIB 1.16.2
#
# Usage: Source this arch file and then run make as instructed.
# A full toolchain installation is performed as default.
# Replace or adapt the "module add" commands below if needed.
#
# Last update: 15.01.2024
#
# \
if [ "${0}" = "${BASH_SOURCE}" ]; then \
echo "ERROR: Script ${0##*/} must be sourced"; \
echo "Usage: source ${0##*/}"; \
exit 1; \
fi; \
this_file=${BASH_SOURCE##*/}; \
module purge; \
module add GCC/11.3.0; \
module add ParaStationMPI/5.8.0-1-mt; \
module add CUDA/11.7; \
module list; \
module save cp2k_jureca_gpu_psmp; \
echo "To load the required modules in your batch job script, use:"; \
echo " module restore cp2k_jureca_gpu_psmp"; \
cd tools/toolchain; \
./install_cp2k_toolchain.sh --enable-cuda=yes --gpu-ver=A100 --install-all -j32 --no-arch-files --with-gcc=system --with-mpich=system; \
cd ../..; \
printf "Sourcing ${PWD}/tools/toolchain/install/setup ... "; \
source ${PWD}/tools/toolchain/install/setup; \
printf "done\n"; \
echo "Check the output above for error messages and consistency!"; \
echo; \
echo "If everything is OK, you can build a CP2K production binary with"; \
echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.}"; \
echo; \
echo "Alternatively, you can add further checks, e.g. for regression testing, with"; \
echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} DO_CHECKS=yes"; \
echo "or build CP2K as a library with"; \
echo " make -j ARCH=${this_file%.*} VERSION=${this_file##*.} libcp2k"; \
echo; \
return
# Set options
DO_CHECKS := no
SHARED := no
TARGET_CPU := native
USE_ACC := yes
USE_COSMA := 2.6.6
USE_ELPA := 2023.05.001
USE_FFTW := 3.3.10
USE_HDF5 := 1.14.2
USE_LIBINT := 2.6.0
USE_LIBPEXSI := 1.2.0
USE_LIBTORCH := 1.12.1
USE_LIBVORI := 220621
USE_LIBXC := 6.2.2
USE_LIBXSMM := 1.17
USE_OPENBLAS := 0.3.25
USE_PLUMED := 2.9.0
USE_QUIP := 0.9.10
USE_SCALAPACK := 2.2.1
USE_SIRIUS := 7.5.2
USE_SPGLIB := 1.16.2
# Only needed for SIRIUS
LIBVDWXC_VER := 0.4.0
SPFFT_VER := 1.0.6
SPLA_VER := 1.5.5
# Only needed for LIBPEXSI
SCOTCH_VER := 6.0.0
SUPERLU_VER := 6.1.0
LMAX := 5
MAX_CONTR := 4
ifeq ($(USE_ACC), yes)
GPUVER := A100
OFFLOAD_TARGET := cuda
OFFLOAD_CC := nvcc
endif
CC := mpicc
CXX := mpicxx
FC := mpifort
LD := mpifort
AR := ar -r
# cc, CC, and ftn include already the proper -march flag
CFLAGS := -O2 -fPIC -fopenmp -fopenmp-simd -ftree-vectorize -funroll-loops -g -mtune=$(TARGET_CPU)
DFLAGS := -D__parallel
DFLAGS += -D__SCALAPACK
DFLAGS += -D__MAX_CONTR=$(strip $(MAX_CONTR))
INSTALL_PATH := $(PWD)/tools/toolchain/install
ifeq ($(SHARED), yes)
LD_SHARED := $(FC) -shared
CFLAGS += -fPIC
LDFLAGS := -Wl,--enable-new-dtags
CP2K_LIB := $(PWD)/lib/$(ARCH)/$(ONEVERSION)
LDFLAGS += -Wl,-rpath=$(CP2K_LIB)
LDFLAGS += -Wl,-rpath=$(CP2K_LIB)/exts/dbcsr
endif
# Settings for regression testing
ifeq ($(DO_CHECKS), yes)
DFLAGS += -D__CHECK_DIAG
# CFLAGS_DEBUG := -fsanitize=address
CFLAGS_DEBUG := -fsanitize=leak
# FCFLAGS_DEBUG := -fcheck=bounds,do,recursion,pointer
FCFLAGS_DEBUG += -fcheck=all,no-array-temps
FCFLAGS_DEBUG += -ffpe-trap=invalid,overflow,zero
FCFLAGS_DEBUG += -fimplicit-none
FCFLAGS_DEBUG += -finit-derived
FCFLAGS_DEBUG += -finit-real=snan
FCFLAGS_DEBUG += -finit-integer=-42
FCFLAGS_DEBUG += -finline-matmul-limit=0
WFLAGS := -Werror=aliasing
WFLAGS += -Werror=ampersand
WFLAGS += -Werror=c-binding-type
WFLAGS += -Werror=conversion
WFLAGS += -Werror=intrinsic-shadow
WFLAGS += -Werror=intrinsics-std
WFLAGS += -Werror=line-truncation
WFLAGS += -Wrealloc-lhs
WFLAGS += -Werror=tabs
WFLAGS += -Werror=target-lifetime
WFLAGS += -Werror=underflow
WFLAGS += -Werror=unused-but-set-variable
WFLAGS += -Werror=unused-dummy-argument
WFLAGS += -Werror=unused-variable
endif
ifeq ($(USE_ACC), yes)
DFLAGS += -D__DBCSR_ACC
DFLAGS += -D__OFFLOAD_CUDA
# Possibly no performance gain with PW_CUDA currently
DFLAGS += -D__NO_OFFLOAD_PW
endif
ifneq ($(USE_PLUMED),)
USE_PLUMED := $(strip $(USE_PLUMED))
PLUMED_LIB := $(INSTALL_PATH)/plumed-$(USE_PLUMED)/lib
DFLAGS += -D__PLUMED2
USE_GSL := 2.7
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(PLUMED_LIB) -L$(PLUMED_LIB) -lplumed -lplumedKernel
else
LIBS += $(PLUMED_LIB)/libplumed.a
endif
endif
ifneq ($(USE_ELPA),)
USE_ELPA := $(strip $(USE_ELPA))
ifeq ($(USE_ACC), yes)
TARGET := nvidia
DFLAGS += -D__ELPA_NVIDIA_GPU
else
TARGET := cpu
endif
ELPA_INC := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/include/elpa_openmp-$(USE_ELPA)
ELPA_LIB := $(INSTALL_PATH)/elpa-$(USE_ELPA)/$(TARGET)/lib
CFLAGS += -I$(ELPA_INC)/elpa -I$(ELPA_INC)/modules
DFLAGS += -D__ELPA
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(ELPA_LIB) -L$(ELPA_LIB) -lelpa_openmp
else
LIBS += $(ELPA_LIB)/libelpa_openmp.a
endif
endif
ifneq ($(USE_QUIP),)
USE_QUIP := $(strip $(USE_QUIP))
QUIP_INC := $(INSTALL_PATH)/quip-$(USE_QUIP)/include
QUIP_LIB := $(INSTALL_PATH)/quip-$(USE_QUIP)/lib
CFLAGS += -I$(QUIP_INC)
DFLAGS += -D__QUIP
LIBS += $(QUIP_LIB)/libquip_core.a
LIBS += $(QUIP_LIB)/libatoms.a
LIBS += $(QUIP_LIB)/libFoX_sax.a
LIBS += $(QUIP_LIB)/libFoX_common.a
LIBS += $(QUIP_LIB)/libFoX_utils.a
LIBS += $(QUIP_LIB)/libFoX_fsys.a
endif
ifneq ($(USE_LIBPEXSI),)
USE_LIBPEXSI := $(strip $(USE_LIBPEXSI))
SCOTCH_VER := $(strip $(SCOTCH_VER))
SUPERLU_VER := $(strip $(SUPERLU_VER))
LIBPEXSI_INC := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/include
LIBPEXSI_LIB := $(INSTALL_PATH)/pexsi-$(USE_LIBPEXSI)/lib
SCOTCH_INC := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/include
SCOTCH_LIB := $(INSTALL_PATH)/scotch-$(SCOTCH_VER)/lib
SUPERLU_INC := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/include
SUPERLU_LIB := $(INSTALL_PATH)/superlu_dist-$(SUPERLU_VER)/lib
CFLAGS += -I$(LIBPEXSI_INC) -I$(SCOTCH_INC) -I$(SUPERLU_INC)
DFLAGS += -D__LIBPEXSI
LIBS += $(LIBPEXSI_LIB)/libpexsi.a
LIBS += $(SUPERLU_LIB)/libsuperlu_dist.a
LIBS += $(SCOTCH_LIB)/libptscotchparmetis.a
LIBS += $(SCOTCH_LIB)/libptscotch.a
LIBS += $(SCOTCH_LIB)/libptscotcherr.a
LIBS += $(SCOTCH_LIB)/libscotchmetis.a
LIBS += $(SCOTCH_LIB)/libscotch.a
endif
ifneq ($(USE_LIBVORI),)
USE_LIBVORI := $(strip $(USE_LIBVORI))
LIBVORI_LIB := $(INSTALL_PATH)/libvori-$(USE_LIBVORI)/lib
DFLAGS += -D__LIBVORI
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(LIBVORI_LIB) -L$(LIBVORI_LIB) -lvori
else
LIBS += $(LIBVORI_LIB)/libvori.a
endif
endif
ifneq ($(USE_LIBXC),)
USE_LIBXC := $(strip $(USE_LIBXC))
LIBXC_INC := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/include
LIBXC_LIB := $(INSTALL_PATH)/libxc-$(USE_LIBXC)/lib
CFLAGS += -I$(LIBXC_INC)
DFLAGS += -D__LIBXC
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(LIBXC_LIB) -L$(LIBXC_LIB) -lxcf03 -lxc
else
LIBS += $(LIBXC_LIB)/libxcf03.a
LIBS += $(LIBXC_LIB)/libxc.a
endif
endif
ifneq ($(USE_LIBINT),)
USE_LIBINT := $(strip $(USE_LIBINT))
LMAX := $(strip $(LMAX))
LIBINT_INC := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/include
LIBINT_LIB := $(INSTALL_PATH)/libint-v$(USE_LIBINT)-cp2k-lmax-$(LMAX)/lib
CFLAGS += -I$(LIBINT_INC)
DFLAGS += -D__LIBINT
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(LIBINT_LIB) -L$(LIBINT_LIB) -lint2
else
LIBS += $(LIBINT_LIB)/libint2.a
endif
endif
ifneq ($(USE_SPGLIB),)
USE_SPGLIB := $(strip $(USE_SPGLIB))
SPGLIB_INC := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/include
SPGLIB_LIB := $(INSTALL_PATH)/spglib-$(USE_SPGLIB)/lib
CFLAGS += -I$(SPGLIB_INC)
DFLAGS += -D__SPGLIB
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(SPGLIB_LIB) -L$(SPGLIB_LIB) -lsymspg
else
LIBS += $(SPGLIB_LIB)/libsymspg.a
endif
endif
ifneq ($(USE_LIBXSMM),)
USE_LIBXSMM := $(strip $(USE_LIBXSMM))
LIBXSMM_INC := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/include
LIBXSMM_LIB := $(INSTALL_PATH)/libxsmm-$(USE_LIBXSMM)/lib
CFLAGS += -I$(LIBXSMM_INC)
DFLAGS += -D__LIBXSMM
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(LIBXSMM_LIB) -L$(LIBXSMM_LIB) -lxsmmf -lxsmm
else
LIBS += $(LIBXSMM_LIB)/libxsmmf.a
LIBS += $(LIBXSMM_LIB)/libxsmm.a
endif
endif
ifneq ($(USE_SIRIUS),)
USE_SIRIUS := $(strip $(USE_SIRIUS))
LIBVDWXC_VER := $(strip $(LIBVDWXC_VER))
LIBVDWXC_INC := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/include
LIBVDWXC_LIB := $(INSTALL_PATH)/libvdwxc-$(LIBVDWXC_VER)/lib
SPFFT_VER := $(strip $(SPFFT_VER))
SPFFT_INC := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/include
SPLA_VER := $(strip $(SPLA_VER))
SPLA_INC := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/include/spla
ifeq ($(USE_ACC), yes)
DFLAGS += -D__OFFLOAD_GEMM
SPFFT_LIB := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib/cuda
SPLA_LIB := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib/cuda
SIRIUS_INC := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include/cuda
SIRIUS_LIB := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib/cuda
else
SPFFT_LIB := $(INSTALL_PATH)/SpFFT-$(SPFFT_VER)/lib
SPLA_LIB := $(INSTALL_PATH)/SpLA-$(SPLA_VER)/lib
SIRIUS_INC := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/include
SIRIUS_LIB := $(INSTALL_PATH)/sirius-$(USE_SIRIUS)/lib
endif
CFLAGS += -I$(LIBVDWXC_INC)
CFLAGS += -I$(SPFFT_INC)
CFLAGS += -I$(SPLA_INC)
CFLAGS += -I$(SIRIUS_INC)
DFLAGS += -D__LIBVDWXC
DFLAGS += -D__SPFFT
DFLAGS += -D__SPLA
DFLAGS += -D__SIRIUS
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(SIRIUS_LIB) -L$(SIRIUS_LIB) -lsirius
LIBS += -Wl,-rpath=$(SPLA_LIB) -L$(SPLA_LIB) -lspla
LIBS += -Wl,-rpath=$(SPFFT_LIB) -L$(SPFFT_LIB) -lspfft
LIBS += -Wl,-rpath=$(LIBVDWXC_LIB) -L$(LIBVDWXC_LIB) -lvdwxc
else
LIBS += $(SIRIUS_LIB)/libsirius.a
LIBS += $(SPLA_LIB)/libspla.a
LIBS += $(SPFFT_LIB)/libspfft.a
LIBS += $(LIBVDWXC_LIB)/libvdwxc.a
endif
endif
ifneq ($(USE_HDF5),)
USE_HDF5 := $(strip $(USE_HDF5))
HDF5_INC := $(INSTALL_PATH)/hdf5-$(USE_HDF5)/include
HDF5_LIB := $(INSTALL_PATH)/hdf5-$(USE_HDF5)/lib
CFLAGS += -I$(HDF5_INC)
DFLAGS += -D__HDF5
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath,$(HDF5_LIB) -L$(HDF5_LIB) -lhdf5_fortran -lhdf5_hl -lhdf5
else
LIBS += $(HDF5_LIB)/libhdf5_fortran.a
LIBS += $(HDF5_LIB)/libhdf5_hl.a
LIBS += $(HDF5_LIB)/libhdf5.a
endif
endif
ifneq ($(USE_COSMA),)
USE_COSMA := $(strip $(USE_COSMA))
ifeq ($(USE_ACC), yes)
USE_COSMA := $(USE_COSMA)-cuda
endif
COSMA_INC := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/include
COSMA_LIB := $(INSTALL_PATH)/COSMA-$(USE_COSMA)/lib
CFLAGS += -I$(COSMA_INC)
DFLAGS += -D__COSMA
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(COSMA_LIB) -L$(COSMA_LIB) -lcosma_prefixed_pxgemm -lcosma -lcosta
ifeq ($(USE_ACC), yes)
LIBS += -lTiled-MM
endif
else
LIBS += $(COSMA_LIB)/libcosma_prefixed_pxgemm.a
LIBS += $(COSMA_LIB)/libcosma.a
LIBS += $(COSMA_LIB)/libcosta.a
ifeq ($(USE_ACC), yes)
LIBS += $(COSMA_LIB)/libTiled-MM.a
endif
endif
endif
ifneq ($(USE_FFTW),)
USE_FFTW := $(strip $(USE_FFTW))
FFTW_INC := $(INSTALL_PATH)/fftw-$(USE_FFTW)/include
FFTW_LIB := $(INSTALL_PATH)/fftw-$(USE_FFTW)/lib
CFLAGS += -I$(FFTW_INC)
DFLAGS += -D__FFTW3
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(FFTW_LIB) -L$(FFTW_LIB) -lfftw3_mpi -lfftw3_omp -lfftw3
else
LIBS += $(FFTW_LIB)/libfftw3_mpi.a
LIBS += $(FFTW_LIB)/libfftw3_omp.a
LIBS += $(FFTW_LIB)/libfftw3.a
endif
endif
ifneq ($(USE_SCALAPACK),)
SCALAPACK_LIB := $(INSTALL_PATH)/scalapack-$(USE_SCALAPACK)/lib
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(SCALAPACK_LIB) -L$(SCALAPACK_LIB) -lscalapack
else
LIBS += $(SCALAPACK_LIB)/libscalapack.a
endif
endif
ifneq ($(USE_GSL),)
USE_GSL := $(strip $(USE_GSL))
GSL_INC := $(INSTALL_PATH)/gsl-$(USE_GSL)/include
GSL_LIB := $(INSTALL_PATH)/gsl-$(USE_GSL)/lib
CFLAGS += -I$(GSL_INC)
DFLAGS += -D__GSL
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(GSL_LIB) -L$(GSL_LIB) -lgsl
else
LIBS += $(GSL_LIB)/libgsl.a
endif
endif
ifneq ($(USE_OPENBLAS),)
USE_OPENBLAS := $(strip $(USE_OPENBLAS))
OPENBLAS_INC := $(INSTALL_PATH)/openblas-$(USE_OPENBLAS)/include
OPENBLAS_LIB := $(INSTALL_PATH)/openblas-$(USE_OPENBLAS)/lib
CFLAGS += -I$(OPENBLAS_INC)
ifeq ($(SHARED), yes)
LIBS += -Wl,-rpath=$(OPENBLAS_LIB) -L$(OPENBLAS_LIB) -lopenblas
else
LIBS += $(OPENBLAS_LIB)/libopenblas.a
endif
endif
ifeq ($(shell [ $(shell ldd --version | head -n 1 | tr -s '.' '\n' | tail -n 1) -ge 27 ] && echo yes), yes)
ifneq ($(USE_LIBTORCH),)
USE_LIBTORCH := $(strip $(USE_LIBTORCH))
LIBTORCH_INC := $(INSTALL_PATH)/libtorch-$(USE_LIBTORCH)/include
LIBTORCH_LIB := $(INSTALL_PATH)/libtorch-$(USE_LIBTORCH)/lib
CFLAGS += -I$(LIBTORCH_INC)
DFLAGS += -D__LIBTORCH
ifeq ($(USE_ACC), yes)
LIBS += -Wl,-rpath=$(LIBTORCH_LIB) -L$(LIBTORCH_LIB) -lc10 -ltorch_cpu -ltorch
# LIBS += -Wl,-rpath=$(LIBTORCH_LIB) -L$(LIBTORCH_LIB) -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch
else
LIBS += -Wl,-rpath=$(LIBTORCH_LIB) -L$(LIBTORCH_LIB) -lc10 -ltorch_cpu -ltorch
endif
endif
endif
CFLAGS += $(DFLAGS)
FCFLAGS := $(CFLAGS)
ifeq ($(shell [ $(shell gcc -dumpversion | cut -d. -f1) -gt 9 ] && echo yes), yes)
FCFLAGS += -fallow-argument-mismatch
endif
FCFLAGS += -fbacktrace
FCFLAGS += -ffree-form
FCFLAGS += -ffree-line-length-none
FCFLAGS += -fno-omit-frame-pointer
FCFLAGS += -std=f2008
ifneq ($(CUDA_HOME),)
CFLAGS += -I$(CUDA_HOME)/include
FCFLAGS += -I$(CUDA_HOME)/include
CUDA_LIB := $(CUDA_HOME)/lib64
LDFLAGS += $(FCFLAGS) -L$(CUDA_LIB) -Wl,-rpath=$(CUDA_LIB)
else
LDFLAGS += $(FCFLAGS)
endif
CFLAGS += $(CFLAGS) -std=c11
CXXFLAGS := $(CFLAGS) -std=c++14
ifeq ($(USE_ACC), yes)
OFFLOAD_FLAGS := $(DFLAGS) -O3 -Xcompiler='-fopenmp' -allow-unsupported-compiler -arch sm_80 -g --std=c++11
LIBS += -lcusolver -lcudart -lnvrtc -lcuda -lcufft -lcublas -lrt
endif
LIBS += -lz -ldl -lpthread -lstdc++
# End