diff --git a/.gitignore b/.gitignore index 39591560..cb485a1a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,8 @@ +*.o +bin +lib +build logs r[0-9][0-9][0-9]* +vtune* + diff --git a/Makefile b/Makefile index e832455d..63a7eaf2 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -30,7 +30,7 @@ # The 'stencil' and 'arch' vars are most important and should always be specified. # # stencil: sets stencil problem to be solved. -# For a list of current stencils, see src/kernel/Makefile or run the following: +# For a list of current stencils, run the following: # % make compiler # % bin/yask_compiler.exe -h # You can also create your own stencil; see the documentation. @@ -44,26 +44,14 @@ # # real_bytes: FP precision: 4=float, 8=double. # -# fold: How to fold vectors (x*y*z). -# fold_4byte: How to fold vectors when real_bytes=4. -# fold_8byte: How to fold vectors when real_bytes=8. -# -# cluster: How many folded vectors to evaluate simultaneously. +# fold: In which dimension(s) to vectorize. +# cluster: How many vectors to evaluate simultaneously. # # pfd_l1: L1 prefetch distance (0 => disabled). # pfd_l2: L2 prefetch distance (0 => disabled). -# -# omp_region_schedule: OMP schedule policy for region loop. -# omp_block_schedule: OMP schedule policy for nested OpenMP block loop. -# omp_misc_schedule: OMP schedule policy for OpenMP misc loop. -# -# def_block_threads: Number of threads to use in nested OpenMP block loop by default. -# def_thread_divisor: Divide number of OpenMP threads by this factor by default. -# def_*_args: Default cmd-line args for specific settings. -# more_def_args: Additional default cmd-line args. -# -# allow_new_grid_types: Whether to allow grid types not defined in the stencil -# to be created via new_grid() and new_fixed_size_grid(). + +# Common defaults. +offload ?= 0 # Common settings. YASK_BASE := $(abspath .) @@ -80,17 +68,14 @@ include $(YASK_BASE)/src/common/common.mk YK_MAKE := $(MAKE) $(YASK_MFLAGS) -C src/kernel YASK_OUTPUT_DIR=$(YASK_OUT_BASE) YC_MAKE := $(MAKE) $(YASK_MFLAGS) -C src/compiler YASK_OUTPUT_DIR=$(YASK_OUT_BASE) -# Misc dirs & files. -TUPLE_TEST_EXEC := $(BIN_OUT_DIR)/yask_tuple_test.exe -COMBO_TEST_EXEC := $(BIN_OUT_DIR)/yask_combo_test.exe - -# Compiler and default flags--used only for targets in this Makefile. -# For compiler, use YC_CXX*. -# For kernel, use YK_CXX*. -CXX := g++ -CXXFLAGS := -g -std=c++11 -Wall -O2 -CXXFLAGS += $(addprefix -I,$(INC_DIR) $(COMM_DIR)) -CXXFLAGS += -fopenmp +# Default flags--used only for targets in this Makefile. +# For compiler, use YC_CXX* vars. +# For kernel, use YK_CXX* vars. +YASK_CXX := $(CXX) +YASK_CXXOPT := -O2 +YASK_CXXFLAGS := -g -std=c++17 -Wall $(YASK_CXXOPT) +YASK_CXXFLAGS += $(addprefix -I,$(INC_DIR) $(COMM_DIR)) +YASK_CXXFLAGS += -fopenmp ######## Primary targets & rules # NB: must set stencil and arch to generate the desired kernel. @@ -136,6 +121,51 @@ docs/api/html/index.html: include/*.hpp include/*/*.hpp docs/api/*.* find docs/api/html -type f | xargs -r rm cd docs/api; doxygen doxygen_config.txt +######## Misc targets + +code-stats: + $(YK_MAKE) $@ + +docs: api-docs + +tags: + rm -f TAGS ; find src include -name '*.[ch]pp' | xargs etags -C -a + +# Remove intermediate files. +# Should not trigger remake of stencil compiler, so does not invoke clean in compiler dir. +# Make this target before rebuilding YASK with any new parameters. +clean: + $(YK_MAKE) $@ + +# Remove executables, generated documentation, etc. (not logs). +# Use 'find *' instead of 'find .' to avoid searching in '.git'. +realclean: clean + rm -rf $(LIB_OUT_DIR) $(BIN_OUT_DIR) $(BUILD_OUT_DIR) + rm -fv TAGS '*~' + - find src include utils -name '*~' -print -delete + - find src -name '*.optrpt' -print -delete + - find src -name __pycache__ -print -delete + $(YC_MAKE) $@ + $(YK_MAKE) $@ + - find $(PY_OUT_DIR) -mindepth 1 '!' -name __init__.py -print -delete + - rmdir -v --ignore-fail-on-non-empty $(PY_OUT_DIR) + - rmdir -v --ignore-fail-on-non-empty $(YASK_OUT_BASE) + +help: + @ $(YC_MAKE) $@ + @ $(YK_MAKE) $@ + @ echo " " + @ echo "'setenv CXX_PREFIX ccache' or 'export CXX_PREFIX=ccache' to use ccache." + +################################# +########### Tests ############### +################################# +# TODO: convert all testing to a separate test framework. + +# Test dirs & files. +TUPLE_TEST_EXEC := $(BIN_OUT_DIR)/yask_tuple_test.exe +COMBO_TEST_EXEC := $(BIN_OUT_DIR)/yask_combo_test.exe + #### API tests. # The tests listed here are designed to test various combinations of the @@ -188,17 +218,25 @@ py-yc-api-and-cxx-yk-api-test: # When the built-in stencil examples aren't being used, # "stencil=api_test" in the commands below is simply used to # create file names. -combo-api-tests: +yc-combo-api-tests: $(MAKE) clean; $(MAKE) stencil=iso3dfd yc-and-cxx-yk-api-test $(MAKE) clean; $(MAKE) stencil=iso3dfd yc-and-py-yk-api-test + +cxx-yc-combo-api-tests: $(MAKE) clean; $(MAKE) stencil=api_test cxx-yc-api-and-yk-test - $(MAKE) clean; $(MAKE) stencil=api_test py-yc-api-and-yk-test $(MAKE) clean; $(MAKE) stencil=api_test cxx-yc-api-and-cxx-yk-api-test - $(MAKE) clean; $(MAKE) stencil=api_test py-yc-api-and-py-yk-api-test $(MAKE) clean; $(MAKE) stencil=api_test cxx-yc-api-and-py-yk-api-test + +py-yc-combo-api-tests: + $(MAKE) clean; $(MAKE) stencil=api_test py-yc-api-and-yk-test + $(MAKE) clean; $(MAKE) stencil=api_test py-yc-api-and-py-yk-api-test $(MAKE) clean; $(MAKE) stencil=api_test py-yc-api-and-cxx-yk-api-test -######## Misc targets +combo-api-tests: + $(MAKE) yc-combo-api-tests + if (( $(offload) == 0 )); then \ + $(MAKE) cxx-yc-combo-api-tests && \ + $(MAKE) py-yc-combo-api-tests; fi # NB: set arch var if applicable. # NB: save some test time by using YK_CXXOPT=-O2. @@ -206,16 +244,13 @@ combo-api-tests: yc-and-yk-test: $(YK_MAKE) $@ -code-stats: - $(YK_MAKE) $@ - $(TUPLE_TEST_EXEC): $(COMM_DIR)/tests/tuple_test.cpp $(COMM_DIR)/*.*pp $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(CXX) $(CXXFLAGS) $(LFLAGS) -o $@ $< $(COMM_DIR)/tuple.cpp $(COMM_DIR)/common_utils.cpp + $(CXX_PREFIX) $(YASK_CXX) $(YASK_CXXFLAGS) $(LFLAGS) -o $@ $< $(COMM_DIR)/tuple.cpp $(COMM_DIR)/common_utils.cpp $(COMBO_TEST_EXEC): $(COMM_DIR)/tests/combo_test.cpp $(COMM_DIR)/*.*pp $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(CXX) $(CXXFLAGS) $(LFLAGS) -o $@ $< $(COMM_DIR)/combo.cpp + $(CXX_PREFIX) $(YASK_CXX) $(YASK_CXXFLAGS) $(LFLAGS) -o $@ $< $(COMM_DIR)/combo.cpp tuple-test: $(TUPLE_TEST_EXEC) @echo '*** Running the C++ YASK tuple test...' @@ -229,48 +264,19 @@ api-tests: compiler-api $(MAKE) combo-api-tests $(YK_MAKE) $@ -all-tests: compiler-api +unit-tests: $(MAKE) tuple-test $(MAKE) combo-test + +all-tests: compiler-api unit-tests $(YK_MAKE) $@ $(MAKE) combo-api-tests + $(MAKE) clean + @echo "All YASK tests have been run" all: $(MAKE) realclean $(MAKE) tags $(MAKE) default - $(MAKE) all-tests - $(MAKE) clean - $(MAKE) default $(MAKE) api-all - -docs: api-docs - -tags: - rm -f TAGS ; find src include -name '*.[ch]pp' | xargs etags -C -a - -# Remove intermediate files. -# Should not trigger remake of stencil compiler, so does not invoke clean in compiler dir. -# Make this target before rebuilding YASK with any new parameters. -clean: - $(YK_MAKE) $@ - -# Remove executables, generated documentation, etc. (not logs). -# Use 'find *' instead of 'find .' to avoid searching in '.git'. -realclean: clean - rm -rf $(LIB_OUT_DIR) $(BIN_OUT_DIR) $(BUILD_OUT_DIR) - rm -fv TAGS '*~' - - find * -name '*~' -print -delete - - find * -name '*.optrpt' -print -delete - - find * -name __pycache__ -print -delete - $(YC_MAKE) $@ - $(YK_MAKE) $@ - - find $(PY_OUT_DIR) -mindepth 1 '!' -name __init__.py -print -delete - - rmdir -v --ignore-fail-on-non-empty $(PY_OUT_DIR) - - rmdir -v --ignore-fail-on-non-empty $(YASK_OUT_BASE) - -help: - @ $(YC_MAKE) $@ - @ $(YK_MAKE) $@ - @ echo " " - @ echo "'setenv CXX_PREFIX ccache' or 'export CXX_PREFIX=ccache' to use ccache." + $(MAKE) all-tests diff --git a/README.md b/README.md index ffb7e1b1..e2d3ef14 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,48 @@ # YASK--Yet Another Stencil Kit -* New YASK users may want to start with the [YASK tutorial](http://intel.github.io/yask/YASK-tutorial.pdf). -* Users with existing YASK-based code may want to jump to the [backward-compatibility notices](#backward-compatibility-notices). -* All YASK users will also be interested in the [API documentation](http://intel.github.io/yask/api/html/index.html). -* **Coming soon: GPU support via OpenMP device offload!** +* New YASK users may want to start with +the [YASK tutorial](http://intel.github.io/yask/YASK-tutorial.pdf). +* Users with existing YASK-based code may want to jump to +the [backward-compatibility notices](#backward-compatibility-notices). +* All YASK users will also be interested in +the [API documentation](http://intel.github.io/yask/api/html/index.html). ## Overview -YASK is a framework to rapidly create high-performance stencil code including optimizations and features such as +YASK is a framework to rapidly create high-performance stencil code +including optimizations and features such as * Support for boundary layers and staggered-grid stencils. * Vector-folding to increase data reuse via non-traditional data layout. -* Multi-level OpenMP parallelism to exploit multiple cores and threads. -* Scaling to multiple sockets and nodes via MPI with overlapped communication and compute. +* Multi-level OpenMP parallelism to exploit multiple CPU cores and threads. +* OpenMP offloading to GPUs. +* MPI scaling to multiple sockets and nodes with overlapped communication and compute. * Spatial tiling with automatically-tuned block sizes. * Temporal tiling in multiple dimensions to further increase cache locality. * APIs for C++ and Python. -YASK contains a domain-specific compiler to convert stencil-equation specifications to SIMD-optimized code for Intel(R) Xeon Phi(TM) and Intel(R) Xeon(R) processors. +YASK contains a domain-specific compiler to convert stencil-equation specifications to +optimized code for Intel(R) Xeon(R) processors, Intel(R) Xeon Phi(TM) processors, +and Intel(R) graphics processors. ### Supported Platforms and Processors: * 64-bit Linux. * Intel(R) Xeon(R) processors supporting the AVX, AVX2, or CORE_AVX512 instruction sets. -* Intel(R) Xeon Phi(TM) x200-family processors supporting the MIC_AVX512 instruction set. -* Intel(R) Xeon Phi(TM) x100-family coprocessors supporting the Knights-Corner instruction set (no longer tested). +* Intel(R) Xeon Phi(TM) x200-family processors supporting the MIC_AVX512 instruction set (KNL). +* Intel(R) graphics processors supporting UHD graphics, e.g., "Gen12" GPUs. ### Pre-requisites: -* Intel(R) Parallel Studio XE Cluster Edition for Linux - for multi-socket and multi-node operation or - Intel(R) Parallel Studio XE Composer Edition for C++ Linux - for single-socket only - (2020.1.217, a.k.a. 19.1.1.217, or later recommended). - * There was an issue in Intel(R) MPI versions 2019u1 and 2019u2 that - caused the application to crash when allocating very - large shared-memory (shm) regions, so you may have to - use the `-no-use_shm` option with these versions. - This issue was resolved in MPI version 2019u3. - * There was an issue in the Intel(R) C++ compiler 2019.1.0 that caused - an internal error when building YASK kernels. - This has been fixed in 19.1.1.x and later versions. - * If you are using the Intel(R) C++ compiler with g++ version 8.x or later, - Intel(R) C++ version 2019 or later is required. - * Building a YASK kernel with clang or the "nextgen" Intel(R) C++ - compiler is possible; however, - SIMD operations for functions such as sin() are not supported in the nextgen - compiler at this time. Also, the Python interface may not work with - the nextgen compiler. - * Building a YASK kernel with the Gnu C++ compiler is possible. - Limited testing with g++ 8.2.0 shows the "iso3dfd" kernel - runs about 30% slower compared to the same kernel built with - the Intel C++ compiler. - Older Gnu C++ compilers can produce kernels that run - many times slower. -* Gnu C++ compiler, g++ (4.9.0 or later; 9.1.0 or later recommended). - Even when using Intel compilers, they rely on functionality provided by a g++ installation. +* Intel(R) [oneAPI](https://software.intel.com/content/www/us/en/develop/tools/oneapi.html) + HPC Toolkit for Linux (toolkit 2022.3 or later recommended; this will install + the Intel(R) oneAPI DPC++/C++ Compiler 2022.2.0). + See notes below under version 4.00.00 changes. +* Gnu C++ compiler, g++ (8.2.0 or later recommended). + Even when using Intel compilers, a g++ installation is required. * Linux libraries `librt` and `libnuma`. +* Grep. * Perl (5.010 or later). * Awk. * Gnu make. * Bash shell. -* Numactl utility. +* Numactl utility if running on more than one CPU socket. * Optional utilities and their purposes: * The `indent` or `gindent` utility, used automatically during the build process to make the generated code easier for humans to read. @@ -66,19 +51,77 @@ YASK contains a domain-specific compiler to convert stencil-equation specificati Reading the generated code is only necessary for debug or curiosity. * SWIG (3.0.12 or later; 4.0.0 or later recommended), http://www.swig.org, for creating the Python interface. - * Python 2 (2.7.5 or later) or 3 (3.6.1 or later), + * Python 3 (3.6.1 or later, recommended): https://www.python.org/downloads, for creating and using the Python interface. - * Doxygen (1.8.11 or later), + * Doxygen (1.8.11 or later): http://doxygen.org, for creating updated API documentation. If you're not changing the API documentation, you can view the existing documentation at the link at the top of this page. - * Graphviz (2.30.1 or later), + * Graphviz (2.30.1 or later): http://www.graphviz.org, for rendering stencil diagrams. - * Intel(R) Software Development Emulator, + * Intel(R) Software Development Emulator: https://software.intel.com/en-us/articles/intel-software-development-emulator, - for functional testing if you don't have native support for any given instruction set. + for functional testing if you don't have native support for the targeted instruction set. ## Backward-compatibility notices +### Version 4 +* Version 4.00.00 was a major release with a number of notices: + - Support has been added for GPU offloading via the OpenMP device model. + Build any YASK stencil kernel with `make offload=1 ...`. This will create + a kernel library and executable with an "arch" field containing + "offload" and the OpenMP device target name. + Use `make offload=1 offload_arch=` to change the OpenMP target; + the default is `spir64`, for GPUs with Intel(R) Architecture (e.g., Gen12). + Use `make offload_usm=1` to use the OpenMP Unified Shared Memory model. + - The default compiler is now the Intel(R) oneAPI C++ compiler, icpx. + If you want to use a different compiler, use `make YK_CXX= ...` + for the kernel, and/or `make YC_CXX= ...` for the YASK compiler, + or `make CXX=` for both. A C++ compiler that supports C++17 + is now required. + - The loop hierarchy has been extended and renamed with (hopefully) + more memorable names: + version 3's regions, blocks, mini-blocks, and sub-blocks + are now mega-blocks, blocks, micro-blocks, and nano-blocks, + respectively. + Pico-blocks have been added inside nano-blocks. + When offloading, the nano-blocks and pico-blocks are executed on the device. + The looping behaviors, including any temporal tiling, of mega-blocks, + blocks, and micro-blocks are handled by the CPU. + The `get_region_size()` and `set_region_size()` APIs have been removed. + The `-r` and `-sb` options, e.g., `-rx` and `-sbx`, have also been removed. + - Regarding CPU threads, "region threads" are now referred to as "outer threads", + and "block threads" are now referred to as "inner threads". + The option `-block_threads` is deprecated. + The option `-thread_divisor` has been removed. + See the `-help` documentation for new options `-outer_threads` and `-inner_threads`. + The `-max_threads` option remains. + - Only one thread per core is now used by default on most CPU models. + This is done in `yask.sh` by passing `-outer_threads ` to the executable, + where `` is the number of cores on the node divided by the + number of MPI ranks. + Consequently, the default number of inner threads is now one (1) + to use one core per block. + This change was made based on observed + performance on newer Intel(R) Xeon(R) Processors. Previous versions + used two threads per block by default and used both hyper-threads if + they were enabled. To configure two hyper-threads to work cooperatively + on each block, use the option `-inner_threads 2`. + These changes do not + apply to Intel(R) Xeon Phi(TM) x200-family processors (KNL), which + continue to use all 4 hyper-threads per core and 8 inner threads + by default (because 2 cores share an L2 cache). + - Intel(R) Xeon Phi(TM) x100-family processors (KNC) are no longer supported. + (Intel(R) Xeon Phi(TM) x200-family processors (KNL) are still supported.) + - Python v2 is no longer supported. + - New vector APIs were added to `yk_solution` and `yk_var` to allow getting + or setting multiple dimensions in one API call. + - `new_relative_var_point()` API is deprecated. + - APIs that were previously deprecated in the `yk_var` class have been removed. + - Explicit support for persistent-memory devices has been removed. + (Persistent-memory accessible via separate NUMA nodes or other standard + Linux mechanisms is supported as with any other special memory types, + e.g., high-bandwidth memory.) + ### Version 3 * Version 3.05.00 changed the default setting of `-use_shm` to `true`. Use `-no-use_shm` to disable shared-memory inter-rank communication. @@ -90,7 +133,7 @@ YASK contains a domain-specific compiler to convert stencil-equation specificati `yk_solution::apply_command_line_options()`. APIs to set the corresponding options are now in `yk_env`. This allows configuring the debug output before a `yk_solution` is created. -* Version 3.00.00 was a major release with a number of backward-compatibility notices: +* Version 3.00.00 was a major release with a number of notices: - The old (v1 and v2) internal DSL that used undocumented types such as `SolutionBase` and `GridValue` and undocumented macros such as `MAKE_GRID` was replaced with an expanded version of the documented YASK @@ -124,30 +167,54 @@ YASK contains a domain-specific compiler to convert stencil-equation specificati sizes are specified. This did not affect the default folding sizes. * Version 2.21.02 simplified the example 3-D stencils (`3axis`, `3plane`, etc.) to calculate simple averages like those in the MiniGhost benchmark. -This reduced the number of floating-point operations but not the number of points read for each stencil. -* Version 2.20.00 added checking of the step-dimension index value in the `yk_grid::get_element()` and similar APIs. +This reduced the number of floating-point operations but not the number of points read +for each stencil. +* Version 2.20.00 added checking of the step-dimension index value in the +`yk_grid::get_element()` and similar APIs. Previously, invalid values silently "wrapped" around to valid values. -Now, by default, the step index must be valid when reading, and the valid step indices are updated when writing. +Now, by default, the step index must be valid when reading, and the valid step +indices are updated when writing. The old behavior of silent index wrapping may be restored via `set_step_wrap(true)`. -The default for all `strict_indices` API parameters is now `true` to catch more programming errors and +The default for all `strict_indices` API parameters is now `true` to catch more +programming errors and increase consistency of behavior between "set" and "get" APIs. Also, the advanced `share_storage()` APIs have been replaced with `fuse_grids()`. * Version 2.19.01 turned off multi-pass tuning by default. Enable with `-auto_tune_each_pass`. -* Version 2.18.03 allowed the default radius to be stencil-specific and changed the names of example stencil "9axis" to "3axis_with_diags". -* Version 2.18.00 added the ability to specify the global-domain size, and it will calculate the local-domain sizes from it. +* Version 2.18.03 allowed the default radius to be stencil-specific and changed the names +of example stencil "9axis" to "3axis_with_diags". +* Version 2.18.00 added the ability to specify the global-domain size, and it will calculate +the local-domain sizes from it. There is no longer a default local-domain size. Output changed terms "overall-problem" to "global-domain" and "rank-domain" to "local-domain". -* Version 2.17.00 determined the host architecture in `make` and `bin/yask.sh` and number of MPI ranks in `bin/yask.sh`. -This changed the old behavior of `make` defaulting to `snb` architecture and `bin/yask.sh` requiring `-arch` and `-ranks`. +* Version 2.17.00 determined the host architecture in `make` and `bin/yask.sh` and +number of MPI ranks in `bin/yask.sh`. +This changed the old behavior of `make` defaulting to `snb` architecture and +`bin/yask.sh` requiring `-arch` and `-ranks`. Those options are still available to override the host-based default. -* Version 2.16.03 moved the position of the log-file name to the last column in the CSV output of `utils/bin/yask_log_to_csv.pl`. +* Version 2.16.03 moved the position of the log-file name to the last column in the CSV +output of `utils/bin/yask_log_to_csv.pl`. * Version 2.15.04 required a call to `yc_grid::set_dynamic_step_alloc(true)` to allow changing the allocation in the step (time) dimension at run-time for grid variables created at YASK compile-time. * Version 2.15.02 required all "misc" indices to be yask-compiler-time constants. -* Version 2.14.05 changed the meaning of temporal sizes so that 0 means never do temporal blocking and 1 allows blocking within a single time-step for multi-pack solutions. The default setting is 0, which keeps the old behavior. -* Version 2.13.06 changed the default behavior of the performance-test utility (`yask.sh`) to run trials for a given amount of time instead of a given number of steps. As of version 2.13.08, use the `-trial_time` option to specify the number of seconds to run. To force a specific number of trials as in previous versions, use the `-trial_steps` option. -* Version 2.13.02 required some changes in perf statistics due to step (temporal) conditions. Both text output and `yk_stats` APIs affected. -* Version 2.12.00 removed the long-deprecated `==` operator for asserting equality between a grid point and an equation. Use `EQUALS` instead. -* Version 2.11.01 changed the plain-text format of some of the performance data in the test-utility output. Specifically, some leading spaces were added, SI multipliers for values < 1 were added, and the phrase "time in" no longer appears before each time breakdown. This may affect some user programs that parse the output to collect stats. -* Version 2.10.00 changed the location of temporary files created during the build process. This will not affect most users, although you may need to manually remove old `src/compiler/gen` and `src/kernel/gen` directories. -* Version 2.09.00 changed the location of stencils in the internal DSL from `.hpp` to `.cpp` files. See the notes in https://github.com/intel/yask/releases/tag/v2.09.00 if you have any new or modified code in `src/stencils`. +* Version 2.14.05 changed the meaning of temporal sizes so that 0 means never do temporal +blocking and 1 allows blocking within a single time-step for multi-pack solutions. +The default setting is 0, which keeps the old behavior. +* Version 2.13.06 changed the default behavior of the performance-test utility (`yask.sh`) +to run trials for a given amount of time instead of a given number of steps. +As of version 2.13.08, use the `-trial_time` option to specify the number of seconds to run. +To force a specific number of trials as in previous versions, use the `-trial_steps` option. +* Version 2.13.02 required some changes in perf statistics due to step (temporal) conditions. +Both text output and `yk_stats` APIs affected. +* Version 2.12.00 removed the long-deprecated `==` operator for asserting equality between +a grid point and an equation. Use `EQUALS` instead. +* Version 2.11.01 changed the plain-text format of some of the performance data in the +test-utility output. +Specifically, some leading spaces were added, SI multipliers for values < 1 were added, +and the phrase "time in" no longer appears before each time breakdown. +This may affect some user programs that parse the output to collect stats. +* Version 2.10.00 changed the location of temporary files created during the build process. +This will not affect most users, although you may need to manually remove old `src/compiler/gen` +and `src/kernel/gen` directories. +* Version 2.09.00 changed the location of stencils in the internal DSL from `.hpp` to `.cpp` files. +See the notes in https://github.com/intel/yask/releases/tag/v2.09.00 if you have any new +or modified code in `src/stencils`. diff --git a/docs/YASK-tutorial.pdf b/docs/YASK-tutorial.pdf index 30130e2a..954d2beb 100644 Binary files a/docs/YASK-tutorial.pdf and b/docs/YASK-tutorial.pdf differ diff --git a/docs/api/html/classyask_1_1yc__node__factory.html b/docs/api/html/classyask_1_1yc__node__factory.html index dac227ed..2d9abad0 100644 --- a/docs/api/html/classyask_1_1yc__node__factory.html +++ b/docs/api/html/classyask_1_1yc__node__factory.html @@ -99,7 +99,7 @@ virtual yc_number_node_ptr new_const_number_node (double val) const  Create a constant numerical-value node. More...
  -virtual yc_number_node_ptr new_const_number_node (idx_t val) const +virtual yc_number_node_ptr new_const_number_node (idx_t val) const  Create a constant numerical value node. More...
  virtual yc_number_node_ptr new_negate_node (yc_number_node_ptr rhs) const @@ -403,7 +403,7 @@

virtual yc_number_node_ptr yask::yc_node_factory::new_const_number_node ( - idx_t  + idx_t  val) const diff --git a/docs/api/html/classyask_1_1yc__number__any__arg.html b/docs/api/html/classyask_1_1yc__number__any__arg.html index d262e811..a06dd67b 100644 --- a/docs/api/html/classyask_1_1yc__number__any__arg.html +++ b/docs/api/html/classyask_1_1yc__number__any__arg.html @@ -103,7 +103,7 @@  Arg can be a var-point-node pointer.
  yc_number_any_arg (idx_t i) + yc_number_any_arg (idx_t i)  Arg can be an index type.
  diff --git a/docs/api/html/classyask_1_1yc__number__const__arg.html b/docs/api/html/classyask_1_1yc__number__const__arg.html index ecadf1fd..79846fcb 100644 --- a/docs/api/html/classyask_1_1yc__number__const__arg.html +++ b/docs/api/html/classyask_1_1yc__number__const__arg.html @@ -91,7 +91,7 @@

Public Member Functions

yc_number_const_arg (idx_t i) + yc_number_const_arg (idx_t i)  Arg can be an index type.
  diff --git a/docs/api/html/classyask_1_1yc__solution-members.html b/docs/api/html/classyask_1_1yc__solution-members.html index 98e132e8..1c251440 100644 --- a/docs/api/html/classyask_1_1yc__solution-members.html +++ b/docs/api/html/classyask_1_1yc__solution-members.html @@ -79,15 +79,15 @@ clear_clustering()=0yask::yc_solutionpure virtual clear_dependencies()=0yask::yc_solutionpure virtual clear_folding()=0yask::yc_solutionpure virtual - format(const std::string &format_type, yask_output_ptr output)yask::yc_solutioninline + format(const std::string &format_type, yask_output_ptr output)yask::yc_solutioninline get_description() const =0yask::yc_solutionpure virtual get_element_bytes() const =0yask::yc_solutionpure virtual get_equations()=0yask::yc_solutionpure virtual - get_grid(const std::string &name)yask::yc_solutioninline - get_grids()yask::yc_solutioninline + get_grid(const std::string &name)yask::yc_solutioninline + get_grids()yask::yc_solutioninline get_name() const =0yask::yc_solutionpure virtual get_num_equations() const =0yask::yc_solutionpure virtual - get_num_grids() constyask::yc_solutioninline + get_num_grids() constyask::yc_solutioninline get_num_vars() const =0yask::yc_solutionpure virtual get_prefetch_dist(int level)=0yask::yc_solutionpure virtual get_target()=0yask::yc_solutionpure virtual @@ -97,10 +97,10 @@ is_dependency_checker_enabled() const =0yask::yc_solutionpure virtual is_folding_set()=0yask::yc_solutionpure virtual is_target_set()=0yask::yc_solutionpure virtual - new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)yask::yc_solutioninline - new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)yask::yc_solutioninline - new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)yask::yc_solutioninline - new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)yask::yc_solutioninline + new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)yask::yc_solutioninline + new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)yask::yc_solutioninline + new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)yask::yc_solutioninline + new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)yask::yc_solutioninline new_scratch_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0yask::yc_solutionpure virtual new_scratch_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0yask::yc_solutionpure virtual new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0yask::yc_solutionpure virtual diff --git a/docs/api/html/classyask_1_1yc__solution.html b/docs/api/html/classyask_1_1yc__solution.html index 8bad8bd2..982ff834 100644 --- a/docs/api/html/classyask_1_1yc__solution.html +++ b/docs/api/html/classyask_1_1yc__solution.html @@ -107,7 +107,7 @@ virtual std::string get_target ()=0  Get the current output-file format. More...
  -virtual void set_target (const std::string &format)=0 +virtual void set_target (const std::string &format)=0  Set the output target. More...
  virtual bool is_target_set ()=0 @@ -202,38 +202,38 @@ virtual void clear_dependencies ()=0  [Advanced] Remove all existing dependencies. More...
  - -void format (const std::string &format_type, yask_output_ptr output) - [Deprecated] Use set_target() and output_solution().
-  - -yc_var_ptr new_grid (const std::string &name, const std::vector< yc_index_node_ptr > &dims) - [Deprecated] Use new_var().
-  - -yc_var_ptr new_grid (const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims) - [Deprecated] Use new_var().
-  - -yc_var_ptr new_scratch_grid (const std::string &name, const std::vector< yc_index_node_ptr > &dims) - [Deprecated] Use new_scratch_var().
-  - -yc_var_ptr new_scratch_grid (const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims) - [Deprecated] Use new_scratch_var().
-  - -int get_num_grids () const - [Deprecated] Use get_num_vars().
-  - -std::vector< yc_var_ptrget_grids () - [Deprecated] Use get_vars().
-  - -yc_var_ptr get_grid (const std::string &name) - [Deprecated] Use get_var().
-  + +YASK_DEPRECATED void format (const std::string &format_type, yask_output_ptr output) + [Deprecated] Use set_target() and output_solution().
+  + +YASK_DEPRECATED yc_var_ptr new_grid (const std::string &name, const std::vector< yc_index_node_ptr > &dims) + [Deprecated] Use new_var().
+  + +YASK_DEPRECATED yc_var_ptr new_grid (const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims) + [Deprecated] Use new_var().
+  + +YASK_DEPRECATED yc_var_ptr new_scratch_grid (const std::string &name, const std::vector< yc_index_node_ptr > &dims) + [Deprecated] Use new_scratch_var().
+  + +YASK_DEPRECATED yc_var_ptr new_scratch_grid (const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims) + [Deprecated] Use new_scratch_var().
+  + +YASK_DEPRECATED int get_num_grids () const + [Deprecated] Use get_num_vars().
+  + +YASK_DEPRECATED std::vector< yc_var_ptrget_grids () + [Deprecated] Use get_vars().
+  + +YASK_DEPRECATED yc_var_ptr get_grid (const std::string &name) + [Deprecated] Use get_var().

Detailed Description

Stencil solution.

@@ -465,7 +465,7 @@

avx512 YASK kernel for CORE AVX-512 ISA. -avx512lo YASK kernel for CORE AVX-512 ISA with 256-bit SIMD. +avx512-ymm YASK kernel for CORE AVX-512 ISA with 256-bit SIMD. knl YASK kernel for MIC AVX-512 ISA. @@ -1142,7 +1142,6 @@

Set the prefetch distance for the given cache.

-

If the prefetch distance is not set for a given cache, a default will be used based on the target format.

Parameters
@@ -1180,7 +1179,7 @@

set_target() and all other preceding YASK compiler API calls.

Progress text will be written to the output stream set via set_debug_output().

-
Warning
Side effect: Applies optimizations to the equation(s), so some pointers to nodes in the original equations may refer to modified nodes or nodes that have been optimized away after calling format(). In general, do not use pointers to nodes across calls to format().
+
Warning
Side effect: Applies optimizations to the equation(s), so some pointers to nodes in the original equations may refer to modified nodes or nodes that have been optimized away after calling format(). In general, do not use pointers to nodes across calls to format().
Parameters

[in]levelCache level: 1 or 2.
@@ -1476,7 +1475,7 @@

format() is called. +
  • The dependencies should create one or more directed acyclic graphs (DAGs). If a cycle is created, the YASK compiler will throw an exception containing an error message about a circular dependency. This exception may not be thrown until format() is called.
  • If using scratch vars, dependencies among scratch vars and between scratch equations and non-scratch equations should also be added. Each scratch equation should ultimately depend on non-scratch values.
  • This function can be used in cooperation with or instead of the built-in automatic dependency checker. When used in cooperation with the built-in checker, both dependencies from this function and the built-in checker will be considered. When the built-in checker is diabled via set_dependency_checker_enabled(false), only dependencies from this function will be considered. In this case, it is imperative that all immediate dependencies are added. If the dependency graph is incomplete, the resulting generated stencil code will contain illegal race conditions, and it will most likely produce incorrect results.
  • diff --git a/docs/api/html/classyask_1_1yc__var-members.html b/docs/api/html/classyask_1_1yc__var-members.html index 7a67b2d3..09e1fa13 100644 --- a/docs/api/html/classyask_1_1yc__var-members.html +++ b/docs/api/html/classyask_1_1yc__var-members.html @@ -73,17 +73,17 @@

    This is the complete list of members for yask::yc_var, including all inherited members.

    [out]outputPointer to object to receive formatted output. See yask_output_factory.
    - + - - - - - - + + + + + + diff --git a/docs/api/html/classyask_1_1yc__var.html b/docs/api/html/classyask_1_1yc__var.html index 1ec18a13..72dd320f 100644 --- a/docs/api/html/classyask_1_1yc__var.html +++ b/docs/api/html/classyask_1_1yc__var.html @@ -87,49 +87,51 @@ - - - + + + - - - - - - - + - + - - - - - - - - - - - - + + + + + + + + + + + + + + + + + +
    get_dim_names() const =0yask::yc_varpure virtual
    get_dim_names() const =0yask::yc_varpure virtual
    get_name() const =0yask::yc_varpure virtual
    get_num_dims() const =0yask::yc_varpure virtual
    get_step_alloc_size() const =0yask::yc_varpure virtual
    is_dynamic_step_alloc() const =0yask::yc_varpure virtual
    new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)yask::yc_varinline
    new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)yask::yc_varinline
    new_relative_grid_point(const std::vector< int > &dim_offsets)yask::yc_varinline
    new_relative_grid_point(const std::initializer_list< int > &dim_offsets)yask::yc_varinline
    new_relative_var_point(const std::vector< int > &dim_offsets)=0yask::yc_varpure virtual
    new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0yask::yc_varpure virtual
    new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)yask::yc_varinline
    new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)yask::yc_varinline
    new_relative_grid_point(const std::vector< int > &dim_offsets)yask::yc_varinline
    new_relative_grid_point(const std::initializer_list< int > &dim_offsets)yask::yc_varinline
    new_relative_var_point(const std::vector< int > &dim_offsets)=0yask::yc_varpure virtual
    new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0yask::yc_varpure virtual
    new_var_point(const std::vector< yc_number_node_ptr > &index_exprs)=0yask::yc_varpure virtual
    new_var_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)=0yask::yc_varpure virtual
    set_dynamic_step_alloc(bool is_dynamic)=0yask::yc_varpure virtual
    virtual int get_num_dims () const =0
     Get the number of dimensions. More...
     
    virtual std::vector< std::string > get_dim_names () const =0
     Get all the dimensions in this var. More...
     
    virtual string_vec get_dim_names () const =0
     Get all the dimensions in this var. More...
     
    virtual yc_var_point_node_ptr new_var_point (const std::vector< yc_number_node_ptr > &index_exprs)=0
     Create a reference to a point in this var. More...
     
    virtual yc_var_point_node_ptr new_var_point (const std::initializer_list< yc_number_node_ptr > &index_exprs)=0
     Create a reference to a point in this var. More...
     
    virtual yc_var_point_node_ptr new_relative_var_point (const std::vector< int > &dim_offsets)=0
     Create a reference to a point in this var using relative offsets. More...
     
    virtual yc_var_point_node_ptr new_relative_var_point (const std::initializer_list< int > &dim_offsets)=0
     Create a reference to a point in this var using relative offsets. More...
     
    virtual bool is_dynamic_step_alloc () const =0
     [Advanced] Get whether the allocation of the step dimension of this var can be modified at run-time. More...
     
    virtual void set_dynamic_step_alloc (bool is_dynamic)=0
     [Advanced] Set whether the allocation of the step dimension of this var can be modified at run-time. More...
     
    virtual idx_t get_step_alloc_size () const =0
    virtual idx_t get_step_alloc_size () const =0
     [Advanced] Get the current allocation in the step dimension of this var. More...
     
    virtual void set_step_alloc_size (idx_t size)=0
    virtual void set_step_alloc_size (idx_t size)=0
     [Advanced] Set the current allocation in the step dimension of this var. More...
     
    -yc_var_point_node_ptr new_grid_point (const std::vector< yc_number_node_ptr > &index_exprs)
     [Deprecated] Use new_var_point().
     
    -yc_var_point_node_ptr new_grid_point (const std::initializer_list< yc_number_node_ptr > &index_exprs)
     [Deprecated] Use new_var_point().
     
    -yc_var_point_node_ptr new_relative_grid_point (const std::vector< int > &dim_offsets)
     [Deprecated] Use new_relative_var_point().
     
    -yc_var_point_node_ptr new_relative_grid_point (const std::initializer_list< int > &dim_offsets)
     [Deprecated] Use new_relative_var_point().
     
    +virtual YASK_DEPRECATED yc_var_point_node_ptr new_relative_var_point (const std::vector< int > &dim_offsets)=0
     [Deprecated] Use new_var_point().
     
    +virtual YASK_DEPRECATED yc_var_point_node_ptr new_relative_var_point (const std::initializer_list< int > &dim_offsets)=0
     [Deprecated] Use new_var_point().
     
    +YASK_DEPRECATED yc_var_point_node_ptr new_grid_point (const std::vector< yc_number_node_ptr > &index_exprs)
     [Deprecated] Use new_var_point().
     
    +YASK_DEPRECATED yc_var_point_node_ptr new_grid_point (const std::initializer_list< yc_number_node_ptr > &index_exprs)
     [Deprecated] Use new_var_point().
     
    +YASK_DEPRECATED yc_var_point_node_ptr new_relative_grid_point (const std::vector< int > &dim_offsets)
     [Deprecated] Use new_relative_var_point().
     
    +YASK_DEPRECATED yc_var_point_node_ptr new_relative_grid_point (const std::initializer_list< int > &dim_offsets)
     [Deprecated] Use new_relative_var_point().
     

    Detailed Description

    A compile-time data variable.

    @@ -192,8 +194,8 @@

    -

    ◆ get_dim_names()

    + +

    ◆ get_dim_names()

    - -

    Create a reference to a point in this var using relative offsets.

    -

    A shorthand function for calling new_var_point() when all index expressions are constant offsets. Each offset refers to the dimensions defined when the var was created via yc_solution::new_var().

    -

    Example: if g = new_var("data", {t, x, y}) with step-dimension t and domain-dimensions x and y, g->new_relative_var_point({1, -1, 0}) refers to the same point as g->new_var_point({t + 1, x - 1, y}).

    -
    Warning
    This convenience function can only be used when every dimension of the var is either the step dimension or a domain dimension. If this is not the case, use new_var_point().
    -
    Returns
    Pointer to AST node used to read from or write to point in var.
    -
    Parameters
    - - -
    [in]dim_offsetsoffset from evaluation index in each dim.
    -
    -
    - -
    -
    - -

    ◆ new_relative_var_point() [2/2]

    - -
    -
    - - - - - -
    - - - - - - - - -
    virtual yc_var_point_node_ptr yask::yc_var::new_relative_var_point (const std::initializer_list< int > & dim_offsets)
    -
    -pure virtual
    -
    - -

    Create a reference to a point in this var using relative offsets.

    -

    C++ initializer-list version with same semantics as the vector version of new_relative_var_point().

    Note
    Not available in the Python API. Use the vector version.
    -
    Returns
    Pointer to AST node used to read or write from point in var.
    -
    @@ -429,7 +363,7 @@

    - + @@ -459,7 +393,7 @@

    virtual void yask::yc_var::set_step_alloc_size

    - + diff --git a/docs/api/html/classyask_1_1yc__var__point__node-members.html b/docs/api/html/classyask_1_1yc__var__point__node-members.html index 8190b726..f85a8ffc 100644 --- a/docs/api/html/classyask_1_1yc__var__point__node-members.html +++ b/docs/api/html/classyask_1_1yc__var__point__node-members.html @@ -75,7 +75,7 @@
    virtual idx_t yask::yc_var::get_step_alloc_size virtual idx_t yask::yc_var::get_step_alloc_size ( ) const(idx_t idx_t  size)
    - + diff --git a/docs/api/html/classyask_1_1yc__var__point__node.html b/docs/api/html/classyask_1_1yc__var__point__node.html index 0d683332..6edd999c 100644 --- a/docs/api/html/classyask_1_1yc__var__point__node.html +++ b/docs/api/html/classyask_1_1yc__var__point__node.html @@ -96,10 +96,10 @@ - - - + + + @@ -115,7 +115,7 @@
    clone_ast() const =0yask::yc_number_nodepure virtual
    format_simple() const =0yask::yc_expr_nodepure virtual
    get_grid()yask::yc_var_point_nodeinline
    get_grid()yask::yc_var_point_nodeinline
    get_num_nodes() const =0yask::yc_expr_nodepure virtual
    get_var()=0yask::yc_var_point_nodepure virtual
    ~yc_expr_node() (defined in yask::yc_expr_node)yask::yc_expr_nodeinlinevirtual
    virtual yc_var_ptr get_var ()=0
     Get the var this point is in. More...
     
    -yc_var_ptr get_grid ()
     [Deprecated] Use get_var().
     
    +YASK_DEPRECATED yc_var_ptr get_grid ()
     [Deprecated] Use get_var().
     
    - Public Member Functions inherited from yask::yc_number_node
    virtual yc_number_node_ptr clone_ast () const =0

    Detailed Description

    Member Function Documentation

    ◆ get_var()

    diff --git a/docs/api/html/classyask_1_1yk__env-members.html b/docs/api/html/classyask_1_1yk__env-members.html index cdb3940e..acc16f7c 100644 --- a/docs/api/html/classyask_1_1yk__env-members.html +++ b/docs/api/html/classyask_1_1yk__env-members.html @@ -73,12 +73,14 @@

    This is the complete list of members for yask::yk_env, including all inherited members.

    - - - - - - + + + + + + + +
    get_debug_output() const =0yask::yk_envpure virtual
    get_num_ranks() const =0yask::yk_envpure virtual
    get_rank_index() const =0yask::yk_envpure virtual
    global_barrier() const =0yask::yk_envpure virtual
    set_debug_output(yask_output_ptr debug)=0yask::yk_envpure virtual
    set_trace_enabled(bool enable)=0yask::yk_envpure virtual
    disable_debug_output()yask::yk_envstatic
    get_debug_output()yask::yk_envstatic
    get_num_ranks() const =0yask::yk_envpure virtual
    get_rank_index() const =0yask::yk_envpure virtual
    global_barrier() const =0yask::yk_envpure virtual
    is_trace_enabled()yask::yk_envstatic
    set_debug_output(yask_output_ptr debug)yask::yk_envstatic
    set_trace_enabled(bool enable)yask::yk_envstatic
    ~yk_env() (defined in yask::yk_env)yask::yk_envinlinevirtual

    diff --git a/docs/api/html/classyask_1_1yk__env.html b/docs/api/html/classyask_1_1yk__env.html index 61acd09d..7785359a 100644 --- a/docs/api/html/classyask_1_1yk__env.html +++ b/docs/api/html/classyask_1_1yk__env.html @@ -68,6 +68,7 @@
    yask::yk_env Class Referenceabstract
    @@ -81,15 +82,6 @@ - - - - - - - - - @@ -99,13 +91,31 @@ +

    Public Member Functions

    virtual void set_debug_output (yask_output_ptr debug)=0
     Set object to receive debug output. More...
     
    virtual yask_output_ptr get_debug_output () const =0
     Get object to receive debug output. More...
     
    virtual void set_trace_enabled (bool enable)=0
     Enable or disable additional debug tracing. More...
     
    virtual int get_num_ranks () const =0
     Get number of MPI ranks. More...
     
    virtual void global_barrier () const =0
     Wait until all ranks have reached this element. More...
     
    + + + + + + + + + + + + + + + +

    +Static Public Member Functions

    static void set_debug_output (yask_output_ptr debug)
     Set object to receive debug output. More...
     
    static void disable_debug_output ()
     Disable the debug output. More...
     
    static yask_output_ptr get_debug_output ()
     Get object to receive debug output. More...
     
    static void set_trace_enabled (bool enable)
     Enable or disable additional debug tracing. More...
     
    static bool is_trace_enabled ()
     Get whether tracing is enabled. More...
     

    Detailed Description

    Kernel environment.

    Created via yk_factory::new_env().

    Member Function Documentation

    - -

    ◆ set_debug_output()

    + +

    ◆ set_debug_output()

    + +

    ◆ get_debug_output()

    + +
    +
    + + + +
    + + + + + + + +
    static yask_output_ptr yask::yk_env::get_debug_output ()
    +
    +static

    Get object to receive debug output.

    -

    Returns pointer to yask_output set via set_debug_output or pointer to a yask_stdout_output if not set.

    +

    This is a static method, implying the following:

      +
    • This method may be called before creating a yk_env object.
    • +
    +
    Returns
    Pointer to yask_output set via set_debug_output or pointer to a yask_stdout_output if not set.
    - -

    ◆ set_trace_enabled()

    + +

    ◆ set_trace_enabled()

    + +

    ◆ is_trace_enabled()

    + +
    +
    + + + + + +
    + + + + + + + +
    static bool yask::yk_env::is_trace_enabled ()
    +
    +static
    +
    + +

    Get whether tracing is enabled.

    +

    This is a static method, implying the following:

      +
    • This method may be called before creating a yk_env object.
    • +
    +
    Returns
    Whether tracing is enabled.
    +
    diff --git a/docs/api/html/classyask_1_1yk__solution-members.html b/docs/api/html/classyask_1_1yk__solution-members.html index 0f214274..c419a7f1 100644 --- a/docs/api/html/classyask_1_1yk__solution-members.html +++ b/docs/api/html/classyask_1_1yk__solution-members.html @@ -75,33 +75,41 @@ - + + + - + - - - - - - + + + + + + + + - - - - - - + + + + + + + + - + + + - + @@ -111,28 +119,38 @@ - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    apply_command_line_options(const std::string &args)=0yask::yk_solutionpure virtual
    apply_command_line_options(int argc, char *argv[])=0yask::yk_solutionpure virtual
    apply_command_line_options(const std::vector< std::string > &args)=0yask::yk_solutionpure virtual
    apply_command_line_options(const string_vec &args)=0yask::yk_solutionpure virtual
    call_after_prepare_solution(hook_fn_t hook_fn)=0yask::yk_solutionpure virtual
    call_after_run_solution(hook_fn_2idx_t hook_fn)=0yask::yk_solutionpure virtual
    call_before_prepare_solution(hook_fn_t hook_fn)=0yask::yk_solutionpure virtual
    call_before_run_solution(hook_fn_2idx_t hook_fn)=0yask::yk_solutionpure virtual
    copy_vars_from_device() const =0yask::yk_solutionpure virtual
    copy_vars_to_device() const =0yask::yk_solutionpure virtual
    end_solution()=0yask::yk_solutionpure virtual
    fuse_grids(yk_solution_ptr source)yask::yk_solutioninline
    fuse_grids(yk_solution_ptr source)yask::yk_solutioninline
    fuse_vars(yk_solution_ptr source)=0yask::yk_solutionpure virtual
    get_block_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_default_numa_preferred() const =0yask::yk_solutionpure virtual
    get_domain_dim_names() const =0yask::yk_solutionpure virtual
    get_element_bytes() const =0yask::yk_solutionpure virtual
    get_first_rank_domain_index(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_grid(const std::string &name)yask::yk_solutioninline
    get_grids()yask::yk_solutioninline
    get_block_size_vec() const =0yask::yk_solutionpure virtual
    get_default_numa_preferred() const =0yask::yk_solutionpure virtual
    get_domain_dim_names() const =0yask::yk_solutionpure virtual
    get_element_bytes() const =0yask::yk_solutionpure virtual
    get_first_rank_domain_index(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_first_rank_domain_index_vec() const =0yask::yk_solutionpure virtual
    get_grid(const std::string &name)yask::yk_solutioninline
    get_grids()yask::yk_solutioninline
    get_last_rank_domain_index(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_min_pad_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_misc_dim_names() const =0yask::yk_solutionpure virtual
    get_name() const =0yask::yk_solutionpure virtual
    get_num_domain_dims() const =0yask::yk_solutionpure virtual
    get_num_grids() constyask::yk_solutioninline
    get_num_ranks(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_last_rank_domain_index_vec() const =0yask::yk_solutionpure virtual
    get_min_pad_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_misc_dim_names() const =0yask::yk_solutionpure virtual
    get_name() const =0yask::yk_solutionpure virtual
    get_num_domain_dims() const =0yask::yk_solutionpure virtual
    get_num_grids() constyask::yk_solutioninline
    get_num_ranks(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_num_ranks_vec() const =0yask::yk_solutionpure virtual
    get_num_vars() const =0yask::yk_solutionpure virtual
    get_overall_domain_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_rank_domain_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_overall_domain_size_vec() const =0yask::yk_solutionpure virtual
    get_rank_domain_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_rank_domain_size_vec() const =0yask::yk_solutionpure virtual
    get_rank_index(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_region_size(const std::string &dim) const =0yask::yk_solutionpure virtual
    get_rank_index_vec() const =0yask::yk_solutionpure virtual
    get_stats()=0yask::yk_solutionpure virtual
    get_step_dim_name() const =0yask::yk_solutionpure virtual
    get_step_wrap() const =0yask::yk_solutionpure virtual
    hook_fn_2idx_t typedefyask::yk_solution
    hook_fn_t typedefyask::yk_solution
    is_auto_tuner_enabled() const =0yask::yk_solutionpure virtual
    new_fixed_size_grid(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)yask::yk_solutioninline
    new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const std::vector< idx_t > &dim_sizes)yask::yk_solutioninline
    new_fixed_size_var(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)=0yask::yk_solutionpure virtual
    new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const std::initializer_list< idx_t > &dim_sizes)=0yask::yk_solutionpure virtual
    new_grid(const std::string &name, const std::vector< std::string > &dims)yask::yk_solutioninline
    new_grid(const std::string &name, const std::initializer_list< std::string > &dims)yask::yk_solutioninline
    new_var(const std::string &name, const std::vector< std::string > &dims)=0yask::yk_solutionpure virtual
    new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0yask::yk_solutionpure virtual
    prepare_solution()=0yask::yk_solutionpure virtual
    reset_auto_tuner(bool enable, bool verbose=false)=0yask::yk_solutionpure virtual
    run_auto_tuner_now(bool verbose=true)=0yask::yk_solutionpure virtual
    run_solution(idx_t first_step_index, idx_t last_step_index)=0yask::yk_solutionpure virtual
    run_solution(idx_t step_index)=0yask::yk_solutionpure virtual
    set_block_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_debug_output(yask_output_ptr debug)=0yask::yk_solutionpure virtual
    set_default_numa_preferred(int numa_node)=0yask::yk_solutionpure virtual
    set_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_num_ranks(const std::string &dim, idx_t num)=0yask::yk_solutionpure virtual
    set_overall_domain_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_rank_domain_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_rank_index(const std::string &dim, idx_t num)=0yask::yk_solutionpure virtual
    set_region_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    is_offloaded() const =0yask::yk_solutionpure virtual
    new_fixed_size_grid(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)yask::yk_solutioninline
    new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_vec &dim_sizes)yask::yk_solutioninline
    new_fixed_size_var(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)=0yask::yk_solutionpure virtual
    new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_init_list &dim_sizes)=0yask::yk_solutionpure virtual
    new_grid(const std::string &name, const string_vec &dims)yask::yk_solutioninline
    new_grid(const std::string &name, const std::initializer_list< std::string > &dims)yask::yk_solutioninline
    new_var(const std::string &name, const string_vec &dims)=0yask::yk_solutionpure virtual
    new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0yask::yk_solutionpure virtual
    prepare_solution()=0yask::yk_solutionpure virtual
    reset_auto_tuner(bool enable, bool verbose=false)=0yask::yk_solutionpure virtual
    run_auto_tuner_now(bool verbose=true)=0yask::yk_solutionpure virtual
    run_solution(idx_t first_step_index, idx_t last_step_index)=0yask::yk_solutionpure virtual
    run_solution(idx_t step_index)=0yask::yk_solutionpure virtual
    set_block_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_block_size_vec(const idx_t_vec &vals)=0yask::yk_solutionpure virtual
    set_block_size_vec(const idx_t_init_list &vals)=0yask::yk_solutionpure virtual
    set_debug_output(yask_output_ptr debug)=0yask::yk_solutionpure virtual
    set_default_numa_preferred(int numa_node)=0yask::yk_solutionpure virtual
    set_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_num_ranks(const std::string &dim, idx_t num)=0yask::yk_solutionpure virtual
    set_num_ranks_vec(const idx_t_vec &vals)=0yask::yk_solutionpure virtual
    set_num_ranks_vec(const idx_t_init_list &vals)=0yask::yk_solutionpure virtual
    set_overall_domain_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_overall_domain_size_vec(const idx_t_vec &vals)=0yask::yk_solutionpure virtual
    set_overall_domain_size_vec(const idx_t_init_list &vals)=0yask::yk_solutionpure virtual
    set_rank_domain_size(const std::string &dim, idx_t size)=0yask::yk_solutionpure virtual
    set_rank_domain_size_vec(const idx_t_vec &vals)=0yask::yk_solutionpure virtual
    set_rank_domain_size_vec(const idx_t_init_list &vals)=0yask::yk_solutionpure virtual
    set_rank_index(const std::string &dim, idx_t num)=0yask::yk_solutionpure virtual
    set_rank_index_vec(const idx_t_vec &vals)=0yask::yk_solutionpure virtual
    set_rank_index_vec(const idx_t_init_list &vals)=0yask::yk_solutionpure virtual
    set_step_wrap(bool do_wrap)=0yask::yk_solutionpure virtual
    ~yk_solution() (defined in yask::yk_solution)yask::yk_solutioninlinevirtual
    diff --git a/docs/api/html/classyask_1_1yk__solution.html b/docs/api/html/classyask_1_1yk__solution.html index 10bab1d5..7ee07f4b 100644 --- a/docs/api/html/classyask_1_1yk__solution.html +++ b/docs/api/html/classyask_1_1yk__solution.html @@ -87,21 +87,21 @@  [Advanced] Callback type with yk_solution parameter.
      -typedef std::function< void(yk_solution &soln, idx_t first_step_index, idx_t last_step_index)> hook_fn_2idx_t +typedef std::function< void(yk_solution &soln, idx_t first_step_index, idx_t last_step_index)> hook_fn_2idx_t  [Advanced] Callback type with yk_solution and step-index parameters.
      - - - + + + @@ -111,51 +111,96 @@ - - - - - - - + + + + + + + - + + + + + + + - + + + + - + + + + + + + - + + + + - + + + + + + + - + + + + - + + + + + + + - + + + + - + + + + + + + + + + - - - + + + @@ -168,57 +213,63 @@ - + - + + + + - + + + + - + + + + + + + + + + - + - - - - - - - + + + + - + - - - - - - - - - + + + - - - - - - + + + + + + @@ -246,46 +297,50 @@ - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + +

    Public Member Functions

    virtual void set_debug_output (yask_output_ptr debug)=0
     Set object to receive debug output. More...
     
    virtual const std::string & get_name () const =0
     Get the name of the solution. More...
     
    virtual std::string get_target () const =0
     Get the target ISA. More...
     
    virtual bool is_offloaded () const =0
     Get whether the stencil kernel will be offloaded to a device. More...
     
    virtual int get_element_bytes () const =0
     Get the floating-point precision size. More...
     
    virtual int get_num_domain_dims () const =0
     Get the number of domain dimensions used in this solution. More...
     
    virtual std::vector< std::string > get_domain_dim_names () const =0
     Get all the domain dimension names. More...
     
    virtual std::vector< std::string > get_misc_dim_names () const =0
     Get all the miscellaneous dimension names. More...
     
    virtual void set_rank_domain_size (const std::string &dim, idx_t size)=0
    virtual string_vec get_domain_dim_names () const =0
     Get all the domain dimension names. More...
     
    virtual string_vec get_misc_dim_names () const =0
     Get all the miscellaneous dimension names. More...
     
    virtual void set_rank_domain_size (const std::string &dim, idx_t size)=0
     Set the local-domain size in the specified dimension, i.e., the size of the part of the domain that is in this rank. More...
     
    virtual idx_t get_rank_domain_size (const std::string &dim) const =0
    virtual void set_rank_domain_size_vec (const idx_t_vec &vals)=0
     Set the local-domain size in all domain dimensions. More...
     
    virtual void set_rank_domain_size_vec (const idx_t_init_list &vals)=0
     Set the local-domain size in all domain dimensions. More...
     
    virtual idx_t get_rank_domain_size (const std::string &dim) const =0
     Get the local-domain size in the specified dimension, i.e., the size in this rank. More...
     
    virtual void set_overall_domain_size (const std::string &dim, idx_t size)=0
    virtual idx_t_vec get_rank_domain_size_vec () const =0
     Get the local-domain size in all domain dimensions. More...
     
    virtual void set_overall_domain_size (const std::string &dim, idx_t size)=0
     Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. More...
     
    virtual idx_t get_overall_domain_size (const std::string &dim) const =0
    virtual void set_overall_domain_size_vec (const idx_t_vec &vals)=0
     Set the global-domain size in all domain dimensions. More...
     
    virtual void set_overall_domain_size_vec (const idx_t_init_list &vals)=0
     Set the global-domain size in all domain dimensions. More...
     
    virtual idx_t get_overall_domain_size (const std::string &dim) const =0
     Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. More...
     
    virtual void set_block_size (const std::string &dim, idx_t size)=0
    virtual idx_t_vec get_overall_domain_size_vec () const =0
     Get the global-domain size in all domain dimensions. More...
     
    virtual void set_block_size (const std::string &dim, idx_t size)=0
     Set the block size in the given dimension. More...
     
    virtual idx_t get_block_size (const std::string &dim) const =0
    virtual void set_block_size_vec (const idx_t_vec &vals)=0
     Set the block size in all domain dimensions. More...
     
    virtual void set_block_size_vec (const idx_t_init_list &vals)=0
     Set the block size in all domain dimensions. More...
     
    virtual idx_t get_block_size (const std::string &dim) const =0
     Get the block size. More...
     
    virtual void set_num_ranks (const std::string &dim, idx_t num)=0
    virtual idx_t_vec get_block_size_vec () const =0
     Get the block size in all domain dimensions. More...
     
    virtual void set_num_ranks (const std::string &dim, idx_t num)=0
     Set the number of MPI ranks in the given dimension. More...
     
    virtual idx_t get_num_ranks (const std::string &dim) const =0
    virtual void set_num_ranks_vec (const idx_t_vec &vals)=0
     Set the number of MPI ranks in all domain dimensions. More...
     
    virtual void set_num_ranks_vec (const idx_t_init_list &vals)=0
     Set the number of all MPI ranks in all domain dimensions. More...
     
    virtual idx_t get_num_ranks (const std::string &dim) const =0
     Get the number of MPI ranks in the given dimension. More...
     
    virtual void set_rank_index (const std::string &dim, idx_t num)=0
    virtual idx_t_vec get_num_ranks_vec () const =0
     Get the number of MPI ranks in all domain dimensions. More...
     
    virtual void set_rank_index (const std::string &dim, idx_t num)=0
     Set the rank index in the specified dimension. More...
     
    virtual idx_t get_rank_index (const std::string &dim) const =0
    virtual void set_rank_index_vec (const idx_t_vec &vals)=0
     Set the rank index in all domain dimensions. More...
     
    virtual void set_rank_index_vec (const idx_t_init_list &vals)=0
     Set the rank index in all domain dimensions. More...
     
    virtual idx_t get_rank_index (const std::string &dim) const =0
     Get the rank index in the specified dimension. More...
     
    virtual idx_t_vec get_rank_index_vec () const =0
     Get the rank index in all domain dimensions. More...
     
    virtual std::string apply_command_line_options (const std::string &args)=0
     Set kernel options from a string. More...
     
    virtual std::string apply_command_line_options (int argc, char *argv[])=0
     Set kernel options from standard C or C++ argc and argv parameters to main(). More...
     
    virtual std::string apply_command_line_options (const std::vector< std::string > &args)=0
     Set kernel options from a vector of strings. More...
     
    virtual std::string apply_command_line_options (const string_vec &args)=0
     Set kernel options from a vector of strings. More...
     
    virtual int get_num_vars () const =0
     Get the number of vars in the solution. More...
     
    virtual void prepare_solution ()=0
     Prepare the solution for stencil application. More...
     
    virtual idx_t get_first_rank_domain_index (const std::string &dim) const =0
    virtual idx_t get_first_rank_domain_index (const std::string &dim) const =0
     Get the first index of the sub-domain in this rank in the specified dimension. More...
     
    virtual idx_t get_last_rank_domain_index (const std::string &dim) const =0
    virtual idx_t_vec get_first_rank_domain_index_vec () const =0
     Get the first index of the sub-domain in this rank in all domain dimensions. More...
     
    virtual idx_t get_last_rank_domain_index (const std::string &dim) const =0
     Get the last index of the sub-domain in this rank the specified dimension. More...
     
    virtual void run_solution (idx_t first_step_index, idx_t last_step_index)=0
    virtual idx_t_vec get_last_rank_domain_index_vec () const =0
     Get the last index of the sub-domain in this rank in all domain dimensions. More...
     
    virtual void run_solution (idx_t first_step_index, idx_t last_step_index)=0
     Run the stencil solution for the specified steps. More...
     
    virtual void run_solution (idx_t step_index)=0
    virtual void run_solution (idx_t step_index)=0
     Run the stencil solution for the specified step. More...
     
    virtual void copy_vars_to_device () const =0
     Update data on the device. More...
     
    virtual void copy_vars_from_device () const =0
     Update data on the host. More...
     
    virtual void end_solution ()=0
     Finish using a solution. More...
     
    virtual yk_stats_ptr get_stats ()=0
     Get performance statistics associated with preceding calls to run_solution(). More...
     
    virtual void reset_auto_tuner (bool enable, bool verbose=false)=0
     Start or stop the online auto-tuner on this rank. More...
     
    virtual bool is_auto_tuner_enabled () const =0
     Determine whether the auto-tuner is enabled on this rank. More...
     Determine whether the online auto-tuner is enabled on this rank. More...
     
    virtual void set_region_size (const std::string &dim, idx_t size)=0
     [Advanced] Set the region size in the given dimension. More...
     
    virtual idx_t get_region_size (const std::string &dim) const =0
     [Advanced] Get the region size. More...
     
    virtual void set_min_pad_size (const std::string &dim, idx_t size)=0
    virtual void run_auto_tuner_now (bool verbose=true)=0
     Run the offline auto-tuner immediately, not preserving variable data. More...
     
    virtual void set_min_pad_size (const std::string &dim, idx_t size)=0
     [Advanced] Set the minimum amount of padding for all vars. More...
     
    virtual idx_t get_min_pad_size (const std::string &dim) const =0
    virtual idx_t get_min_pad_size (const std::string &dim) const =0
     [Advanced] Get the minimum amount of padding for all vars. More...
     
    virtual void reset_auto_tuner (bool enable, bool verbose=false)=0
     [Advanced] Restart or disable the auto-tuner on this rank. More...
     
    virtual void run_auto_tuner_now (bool verbose=true)=0
     [Advanced] Automatically tune selected settings immediately. More...
     
    virtual yk_var_ptr new_var (const std::string &name, const std::vector< std::string > &dims)=0
     [Advanced] Add a new var to the solution. More...
     
    virtual yk_var_ptr new_var (const std::string &name, const string_vec &dims)=0
     [Advanced] Add a new var to the solution. More...
     
    virtual yk_var_ptr new_var (const std::string &name, const std::initializer_list< std::string > &dims)=0
     [Advanced] Add a new var to the solution. More...
     
    virtual yk_var_ptr new_fixed_size_var (const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)=0
     [Advanced] Add a new var to the solution with a specified size. More...
     
    virtual yk_var_ptr new_fixed_size_var (const std::string &name, const std::initializer_list< std::string > &dims, const std::initializer_list< idx_t > &dim_sizes)=0
     [Advanced] Add a new var to the solution with a specified size. More...
     
    virtual yk_var_ptr new_fixed_size_var (const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)=0
     [Advanced] Add a new var to the solution with a specified size. More...
     
    virtual yk_var_ptr new_fixed_size_var (const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_init_list &dim_sizes)=0
     [Advanced] Add a new var to the solution with a specified size. More...
     
    virtual bool set_default_numa_preferred (int numa_node)=0
     [Advanced] Set the default preferred NUMA node on which to allocate data. More...
     
    virtual bool get_step_wrap () const =0
     [Advanced] Get whether invalid step indices alias to valid ones. More...
     
    -int get_num_grids () const
     [Deprecated] Use get_num_vars().
     
    -yk_var_ptr get_grid (const std::string &name)
     [Deprecated] Use get_var().
     
    -std::vector< yk_var_ptrget_grids ()
     [Deprecated] Use get_vars().
     
    -yk_var_ptr new_grid (const std::string &name, const std::vector< std::string > &dims)
     [Deprecated] Use new_var().
     
    -yk_var_ptr new_grid (const std::string &name, const std::initializer_list< std::string > &dims)
     [Deprecated] Use new_var().
     
    -yk_var_ptr new_fixed_size_grid (const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)
     [Deprecated] Use new_fixed_size_var().
     
    -yk_var_ptr new_fixed_size_grid (const std::string &name, const std::initializer_list< std::string > &dims, const std::vector< idx_t > &dim_sizes)
     [Deprecated] Use new_fixed_size_var().
     
    -void fuse_grids (yk_solution_ptr source)
     [Deprecated] Use fuse_vars().
     
    +virtual YASK_DEPRECATED void set_debug_output (yask_output_ptr debug)=0
     [Deprecated] Use yk_env::set_debug_output().
     
    +YASK_DEPRECATED int get_num_grids () const
     [Deprecated] Use get_num_vars().
     
    +YASK_DEPRECATED yk_var_ptr get_grid (const std::string &name)
     [Deprecated] Use get_var().
     
    +YASK_DEPRECATED std::vector< yk_var_ptrget_grids ()
     [Deprecated] Use get_vars().
     
    +YASK_DEPRECATED yk_var_ptr new_grid (const std::string &name, const string_vec &dims)
     [Deprecated] Use new_var().
     
    +YASK_DEPRECATED yk_var_ptr new_grid (const std::string &name, const std::initializer_list< std::string > &dims)
     [Deprecated] Use new_var().
     
    +YASK_DEPRECATED yk_var_ptr new_fixed_size_grid (const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)
     [Deprecated] Use new_fixed_size_var().
     
    +YASK_DEPRECATED yk_var_ptr new_fixed_size_grid (const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_vec &dim_sizes)
     [Deprecated] Use new_fixed_size_var().
     
    +YASK_DEPRECATED void fuse_grids (yk_solution_ptr source)
     [Deprecated] Use fuse_vars().
     

    Detailed Description

    Stencil solution as defined by the generated code from the YASK stencil compiler.

    Objects of this type contain all the vars and equations that comprise a solution.

    Created via yk_factory::new_solution().

    Member Function Documentation

    - -

    ◆ set_debug_output()

    + +

    ◆ get_name()

    @@ -294,11 +349,10 @@

    - + - - - + +
    virtual void yask::yk_solution::set_debug_output virtual const std::string& yask::yk_solution::get_name (yask_output_ptr debug)) const
    @@ -308,19 +362,13 @@

    -

    Set object to receive debug output.

    -

    Just a shortcut for setting the debug output in the yk_env used to create the solution.

    -
    Parameters
    - - -
    [out]debugPointer to object to receive debug output. See yask_output_factory.
    -
    -
    +

    Get the name of the solution.

    +
    Returns
    String containing the solution name provided during stencil compilation.

    - -

    ◆ get_name()

    + +

    ◆ get_target()

    @@ -329,7 +377,7 @@

    - + @@ -342,13 +390,13 @@

    -

    Get the name of the solution.

    -
    Returns
    String containing the solution name provided during stencil compilation.
    +

    Get the target ISA.

    +
    Returns
    String describing the instruction-set architecture of the CPU targeted during kernel compilation. See the allowed YASK kernel targets in yc_solution::set_target().
    - -

    ◆ get_target()

    + +

    ◆ is_offloaded()

    @@ -357,7 +405,7 @@

    virtual const std::string& yask::yk_solution::get_name virtual std::string yask::yk_solution::get_target ( ) const
    - + @@ -370,8 +418,8 @@

    -

    Get the target ISA.

    -
    Returns
    String describing the instruction-set architecture targeted during kernel compilation. See the allowed YASK kernel targets in yc_solution::set_target().
    +

    Get whether the stencil kernel will be offloaded to a device.

    +
    Returns
    true if kernel will be offloaded or false if not.
    @@ -459,8 +507,8 @@

    -

    ◆ get_domain_dim_names()

    + +

    ◆ get_domain_dim_names()

    @@ -469,7 +517,7 @@

    virtual std::string yask::yk_solution::get_target virtual bool yask::yk_solution::is_offloaded ( ) const
    - + @@ -487,8 +535,8 @@

    -

    ◆ get_misc_dim_names()

    + +

    ◆ get_misc_dim_names()

    @@ -497,7 +545,7 @@

    virtual std::vector<std::string> yask::yk_solution::get_domain_dim_names virtual string_vec yask::yk_solution::get_domain_dim_names ( ) const
    - + @@ -511,7 +559,7 @@

    Get all the miscellaneous dimension names.

    -
    Returns
    List of all dimension names that were either Defined by yc_node_factory::new_misc_index() and used in one or more vars, or Created at run-time by adding a new dimension via yk_solution::new_var() or yk_solution::new_fixed_size_var().
    +
    Returns
    List of all dimension names that were either Defined by yc_node_factory::new_misc_index() and used in one or more vars, or Created at run-time by adding a new dimension via yk_solution::new_var() or yk_solution::new_fixed_size_var().
    @@ -533,7 +581,7 @@

    - + @@ -551,16 +599,86 @@

    prepare_solution() is called. Setting the local-domain size to a non-zero value will clear the global-domain size in that dimension until prepare_solution() is called.

    +

    You should set either the local-domain size or the global-domain size in each dimension; the other should be set to zero (unspecified). The unspecified (zero) sizes will be calculated based on the specified ones when prepare_solution() is called.

    See the "Detailed Description" for yk_var for more information on var sizes.

    Parameters

    virtual std::vector<std::string> yask::yk_solution::get_misc_dim_names virtual string_vec yask::yk_solution::get_misc_dim_names ( ) constidx_t idx_t  size 
    - +
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]sizeElements in the domain in this `dim`.

    +

    + + +

    ◆ set_rank_domain_size_vec() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_rank_domain_size_vec (const idx_t_vecvals)
    +
    +pure virtual
    +
    + +

    Set the local-domain size in all domain dimensions.

    +

    See set_rank_domain_size().

    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    + +
    +
    + +

    ◆ set_rank_domain_size_vec() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_rank_domain_size_vec (const idx_t_init_listvals)
    +
    +pure virtual
    +
    + +

    Set the local-domain size in all domain dimensions.

    +

    See set_rank_domain_size().

    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    +
    @@ -573,7 +691,7 @@

    - + @@ -589,15 +707,43 @@

    set_rank_domain_size().

    -

    If you have called set_overall_domain_size() in a given dimension, get_rank_domain_size() will return zero in that dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned.

    +
    Note
    get_rank_domain_size() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned.
    Returns
    Current setting of rank domain size in specified dimension.
    Parameters

    virtual idx_t yask::yk_solution::get_rank_domain_size virtual idx_t yask::yk_solution::get_rank_domain_size ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    + + + +

    ◆ get_rank_domain_size_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_rank_domain_size_vec () const
    +
    +pure virtual
    +
    + +

    Get the local-domain size in all domain dimensions.

    +

    See get_rank_domain_size().

    Returns
    Vector of current setting of rank domain sizes.
    +
    @@ -618,7 +764,7 @@

    - idx_t  + idx_t  size  @@ -635,16 +781,86 @@

    Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks.

    -

    You should set either the local-domain size or the global-domain size in each dimension. The unspecified (zero) sizes will be calculated based on the specified ones when prepare_solution() is called. Setting the global-domain size to a non-zero value will clear the local-domain size in that dimension until prepare_solution() is called.

    +

    You should set either the local-domain size or the global-domain size in each dimension; the other should be set to zero (unspecified). The unspecified (zero) sizes will be calculated based on the specified ones when prepare_solution() is called.

    See documentation for set_rank_domain_size(). See the "Detailed Description" for yk_var for more information on var sizes.

    Parameters
    - +
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]sizeElements in the domain in this `dim`.
    + + + +

    ◆ set_overall_domain_size_vec() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_overall_domain_size_vec (const idx_t_vecvals)
    +
    +pure virtual
    +
    + +

    Set the global-domain size in all domain dimensions.

    +

    See set_overall_domain_size().

    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    + +
    +
    + +

    ◆ set_overall_domain_size_vec() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_overall_domain_size_vec (const idx_t_init_listvals)
    +
    +pure virtual
    +
    + +

    Set the global-domain size in all domain dimensions.

    +

    See set_overall_domain_size().

    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    +
    @@ -657,7 +873,7 @@

    - + @@ -673,15 +889,44 @@

    get_overall_domain_size() - 1, inclusive. Call get_first_rank_domain_index() and get_last_rank_domain_index() to find the subset of this domain in each rank.

    -

    If you have called set_rank_domain_size() in a given dimension, get_overall_domain_size() will return zero in that dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned.

    +
    Note
    get_overall_domain_size() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned.
    Returns
    Sum of all ranks' domain sizes in the given dimension.
    Parameters

    virtual idx_t yask::yk_solution::get_overall_domain_size virtual idx_t yask::yk_solution::get_overall_domain_size ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    + + + +

    ◆ get_overall_domain_size_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_overall_domain_size_vec () const
    +
    +pure virtual
    +
    + +

    Get the global-domain size in all domain dimensions.

    +

    See get_overall_domain_size().

    +
    Returns
    Vector of current setting of global domain sizes.
    +
    @@ -702,7 +947,7 @@

    - idx_t  + idx_t  size  @@ -720,15 +965,88 @@

    -

    Unless auto-tuning is disabled, the block size will be used as a starting point for an automated search for a higher-performing block size.

    +

    Unless auto-tuning is disabled, the block size will be used as a starting point for an automated search for a higher-performing block size.

    +

    This and all other tile sizes (Mega-blocks, blocks, micro-blocks, etc.) can be set via apply_command_line_options().

    Parameters
    - +
    [in]dimName of dimension to set. Must be one of the names from get_step_dim_name() or get_domain_dim_names().
    [in]dimName of dimension to set. Must be one of the names from get_step_dim_name() or get_domain_dim_names().
    [in]sizeElements in a block in this `dim`.
    + + + +

    ◆ set_block_size_vec() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_block_size_vec (const idx_t_vecvals)
    +
    +pure virtual
    +
    + +

    Set the block size in all domain dimensions.

    +

    See set_block_size().

    +
    Note
    Does not set the block size in the step dim. Call set_block_size() with the name of the step dim to set the temporal block size.
    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    + +
    +
    + +

    ◆ set_block_size_vec() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_block_size_vec (const idx_t_init_listvals)
    +
    +pure virtual
    +
    + +

    Set the block size in all domain dimensions.

    +

    See set_block_size().

    +
    Note
    Does not set the block size in the step dim. Call set_block_size() with the name of the step dim to set the temporal block size.
    +
    Parameters
    + + +
    [in]valsElements in all domain dims.
    +
    +
    +
    @@ -741,7 +1059,7 @@

    - + @@ -759,11 +1077,41 @@

    set_block_size() due to rounding.

    Returns
    Current settings of block size.
    Parameters

    virtual idx_t yask::yk_solution::get_block_size virtual idx_t yask::yk_solution::get_block_size ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_step_dim_name() or get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_step_dim_name() or get_domain_dim_names().
    + + + +

    ◆ get_block_size_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_block_size_vec () const
    +
    +pure virtual
    +
    + +

    Get the block size in all domain dimensions.

    +

    See get_block_size().

    +
    Note
    Does not return the block size in the step domain. Call get_block_size() with the name of the step-domain dimension to get the temporal block size.
    +
    Returns
    Vector of current setting of block domain sizes.
    +
    @@ -784,7 +1132,7 @@

    - idx_t  + idx_t  num  @@ -807,16 +1155,207 @@

    Parameters
    - - + + +
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]numNumber of ranks in `dim`.
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]numNumber of ranks in `dim`.
    +
    + + + + +
    +

    ◆ set_num_ranks_vec() [1/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_num_ranks_vec (const idx_t_vecvals)
    +
    +pure virtual
    +
    + +

    Set the number of MPI ranks in all domain dimensions.

    +

    See set_num_ranks().

    +
    Parameters
    + + +
    [in]valsNumber of ranks in all domain dims.
    +
    +
    + +
    +
    + +

    ◆ set_num_ranks_vec() [2/2]

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual void yask::yk_solution::set_num_ranks_vec (const idx_t_init_listvals)
    +
    +pure virtual
    +
    + +

    Set the number of all MPI ranks in all domain dimensions.

    +

    See set_num_ranks().

    +
    Parameters
    + + +
    [in]valsNumber of ranks in all domain dims.
    +
    +
    + +
    +
    + +

    ◆ get_num_ranks()

    + +
    +
    + + + + + +
    + + + + + + + + +
    virtual idx_t yask::yk_solution::get_num_ranks (const std::string & dim) const
    +
    +pure virtual
    +
    + +

    Get the number of MPI ranks in the given dimension.

    +
    Note
    get_num_ranks() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed number of ranks will be returned.
    +
    Returns
    Current number of ranks.
    +
    Parameters
    + + +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    +
    +
    + +
    +
    + +

    ◆ get_num_ranks_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_num_ranks_vec () const
    +
    +pure virtual
    +
    + +

    Get the number of MPI ranks in all domain dimensions.

    +

    See get_num_ranks();

    Returns
    Vector of current number of ranks in all domain dimensions.
    + +
    +
    + +

    ◆ set_rank_index()

    + +
    +
    + + + + + +
    + + + + + + + + + + + + + + + + + + +
    virtual void yask::yk_solution::set_rank_index (const std::string & dim,
    idx_t num 
    )
    +
    +pure virtual
    +
    + +

    Set the rank index in the specified dimension.

    +

    The overall rank index in the specified dimension must range from zero (0) to get_num_ranks() - 1, inclusive. If you do not call set_rank_index(), a rank index will be assigned when prepare_solution() is called. You should either call set_rank_index() on all ranks or allow YASK to assign on on all ranks, i.e., do not mix-and-match.

    +

    Example using 6 MPI ranks in a 2-by-3 x, y domain:

    + + + + + + + +
    MPI rank index = 0, x rank index = 0, y rank index = 0 MPI rank index = 1, x rank index = 1, y rank index = 0
    MPI rank index = 2, x rank index = 0, y rank index = 1 MPI rank index = 3, x rank index = 1, y rank index = 1
    MPI rank index = 4, x rank index = 0, y rank index = 2 MPI rank index = 5, x rank index = 1, y rank index = 2
    +

    See yk_env::get_num_ranks() and yk_env::get_rank_index() for MPI rank index.

    +
    Note
    get_rank_index() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed index will be returned.
    +
    Parameters
    + + +
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]numRank index in `dim`.
    - -

    ◆ get_num_ranks()

    + +

    ◆ set_rank_index_vec() [1/2]

    - -

    ◆ set_rank_index()

    + +

    ◆ set_rank_index_vec() [2/2]

    @@ -860,21 +1399,11 @@

    - + - - - - - - - - - - + + - -
    virtual void yask::yk_solution::set_rank_index virtual void yask::yk_solution::set_rank_index_vec (const std::string & dim,
    idx_t num 
    const idx_t_init_listvals) )
    @@ -884,22 +1413,11 @@

    -

    Set the rank index in the specified dimension.

    -

    The overall rank index in the specified dimension must range from zero (0) to get_num_ranks() - 1, inclusive. If you do not call set_rank_index(), a rank index will be assigned when prepare_solution() is called. You should either call set_rank_index() on all ranks or allow YASK to assign on on all ranks, i.e., do not mix-and-match.

    -

    Example using 6 MPI ranks in a 2-by-3 x, y domain:

    - - - - - - - -
    MPI rank index = 0, x rank index = 0, y rank index = 0 MPI rank index = 1, x rank index = 1, y rank index = 0
    MPI rank index = 2, x rank index = 0, y rank index = 1 MPI rank index = 3, x rank index = 1, y rank index = 1
    MPI rank index = 4, x rank index = 0, y rank index = 2 MPI rank index = 5, x rank index = 1, y rank index = 2
    -

    See yk_env::get_num_ranks() and yk_env::get_rank_index() for MPI rank index.

    +

    Set the rank index in all domain dimensions.

    +

    See set_rank_index().

    Parameters
    - - +
    [in]dimName of dimension to set. Must be one of the names from get_domain_dim_names().
    [in]numRank index in `dim`.
    [in]valsIndex of this rank in all domain dims.
    @@ -916,7 +1434,7 @@

    - + @@ -934,11 +1452,39 @@

    get_num_ranks() - 1, inclusive.

    Returns
    Zero-based index of this rank.
    Parameters

    virtual idx_t yask::yk_solution::get_rank_index virtual idx_t yask::yk_solution::get_rank_index ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    +

    +
    + +

    ◆ get_rank_index_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_rank_index_vec () const
    +
    +pure virtual
    +
    + +

    Get the rank index in all domain dimensions.

    +

    See get_rank_index();

    Returns
    Vector of zero-based indices of this rank in all domain dimensions.
    +
    @@ -966,7 +1512,7 @@

    Set kernel options from a string.

    -

    Parses the string for options as if from a command-line. Example: "-bx 64 -block_threads 4" sets the block-size in the x dimension to 64 and the number of threads used to process each block to 4. See the help message from the YASK kernel binary for documentation on the command-line options. Used to set less-common options not directly supported by the APIs above (set_block_size(), etc.).

    +

    Parses the string for options as if from a command-line. Example: "-bx 64 -inner_threads 4" sets the block-size in the x dimension to 64 and the number of nested OpenMp threads to 4. See the help message from the YASK kernel binary for documentation on the command-line options. Used to set less-common options not directly supported by the APIs above (set_block_size(), etc.).

    Returns
    Any parts of args that were not recognized by the parser as options. Thus, a non-empty returned string may be used to signal an error or interpreted by a custom application in another way.
    Parameters
    @@ -1017,8 +1563,8 @@

    -

    ◆ apply_command_line_options() [3/3]

    + +

    ◆ apply_command_line_options() [3/3]

    - + @@ -1071,7 +1617,7 @@

    Get the number of vars in the solution.

    -

    Vars may be pre-defined by the stencil compiler (e.g., via yc_solution::new_var()) or created explicitly via yk_solution::new_var() or yk_solution::new_fixed_size_var().

    Returns
    Number of YASK vars that have been created.
    +

    Vars may be pre-defined by the stencil compiler (e.g., via yc_solution::new_var()) or created explicitly via yk_solution::new_var() or yk_solution::new_fixed_size_var().

    Returns
    Number of YASK vars that have been created.
    @@ -1162,7 +1708,7 @@

    Prepare the solution for stencil application.

    -

    Allocates data in vars that do not already have storage allocated. Calculates the position of each rank in the overall problem domain. Sets many other data structures needed for proper stencil application. Since this function initiates MPI communication, it must be called on all MPI ranks, and it will block until all ranks have completed. Must be called before applying any stencils.

    +

    Calculates the position of each rank in the overall problem domain if not previsouly specified. Calculates the sizes of each rank if not previsously specified. Allocates data in vars that do not already have storage allocated. Sets many other data structures needed for proper stencil application. Since this function initiates MPI communication, it must be called on all MPI ranks, and it will block until all ranks have completed. Must be called before applying any stencils.

    @@ -1176,7 +1722,7 @@

    (const std::vector< std::string > & const string_vec args)
    - + @@ -1196,11 +1742,39 @@

    Returns
    First domain index in this rank.
    Parameters

    virtual idx_t yask::yk_solution::get_first_rank_domain_index virtual idx_t yask::yk_solution::get_first_rank_domain_index ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    + + + +

    ◆ get_first_rank_domain_index_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_first_rank_domain_index_vec () const
    +
    +pure virtual
    +
    + +

    Get the first index of the sub-domain in this rank in all domain dimensions.

    +

    See get_first_rank_domain_index().

    Returns
    Vector of first domain indices of this rank in all domain dimensions.
    +
    @@ -1213,7 +1787,7 @@

    - + @@ -1233,11 +1807,39 @@

    Returns
    Last index in this rank.
    Parameters

    virtual idx_t yask::yk_solution::get_last_rank_domain_index virtual idx_t yask::yk_solution::get_last_rank_domain_index ( const std::string &  dim)
    - +
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    [in]dimName of dimension to get. Must be one of the names from get_domain_dim_names().
    + + + +

    ◆ get_last_rank_domain_index_vec()

    + +
    +
    + + + + + +
    + + + + + + + +
    virtual idx_t_vec yask::yk_solution::get_last_rank_domain_index_vec () const
    +
    +pure virtual
    +
    + +

    Get the last index of the sub-domain in this rank in all domain dimensions.

    +

    See get_last_rank_domain_index().

    Returns
    Vector of last domain indices of this rank in all domain dimensions.
    +
    @@ -1252,13 +1854,13 @@

    virtual void yask::yk_solution::run_solution ( - idx_t  + idx_t  first_step_index, - idx_t  + idx_t  last_step_index  @@ -1276,7 +1878,7 @@

    -
  • If temporal wave-front tiling is not used (the default):
      +
    • If temporal tiling is not used (the default):
      • The step index (e.g., t for "time") will be sequentially set to values from first_step_index to last_step_index, inclusive.
      • -
      • [Advanced] If temporal wave-front tiling is enabled via set_region_size():
          -
        • The step index (e.g., t for "time") will be sequentially set to values from first_step_index to last_step_index, inclusive, within each region. -

          If you want a var that is not automatically resized based on the solution settings, use new_fixed_size_var() instead.

          +

          If you want a var that is not automatically resized based on the solution settings, use new_fixed_size_var() instead.

          Note
          A new var contains only the meta-data for the var; data storage is not yet allocated. Storage may be allocated in any of the methods listed in the "Detailed Description" for yk_var.
          Returns
          Pointer to the new var.
          Parameters
          @@ -1782,7 +2366,7 @@

          [Advanced] Add a new var to the solution.

          -

          See documentation for the version of new_var() with a vector of dimension names as a parameter.

          Note
          This version is not available (or needed) in the Python API.
          +

          See documentation for the version of new_var() with a vector of dimension names as a parameter.

          Note
          This version is not available (or needed) in the Python API.
          Returns
          Pointer to the new var.
          Parameters
          @@ -1794,8 +2378,8 @@

          -

          ◆ new_fixed_size_var() [1/2]

          + +

          ◆ new_fixed_size_var() [1/2]

          @@ -1812,13 +2396,13 @@

          - + - + @@ -1836,20 +2420,20 @@

          new_var():

            +

            The following behaviors are different from both pre-defined vars and those created via new_var():

            • Calls to set_rank_domain_size() will not automatically resize the corresponding local-domain size in this var–this is where the term "fixed" applies.
            • In contrast, for each domain dimension of the var, the new var's local-domain size can be changed independently of the domain size of the solution.
            • This var's first domain index in this rank will be fixed at zero (0) in each domain dimension regardless of this rank's position. In other words, this var does not participate in "domain decomposition".
            • This var's padding size will be affected only by calls to yk_var::set_min_pad_size(), etc., i.e., not via yk_solution::set_min_pad_size().
            -

            The following behaviors are the same as those of a pre-defined var and those created via new_var():

              +

              The following behaviors are the same as those of a pre-defined var and those created via new_var():

              See yk_var::set_alloc_size().

              -

              The following behaviors are different than a pre-defined var but the same as those created via new_var():

                +

                The following behaviors are different than a pre-defined var but the same as those created via new_var():

                Note
                A new var contains only the meta-data for the var; data storage is not yet allocated. Storage may be allocated in any of the methods listed in the "Detailed Description" for yk_var.
                Returns
                Pointer to the new var.
                @@ -1864,8 +2448,8 @@

                -

                ◆ new_fixed_size_var() [2/2]

                + +

                ◆ new_fixed_size_var() [2/2]

                @@ -1888,7 +2472,7 @@

          - + @@ -1905,7 +2489,7 @@

          [Advanced] Add a new var to the solution with a specified size.

          -

          See documentation for the version of new_fixed_size_var() with a vector of dimension names as a parameter.

          Note
          This version is not available (or needed) in the Python API.
          +

          See documentation for the version of new_fixed_size_var() with a vector of dimension names as a parameter.

          Note
          This version is not available (or needed) in the Python API.
          Returns
          Pointer to the new var.
          Parameters

          const std::vector< std::string > & const string_vec dims,
          const std::vector< idx_t > & const idx_t_vec dim_sizes 
          const std::initializer_list< idx_t > & const idx_t_init_list dim_sizes 
          diff --git a/docs/api/html/classyask_1_1yk__stats.html b/docs/api/html/classyask_1_1yk__stats.html index 073f7043..e39b35df 100644 --- a/docs/api/html/classyask_1_1yk__stats.html +++ b/docs/api/html/classyask_1_1yk__stats.html @@ -81,16 +81,16 @@
          - + - + - + - + @@ -111,7 +111,7 @@

          Public Member Functions

          virtual idx_t get_num_elements ()=0
          virtual idx_t get_num_elements ()=0
           Get the number of elements in the overall domain. More...
           
          virtual idx_t get_num_steps_done ()=0
          virtual idx_t get_num_steps_done ()=0
           Get the number of steps executed via run_solution(). More...
           
          virtual idx_t get_num_writes_done ()=0
          virtual idx_t get_num_writes_done ()=0
           Get the number of elements written across all steps. More...
           
          virtual idx_t get_est_fp_ops_done ()=0
          virtual idx_t get_est_fp_ops_done ()=0
           Get the estimated number of floating-point operations executed across all steps. More...
           
          virtual double get_elapsed_secs ()=0
          - + @@ -139,7 +139,7 @@

          virtual idx_t yask::yk_stats::get_num_elements virtual idx_t yask::yk_stats::get_num_elements ( )
          - + @@ -167,7 +167,7 @@

          virtual idx_t yask::yk_stats::get_num_steps_done virtual idx_t yask::yk_stats::get_num_steps_done ( )
          - + @@ -195,7 +195,7 @@

          virtual idx_t yask::yk_stats::get_num_writes_done virtual idx_t yask::yk_stats::get_num_writes_done ( )
          - + diff --git a/docs/api/html/classyask_1_1yk__var-members.html b/docs/api/html/classyask_1_1yk__var-members.html index 57af1dd1..eeb5bf83 100644 --- a/docs/api/html/classyask_1_1yk__var-members.html +++ b/docs/api/html/classyask_1_1yk__var-members.html @@ -73,73 +73,76 @@

          This is the complete list of members for yask::yk_var, including all inherited members.

          virtual idx_t yask::yk_stats::get_est_fp_ops_done virtual idx_t yask::yk_stats::get_est_fp_ops_done ( )
          - - + + - - - - - - - - - - - - - + + + + + + + + + + + + + - + - + + + - - + + - + - + + + - - - - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + +
          add_to_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          add_to_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          add_to_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          add_to_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          alloc_storage()=0yask::yk_varpure virtual
          are_indices_local(const std::vector< idx_t > &indices) const =0yask::yk_varpure virtual
          are_indices_local(const std::initializer_list< idx_t > &indices) const =0yask::yk_varpure virtual
          format_indices(const std::vector< idx_t > &indices) const =0yask::yk_varpure virtual
          format_indices(const std::initializer_list< idx_t > &indices) const =0yask::yk_varpure virtual
          fuse_grids(yk_var_ptr source)yask::yk_varinline
          fuse_vars(yk_var_ptr source)=0yask::yk_varpure virtual
          get_alloc_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_dim_names() const =0yask::yk_varpure virtual
          get_element(const std::vector< idx_t > &indices) const =0yask::yk_varpure virtual
          get_element(const std::initializer_list< idx_t > &indices) const =0yask::yk_varpure virtual
          get_elements_in_slice(void *buffer_ptr, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices) const =0yask::yk_varpure virtual
          get_extra_pad_size(const std::string &dim) constyask::yk_varinline
          get_first_local_index(const std::string &dim) const =0yask::yk_varpure virtual
          are_indices_local(const idx_t_vec &indices) const =0yask::yk_varpure virtual
          are_indices_local(const idx_t_init_list &indices) const =0yask::yk_varpure virtual
          format_indices(const idx_t_vec &indices) const =0yask::yk_varpure virtual
          format_indices(const idx_t_init_list &indices) const =0yask::yk_varpure virtual
          fuse_vars(yk_var_ptr source)=0yask::yk_varpure virtual
          get_alloc_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_alloc_size_vec() const =0yask::yk_varpure virtual
          get_dim_names() const =0yask::yk_varpure virtual
          get_element(const idx_t_vec &indices) const =0yask::yk_varpure virtual
          get_element(const idx_t_init_list &indices) const =0yask::yk_varpure virtual
          get_elements_in_slice(void *buffer_ptr, const idx_t_vec &first_indices, const idx_t_vec &last_indices) const =0yask::yk_varpure virtual
          get_first_local_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_local_index_vec() const =0yask::yk_varpure virtual
          get_first_misc_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_rank_alloc_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_rank_alloc_index(const std::string &dim) constyask::yk_varinlinevirtual
          get_first_rank_domain_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_rank_halo_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_rank_domain_index_vec() const =0yask::yk_varpure virtual
          get_first_rank_halo_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_first_rank_halo_index_vec() const =0yask::yk_varpure virtual
          get_first_valid_step_index() const =0yask::yk_varpure virtual
          get_halo_exchange_l1_norm() const =0yask::yk_varpure virtual
          get_halo_size(const std::string &dim) constyask::yk_varinline
          get_last_local_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_local_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_local_index_vec() const =0yask::yk_varpure virtual
          get_last_misc_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_rank_alloc_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_rank_alloc_index(const std::string &dim) constyask::yk_varinlinevirtual
          get_last_rank_domain_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_rank_halo_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_rank_domain_index_vec() const =0yask::yk_varpure virtual
          get_last_rank_halo_index(const std::string &dim) const =0yask::yk_varpure virtual
          get_last_rank_halo_index_vec() const =0yask::yk_varpure virtual
          get_last_valid_step_index() const =0yask::yk_varpure virtual
          get_left_extra_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_left_halo_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_left_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_name() const =0yask::yk_varpure virtual
          get_num_dims() const =0yask::yk_varpure virtual
          get_num_storage_bytes() const =0yask::yk_varpure virtual
          get_num_storage_elements() const =0yask::yk_varpure virtual
          get_numa_preferred() const =0yask::yk_varpure virtual
          get_pad_size(const std::string &dim) constyask::yk_varinline
          get_num_domain_dims() const =0yask::yk_varpure virtual
          get_num_storage_bytes() const =0yask::yk_varpure virtual
          get_num_storage_elements() const =0yask::yk_varpure virtual
          get_numa_preferred() const =0yask::yk_varpure virtual
          get_rank_domain_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_raw_storage_buffer()=0yask::yk_varpure virtual
          get_right_extra_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_right_halo_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_right_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          is_dim_used(const std::string &dim) const =0yask::yk_varpure virtual
          is_dynamic_step_alloc() const =0yask::yk_varpure virtual
          is_element_allocated(const std::vector< idx_t > &indices) constyask::yk_varinline
          is_element_allocated(const std::initializer_list< idx_t > &indices) constyask::yk_varinline
          is_fixed_size() const =0yask::yk_varpure virtual
          is_storage_allocated() const =0yask::yk_varpure virtual
          is_storage_layout_identical(const yk_var_ptr other) const =0yask::yk_varpure virtual
          release_storage()=0yask::yk_varpure virtual
          set_all_elements_same(double val)=0yask::yk_varpure virtual
          set_alloc_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_elements_in_slice(const void *buffer_ptr, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices)=0yask::yk_varpure virtual
          set_elements_in_slice_same(double val, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_first_misc_index(const std::string &dim, idx_t idx)=0yask::yk_varpure virtual
          set_halo_exchange_l1_norm(int norm)=0yask::yk_varpure virtual
          set_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_left_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_left_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_numa_preferred(int numa_node)=0yask::yk_varpure virtual
          set_right_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_right_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          ~yk_var() (defined in yask::yk_var)yask::yk_varinlinevirtual
          get_rank_domain_size_vec() const =0yask::yk_varpure virtual
          get_raw_storage_buffer()=0yask::yk_varpure virtual
          get_right_extra_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_right_halo_size(const std::string &dim) const =0yask::yk_varpure virtual
          get_right_pad_size(const std::string &dim) const =0yask::yk_varpure virtual
          is_dim_used(const std::string &dim) const =0yask::yk_varpure virtual
          is_dynamic_step_alloc() const =0yask::yk_varpure virtual
          is_fixed_size() const =0yask::yk_varpure virtual
          is_storage_allocated() const =0yask::yk_varpure virtual
          is_storage_layout_identical(const yk_var_ptr other) const =0yask::yk_varpure virtual
          release_storage()=0yask::yk_varpure virtual
          set_all_elements_same(double val)=0yask::yk_varpure virtual
          set_alloc_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_elements_in_slice(const void *buffer_ptr, const idx_t_vec &first_indices, const idx_t_vec &last_indices)=0yask::yk_varpure virtual
          set_elements_in_slice_same(double val, const idx_t_vec &first_indices, const idx_t_vec &last_indices, bool strict_indices=true)=0yask::yk_varpure virtual
          set_first_misc_index(const std::string &dim, idx_t idx)=0yask::yk_varpure virtual
          set_halo_exchange_l1_norm(int norm)=0yask::yk_varpure virtual
          set_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_left_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_left_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_numa_preferred(int numa_node)=0yask::yk_varpure virtual
          set_right_halo_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          set_right_min_pad_size(const std::string &dim, idx_t size)=0yask::yk_varpure virtual
          ~yk_var() (defined in yask::yk_var)yask::yk_varinlinevirtual
          + + + +

          ◆ get_first_local_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_first_local_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the first valid index in this rank in all dimensions in this var.

          +

          See get_first_local_index().

          Returns
          vector of first valid indices.
          +
          @@ -533,7 +596,7 @@

          - + @@ -548,15 +611,43 @@

          Get the last index in this rank in the specified dimension.

          -

          This is a convenience function that provides the last possible index in any var dimension regardless of the dimension type. It is equivalent to get_last_rank_alloc_index(dim) when dim is a domain dimension, get_last_misc_index(dim) for a misc dimension, and get_last_valid_step_index() for the step dimension.

          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          +

          This is a convenience function that provides the last possible index in any var dimension regardless of the dimension type. If dim is a domain dimension, returns the last accessible index in the right padding area. It is equivalent to get_last_misc_index(dim) for a misc dimension, and get_last_valid_step_index() for the step dimension.

          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          Returns
          the last valid index.
          Parameters

          virtual idx_t yask::yk_var::get_last_local_index virtual idx_t yask::yk_var::get_last_local_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from get_dim_names().
          [in]dimName of dimension to get. Must be one of the names from get_dim_names().

          + + + +

          ◆ get_last_local_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_last_local_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the last valid index in this rank in all dimensions in this var.

          +

          See get_last_local_index().

          Returns
          vector of last valid indices.
          +
          @@ -569,7 +660,7 @@

          - + @@ -584,14 +675,42 @@

          Get the number of elements allocated in the specified dimension.

          -

          For the domain dimensions, this includes the rank-domain and padding sizes. See the "Detailed Description" for yk_var for information on var sizes. For any dimension dim, get_alloc_size(dim) == get_last_local_index(dim) - get_first_local_index(dim) + 1;

          Returns
          allocation in number of elements (not bytes).
          +

          For the domain dimensions, this includes the rank-domain and padding sizes. See the "Detailed Description" for yk_var for information on var sizes. For any dimension dim, get_alloc_size(dim) == get_last_local_index(dim) - get_first_local_index(dim) + 1;

          Returns
          allocation size in number of elements (not bytes).
          Parameters

          virtual idx_t yask::yk_var::get_alloc_size virtual idx_t yask::yk_var::get_alloc_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from get_dim_names().
          [in]dimName of dimension to get. Must be one of the names from get_dim_names().
          + + + +

          ◆ get_alloc_size_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_alloc_size_vec () const
          +
          +pure virtual
          +
          + +

          Get the number of elements allocated in all dimensions in this var.

          +

          See get_alloc_size().

          Returns
          vector of allocation sizes in number of elements (not bytes).
          +
          @@ -604,7 +723,7 @@

          - + @@ -632,7 +751,7 @@

          virtual idx_t yask::yk_var::get_first_valid_step_index virtual idx_t yask::yk_var::get_first_valid_step_index ( ) const
          - + @@ -660,7 +779,7 @@

          virtual idx_t yask::yk_var::get_last_valid_step_index virtual idx_t yask::yk_var::get_last_valid_step_index ( ) const
          - + @@ -674,15 +793,44 @@

          -

          Get the domain size for this rank.

          -
          Returns
          The same value as yk_solution::get_rank_domain_size() if is_fixed_size() returns false or the fixed sized provided via yk_solution::new_fixed_size_var() otherwise.
          +

          Get the domain size for this rank in the specified dimension.

          +
          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's size.
          +
          Returns
          The same value as yk_solution::get_rank_domain_size() if is_fixed_size() returns false or the fixed sized provided via yk_solution::new_fixed_size_var() otherwise.
          Parameters

          virtual idx_t yask::yk_var::get_rank_domain_size virtual idx_t yask::yk_var::get_rank_domain_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          + + + +

          ◆ get_rank_domain_size_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_rank_domain_size_vec () const
          +
          +pure virtual
          +
          + +

          Get the domain size for this rank in all domain dimensions in this var.

          +

          See get_rank_domain_size().

          Returns
          vector of values, one for each domain dimension in this var.
          +
          @@ -695,7 +843,7 @@

          - + @@ -710,15 +858,43 @@

          Get the first index of the sub-domain in this rank in the specified dimension.

          -
          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          +

          Does not include indices of padding area.

          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          Returns
          The same value as yk_solution::get_first_rank_domain_index() if is_fixed_size() returns false or zero (0) otherwise.
          Parameters

          virtual idx_t yask::yk_var::get_first_rank_domain_index virtual idx_t yask::yk_var::get_first_rank_domain_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          + + + +

          ◆ get_first_rank_domain_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_first_rank_domain_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the first index of the sub-domain in this rank in all domain dimensions in this var.

          +

          See get_first_rank_domain_index().

          Returns
          vector of values, one for each domain dimension in this var.
          +
          @@ -731,7 +907,7 @@

          - + @@ -746,15 +922,43 @@

          Get the last index of the sub-domain in this rank in the specified dimension.

          -
          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          -
          Returns
          The same value as yk_solution::get_last_rank_domain_index() if is_fixed_size() returns false or one less than the fixed sized provided via yk_solution::new_fixed_size_var() otherwise.
          +

          Does not include indices of padding area.

          Note
          This function should be called only after calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain.
          +
          Returns
          The same value as yk_solution::get_last_rank_domain_index() if is_fixed_size() returns false or one less than the fixed sized provided via yk_solution::new_fixed_size_var() otherwise.
          Parameters

          virtual idx_t yask::yk_var::get_last_rank_domain_index virtual idx_t yask::yk_var::get_last_rank_domain_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          + + + +

          ◆ get_last_rank_domain_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_last_rank_domain_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the last index of the sub-domain in this rank in all domain dimensions in this var.

          +

          See get_last_rank_domain_index().

          Returns
          vector of values, one for each domain dimension in this var.
          +
          @@ -767,7 +971,7 @@

          - + @@ -785,7 +989,7 @@

          Returns
          Elements in halo in given dimension before the domain.
          Parameters

          virtual idx_t yask::yk_var::get_left_halo_size virtual idx_t yask::yk_var::get_left_halo_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -802,7 +1006,7 @@

          - + @@ -820,7 +1024,7 @@

          Returns
          Elements in halo in given dimension after the domain.
          Parameters

          virtual idx_t yask::yk_var::get_right_halo_size virtual idx_t yask::yk_var::get_right_halo_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -837,7 +1041,7 @@

          - + @@ -856,11 +1060,39 @@

          Returns
          The first index of left halo in this rank or the same value as yk_var::get_first_rank_domain_index() if the left halo has zero size.
          Parameters

          virtual idx_t yask::yk_var::get_first_rank_halo_index virtual idx_t yask::yk_var::get_first_rank_halo_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          + + + +

          ◆ get_first_rank_halo_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_first_rank_halo_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the first index of the left halo in this rank in all domain dimensions in this var.

          +

          See get_first_rank_halo_index().

          Returns
          vector of values, one for each domain dimension in this var.
          +
          @@ -873,7 +1105,7 @@

          - + @@ -892,11 +1124,39 @@

          Returns
          The last index of right halo in this rank or the same value as yk_var::get_last_rank_domain_index() if the right halo has zero size.
          Parameters

          virtual idx_t yask::yk_var::get_last_rank_halo_index virtual idx_t yask::yk_var::get_last_rank_halo_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          + + + +

          ◆ get_last_rank_halo_index_vec()

          + +
          +
          + + + + + +
          + + + + + + + +
          virtual idx_t_vec yask::yk_var::get_last_rank_halo_index_vec () const
          +
          +pure virtual
          +
          + +

          Get the last index of the right halo in this rank in all domain dimensions in this var.

          +

          See get_last_rank_halo_index().

          Returns
          vector of values, one for each domain dimension in this var.
          +
          @@ -909,7 +1169,7 @@

          - + @@ -927,7 +1187,7 @@

          set_left_min_pad_size(), etc. due to rounding.

          Returns
          Elements in left padding in given dimension.
          Parameters

          virtual idx_t yask::yk_var::get_left_pad_size virtual idx_t yask::yk_var::get_left_pad_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -944,7 +1204,7 @@

          - + @@ -962,7 +1222,7 @@

          set_right_min_pad_size(), etc. due to rounding.

          Returns
          Elements in right padding in given dimension.
          Parameters

          virtual idx_t yask::yk_var::get_right_pad_size virtual idx_t yask::yk_var::get_right_pad_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -979,7 +1239,7 @@

          - + @@ -997,7 +1257,7 @@

          Returns
          Elements in padding in given dimension before the left halo area.
          Parameters

          virtual idx_t yask::yk_var::get_left_extra_pad_size virtual idx_t yask::yk_var::get_left_extra_pad_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -1014,7 +1274,7 @@

          - + @@ -1032,7 +1292,7 @@

          Returns
          Elements in padding in given dimension after the right halo area.
          Parameters

          virtual idx_t yask::yk_var::get_right_extra_pad_size virtual idx_t yask::yk_var::get_right_extra_pad_size ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          @@ -1049,7 +1309,7 @@

          - + @@ -1067,7 +1327,7 @@

          Returns
          the first valid index in a non-step and non-domain dimension.
          Parameters

          virtual idx_t yask::yk_var::get_first_misc_index virtual idx_t yask::yk_var::get_first_misc_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          @@ -1084,7 +1344,7 @@

          - + @@ -1102,15 +1362,15 @@

          Returns
          the last valid index in a non-step and non-domain dimension.
          Parameters

          virtual idx_t yask::yk_var::get_last_misc_index virtual idx_t yask::yk_var::get_last_misc_index ( const std::string &  dim)
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          - -

          ◆ are_indices_local() [1/2]

          + +

          ◆ are_indices_local() [1/2]

          @@ -1121,7 +1381,7 @@

          virtual bool yask::yk_var::are_indices_local ( - const std::vector< idx_t > &  + const idx_t_vecindices) const @@ -1134,7 +1394,7 @@

          Determine whether the given indices refer to an accessible element in this rank.

          -

          Provide indices in a list in the same order returned by get_dim_names() for this var. Domain index values are relative to the overall problem domain.

          Returns
          true if index values fall within the range returned by get_first_local_index(dim) and get_last_local_index(dim) for each dimension dim in the var; false otherwise.
          +

          Provide indices in a list in the same order returned by get_dim_names() for this var. Domain index values are relative to the overall problem domain.

          Returns
          true if index values fall within the range returned by get_first_local_index(dim) and get_last_local_index(dim) for each dimension dim in the var; false otherwise.
          Parameters
          @@ -1144,8 +1404,8 @@

          -

          ◆ are_indices_local() [2/2]

          + +

          ◆ are_indices_local() [2/2]

          - + @@ -1169,7 +1429,7 @@

          Determine whether the given indices refer to an accessible element in this rank.

          -

          See get_last_misc_index().

          +

          See are_indices_local().

          Parameters

          [in]indicesList of indices, one for each var dimension.
          (const std::initializer_list< idx_t > & const idx_t_init_list indices) const
          @@ -1179,8 +1439,8 @@

          -

          ◆ get_element() [1/2]

          + +

          ◆ get_element() [1/2]

          - + @@ -1204,7 +1464,7 @@

          Read the value of one element in this var.

          -

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive, for each dimension in the var.

          Returns
          value in var at given indices.
          +

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive, for each dimension in the var.

          Returns
          value in var at given indices.
          Parameters

          [in]indicesList of indices, one for each var dimension.
          (const std::vector< idx_t > & const idx_t_vec indices) const
          @@ -1214,8 +1474,8 @@

          -

          ◆ get_element() [2/2]

          + +

          ◆ get_element() [2/2]

          - + @@ -1239,7 +1499,7 @@

          Read the value of one element in this var.

          -

          See get_element().

          Returns
          value in var at given indices.
          +

          See get_element().

          Returns
          value in var at given indices.
          Parameters

          [in]indicesList of indices, one for each var dimension.
          (const std::initializer_list< idx_t > & const idx_t_init_list indices) const
          @@ -1249,8 +1509,8 @@

          -

          ◆ set_element() [1/2]

          + +

          ◆ set_element() [1/2]

          @@ -1259,7 +1519,7 @@

          [in]indicesList of indices, one for each var dimension.
          - + @@ -1267,7 +1527,7 @@

          - + @@ -1290,7 +1550,7 @@

          Set the value of one element in this var.

          -

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. If the var uses the step dimension, the value of the step index will be used to update the current valid step indices in the var. If strict_indices is false and any non-step index values are invalid as defined by are_indices_local(), the API will have no effect and return zero (0). If strict_indices is true and any non-step index values are invalid, the API will throw an exception. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Note
          The parameter value is a double-precision floating-point value, but it will be converted to single-precision if yk_solution::get_element_bytes() returns 4.
          +

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. If the var uses the step dimension, the value of the step index will be used to update the current valid step indices in the var. If strict_indices is false and any non-step index values are invalid as defined by are_indices_local(), the API will have no effect and return zero (0). If strict_indices is true and any non-step index values are invalid, the API will throw an exception. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Note
          The parameter value is a double-precision floating-point value, but it will be converted to single-precision if yk_solution::get_element_bytes() returns 4.
          Returns
          Number of elements set, which will be one (1) if the indices are valid and zero (0) if they are not.
          Parameters

          virtual idx_t yask::yk_var::set_element virtual idx_t yask::yk_var::set_element ( double  val, const std::vector< idx_t > & const idx_t_vec indices,
          @@ -1303,8 +1563,8 @@

          -

          ◆ set_element() [2/2]

          + +

          ◆ set_element() [2/2]

          @@ -1313,7 +1573,7 @@

          - + @@ -1321,7 +1581,7 @@

          - + @@ -1344,7 +1604,7 @@

          Set the value of one element in this var.

          -

          See set_element().

          Returns
          Number of elements set.
          +

          See set_element().

          Returns
          Number of elements set.
          Parameters

          virtual idx_t yask::yk_var::set_element virtual idx_t yask::yk_var::set_element ( double  val, const std::initializer_list< idx_t > & const idx_t_init_list indices,
          @@ -1356,8 +1616,8 @@

          -

          ◆ get_elements_in_slice()

          + +

          ◆ get_elements_in_slice()

          @@ -1366,7 +1626,7 @@

          [in]valElement in var will be set to this.
          - + @@ -1374,13 +1634,13 @@

          - + - + @@ -1398,7 +1658,7 @@

          yk_solution::get_element_bytes() multiplied by the number of elements in the specified slice. Since the reads proceed in row-major order, the last index is "unit-stride" in the buffer.

          -

          Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive.

          Returns
          Number of elements read.
          +

          Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive.

          Returns
          Number of elements read.
          Parameters

          virtual idx_t yask::yk_var::get_elements_in_slice virtual idx_t yask::yk_var::get_elements_in_slice ( void *  buffer_ptr, const std::vector< idx_t > & const idx_t_vec first_indices,
          const std::vector< idx_t > & const idx_t_vec last_indices 
          @@ -1410,8 +1670,8 @@

          -

          ◆ add_to_element() [1/2]

          + +

          ◆ add_to_element() [1/2]

          @@ -1420,7 +1680,7 @@

          [out]buffer_ptrPointer to buffer where values will be written.
          - + @@ -1428,7 +1688,7 @@

          - + @@ -1451,7 +1711,7 @@

          Atomically add to the value of one var element.

          -

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive. Updates are OpenMP atomic, meaning that this function can be called by several OpenMP threads without causing a race condition. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Note
          The parameter value is a double-precision floating-point value, but it will be converted to single-precision if yk_solution::get_element_bytes() returns 4.
          +

          Provide indices in a list in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive. Updates are OpenMP atomic, meaning that this function can be called by several OpenMP threads without causing a race condition. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Note
          The parameter value is a double-precision floating-point value, but it will be converted to single-precision if yk_solution::get_element_bytes() returns 4.
          Returns
          Number of elements updated.
          Parameters

          virtual idx_t yask::yk_var::add_to_element virtual idx_t yask::yk_var::add_to_element ( double  val, const std::vector< idx_t > & const idx_t_vec indices,
          @@ -1464,8 +1724,8 @@

          -

          ◆ add_to_element() [2/2]

          + +

          ◆ add_to_element() [2/2]

          @@ -1474,7 +1734,7 @@

          - + @@ -1482,7 +1742,7 @@

          - + @@ -1505,7 +1765,7 @@

          Atomically add to the value of one var element.

          -

          See add_to_element().

          Returns
          Number of elements set.
          +

          See add_to_element().

          Returns
          Number of elements set.
          Parameters

          virtual idx_t yask::yk_var::add_to_element virtual idx_t yask::yk_var::add_to_element ( double  val, const std::initializer_list< idx_t > & const idx_t_init_list indices,
          @@ -1552,8 +1812,8 @@

          -

          ◆ set_elements_in_slice_same()

          + +

          ◆ set_elements_in_slice_same()

          @@ -1562,7 +1822,7 @@

          [in]valThis value will be added to element in var.
          - + @@ -1570,13 +1830,13 @@

          - + - + @@ -1599,7 +1859,7 @@

          Initialize var elements within specified subset of the var to the same value.

          -

          Sets all elements from first_indices to last_indices in each dimension to the specified value. Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive, if strict_indices is true. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Returns
          Number of elements set.
          +

          Sets all elements from first_indices to last_indices in each dimension to the specified value. Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive, if strict_indices is true. If storage has not been allocated for this var, this will have no effect and return zero (0) if strict_indices is false, or it will throw an exception if strict_indices is true.

          Returns
          Number of elements set.
          Parameters

          virtual idx_t yask::yk_var::set_elements_in_slice_same virtual idx_t yask::yk_var::set_elements_in_slice_same ( double  val, const std::vector< idx_t > & const idx_t_vec first_indices,
          const std::vector< idx_t > & const idx_t_vec last_indices,
          @@ -1612,8 +1872,8 @@

          -

          ◆ set_elements_in_slice()

          + +

          ◆ set_elements_in_slice()

          @@ -1622,7 +1882,7 @@

          [in]valAll elements in the slice will be set to this.
          - + @@ -1630,13 +1890,13 @@

          - + - + @@ -1653,7 +1913,7 @@

          Set var elements within specified subset of the var from values in a buffer.

          -

          Reads elements from consecutive memory locations, starting at buffer_ptr and writes them from first_indices to last_indices in each dimension. Indices in the buffer progress in row-major order. The buffer pointed to must contain either 4 or 8 byte FP values per element in the subset, depending on the FP precision of the solution. The buffer pointed to must contain the number of FP values in the specified slice, where each FP value is the size of yk_solution::get_element_bytes(). Since the writes proceed in row-major order, the last index is "unit-stride" in the buffer. Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive. If storage has not been allocated for this var, this will throw an exception.

          Returns
          Number of elements written.
          +

          Reads elements from consecutive memory locations, starting at buffer_ptr and writes them from first_indices to last_indices in each dimension. Indices in the buffer progress in row-major order. The buffer pointed to must contain either 4 or 8 byte FP values per element in the subset, depending on the FP precision of the solution. The buffer pointed to must contain the number of FP values in the specified slice, where each FP value is the size of yk_solution::get_element_bytes(). Since the writes proceed in row-major order, the last index is "unit-stride" in the buffer. Provide indices in two lists in the same order returned by get_dim_names(). Indices are relative to the overall problem domain. Index values must fall between the values returned by get_first_local_index() and get_last_local_index(), inclusive. If storage has not been allocated for this var, this will throw an exception.

          Returns
          Number of elements written.
          Parameters

          virtual idx_t yask::yk_var::set_elements_in_slice virtual idx_t yask::yk_var::set_elements_in_slice ( const void *  buffer_ptr, const std::vector< idx_t > & const idx_t_vec first_indices,
          const std::vector< idx_t > & const idx_t_vec last_indices 
          @@ -1665,8 +1925,8 @@

          -

          ◆ format_indices() [1/2]

          + +

          ◆ format_indices() [1/2]

          - + @@ -1689,8 +1949,8 @@

          -

          Format the indices for pretty-printing.

          -

          Provide indices in a list in the same order returned by get_dim_names().

          Returns
          A string containing the var name and the index values.
          +

          Format the indices for human-readable display.

          +

          Provide indices in a list in the same order returned by get_dim_names().

          Returns
          A string containing the var name and the index values.
          Parameters

          [out]buffer_ptrPointer to buffer where values will be read.
          (const std::vector< idx_t > & const idx_t_vec indices) const
          @@ -1700,8 +1960,8 @@

          -

          ◆ format_indices() [2/2]

          + +

          ◆ format_indices() [2/2]

          - + @@ -1724,8 +1984,8 @@

          -

          Format the indices for pretty-printing.

          -

          See format_indices().

          Returns
          A string containing the var name and the index values.
          +

          Format the indices for human-readable display.

          +

          See format_indices().

          Returns
          A string containing the var name and the index values.
          Parameters

          [in]indicesList of indices, one for each var dimension.
          (const std::initializer_list< idx_t > & const idx_t_init_list indices) const
          @@ -1761,7 +2021,7 @@

          each domain dimension can be only zero or one, so the sum can range from zero to the number of domain dimensions.

          Examples for a domain size with 2 spatial dimensions (e.g., "x" and "y"): L1-norm = 0: no halos are exchanged for this var. L1-norm = 1: halos are exchanged between "up", "down", "left" and "right" neighbors. L1-norm = 2: halos are exchanged as above plus diagonal neighbors.

          -

          The actual exchanges are further controlled by the size of the halo in each direction per get_halo_size().

          +

          The actual exchanges are further controlled by the size of the halo in each direction per get_halo_size().

          Returns
          L1-norm, ranging from zero to number of domain dimensions.
          @@ -1791,7 +2051,7 @@

          [Advanced] Set the maximum L1-norm of a neighbor rank for halo exchange.

          -
          See also
          get_halo_exchange_l1_norm().
          +

          This should only be used to override the value calculated automatically by the YASK compiler.

          See also
          get_halo_exchange_l1_norm().
          Parameters

          [in]indicesList of indices, one for each var dimension.
          @@ -1910,7 +2170,7 @@

          - + @@ -1931,7 +2191,7 @@

          get_left_pad_size() to determine the actual padding size for the var. See additional behavior related to setting pad size under yk_solution::set_min_pad_size(). See the "Detailed Description" for yk_var for information on var sizes.

          Parameters

          [in]normMaximum L1-norm of neighbor rank with which to exchange halos.
          idx_t idx_t  size 
          - +
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeMinimum number of elements to allocate before the domain size.
          @@ -1957,7 +2217,7 @@

          - idx_t  + idx_t  size  @@ -1978,7 +2238,7 @@

          get_right_pad_size() to determine the actual padding size for the var. See additional behavior related to setting pad size under yk_solution::set_min_pad_size(). See the "Detailed Description" for yk_var for information on var sizes.

          Parameters
          - +
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeMinimum number of elements to allocate after the domain size.
          @@ -2004,7 +2264,7 @@

          - idx_t  + idx_t  size  @@ -2024,7 +2284,7 @@

          set_left_min_pad_size() and set_right_min_pad_size().

          Parameters
          - +
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to set. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeMinimum number of elements to allocate before and after the domain size.
          @@ -2050,7 +2310,7 @@

          - idx_t  + idx_t  size  @@ -2070,7 +2330,7 @@

          Note
          After data storage has been allocated, the left halo size can only be set to a value less than or equal to the left padding size in the given dimension.

          Parameters
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeNumber of elements in the left halo.
          @@ -2096,7 +2356,7 @@

          - idx_t  + idx_t  size  @@ -2116,7 +2376,7 @@

          Note
          After data storage has been allocated, the right halo size can only be set to a value less than or equal to the right padding size in the given dimension.

          Parameters
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeNumber of elements in the right halo.
          @@ -2142,7 +2402,7 @@

          - idx_t  + idx_t  size  @@ -2162,7 +2422,7 @@

          Parameters
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          [in]sizeNumber of elements in the halo.
          @@ -2188,7 +2448,7 @@

          - idx_t  + idx_t  size  @@ -2214,15 +2474,15 @@

          Compile-time yc_solution::new_var() + yc_var::set_dynamic_step_alloc (true) [1] Yes No Yes [2] -Run-time yk_solution::new_var() Yes No Yes +Run-time yk_solution::new_var() Yes No Yes -Run-time yk_solution::new_fixed_size_var() [3] Yes Yes Yes +Run-time yk_solution::new_fixed_size_var() [3] Yes Yes Yes
          Note
          [1] By default, variables created via yc_solution::new_var() do not allow dynamic step allocation.
          [2] Misc dim allocations cannot be changed for compile-time vars if the YASK compiler was run with the "-interleave-misc" option.
          -[3] The term "fixed" in yk_solution::new_fixed_size_var() means that the domain size will not change automatically when its solution domain size changes. It does not mean that the sizes cannot be changed via the APIs–quite the opposite.
          +[3] The term "fixed" in yk_solution::new_fixed_size_var() means that the domain size will not change automatically when its solution domain size changes. It does not mean that the sizes cannot be changed via the APIs–quite the opposite.

          The allocation size cannot be changed after data storage has been allocated for this var.

          Parameters
          @@ -2252,7 +2512,7 @@

          - + @@ -2272,82 +2532,12 @@

          set_alloc_size() minus one.

          Parameters

          idx_t idx_t  idx 
          - +
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_misc_dim_names().
          [in]idxNew value for first index. May be negative.
          -

          -
          - -

          ◆ get_first_rank_alloc_index()

          - -
          -
          - - - - - -
          - - - - - - - - -
          virtual idx_t yask::yk_var::get_first_rank_alloc_index (const std::string & dim) const
          -
          -pure virtual
          -
          - -

          [Advanced] Get the first accessible index in this var in this rank in the specified domain dimension.

          -

          Equivalent to get_first_local_index(dim), where dim is a domain dimension.

          Returns
          First valid index in this var.
          -
          Parameters
          - - -
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          -
          -
          - -
          -
          - -

          ◆ get_last_rank_alloc_index()

          - -
          -
          - - - - - -
          - - - - - - - - -
          virtual idx_t yask::yk_var::get_last_rank_alloc_index (const std::string & dim) const
          -
          -pure virtual
          -
          - -

          [Advanced] Get the last accessible index in this var in this rank in the specified domain dimension.

          -

          Equivalent to get_last_local_index(dim), where dim is a domain dimension.

          Returns
          Last valid index in this var.
          -
          Parameters
          - - -
          [in]dimName of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names().
          -
          -
          -
          @@ -2388,7 +2578,7 @@

          - + @@ -2416,7 +2606,7 @@

          virtual idx_t yask::yk_var::get_num_storage_bytes virtual idx_t yask::yk_var::get_num_storage_bytes ( ) const
          - + @@ -2604,85 +2794,18 @@

          Returns
          Pointer to raw data storage if is_storage_allocated() returns true or NULL otherwise.
          - - - -

          ◆ is_element_allocated() [1/2]

          - -
          -
          -
          virtual idx_t yask::yk_var::get_num_storage_elements virtual idx_t yask::yk_var::get_num_storage_elements ( ) const
          - - - - -
          - - - - - - - - -
          bool yask::yk_var::is_element_allocated (const std::vector< idx_t > & indices) const
          -
          -inline
          -
          - -

          [Deprecated] Use are_indices_local().

          -
          Parameters
          - - -
          [in]indicesList of indices, one for each var dimension.
          -
          -
          - -
          - - -

          ◆ is_element_allocated() [2/2]

          - -
          -
          - - - - - -
          - - - - - - - - -
          bool yask::yk_var::is_element_allocated (const std::initializer_list< idx_t > & indices) const
          -
          -inline
          -
          - -

          [Deprecated] Use are_indices_local().

          -
          Parameters
          - - -
          [in]indicesList of indices, one for each var dimension.
          -
          -
          -

          The documentation for this class was generated from the following file:
            diff --git a/docs/api/html/functions_a.html b/docs/api/html/functions_a.html index 923da1ac..c7276282 100644 --- a/docs/api/html/functions_a.html +++ b/docs/api/html/functions_a.html @@ -75,7 +75,7 @@

            - a -

              : yask::yc_commutative_number_node
            • add_to_element() -: yask::yk_var +: yask::yk_var
            • alloc_storage() : yask::yk_var @@ -84,7 +84,7 @@

              - a -

              diff --git a/docs/api/html/functions_c.html b/docs/api/html/functions_c.html index 20a8ba9c..f6499e51 100644 --- a/docs/api/html/functions_c.html +++ b/docs/api/html/functions_c.html @@ -100,6 +100,12 @@

              - c -

              diff --git a/docs/api/html/functions_d.html b/docs/api/html/functions_d.html index 95dba8b8..af6467f4 100644 --- a/docs/api/html/functions_d.html +++ b/docs/api/html/functions_d.html @@ -69,6 +69,9 @@

              - d -

                : yask::yc_solution_base , yask::yc_solution_with_radius_base +
              • disable_debug_output() +: yask::yk_env +
              • discard() : yask::yask_string_output
              • diff --git a/docs/api/html/functions_f.html b/docs/api/html/functions_f.html index 97ef8d00..ebd6ac62 100644 --- a/docs/api/html/functions_f.html +++ b/docs/api/html/functions_f.html @@ -69,17 +69,16 @@

                - f -

                  : yask::yc_solution_base
                • format() -: yask::yc_solution +: yask::yc_solution
                • format_indices() -: yask::yk_var +: yask::yk_var
                • format_simple() : yask::yc_expr_node
                • fuse_grids() -: yask::yk_solution -, yask::yk_var +: yask::yk_solution
                • fuse_vars() : yask::yk_solution diff --git a/docs/api/html/functions_func_a.html b/docs/api/html/functions_func_a.html index b607cdb6..dae5d110 100644 --- a/docs/api/html/functions_func_a.html +++ b/docs/api/html/functions_func_a.html @@ -75,7 +75,7 @@

                  - a -

                    : yask::yc_commutative_number_node
                  • add_to_element() -: yask::yk_var +: yask::yk_var
                  • alloc_storage() : yask::yk_var @@ -84,7 +84,7 @@

                    - a -

                    diff --git a/docs/api/html/functions_func_c.html b/docs/api/html/functions_func_c.html index ea915c74..03fe6df7 100644 --- a/docs/api/html/functions_func_c.html +++ b/docs/api/html/functions_func_c.html @@ -100,6 +100,12 @@

                    - c -

                    diff --git a/docs/api/html/functions_func_d.html b/docs/api/html/functions_func_d.html index 7f0f4d59..8a3ce948 100644 --- a/docs/api/html/functions_func_d.html +++ b/docs/api/html/functions_func_d.html @@ -69,6 +69,9 @@

                    - d -

                      : yask::yc_solution_base , yask::yc_solution_with_radius_base +
                    • disable_debug_output() +: yask::yk_env +
                    • discard() : yask::yask_string_output
                    • diff --git a/docs/api/html/functions_func_f.html b/docs/api/html/functions_func_f.html index 455549ad..b0d7aef0 100644 --- a/docs/api/html/functions_func_f.html +++ b/docs/api/html/functions_func_f.html @@ -69,17 +69,16 @@

                      - f -

                        : yask::yc_solution_base
                      • format() -: yask::yc_solution +: yask::yc_solution
                      • format_indices() -: yask::yk_var +: yask::yk_var
                      • format_simple() : yask::yc_expr_node
                      • fuse_grids() -: yask::yk_solution -, yask::yk_var +: yask::yk_solution
                      • fuse_vars() : yask::yk_solution diff --git a/docs/api/html/functions_func_g.html b/docs/api/html/functions_func_g.html index 28c8b082..807d5abe 100644 --- a/docs/api/html/functions_func_g.html +++ b/docs/api/html/functions_func_g.html @@ -68,14 +68,20 @@

                        - g -

                        • get_alloc_size() : yask::yk_var
                        • +
                        • get_alloc_size_vec() +: yask::yk_var +
                        • get_block_size() : yask::yk_solution
                        • +
                        • get_block_size_vec() +: yask::yk_solution +
                        • get_cond() : yask::yc_equation_node
                        • get_debug_output() -: yask::yk_env +: yask::yk_env
                        • get_default_numa_preferred() : yask::yk_solution @@ -84,24 +90,24 @@

                          - g -

                            : yask::yc_solution
                          • get_dim_names() -: yask::yc_var -, yask::yk_var +: yask::yc_var +, yask::yk_var
                          • get_domain_dim_names() -: yask::yk_solution +: yask::yk_solution
                          • get_elapsed_secs() : yask::yk_stats
                          • get_element() -: yask::yk_var +: yask::yk_var
                          • get_element_bytes() : yask::yc_solution , yask::yk_solution
                          • get_elements_in_slice() -: yask::yk_var +: yask::yk_var
                          • get_equations() : yask::yc_solution @@ -109,62 +115,76 @@

                            - g -

                            • get_est_fp_ops_done() : yask::yk_stats
                            • -
                            • get_extra_pad_size() -: yask::yk_var -
                            • get_filename() : yask::yask_file_output
                            • get_first_local_index() : yask::yk_var
                            • +
                            • get_first_local_index_vec() +: yask::yk_var +
                            • get_first_misc_index() : yask::yk_var
                            • get_first_rank_alloc_index() -: yask::yk_var +: yask::yk_var
                            • get_first_rank_domain_index() : yask::yk_solution , yask::yk_var
                            • +
                            • get_first_rank_domain_index_vec() +: yask::yk_solution +, yask::yk_var +
                            • get_first_rank_halo_index() : yask::yk_var
                            • +
                            • get_first_rank_halo_index_vec() +: yask::yk_var +
                            • get_first_valid_step_index() : yask::yk_var
                            • get_grid() -: yask::yc_solution -, yask::yc_var_point_node -, yask::yk_solution +: yask::yc_solution +, yask::yc_var_point_node +, yask::yk_solution
                            • get_grids() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                            • get_halo_exchange_l1_norm() : yask::yk_var
                            • -
                            • get_halo_size() -: yask::yk_var -
                            • get_last_local_index() : yask::yk_var
                            • +
                            • get_last_local_index_vec() +: yask::yk_var +
                            • get_last_misc_index() : yask::yk_var
                            • get_last_rank_alloc_index() -: yask::yk_var +: yask::yk_var
                            • get_last_rank_domain_index() : yask::yk_solution , yask::yk_var
                            • +
                            • get_last_rank_domain_index_vec() +: yask::yk_solution +, yask::yk_var +
                            • get_last_rank_halo_index() : yask::yk_var
                            • +
                            • get_last_rank_halo_index_vec() +: yask::yk_var +
                            • get_last_valid_step_index() : yask::yk_var
                            • @@ -190,7 +210,7 @@

                              - g -

                                : yask::yk_solution
                              • get_misc_dim_names() -: yask::yk_solution +: yask::yk_solution
                              • get_name() : yask::yc_index_node @@ -205,6 +225,7 @@

                                - g -

                                • get_num_domain_dims() : yask::yk_solution +, yask::yk_var
                                • get_num_elements() : yask::yk_stats @@ -213,8 +234,8 @@

                                  - g -

                                    : yask::yc_solution
                                  • get_num_grids() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                                  • get_num_nodes() : yask::yc_expr_node @@ -226,6 +247,9 @@

                                    - g -

                                      : yask::yk_env , yask::yk_solution +
                                    • get_num_ranks_vec() +: yask::yk_solution +
                                    • get_num_steps_done() : yask::yk_stats
                                    • @@ -254,8 +278,8 @@

                                      - g -

                                      • get_overall_domain_size() : yask::yk_solution
                                      • -
                                      • get_pad_size() -: yask::yk_var +
                                      • get_overall_domain_size_vec() +: yask::yk_solution
                                      • get_prefetch_dist() : yask::yc_solution @@ -267,16 +291,20 @@

                                        - g -

                                          : yask::yk_solution , yask::yk_var +
                                        • get_rank_domain_size_vec() +: yask::yk_solution +, yask::yk_var +
                                        • get_rank_index() : yask::yk_env , yask::yk_solution
                                        • +
                                        • get_rank_index_vec() +: yask::yk_solution +
                                        • get_raw_storage_buffer() : yask::yk_var
                                        • -
                                        • get_region_size() -: yask::yk_solution -
                                        • get_registry() : yask::yc_solution_base
                                        • @@ -325,7 +353,7 @@

                                          - g -

                                          • get_var() : yask::yc_solution , yask::yc_var_point_node -, yask::yc_var_proxy +, yask::yc_var_proxy , yask::yk_solution
                                          • get_vars() diff --git a/docs/api/html/functions_func_i.html b/docs/api/html/functions_func_i.html index 18ff227f..e9213bb8 100644 --- a/docs/api/html/functions_func_i.html +++ b/docs/api/html/functions_func_i.html @@ -81,15 +81,15 @@

                                            - i -

                                              : yask::yc_var , yask::yk_var -
                                            • is_element_allocated() -: yask::yk_var -
                                            • is_fixed_size() : yask::yk_var
                                            • is_folding_set() : yask::yc_solution
                                            • +
                                            • is_offloaded() +: yask::yk_solution +
                                            • is_storage_allocated() : yask::yk_var
                                            • @@ -99,6 +99,9 @@

                                              - i -

                                              diff --git a/docs/api/html/functions_func_n.html b/docs/api/html/functions_func_n.html index 085cd783..dd4f3a31 100644 --- a/docs/api/html/functions_func_n.html +++ b/docs/api/html/functions_func_n.html @@ -97,20 +97,20 @@

                                              - n -

                                                : yask::yc_node_factory
                                              • new_fixed_size_grid() -: yask::yk_solution +: yask::yk_solution
                                              • new_fixed_size_var() -: yask::yk_solution +: yask::yk_solution
                                              • new_greater_than_node() : yask::yc_node_factory
                                              • new_grid() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                                              • new_grid_point() -: yask::yc_var +: yask::yc_var
                                              • new_last_domain_index() : yask::yc_node_factory @@ -154,13 +154,13 @@

                                                - n -

                                                  : yask::yc_node_factory
                                                • new_relative_grid_point() -: yask::yc_var +: yask::yc_var
                                                • new_relative_var_point() -: yask::yc_var +: yask::yc_var
                                                • new_scratch_grid() -: yask::yc_solution +: yask::yc_solution
                                                • new_scratch_var() : yask::yc_solution @@ -184,7 +184,7 @@

                                                  - n -

                                                  • new_var() : yask::yc_solution -, yask::yk_solution +, yask::yk_solution
                                                  • new_var_point() : yask::yc_var diff --git a/docs/api/html/functions_func_s.html b/docs/api/html/functions_func_s.html index 1182bb88..721d54c3 100644 --- a/docs/api/html/functions_func_s.html +++ b/docs/api/html/functions_func_s.html @@ -74,6 +74,9 @@

                                                    - s -

                                                    • set_block_size() : yask::yk_solution
                                                    • +
                                                    • set_block_size_vec() +: yask::yk_solution +
                                                    • set_cluster_mult() : yask::yc_solution
                                                    • @@ -82,8 +85,8 @@

                                                      - s -

                                                      • set_debug_output() : yask::yc_solution -, yask::yk_env -, yask::yk_solution +, yask::yk_env +, yask::yk_solution
                                                      • set_default_numa_preferred() : yask::yk_solution @@ -101,16 +104,16 @@

                                                        - s -

                                                          : yask::yc_var
                                                        • set_element() -: yask::yk_var +: yask::yk_var
                                                        • set_element_bytes() : yask::yc_solution
                                                        • set_elements_in_slice() -: yask::yk_var +: yask::yk_var
                                                        • set_elements_in_slice_same() -: yask::yk_var +: yask::yk_var
                                                        • set_first_misc_index() : yask::yk_var @@ -140,12 +143,18 @@

                                                          - s -

                                                          • set_num_ranks() : yask::yk_solution
                                                          • +
                                                          • set_num_ranks_vec() +: yask::yk_solution +
                                                          • set_numa_preferred() : yask::yk_var
                                                          • set_overall_domain_size() : yask::yk_solution
                                                          • +
                                                          • set_overall_domain_size_vec() +: yask::yk_solution +
                                                          • set_prefetch_dist() : yask::yc_solution
                                                          • @@ -155,11 +164,14 @@

                                                            - s -

                                                            • set_rank_domain_size() : yask::yk_solution
                                                            • +
                                                            • set_rank_domain_size_vec() +: yask::yk_solution +
                                                            • set_rank_index() : yask::yk_solution
                                                            • -
                                                            • set_region_size() -: yask::yk_solution +
                                                            • set_rank_index_vec() +: yask::yk_solution
                                                            • set_right_halo_size() : yask::yk_var @@ -183,7 +195,7 @@

                                                              - s -

                                                                : yask::yc_solution
                                                              • set_trace_enabled() -: yask::yk_env +: yask::yk_env
                                                              • set_value() : yask::yc_const_number_node diff --git a/docs/api/html/functions_g.html b/docs/api/html/functions_g.html index 4d4d1c5c..3ea97f66 100644 --- a/docs/api/html/functions_g.html +++ b/docs/api/html/functions_g.html @@ -68,14 +68,20 @@

                                                                - g -

                                                                • get_alloc_size() : yask::yk_var
                                                                • +
                                                                • get_alloc_size_vec() +: yask::yk_var +
                                                                • get_block_size() : yask::yk_solution
                                                                • +
                                                                • get_block_size_vec() +: yask::yk_solution +
                                                                • get_cond() : yask::yc_equation_node
                                                                • get_debug_output() -: yask::yk_env +: yask::yk_env
                                                                • get_default_numa_preferred() : yask::yk_solution @@ -84,24 +90,24 @@

                                                                  - g -

                                                                    : yask::yc_solution
                                                                  • get_dim_names() -: yask::yc_var -, yask::yk_var +: yask::yc_var +, yask::yk_var
                                                                  • get_domain_dim_names() -: yask::yk_solution +: yask::yk_solution
                                                                  • get_elapsed_secs() : yask::yk_stats
                                                                  • get_element() -: yask::yk_var +: yask::yk_var
                                                                  • get_element_bytes() : yask::yc_solution , yask::yk_solution
                                                                  • get_elements_in_slice() -: yask::yk_var +: yask::yk_var
                                                                  • get_equations() : yask::yc_solution @@ -109,62 +115,76 @@

                                                                    - g -

                                                                    • get_est_fp_ops_done() : yask::yk_stats
                                                                    • -
                                                                    • get_extra_pad_size() -: yask::yk_var -
                                                                    • get_filename() : yask::yask_file_output
                                                                    • get_first_local_index() : yask::yk_var
                                                                    • +
                                                                    • get_first_local_index_vec() +: yask::yk_var +
                                                                    • get_first_misc_index() : yask::yk_var
                                                                    • get_first_rank_alloc_index() -: yask::yk_var +: yask::yk_var
                                                                    • get_first_rank_domain_index() : yask::yk_solution , yask::yk_var
                                                                    • +
                                                                    • get_first_rank_domain_index_vec() +: yask::yk_solution +, yask::yk_var +
                                                                    • get_first_rank_halo_index() : yask::yk_var
                                                                    • +
                                                                    • get_first_rank_halo_index_vec() +: yask::yk_var +
                                                                    • get_first_valid_step_index() : yask::yk_var
                                                                    • get_grid() -: yask::yc_solution -, yask::yc_var_point_node -, yask::yk_solution +: yask::yc_solution +, yask::yc_var_point_node +, yask::yk_solution
                                                                    • get_grids() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                                                                    • get_halo_exchange_l1_norm() : yask::yk_var
                                                                    • -
                                                                    • get_halo_size() -: yask::yk_var -
                                                                    • get_last_local_index() : yask::yk_var
                                                                    • +
                                                                    • get_last_local_index_vec() +: yask::yk_var +
                                                                    • get_last_misc_index() : yask::yk_var
                                                                    • get_last_rank_alloc_index() -: yask::yk_var +: yask::yk_var
                                                                    • get_last_rank_domain_index() : yask::yk_solution , yask::yk_var
                                                                    • +
                                                                    • get_last_rank_domain_index_vec() +: yask::yk_solution +, yask::yk_var +
                                                                    • get_last_rank_halo_index() : yask::yk_var
                                                                    • +
                                                                    • get_last_rank_halo_index_vec() +: yask::yk_var +
                                                                    • get_last_valid_step_index() : yask::yk_var
                                                                    • @@ -190,7 +210,7 @@

                                                                      - g -

                                                                        : yask::yk_solution
                                                                      • get_misc_dim_names() -: yask::yk_solution +: yask::yk_solution
                                                                      • get_name() : yask::yc_index_node @@ -205,6 +225,7 @@

                                                                        - g -

                                                                        • get_num_domain_dims() : yask::yk_solution +, yask::yk_var
                                                                        • get_num_elements() : yask::yk_stats @@ -213,8 +234,8 @@

                                                                          - g -

                                                                            : yask::yc_solution
                                                                          • get_num_grids() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                                                                          • get_num_nodes() : yask::yc_expr_node @@ -226,6 +247,9 @@

                                                                            - g -

                                                                              : yask::yk_env , yask::yk_solution +
                                                                            • get_num_ranks_vec() +: yask::yk_solution +
                                                                            • get_num_steps_done() : yask::yk_stats
                                                                            • @@ -254,8 +278,8 @@

                                                                              - g -

                                                                              • get_overall_domain_size() : yask::yk_solution
                                                                              • -
                                                                              • get_pad_size() -: yask::yk_var +
                                                                              • get_overall_domain_size_vec() +: yask::yk_solution
                                                                              • get_prefetch_dist() : yask::yc_solution @@ -267,16 +291,20 @@

                                                                                - g -

                                                                                  : yask::yk_solution , yask::yk_var +
                                                                                • get_rank_domain_size_vec() +: yask::yk_solution +, yask::yk_var +
                                                                                • get_rank_index() : yask::yk_env , yask::yk_solution
                                                                                • +
                                                                                • get_rank_index_vec() +: yask::yk_solution +
                                                                                • get_raw_storage_buffer() : yask::yk_var
                                                                                • -
                                                                                • get_region_size() -: yask::yk_solution -
                                                                                • get_registry() : yask::yc_solution_base
                                                                                • @@ -325,7 +353,7 @@

                                                                                  - g -

                                                                                  • get_var() : yask::yc_solution , yask::yc_var_point_node -, yask::yc_var_proxy +, yask::yc_var_proxy , yask::yk_solution
                                                                                  • get_vars() diff --git a/docs/api/html/functions_i.html b/docs/api/html/functions_i.html index 7b9a8b4b..bf0509c0 100644 --- a/docs/api/html/functions_i.html +++ b/docs/api/html/functions_i.html @@ -81,15 +81,15 @@

                                                                                    - i -

                                                                                      : yask::yc_var , yask::yk_var -
                                                                                    • is_element_allocated() -: yask::yk_var -
                                                                                    • is_fixed_size() : yask::yk_var
                                                                                    • is_folding_set() : yask::yc_solution
                                                                                    • +
                                                                                    • is_offloaded() +: yask::yk_solution +
                                                                                    • is_storage_allocated() : yask::yk_var
                                                                                    • @@ -99,6 +99,9 @@

                                                                                      - i -

                                                                                      diff --git a/docs/api/html/functions_n.html b/docs/api/html/functions_n.html index cb2e9b50..40f7bc04 100644 --- a/docs/api/html/functions_n.html +++ b/docs/api/html/functions_n.html @@ -97,20 +97,20 @@

                                                                                      - n -

                                                                                        : yask::yc_node_factory
                                                                                      • new_fixed_size_grid() -: yask::yk_solution +: yask::yk_solution
                                                                                      • new_fixed_size_var() -: yask::yk_solution +: yask::yk_solution
                                                                                      • new_greater_than_node() : yask::yc_node_factory
                                                                                      • new_grid() -: yask::yc_solution -, yask::yk_solution +: yask::yc_solution +, yask::yk_solution
                                                                                      • new_grid_point() -: yask::yc_var +: yask::yc_var
                                                                                      • new_last_domain_index() : yask::yc_node_factory @@ -154,13 +154,13 @@

                                                                                        - n -

                                                                                          : yask::yc_node_factory
                                                                                        • new_relative_grid_point() -: yask::yc_var +: yask::yc_var
                                                                                        • new_relative_var_point() -: yask::yc_var +: yask::yc_var
                                                                                        • new_scratch_grid() -: yask::yc_solution +: yask::yc_solution
                                                                                        • new_scratch_var() : yask::yc_solution @@ -184,7 +184,7 @@

                                                                                          - n -

                                                                                          • new_var() : yask::yc_solution -, yask::yk_solution +, yask::yk_solution
                                                                                          • new_var_point() : yask::yc_var diff --git a/docs/api/html/functions_s.html b/docs/api/html/functions_s.html index 6d6fe914..7084f798 100644 --- a/docs/api/html/functions_s.html +++ b/docs/api/html/functions_s.html @@ -74,6 +74,9 @@

                                                                                            - s -

                                                                                            • set_block_size() : yask::yk_solution
                                                                                            • +
                                                                                            • set_block_size_vec() +: yask::yk_solution +
                                                                                            • set_cluster_mult() : yask::yc_solution
                                                                                            • @@ -82,8 +85,8 @@

                                                                                              - s -

                                                                                              • set_debug_output() : yask::yc_solution -, yask::yk_env -, yask::yk_solution +, yask::yk_env +, yask::yk_solution
                                                                                              • set_default_numa_preferred() : yask::yk_solution @@ -101,16 +104,16 @@

                                                                                                - s -

                                                                                                  : yask::yc_var
                                                                                                • set_element() -: yask::yk_var +: yask::yk_var
                                                                                                • set_element_bytes() : yask::yc_solution
                                                                                                • set_elements_in_slice() -: yask::yk_var +: yask::yk_var
                                                                                                • set_elements_in_slice_same() -: yask::yk_var +: yask::yk_var
                                                                                                • set_first_misc_index() : yask::yk_var @@ -140,12 +143,18 @@

                                                                                                  - s -

                                                                                                  Example Tests

                                                                                                  diff --git a/docs/api/html/search/all_1.js b/docs/api/html/search/all_1.js index 0f028439..3c32950e 100644 --- a/docs/api/html/search/all_1.js +++ b/docs/api/html/search/all_1.js @@ -3,8 +3,8 @@ var searchData= ['add_5fflow_5fdependency',['add_flow_dependency',['../classyask_1_1yc__solution.html#a727a91bb87e42de9822ac6540e3fc93e',1,'yask::yc_solution']]], ['add_5fmessage',['add_message',['../classyask_1_1yask__exception.html#aff4d4707f040fe2876c8e5d2fbfd74a7',1,'yask::yask_exception']]], ['add_5foperand',['add_operand',['../classyask_1_1yc__commutative__number__node.html#a560e25d93eb1ee672e4fdbb40db31f21',1,'yask::yc_commutative_number_node']]], - ['add_5fto_5felement',['add_to_element',['../classyask_1_1yk__var.html#ad1e93677a7b8070501c9d569a8404714',1,'yask::yk_var::add_to_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#ac93056d8c105cf33eebce78e0f054b5f',1,'yask::yk_var::add_to_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0']]], + ['add_5fto_5felement',['add_to_element',['../classyask_1_1yk__var.html#aac79a05181b3eee1031e27cfc0d2c145',1,'yask::yk_var::add_to_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#adf4832584daca2f5139b8a8ba93bcf6a',1,'yask::yk_var::add_to_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0']]], ['alloc_5fstorage',['alloc_storage',['../classyask_1_1yk__var.html#aa3b479a98b425c3a8d504145972198e0',1,'yask::yk_var']]], - ['apply_5fcommand_5fline_5foptions',['apply_command_line_options',['../classyask_1_1yk__solution.html#ac111abbade055c4923cd0044360ec3b7',1,'yask::yk_solution::apply_command_line_options(const std::string &args)=0'],['../classyask_1_1yk__solution.html#ad0947c4ad4ed06d8a7d5058daecd5dc7',1,'yask::yk_solution::apply_command_line_options(int argc, char *argv[])=0'],['../classyask_1_1yk__solution.html#a6c128cbf16d4d4ab36c30654cd0e9818',1,'yask::yk_solution::apply_command_line_options(const std::vector< std::string > &args)=0']]], - ['are_5findices_5flocal',['are_indices_local',['../classyask_1_1yk__var.html#a81a6970b7812b2c5c4bdb71cd6b1384d',1,'yask::yk_var::are_indices_local(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a7993629f98bbacee174f793e18eff1b5',1,'yask::yk_var::are_indices_local(const std::initializer_list< idx_t > &indices) const =0']]] + ['apply_5fcommand_5fline_5foptions',['apply_command_line_options',['../classyask_1_1yk__solution.html#ac111abbade055c4923cd0044360ec3b7',1,'yask::yk_solution::apply_command_line_options(const std::string &args)=0'],['../classyask_1_1yk__solution.html#ad0947c4ad4ed06d8a7d5058daecd5dc7',1,'yask::yk_solution::apply_command_line_options(int argc, char *argv[])=0'],['../classyask_1_1yk__solution.html#a550b24bc0f81de69619ba3029ca79e7f',1,'yask::yk_solution::apply_command_line_options(const string_vec &args)=0']]], + ['are_5findices_5flocal',['are_indices_local',['../classyask_1_1yk__var.html#a3cc808533b7c6e34614409d34bec1a86',1,'yask::yk_var::are_indices_local(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#a923c12a1a7b80698c09f2447828416ed',1,'yask::yk_var::are_indices_local(const idx_t_init_list &indices) const =0']]] ]; diff --git a/docs/api/html/search/all_12.js b/docs/api/html/search/all_12.js index 787638f2..1c9d7997 100644 --- a/docs/api/html/search/all_12.js +++ b/docs/api/html/search/all_12.js @@ -3,16 +3,19 @@ var searchData= ['yask_20common',['YASK Common',['../group__yask.html',1,'']]], ['yask_5fcommon_5fapi_2ehpp',['yask_common_api.hpp',['../yask__common__api_8hpp.html',1,'']]], ['yask_5fcompiler_5fapi_2ehpp',['yask_compiler_api.hpp',['../yask__compiler__api_8hpp.html',1,'']]], + ['yask_5fdeprecated',['YASK_DEPRECATED',['../yask__common__api_8hpp.html#af7d3d837169568cf38a2efc3e7b04123',1,'yask_common_api.hpp']]], ['yask_5fexception',['yask_exception',['../classyask_1_1yask__exception.html',1,'yask::yask_exception'],['../classyask_1_1yask__exception.html#a0d43c543951311f4184d175476b10b5c',1,'yask::yask_exception::yask_exception()'],['../classyask_1_1yask__exception.html#a3d93d64e68bb932f85e19124a99d8fad',1,'yask::yask_exception::yask_exception(const std::string &message)']]], ['yask_5ffile_5foutput',['yask_file_output',['../classyask_1_1yask__file__output.html',1,'yask']]], ['yask_5ffile_5foutput_5fptr',['yask_file_output_ptr',['../group__yask.html#ga44ad5ed6fe36f3ef3ebee7a077a12149',1,'yask']]], ['yask_5fget_5fversion_5fstring',['yask_get_version_string',['../group__yask.html#ga8705b764227c5d6f7cf029d90a6b7ab4',1,'yask']]], + ['yask_5fint64_5ft',['YASK_INT64_T',['../yask__common__api_8hpp.html#a39f516516145bef523f3309b72959cdb',1,'yask_common_api.hpp']]], ['yask_5fkernel_5fapi_2ehpp',['yask_kernel_api.hpp',['../yask__kernel__api_8hpp.html',1,'']]], ['yask_5fnull_5foutput',['yask_null_output',['../classyask_1_1yask__null__output.html',1,'yask']]], ['yask_5fnull_5foutput_5fptr',['yask_null_output_ptr',['../group__yask.html#ga8075797a3891a4ee762cf92c6a59bc25',1,'yask']]], ['yask_5fnuma_5finterleave',['yask_numa_interleave',['../group__yk.html#ga4e56e832945f97f2e741738e9194873c',1,'yask']]], ['yask_5fnuma_5flocal',['yask_numa_local',['../group__yk.html#ga82b8e0f360a0e18fe6c730e37b33e3f6',1,'yask']]], ['yask_5fnuma_5fnone',['yask_numa_none',['../group__yk.html#ga38a50108f67012a357b424545495158a',1,'yask']]], + ['yask_5fnuma_5foffload',['yask_numa_offload',['../group__yk.html#gaa3d0568a0cda08804b8d0a8c521a81fa',1,'yask']]], ['yask_5foutput',['yask_output',['../classyask_1_1yask__output.html',1,'yask']]], ['yask_5foutput_5ffactory',['yask_output_factory',['../classyask_1_1yask__output__factory.html',1,'yask']]], ['yask_5foutput_5fptr',['yask_output_ptr',['../group__yask.html#ga605185252a3f3f917593c83b7dde4b66',1,'yask']]], @@ -48,10 +51,10 @@ var searchData= ['yc_5ffactory',['yc_factory',['../classyask_1_1yc__factory.html',1,'yask']]], ['yc_5fgreater_5fthan_5fnode',['yc_greater_than_node',['../classyask_1_1yc__greater__than__node.html',1,'yask']]], ['yc_5fgreater_5fthan_5fnode_5fptr',['yc_greater_than_node_ptr',['../group__yc.html#ga1428bb8994856ecd456549b2dea7fcd9',1,'yask']]], - ['yc_5fgrid',['yc_grid',['../yask__compiler__api_8hpp.html#ae14796cde0479d525b112001bb6a444c',1,'yask']]], - ['yc_5fgrid_5fpoint_5fnode',['yc_grid_point_node',['../yask__compiler__api_8hpp.html#a7d35999dfda1de8d92cceec57da8d261',1,'yask']]], - ['yc_5fgrid_5fpoint_5fnode_5fptr',['yc_grid_point_node_ptr',['../yask__compiler__api_8hpp.html#acedc107d594b81726586528668145b25',1,'yask']]], - ['yc_5fgrid_5fptr',['yc_grid_ptr',['../yask__compiler__api_8hpp.html#a8b612f791d66a68204d83f264bfb075c',1,'yask']]], + ['yc_5fgrid',['yc_grid',['../yask__compiler__api_8hpp.html#a5af53f9d12f8a64e263f9faf12705833',1,'yask']]], + ['yc_5fgrid_5fpoint_5fnode',['yc_grid_point_node',['../yask__compiler__api_8hpp.html#a24044552be06e5020b82381da8331ab7',1,'yask']]], + ['yc_5fgrid_5fpoint_5fnode_5fptr',['yc_grid_point_node_ptr',['../yask__compiler__api_8hpp.html#a3fdfb1592adfd3b7fad43d3dc0954e7b',1,'yask']]], + ['yc_5fgrid_5fptr',['yc_grid_ptr',['../yask__compiler__api_8hpp.html#ac5d9ddae8098817aebdbb5ead715da01',1,'yask']]], ['yc_5findex_5fnode',['yc_index_node',['../classyask_1_1yc__index__node.html',1,'yask']]], ['yc_5findex_5fnode_5fptr',['yc_index_node_ptr',['../group__yc.html#gac5a8be4a272d764b1145f1e0c6f493e0',1,'yask']]], ['yc_5fless_5fthan_5fnode',['yc_less_than_node',['../classyask_1_1yc__less__than__node.html',1,'yask']]], @@ -95,8 +98,8 @@ var searchData= ['yk_5fenv',['yk_env',['../classyask_1_1yk__env.html',1,'yask']]], ['yk_5fenv_5fptr',['yk_env_ptr',['../group__yk.html#ga8dc62f5599d5c5eb9f7583d7d6a63df1',1,'yask']]], ['yk_5ffactory',['yk_factory',['../classyask_1_1yk__factory.html',1,'yask']]], - ['yk_5fgrid',['yk_grid',['../group__yk.html#ga18cfd02f68b4f754c1d174dafdd59d65',1,'yask']]], - ['yk_5fgrid_5fptr',['yk_grid_ptr',['../group__yk.html#gab8f7fc989bed12dcfd79b6bef885fe1e',1,'yask']]], + ['yk_5fgrid',['yk_grid',['../group__yk.html#gab1c5abbc86c9fdde32def4217482cc63',1,'yask']]], + ['yk_5fgrid_5fptr',['yk_grid_ptr',['../group__yk.html#gaf6e19ac605b32b47d4edc5a8985b3c5d',1,'yask']]], ['yk_5fsolution',['yk_solution',['../classyask_1_1yk__solution.html',1,'yask']]], ['yk_5fsolution_5fapi_2ehpp',['yk_solution_api.hpp',['../yk__solution__api_8hpp.html',1,'']]], ['yk_5fsolution_5fptr',['yk_solution_ptr',['../group__yk.html#ga2debaa7135bb46dfc295ca623bee2876',1,'yask']]], diff --git a/docs/api/html/search/all_3.js b/docs/api/html/search/all_3.js index 84a8d88b..9593c8cd 100644 --- a/docs/api/html/search/all_3.js +++ b/docs/api/html/search/all_3.js @@ -10,5 +10,7 @@ var searchData= ['clear_5fdependencies',['clear_dependencies',['../classyask_1_1yc__solution.html#a42cd08d7a26c93d5073134f3b76dcc38',1,'yask::yc_solution']]], ['clear_5ffolding',['clear_folding',['../classyask_1_1yc__solution.html#afaf489e67ed8cc753e999b1495dd4dde',1,'yask::yc_solution']]], ['clone_5fast',['clone_ast',['../classyask_1_1yc__equation__node.html#a02121980dc7dcae2a18b38340579e8ca',1,'yask::yc_equation_node::clone_ast()'],['../classyask_1_1yc__number__node.html#a85093ab8031538e55c2213aacb843faf',1,'yask::yc_number_node::clone_ast()'],['../classyask_1_1yc__bool__node.html#a724e70472b59661feb96ea3a53cab8c7',1,'yask::yc_bool_node::clone_ast()']]], - ['close',['close',['../classyask_1_1yask__file__output.html#ad05306df06c5965659eda39ddfeb0d38',1,'yask::yask_file_output']]] + ['close',['close',['../classyask_1_1yask__file__output.html#ad05306df06c5965659eda39ddfeb0d38',1,'yask::yask_file_output']]], + ['copy_5fvars_5ffrom_5fdevice',['copy_vars_from_device',['../classyask_1_1yk__solution.html#a105c993241498d9c2a98cbec353fc61a',1,'yask::yk_solution']]], + ['copy_5fvars_5fto_5fdevice',['copy_vars_to_device',['../classyask_1_1yk__solution.html#a2cdea230253b47bd16c1a0c326a78df8',1,'yask::yk_solution']]] ]; diff --git a/docs/api/html/search/all_4.js b/docs/api/html/search/all_4.js index f4437f8d..fe99757c 100644 --- a/docs/api/html/search/all_4.js +++ b/docs/api/html/search/all_4.js @@ -1,5 +1,6 @@ var searchData= [ ['define',['define',['../classyask_1_1yc__solution__base.html#abd34ca7ae7a89fc4a051376a612f494a',1,'yask::yc_solution_base::define()'],['../classyask_1_1yc__solution__with__radius__base.html#a883c31f71b3b2876d4c115ca4f3d926d',1,'yask::yc_solution_with_radius_base::define()']]], + ['disable_5fdebug_5foutput',['disable_debug_output',['../classyask_1_1yk__env.html#ab14fa168dc78346ac61b54c9a509099b',1,'yask::yk_env']]], ['discard',['discard',['../classyask_1_1yask__string__output.html#a86fdebb6dbf89c75d306a2c88166943b',1,'yask::yask_string_output']]] ]; diff --git a/docs/api/html/search/all_6.js b/docs/api/html/search/all_6.js index 75e34423..8a6cb6d5 100644 --- a/docs/api/html/search/all_6.js +++ b/docs/api/html/search/all_6.js @@ -1,9 +1,9 @@ var searchData= [ ['first_5fdomain_5findex',['first_domain_index',['../classyask_1_1yc__solution__base.html#a56f644e7e6b4e96619245f217b3763b5',1,'yask::yc_solution_base']]], - ['format',['format',['../classyask_1_1yc__solution.html#a8bedbd4e6834f58c8e5d0452638ee64c',1,'yask::yc_solution']]], - ['format_5findices',['format_indices',['../classyask_1_1yk__var.html#a16aca4721fceade611c11d2d11472162',1,'yask::yk_var::format_indices(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a0b705bdba753e24e12de0297bb30e133',1,'yask::yk_var::format_indices(const std::initializer_list< idx_t > &indices) const =0']]], + ['format',['format',['../classyask_1_1yc__solution.html#a5f3624ded964c465724fd0221d5a2aaa',1,'yask::yc_solution']]], + ['format_5findices',['format_indices',['../classyask_1_1yk__var.html#a55d6c585b8384881065c01bb067c96dd',1,'yask::yk_var::format_indices(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#ae929dea5359d07e7541abab78ca7b139',1,'yask::yk_var::format_indices(const idx_t_init_list &indices) const =0']]], ['format_5fsimple',['format_simple',['../classyask_1_1yc__expr__node.html#a1af7948d0c2a977ed20c0b2d1d561052',1,'yask::yc_expr_node']]], - ['fuse_5fgrids',['fuse_grids',['../classyask_1_1yk__solution.html#a2c600b5537983349a50462a90dc73291',1,'yask::yk_solution::fuse_grids()'],['../classyask_1_1yk__var.html#a312eac09604272384823123d7219432a',1,'yask::yk_var::fuse_grids()']]], + ['fuse_5fgrids',['fuse_grids',['../classyask_1_1yk__solution.html#ac20126eb21acf5e61ac2c94d823a34e1',1,'yask::yk_solution']]], ['fuse_5fvars',['fuse_vars',['../classyask_1_1yk__solution.html#a563794842445fcd96d77b463f674a60b',1,'yask::yk_solution::fuse_vars()'],['../classyask_1_1yk__var.html#aa095607d5493fcba38cb332053155b7f',1,'yask::yk_var::fuse_vars()']]] ]; diff --git a/docs/api/html/search/all_7.js b/docs/api/html/search/all_7.js index 42bbb6b0..d7ac0f0a 100644 --- a/docs/api/html/search/all_7.js +++ b/docs/api/html/search/all_7.js @@ -1,40 +1,46 @@ var searchData= [ ['get_5falloc_5fsize',['get_alloc_size',['../classyask_1_1yk__var.html#a1934db25d379b5ae2366e01a88a2c867',1,'yask::yk_var']]], + ['get_5falloc_5fsize_5fvec',['get_alloc_size_vec',['../classyask_1_1yk__var.html#af47cbeb07ca7728013e71f31bc00281d',1,'yask::yk_var']]], ['get_5farbitrary_5ffd_5fcoefficients',['get_arbitrary_fd_coefficients',['../group__yask.html#ga67e901ad7dd62a3eac164ceed0c46787',1,'yask']]], ['get_5fbackward_5ffd_5fcoefficients',['get_backward_fd_coefficients',['../group__yask.html#ga31c7a4d960e620b75944a40ffe0ff53b',1,'yask']]], ['get_5fblock_5fsize',['get_block_size',['../classyask_1_1yk__solution.html#a601aeebc023d430a311788c3ce73c190',1,'yask::yk_solution']]], + ['get_5fblock_5fsize_5fvec',['get_block_size_vec',['../classyask_1_1yk__solution.html#a75d10347e75c1e01e9592a3cb0fc42a0',1,'yask::yk_solution']]], ['get_5fcenter_5ffd_5fcoefficients',['get_center_fd_coefficients',['../group__yask.html#ga11d0759a323784806d1a30284a83621c',1,'yask']]], ['get_5fcond',['get_cond',['../classyask_1_1yc__equation__node.html#a09eabc1f5854bb4a5b50a715368d6d0f',1,'yask::yc_equation_node']]], - ['get_5fdebug_5foutput',['get_debug_output',['../classyask_1_1yk__env.html#abc475dee318fbe26cad93cc205c90de4',1,'yask::yk_env']]], + ['get_5fdebug_5foutput',['get_debug_output',['../classyask_1_1yk__env.html#a69eac2bb8a841f6259697a7dcc8cf386',1,'yask::yk_env']]], ['get_5fdefault_5fnuma_5fpreferred',['get_default_numa_preferred',['../classyask_1_1yk__solution.html#a05c98a1d8b03d1009ef67b84b2f0bea0',1,'yask::yk_solution']]], ['get_5fdescription',['get_description',['../classyask_1_1yc__solution.html#a8113e505343c5f2598811669f767930c',1,'yask::yc_solution']]], - ['get_5fdim_5fnames',['get_dim_names',['../classyask_1_1yc__var.html#a163ecf734878dd2495cd51a1f479a94d',1,'yask::yc_var::get_dim_names()'],['../classyask_1_1yk__var.html#a9793866823ba93e4d2f284ffeeab92c8',1,'yask::yk_var::get_dim_names()']]], - ['get_5fdomain_5fdim_5fnames',['get_domain_dim_names',['../classyask_1_1yk__solution.html#a665968c18b2f70467dbc7decaeeb1a1f',1,'yask::yk_solution']]], + ['get_5fdim_5fnames',['get_dim_names',['../classyask_1_1yc__var.html#a1cdca3a698cd9029f659fd7b11d89305',1,'yask::yc_var::get_dim_names()'],['../classyask_1_1yk__var.html#adf2e93317a2e86d80713ad56a46fdc68',1,'yask::yk_var::get_dim_names()']]], + ['get_5fdomain_5fdim_5fnames',['get_domain_dim_names',['../classyask_1_1yk__solution.html#a24c2619b5f0471bcfe8eeb5aed769fec',1,'yask::yk_solution']]], ['get_5felapsed_5fsecs',['get_elapsed_secs',['../classyask_1_1yk__stats.html#a8bcf045fdfde4d9120084902f4d31725',1,'yask::yk_stats']]], - ['get_5felement',['get_element',['../classyask_1_1yk__var.html#ae6e5f47af6672d9e9e02e4bdff59d8ce',1,'yask::yk_var::get_element(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a01a0e195ee63bd58932adead0cd1c809',1,'yask::yk_var::get_element(const std::initializer_list< idx_t > &indices) const =0']]], + ['get_5felement',['get_element',['../classyask_1_1yk__var.html#aed2d676221d5b99be7b8b8bc2cd37af2',1,'yask::yk_var::get_element(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#af49bd859b3200e8cd8a55fe55ecbde93',1,'yask::yk_var::get_element(const idx_t_init_list &indices) const =0']]], ['get_5felement_5fbytes',['get_element_bytes',['../classyask_1_1yc__solution.html#a78551a2f7ca0a9644fa802d0806b7642',1,'yask::yc_solution::get_element_bytes()'],['../classyask_1_1yk__solution.html#a50e565487b7175447cc9f6489221eef4',1,'yask::yk_solution::get_element_bytes()']]], - ['get_5felements_5fin_5fslice',['get_elements_in_slice',['../classyask_1_1yk__var.html#a3c825ad9dfa3c9138348399bfbf548c8',1,'yask::yk_var']]], + ['get_5felements_5fin_5fslice',['get_elements_in_slice',['../classyask_1_1yk__var.html#ad33ae6d7f03ec5fb8fa31e4ad9ff7881',1,'yask::yk_var']]], ['get_5fequations',['get_equations',['../classyask_1_1yc__solution.html#a8257de64334bd95fcaca64719653fd1d',1,'yask::yc_solution']]], ['get_5fest_5ffp_5fops_5fdone',['get_est_fp_ops_done',['../classyask_1_1yk__stats.html#a2ce2e8bf959e0af0caae77bd5ae1626f',1,'yask::yk_stats']]], - ['get_5fextra_5fpad_5fsize',['get_extra_pad_size',['../classyask_1_1yk__var.html#a3a7e4eeb67a187fede214956902b1486',1,'yask::yk_var']]], ['get_5ffilename',['get_filename',['../classyask_1_1yask__file__output.html#a370fcde900fe4cebf04741bde16f59d4',1,'yask::yask_file_output']]], ['get_5ffirst_5flocal_5findex',['get_first_local_index',['../classyask_1_1yk__var.html#aaf5b3f06b832bbf77b8f722fda8a2998',1,'yask::yk_var']]], + ['get_5ffirst_5flocal_5findex_5fvec',['get_first_local_index_vec',['../classyask_1_1yk__var.html#a3b7117c1479a41c6d5e3b37fbc2309f3',1,'yask::yk_var']]], ['get_5ffirst_5fmisc_5findex',['get_first_misc_index',['../classyask_1_1yk__var.html#a9faab903ed2467e46ed0b5ea43a9e1e4',1,'yask::yk_var']]], - ['get_5ffirst_5frank_5falloc_5findex',['get_first_rank_alloc_index',['../classyask_1_1yk__var.html#adb7ed5c0f513ae7a5480a480f08165b7',1,'yask::yk_var']]], + ['get_5ffirst_5frank_5falloc_5findex',['get_first_rank_alloc_index',['../classyask_1_1yk__var.html#a010df6d40b808aa0d7fca274bbf5d2d4',1,'yask::yk_var']]], ['get_5ffirst_5frank_5fdomain_5findex',['get_first_rank_domain_index',['../classyask_1_1yk__solution.html#a03bdef5ba9b0b0e37f9b7be2e2e457a4',1,'yask::yk_solution::get_first_rank_domain_index()'],['../classyask_1_1yk__var.html#a60d8b63ef869693d7eeb556e5254d167',1,'yask::yk_var::get_first_rank_domain_index()']]], + ['get_5ffirst_5frank_5fdomain_5findex_5fvec',['get_first_rank_domain_index_vec',['../classyask_1_1yk__solution.html#a79492e63f435ac8a93d8815a3ca4729c',1,'yask::yk_solution::get_first_rank_domain_index_vec()'],['../classyask_1_1yk__var.html#a45c2449324430b59811c1c6257cf9a5a',1,'yask::yk_var::get_first_rank_domain_index_vec()']]], ['get_5ffirst_5frank_5fhalo_5findex',['get_first_rank_halo_index',['../classyask_1_1yk__var.html#a363c78d8f7f5b40581fb0cb179058410',1,'yask::yk_var']]], + ['get_5ffirst_5frank_5fhalo_5findex_5fvec',['get_first_rank_halo_index_vec',['../classyask_1_1yk__var.html#ab0ed2e82d463cc3fae2daa395edb78f4',1,'yask::yk_var']]], ['get_5ffirst_5fvalid_5fstep_5findex',['get_first_valid_step_index',['../classyask_1_1yk__var.html#a6a02011041f38a4d2fe7899d5369511e',1,'yask::yk_var']]], ['get_5fforward_5ffd_5fcoefficients',['get_forward_fd_coefficients',['../group__yask.html#ga2cccdb7135258b002cdac134fbfa1912',1,'yask']]], - ['get_5fgrid',['get_grid',['../classyask_1_1yc__solution.html#a491edfd90571df7a705d9549d1604578',1,'yask::yc_solution::get_grid()'],['../classyask_1_1yc__var__point__node.html#aed1f22e175b0cd60a061fcd6dff807a2',1,'yask::yc_var_point_node::get_grid()'],['../classyask_1_1yk__solution.html#aadd360ab1d09bfcf603f0345793ba586',1,'yask::yk_solution::get_grid()']]], - ['get_5fgrids',['get_grids',['../classyask_1_1yc__solution.html#a150c8dec60f7c43b64d734f91756a3d6',1,'yask::yc_solution::get_grids()'],['../classyask_1_1yk__solution.html#a418199548c44f5c9d7d90305350cf34b',1,'yask::yk_solution::get_grids()']]], + ['get_5fgrid',['get_grid',['../classyask_1_1yc__solution.html#a7f63562fa2519ad4817c2698832c0dcb',1,'yask::yc_solution::get_grid()'],['../classyask_1_1yc__var__point__node.html#a550d4f6efb8cca03b1a5cea1f05ad936',1,'yask::yc_var_point_node::get_grid()'],['../classyask_1_1yk__solution.html#a561517d815dd1004e1c39557554eee16',1,'yask::yk_solution::get_grid()']]], + ['get_5fgrids',['get_grids',['../classyask_1_1yc__solution.html#aa1d2c083951bb0b3f0d1fce606ba411d',1,'yask::yc_solution::get_grids()'],['../classyask_1_1yk__solution.html#a9a0269a914a3e4e5465a7e3643785544',1,'yask::yk_solution::get_grids()']]], ['get_5fhalo_5fexchange_5fl1_5fnorm',['get_halo_exchange_l1_norm',['../classyask_1_1yk__var.html#a2de3a290dae76a40b3c208074ebbdb89',1,'yask::yk_var']]], - ['get_5fhalo_5fsize',['get_halo_size',['../classyask_1_1yk__var.html#af8cc3a22aaeb21b82d4d77d1617ec117',1,'yask::yk_var']]], ['get_5flast_5flocal_5findex',['get_last_local_index',['../classyask_1_1yk__var.html#a7266bb36c93cb6ae538a0f081f22fad7',1,'yask::yk_var']]], + ['get_5flast_5flocal_5findex_5fvec',['get_last_local_index_vec',['../classyask_1_1yk__var.html#a00d8f5b744b8da99190764f0b0fc1cf5',1,'yask::yk_var']]], ['get_5flast_5fmisc_5findex',['get_last_misc_index',['../classyask_1_1yk__var.html#ab551b0a8749c38e8307082631cf597fa',1,'yask::yk_var']]], - ['get_5flast_5frank_5falloc_5findex',['get_last_rank_alloc_index',['../classyask_1_1yk__var.html#a2919bd783fb240a9d0f43a866bf7f551',1,'yask::yk_var']]], + ['get_5flast_5frank_5falloc_5findex',['get_last_rank_alloc_index',['../classyask_1_1yk__var.html#acb67cbd0ecea35f3f50b435e4b977c53',1,'yask::yk_var']]], ['get_5flast_5frank_5fdomain_5findex',['get_last_rank_domain_index',['../classyask_1_1yk__solution.html#a58f2ce95e150787bc8235e85298dcd01',1,'yask::yk_solution::get_last_rank_domain_index()'],['../classyask_1_1yk__var.html#aaf3e835c074a2bdd627f76483727b1dc',1,'yask::yk_var::get_last_rank_domain_index()']]], + ['get_5flast_5frank_5fdomain_5findex_5fvec',['get_last_rank_domain_index_vec',['../classyask_1_1yk__solution.html#a9d571ca2f9e1309d3af73c9d8fe7a084',1,'yask::yk_solution::get_last_rank_domain_index_vec()'],['../classyask_1_1yk__var.html#abdfdb9d4d382fb1c4fe4966e43afb5b2',1,'yask::yk_var::get_last_rank_domain_index_vec()']]], ['get_5flast_5frank_5fhalo_5findex',['get_last_rank_halo_index',['../classyask_1_1yk__var.html#ad3d97b360b0b1e4c5785adf87384c41e',1,'yask::yk_var']]], + ['get_5flast_5frank_5fhalo_5findex_5fvec',['get_last_rank_halo_index_vec',['../classyask_1_1yk__var.html#aa735497e92513f5796af52e8ddb24353',1,'yask::yk_var']]], ['get_5flast_5fvalid_5fstep_5findex',['get_last_valid_step_index',['../classyask_1_1yk__var.html#a7b346e48cb36b0f304d3b030ac9a1512',1,'yask::yk_var']]], ['get_5fleft_5fextra_5fpad_5fsize',['get_left_extra_pad_size',['../classyask_1_1yk__var.html#ae62308be69ca643714879b0a1362de0e',1,'yask::yk_var']]], ['get_5fleft_5fhalo_5fsize',['get_left_halo_size',['../classyask_1_1yk__var.html#acd8e51e7ee79fcc55a622b3012da63bc',1,'yask::yk_var']]], @@ -42,16 +48,17 @@ var searchData= ['get_5flhs',['get_lhs',['../classyask_1_1yc__equation__node.html#a649b44dfcf32970e94d6fb95d2caafa4',1,'yask::yc_equation_node::get_lhs()'],['../classyask_1_1yc__binary__number__node.html#a99ec0c25562c4c8394b2bdaa1bc6a391',1,'yask::yc_binary_number_node::get_lhs()'],['../classyask_1_1yc__binary__bool__node.html#a46f81b0aabf774f50659f4fc0afa0356',1,'yask::yc_binary_bool_node::get_lhs()'],['../classyask_1_1yc__binary__comparison__node.html#a344a9a30a06825bd1759c5553f6276df',1,'yask::yc_binary_comparison_node::get_lhs()']]], ['get_5fmessage',['get_message',['../classyask_1_1yask__exception.html#acb717171229ec55d8fa110a7f16d9913',1,'yask::yask_exception']]], ['get_5fmin_5fpad_5fsize',['get_min_pad_size',['../classyask_1_1yk__solution.html#a193d7c6e708c1ece4d78b39800a7d5fa',1,'yask::yk_solution']]], - ['get_5fmisc_5fdim_5fnames',['get_misc_dim_names',['../classyask_1_1yk__solution.html#af895e7366f7f224c4e6fc780b0dd7a2f',1,'yask::yk_solution']]], + ['get_5fmisc_5fdim_5fnames',['get_misc_dim_names',['../classyask_1_1yk__solution.html#a4184ac69b5e5af07e8cfc0317d6bd759',1,'yask::yk_solution']]], ['get_5fname',['get_name',['../classyask_1_1yc__solution.html#a630ebb8dc2bff24f15b5a56e46efc9f8',1,'yask::yc_solution::get_name()'],['../classyask_1_1yc__var.html#aef8c255b753899f77280fe65008cc5ba',1,'yask::yc_var::get_name()'],['../classyask_1_1yc__index__node.html#a37c072f91771b3b7c95708da9f152c33',1,'yask::yc_index_node::get_name()'],['../classyask_1_1yk__solution.html#aaafc0aa636a3d306e29771d5440c4aa4',1,'yask::yk_solution::get_name()'],['../classyask_1_1yk__var.html#a4dd64a662a1711fe5613e58773ea0fba',1,'yask::yk_var::get_name()']]], ['get_5fnum_5fdims',['get_num_dims',['../classyask_1_1yc__var.html#a7fd5309c762a7ee5450ca4f540b894d1',1,'yask::yc_var::get_num_dims()'],['../classyask_1_1yk__var.html#aa6c3bbc2bea32b76a9dda612fa91f0f4',1,'yask::yk_var::get_num_dims()']]], - ['get_5fnum_5fdomain_5fdims',['get_num_domain_dims',['../classyask_1_1yk__solution.html#a40a13017d8c3a599d00b99561405dd3c',1,'yask::yk_solution']]], + ['get_5fnum_5fdomain_5fdims',['get_num_domain_dims',['../classyask_1_1yk__solution.html#a40a13017d8c3a599d00b99561405dd3c',1,'yask::yk_solution::get_num_domain_dims()'],['../classyask_1_1yk__var.html#afad8c52ff4cd03e67a097525baeaa56d',1,'yask::yk_var::get_num_domain_dims()']]], ['get_5fnum_5felements',['get_num_elements',['../classyask_1_1yk__stats.html#a371b5222182cdae51184da17db92055e',1,'yask::yk_stats']]], ['get_5fnum_5fequations',['get_num_equations',['../classyask_1_1yc__solution.html#a2c0d2dad6b123d0a1444e2f2a74d1ad4',1,'yask::yc_solution']]], - ['get_5fnum_5fgrids',['get_num_grids',['../classyask_1_1yc__solution.html#ace7bc36f2c4e20e8fef6a99ef0964f13',1,'yask::yc_solution::get_num_grids()'],['../classyask_1_1yk__solution.html#ab36a4f206704861958ac280e774f7e03',1,'yask::yk_solution::get_num_grids()']]], + ['get_5fnum_5fgrids',['get_num_grids',['../classyask_1_1yc__solution.html#a246a87a4165aa119cc3a0761afa4ffbf',1,'yask::yc_solution::get_num_grids()'],['../classyask_1_1yk__solution.html#a4a510dff4e18c716baeec707affd0124',1,'yask::yk_solution::get_num_grids()']]], ['get_5fnum_5fnodes',['get_num_nodes',['../classyask_1_1yc__expr__node.html#a7f746b02ea0de618dcef9ce0e124e321',1,'yask::yc_expr_node']]], ['get_5fnum_5foperands',['get_num_operands',['../classyask_1_1yc__commutative__number__node.html#a8c51f0a10eb6039d35f2cc69514c2bbf',1,'yask::yc_commutative_number_node']]], ['get_5fnum_5franks',['get_num_ranks',['../classyask_1_1yk__env.html#add8b12cd6d10f964665a41acedbb9b14',1,'yask::yk_env::get_num_ranks()'],['../classyask_1_1yk__solution.html#a4449fe8902881c9a61ad12fd20a5a866',1,'yask::yk_solution::get_num_ranks()']]], + ['get_5fnum_5franks_5fvec',['get_num_ranks_vec',['../classyask_1_1yk__solution.html#a743acbc4b86f9a65f3fcef1a35fee2d7',1,'yask::yk_solution']]], ['get_5fnum_5fsteps_5fdone',['get_num_steps_done',['../classyask_1_1yk__stats.html#a5492de5b4904008ea60bec09df1dc630',1,'yask::yk_stats']]], ['get_5fnum_5fstorage_5fbytes',['get_num_storage_bytes',['../classyask_1_1yk__var.html#a4741fd9f7413d5ad634c755a52ffc6b3',1,'yask::yk_var']]], ['get_5fnum_5fstorage_5felements',['get_num_storage_elements',['../classyask_1_1yk__var.html#a73864906531e75762a4d0829b65cd997',1,'yask::yk_var']]], @@ -61,13 +68,14 @@ var searchData= ['get_5foperands',['get_operands',['../classyask_1_1yc__commutative__number__node.html#a026b2ecfa4483af95f572f58f9580e34',1,'yask::yc_commutative_number_node']]], ['get_5fostream',['get_ostream',['../classyask_1_1yask__output.html#a8017a86fd806f0f23dde7a70b77b5f43',1,'yask::yask_output']]], ['get_5foverall_5fdomain_5fsize',['get_overall_domain_size',['../classyask_1_1yk__solution.html#aaba39cb06c9f61d408695009667fe8cd',1,'yask::yk_solution']]], - ['get_5fpad_5fsize',['get_pad_size',['../classyask_1_1yk__var.html#a77d5f163315b4eb78dee290b350829be',1,'yask::yk_var']]], + ['get_5foverall_5fdomain_5fsize_5fvec',['get_overall_domain_size_vec',['../classyask_1_1yk__solution.html#aa141292fbfb8fcd075ea7192b03b6c43',1,'yask::yk_solution']]], ['get_5fprefetch_5fdist',['get_prefetch_dist',['../classyask_1_1yc__solution.html#abe595d131a70b3ef881e4e2ab35b47d6',1,'yask::yc_solution']]], ['get_5fradius',['get_radius',['../classyask_1_1yc__solution__with__radius__base.html#a59c519837c701c0043d41c0fdec1cf78',1,'yask::yc_solution_with_radius_base']]], ['get_5frank_5fdomain_5fsize',['get_rank_domain_size',['../classyask_1_1yk__solution.html#a56ba31268cb0098b64e4503c8996300f',1,'yask::yk_solution::get_rank_domain_size()'],['../classyask_1_1yk__var.html#a3eb132fe3e1f813a0c22b1366e5d55ad',1,'yask::yk_var::get_rank_domain_size()']]], + ['get_5frank_5fdomain_5fsize_5fvec',['get_rank_domain_size_vec',['../classyask_1_1yk__solution.html#a50034edeb397bb0285ea4363178de803',1,'yask::yk_solution::get_rank_domain_size_vec()'],['../classyask_1_1yk__var.html#a2f58d9368265fd6f9382ba12f876f0d3',1,'yask::yk_var::get_rank_domain_size_vec()']]], ['get_5frank_5findex',['get_rank_index',['../classyask_1_1yk__env.html#a93deb7b62612742f5a85fd8e319df38c',1,'yask::yk_env::get_rank_index()'],['../classyask_1_1yk__solution.html#a3268e5e7e2f0e45f951f1c38063bc59f',1,'yask::yk_solution::get_rank_index()']]], + ['get_5frank_5findex_5fvec',['get_rank_index_vec',['../classyask_1_1yk__solution.html#a64ce6c3bb5a4a467e6a23e4aa619881d',1,'yask::yk_solution']]], ['get_5fraw_5fstorage_5fbuffer',['get_raw_storage_buffer',['../classyask_1_1yk__var.html#a4a4e42c4cfc5b3b6a5d998611d0d8602',1,'yask::yk_var']]], - ['get_5fregion_5fsize',['get_region_size',['../classyask_1_1yk__solution.html#ac048a52035e60e2cc4e070da0e68181e',1,'yask::yk_solution']]], ['get_5fregistry',['get_registry',['../classyask_1_1yc__solution__base.html#a2b30d03733943c69f5c25b04d43efa84',1,'yask::yc_solution_base']]], ['get_5frhs',['get_rhs',['../classyask_1_1yc__equation__node.html#a0a3f60078eeeb5228b7b52457a717045',1,'yask::yc_equation_node::get_rhs()'],['../classyask_1_1yc__negate__node.html#abf3f9acad28e9a009ea0fa891371982f',1,'yask::yc_negate_node::get_rhs()'],['../classyask_1_1yc__binary__number__node.html#abb59a235acbd54494c566941dd462b7d',1,'yask::yc_binary_number_node::get_rhs()'],['../classyask_1_1yc__not__node.html#ab514f62621a73abca0bb407174db9f08',1,'yask::yc_not_node::get_rhs()'],['../classyask_1_1yc__binary__bool__node.html#a5a2954e82488da70de6781b67acdf8ce',1,'yask::yc_binary_bool_node::get_rhs()'],['../classyask_1_1yc__binary__comparison__node.html#af7a304da70447a51df07e72387eeeb3f',1,'yask::yc_binary_comparison_node::get_rhs()']]], ['get_5fright_5fextra_5fpad_5fsize',['get_right_extra_pad_size',['../classyask_1_1yk__var.html#afe6845890c3f22bf4614f1f57f414501',1,'yask::yk_var']]], diff --git a/docs/api/html/search/all_9.js b/docs/api/html/search/all_9.js index 1142aa02..c55cea4b 100644 --- a/docs/api/html/search/all_9.js +++ b/docs/api/html/search/all_9.js @@ -1,6 +1,8 @@ var searchData= [ - ['idx_5ft',['idx_t',['../group__yask.html#ga3820f8c6b5f6a92c0df31746b7d2891b',1,'yask']]], + ['idx_5ft',['idx_t',['../group__yask.html#ga1dd7066686ff93559a0f28979be12d81',1,'yask']]], + ['idx_5ft_5finit_5flist',['idx_t_init_list',['../group__yask.html#ga9a86862ece2cddc8fb77fac850c44161',1,'yask']]], + ['idx_5ft_5fvec',['idx_t_vec',['../group__yask.html#gab23959584aacc15a84d1eca058036d09',1,'yask']]], ['if_5fdomain',['IF_DOMAIN',['../group__yc.html#gad70f52c39248d0415c28675bfb166689',1,'yc_node_api.hpp']]], ['if_5fstep',['IF_STEP',['../group__yc.html#ga5fabd22dd6b26cd147074d603cb6f367',1,'yc_node_api.hpp']]], ['is_5fauto_5ftuner_5fenabled',['is_auto_tuner_enabled',['../classyask_1_1yk__solution.html#a0f7eea6bfc844adcc584c2ca5c1f2ba4',1,'yask::yk_solution']]], @@ -8,10 +10,11 @@ var searchData= ['is_5fdependency_5fchecker_5fenabled',['is_dependency_checker_enabled',['../classyask_1_1yc__solution.html#a8b4a7ac8cc9e6be09b115a106bac769b',1,'yask::yc_solution']]], ['is_5fdim_5fused',['is_dim_used',['../classyask_1_1yk__var.html#ace776bc0e51b07e940b23ebbce7a2232',1,'yask::yk_var']]], ['is_5fdynamic_5fstep_5falloc',['is_dynamic_step_alloc',['../classyask_1_1yc__var.html#a59099b1ba72b889e386a2f48912eef0a',1,'yask::yc_var::is_dynamic_step_alloc()'],['../classyask_1_1yk__var.html#a9c4783735b2f5b72c659f1572c44598f',1,'yask::yk_var::is_dynamic_step_alloc()']]], - ['is_5felement_5fallocated',['is_element_allocated',['../classyask_1_1yk__var.html#ad84cc59ae2c7f11c505c99e5955a60a9',1,'yask::yk_var::is_element_allocated(const std::vector< idx_t > &indices) const'],['../classyask_1_1yk__var.html#a49fcc4e673af91318403f804d5bd9b25',1,'yask::yk_var::is_element_allocated(const std::initializer_list< idx_t > &indices) const']]], ['is_5ffixed_5fsize',['is_fixed_size',['../classyask_1_1yk__var.html#a420e0dde2114ba663069b16d579072eb',1,'yask::yk_var']]], ['is_5ffolding_5fset',['is_folding_set',['../classyask_1_1yc__solution.html#abaefa9675e9551ec10b9eee0016a0822',1,'yask::yc_solution']]], + ['is_5foffloaded',['is_offloaded',['../classyask_1_1yk__solution.html#a2978fb8fd701fe35ae8a8b19e23e5544',1,'yask::yk_solution']]], ['is_5fstorage_5fallocated',['is_storage_allocated',['../classyask_1_1yk__var.html#a3b09855a3cfa7e43c32fc43a24503340',1,'yask::yk_var']]], ['is_5fstorage_5flayout_5fidentical',['is_storage_layout_identical',['../classyask_1_1yk__var.html#a3611ad6d130cb5ee7e030e6ad0c24f5b',1,'yask::yk_var']]], - ['is_5ftarget_5fset',['is_target_set',['../classyask_1_1yc__solution.html#abc13ff80f75eac42bb7300c24246f78c',1,'yask::yc_solution']]] + ['is_5ftarget_5fset',['is_target_set',['../classyask_1_1yc__solution.html#abc13ff80f75eac42bb7300c24246f78c',1,'yask::yc_solution']]], + ['is_5ftrace_5fenabled',['is_trace_enabled',['../classyask_1_1yk__env.html#abe162669b192b8c650ee7dbfbd62bb7d',1,'yask::yk_env']]] ]; diff --git a/docs/api/html/search/all_b.js b/docs/api/html/search/all_b.js index ee775c43..256b6558 100644 --- a/docs/api/html/search/all_b.js +++ b/docs/api/html/search/all_b.js @@ -10,11 +10,11 @@ var searchData= ['new_5fequation_5fnode',['new_equation_node',['../classyask_1_1yc__node__factory.html#a65838f8b97438cf4841644cff88dfb14',1,'yask::yc_node_factory']]], ['new_5ffile_5foutput',['new_file_output',['../classyask_1_1yask__output__factory.html#a25d64e5f5834fe353f58a7d8b533bcf2',1,'yask::yask_output_factory']]], ['new_5ffirst_5fdomain_5findex',['new_first_domain_index',['../classyask_1_1yc__node__factory.html#a20988bc2d3185873e890bec353687d45',1,'yask::yc_node_factory']]], - ['new_5ffixed_5fsize_5fgrid',['new_fixed_size_grid',['../classyask_1_1yk__solution.html#a5c86f62c9b5ed883da0642e83095bb6d',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)'],['../classyask_1_1yk__solution.html#ab746b975b9929508188aee45b35ba176',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const std::vector< idx_t > &dim_sizes)']]], - ['new_5ffixed_5fsize_5fvar',['new_fixed_size_var',['../classyask_1_1yk__solution.html#a70c44ba3941aa5682e381c7c36dcf453',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)=0'],['../classyask_1_1yk__solution.html#a0b682d609950a4d9b9d8fb497aad9971',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const std::initializer_list< idx_t > &dim_sizes)=0']]], + ['new_5ffixed_5fsize_5fgrid',['new_fixed_size_grid',['../classyask_1_1yk__solution.html#a86f50f2b0851eb969322956258d459d0',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)'],['../classyask_1_1yk__solution.html#a106ecadbbd0a7fa2ddb8ce6b14e14451',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_vec &dim_sizes)']]], + ['new_5ffixed_5fsize_5fvar',['new_fixed_size_var',['../classyask_1_1yk__solution.html#a2899b81d090c4a2f468ccc31adfa9d85',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)=0'],['../classyask_1_1yk__solution.html#a8cff251e3f20d961e0a11752857dd28a',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_init_list &dim_sizes)=0']]], ['new_5fgreater_5fthan_5fnode',['new_greater_than_node',['../classyask_1_1yc__node__factory.html#ab917cf34e4a230e090d8fdae04796037',1,'yask::yc_node_factory']]], - ['new_5fgrid',['new_grid',['../classyask_1_1yc__solution.html#a9cbb303e06a3db1835a1bed41820474e',1,'yask::yc_solution::new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#a134cbf4957a7f5c34b082ca471cfb2ff',1,'yask::yc_solution::new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)'],['../classyask_1_1yk__solution.html#a1f74cc62244dc8e3e893160b1a5a76d4',1,'yask::yk_solution::new_grid(const std::string &name, const std::vector< std::string > &dims)'],['../classyask_1_1yk__solution.html#a4c229f530c340ca9d7ad37418fe3b6fd',1,'yask::yk_solution::new_grid(const std::string &name, const std::initializer_list< std::string > &dims)']]], - ['new_5fgrid_5fpoint',['new_grid_point',['../classyask_1_1yc__var.html#a5034bae85ce45db6aed4f72a72a12cdd',1,'yask::yc_var::new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)'],['../classyask_1_1yc__var.html#a315854cf3483982b29c51937956a2188',1,'yask::yc_var::new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)']]], + ['new_5fgrid',['new_grid',['../classyask_1_1yc__solution.html#a99a140813b38f5128b730c2831db9fb4',1,'yask::yc_solution::new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#a233eccb68505d3900b1199493fc74796',1,'yask::yc_solution::new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)'],['../classyask_1_1yk__solution.html#ac5b391babd6897c314f5ba5e3c0b2605',1,'yask::yk_solution::new_grid(const std::string &name, const string_vec &dims)'],['../classyask_1_1yk__solution.html#a30606c931e4b30a1d4d1b515dc4c5926',1,'yask::yk_solution::new_grid(const std::string &name, const std::initializer_list< std::string > &dims)']]], + ['new_5fgrid_5fpoint',['new_grid_point',['../classyask_1_1yc__var.html#aa9dcbc42cd74571ef124f9801a177e18',1,'yask::yc_var::new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)'],['../classyask_1_1yc__var.html#a33df3a69c0a880009a764dd6b9ae04b0',1,'yask::yc_var::new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)']]], ['new_5flast_5fdomain_5findex',['new_last_domain_index',['../classyask_1_1yc__node__factory.html#a8ec2bb0a9c5db26467185f876c73febf',1,'yask::yc_node_factory']]], ['new_5fless_5fthan_5fnode',['new_less_than_node',['../classyask_1_1yc__node__factory.html#af5fcf62243eee64f4d8e06224b2e6de7',1,'yask::yc_node_factory']]], ['new_5fmisc_5findex',['new_misc_index',['../classyask_1_1yc__node__factory.html#aef5fed8db0e1798b421c4a8cb8da77ff',1,'yask::yc_node_factory::new_misc_index()'],['../classyask_1_1yc__solution__base.html#a254355f82c0bbaf2f78f6d38a196dcf3',1,'yask::yc_solution_base::new_misc_index()']]], @@ -28,15 +28,15 @@ var searchData= ['new_5fnull_5foutput',['new_null_output',['../classyask_1_1yask__output__factory.html#ab0bfefeb356653f097800f17fa659399',1,'yask::yask_output_factory']]], ['new_5fnumber_5fnode',['new_number_node',['../classyask_1_1yc__node__factory.html#ad7ad1075e359ddf1100ec25432b869b3',1,'yask::yc_node_factory::new_number_node()'],['../classyask_1_1yc__solution__base.html#aa807cfa83dd78deda5d32249acecbe78',1,'yask::yc_solution_base::new_number_node()']]], ['new_5for_5fnode',['new_or_node',['../classyask_1_1yc__node__factory.html#a73b4735896225d361d2a7c450226162d',1,'yask::yc_node_factory']]], - ['new_5frelative_5fgrid_5fpoint',['new_relative_grid_point',['../classyask_1_1yc__var.html#adb5d005f9d7e7bfb453b83b528f4fa8a',1,'yask::yc_var::new_relative_grid_point(const std::vector< int > &dim_offsets)'],['../classyask_1_1yc__var.html#affdab8bb4a9772a98da29aeffab05170',1,'yask::yc_var::new_relative_grid_point(const std::initializer_list< int > &dim_offsets)']]], - ['new_5frelative_5fvar_5fpoint',['new_relative_var_point',['../classyask_1_1yc__var.html#a2d35bc9166438e77a4f9ff7dc2778f6b',1,'yask::yc_var::new_relative_var_point(const std::vector< int > &dim_offsets)=0'],['../classyask_1_1yc__var.html#a18912b133d220d8ea20a847abb893592',1,'yask::yc_var::new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0']]], - ['new_5fscratch_5fgrid',['new_scratch_grid',['../classyask_1_1yc__solution.html#a06883d38ee1010bf6c950fce684d4a87',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#ae46884786032dab64986774ff36b6eba',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)']]], + ['new_5frelative_5fgrid_5fpoint',['new_relative_grid_point',['../classyask_1_1yc__var.html#a01803ca6d935b1d67093ee39192ecd39',1,'yask::yc_var::new_relative_grid_point(const std::vector< int > &dim_offsets)'],['../classyask_1_1yc__var.html#a69b1d05f4337b58afd9e5715663456ce',1,'yask::yc_var::new_relative_grid_point(const std::initializer_list< int > &dim_offsets)']]], + ['new_5frelative_5fvar_5fpoint',['new_relative_var_point',['../classyask_1_1yc__var.html#a08bd94bd9934eb4cec308638cfffe53d',1,'yask::yc_var::new_relative_var_point(const std::vector< int > &dim_offsets)=0'],['../classyask_1_1yc__var.html#a3d1dc10ae85f73f74203ce405618ae5e',1,'yask::yc_var::new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0']]], + ['new_5fscratch_5fgrid',['new_scratch_grid',['../classyask_1_1yc__solution.html#a9e30883d0a97aa0ef5af6832f67bd863',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#ae24afb1d88e355707c2113f749445329',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)']]], ['new_5fscratch_5fvar',['new_scratch_var',['../classyask_1_1yc__solution.html#ac025854d8d7a0e4c62753dda67ff9e39',1,'yask::yc_solution::new_scratch_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#aa3f1bd432ae6b977d8a150e319856228',1,'yask::yc_solution::new_scratch_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0']]], ['new_5fsolution',['new_solution',['../classyask_1_1yc__factory.html#adce155773c9b0d469263303919681d69',1,'yask::yc_factory::new_solution()'],['../classyask_1_1yk__factory.html#a755b1bfc0dd9bfddfe80d924a188b350',1,'yask::yk_factory::new_solution(yk_env_ptr env) const'],['../classyask_1_1yk__factory.html#a43d6b5b6a88c7e4f14e41997b22501f0',1,'yask::yk_factory::new_solution(yk_env_ptr env, const yk_solution_ptr source) const']]], ['new_5fstdout_5foutput',['new_stdout_output',['../classyask_1_1yask__output__factory.html#acf0cc704a266abe9243eaa7b8672ca94',1,'yask::yask_output_factory']]], ['new_5fstep_5findex',['new_step_index',['../classyask_1_1yc__node__factory.html#a77c772e8539b116a9f0adbdf432628a1',1,'yask::yc_node_factory::new_step_index()'],['../classyask_1_1yc__solution__base.html#acd7a84f525c48d932e662597ea6ae32e',1,'yask::yc_solution_base::new_step_index()']]], ['new_5fstring_5foutput',['new_string_output',['../classyask_1_1yask__output__factory.html#ab1ec3a602da73b8ef716c8e07b43da04',1,'yask::yask_output_factory']]], ['new_5fsubtract_5fnode',['new_subtract_node',['../classyask_1_1yc__node__factory.html#af6ec670eeb91d4f4a7b4a9221a808346',1,'yask::yc_node_factory']]], - ['new_5fvar',['new_var',['../classyask_1_1yc__solution.html#a192b0f12d3943483514e16c82c15a42b',1,'yask::yc_solution::new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#a4daa8ae2e61c612cdb79241e43b34fcc',1,'yask::yc_solution::new_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yk__solution.html#a076e1cc78d96e7cc77ec62d58e289131',1,'yask::yk_solution::new_var(const std::string &name, const std::vector< std::string > &dims)=0'],['../classyask_1_1yk__solution.html#ae2774f810be2d57a878884111cbd36e9',1,'yask::yk_solution::new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0']]], + ['new_5fvar',['new_var',['../classyask_1_1yc__solution.html#a192b0f12d3943483514e16c82c15a42b',1,'yask::yc_solution::new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#a4daa8ae2e61c612cdb79241e43b34fcc',1,'yask::yc_solution::new_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yk__solution.html#a75ba824977414468dd23f0a1d5f9eaf3',1,'yask::yk_solution::new_var(const std::string &name, const string_vec &dims)=0'],['../classyask_1_1yk__solution.html#ae2774f810be2d57a878884111cbd36e9',1,'yask::yk_solution::new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0']]], ['new_5fvar_5fpoint',['new_var_point',['../classyask_1_1yc__var.html#aad91c7587f75392db28d7a19bb53b423',1,'yask::yc_var::new_var_point(const std::vector< yc_number_node_ptr > &index_exprs)=0'],['../classyask_1_1yc__var.html#acbb35addfd24ab805d68e7ec0e76b8b9',1,'yask::yc_var::new_var_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)=0']]] ]; diff --git a/docs/api/html/search/all_f.js b/docs/api/html/search/all_f.js index 25abe67d..14beb771 100644 --- a/docs/api/html/search/all_f.js +++ b/docs/api/html/search/all_f.js @@ -3,18 +3,19 @@ var searchData= ['set_5fall_5felements_5fsame',['set_all_elements_same',['../classyask_1_1yk__var.html#a0cbfa0153ac69dfadf0e655246ddeac2',1,'yask::yk_var']]], ['set_5falloc_5fsize',['set_alloc_size',['../classyask_1_1yk__var.html#a7bc339345cc04bb349e2f6bf586a29f1',1,'yask::yk_var']]], ['set_5fblock_5fsize',['set_block_size',['../classyask_1_1yk__solution.html#abd3c7317bf1b397f332962d658f38839',1,'yask::yk_solution']]], + ['set_5fblock_5fsize_5fvec',['set_block_size_vec',['../classyask_1_1yk__solution.html#a191580e8eab142bbf5eeb7573546c9c9',1,'yask::yk_solution::set_block_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a98ae9c07fbc60e4323fa16ee4a2400b5',1,'yask::yk_solution::set_block_size_vec(const idx_t_init_list &vals)=0']]], ['set_5fcluster_5fmult',['set_cluster_mult',['../classyask_1_1yc__solution.html#a45cb1df4af6886e82f98904473873272',1,'yask::yc_solution']]], ['set_5fcond',['set_cond',['../classyask_1_1yc__equation__node.html#ac264942915dfb99fcfc9578873109bdb',1,'yask::yc_equation_node']]], - ['set_5fdebug_5foutput',['set_debug_output',['../classyask_1_1yc__solution.html#aff540803d358a5dcd304f09c522ec867',1,'yask::yc_solution::set_debug_output()'],['../classyask_1_1yk__env.html#a313519caed64266832618f0b00e8f45f',1,'yask::yk_env::set_debug_output()'],['../classyask_1_1yk__solution.html#aa315469b5836e531425836f4fc5eff1c',1,'yask::yk_solution::set_debug_output()']]], + ['set_5fdebug_5foutput',['set_debug_output',['../classyask_1_1yc__solution.html#aff540803d358a5dcd304f09c522ec867',1,'yask::yc_solution::set_debug_output()'],['../classyask_1_1yk__env.html#aa5cf0e8d885f4d9ebde6539d5246cda1',1,'yask::yk_env::set_debug_output()'],['../classyask_1_1yk__solution.html#a7e8ce77d85b54ebaf15ddf11009066c4',1,'yask::yk_solution::set_debug_output()']]], ['set_5fdefault_5fnuma_5fpreferred',['set_default_numa_preferred',['../classyask_1_1yk__solution.html#ac8bde8dfc73219cec84ad3033faabb90',1,'yask::yk_solution']]], ['set_5fdependency_5fchecker_5fenabled',['set_dependency_checker_enabled',['../classyask_1_1yc__solution.html#ac181c24ab7af945318a055cef3b52ee6',1,'yask::yc_solution']]], ['set_5fdescription',['set_description',['../classyask_1_1yc__solution.html#a3cc13f5daf402805f9b1f66996d9d6d3',1,'yask::yc_solution']]], ['set_5fdomain_5fdims',['set_domain_dims',['../classyask_1_1yc__solution.html#a3dc810afcb4ef91c10aa5e0e7092476f',1,'yask::yc_solution::set_domain_dims(const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#ab798850bc3a6bf88322a2f39765ee831',1,'yask::yc_solution::set_domain_dims(const std::initializer_list< yc_index_node_ptr > &dims)=0']]], ['set_5fdynamic_5fstep_5falloc',['set_dynamic_step_alloc',['../classyask_1_1yc__var.html#a528a6e79000ffc8addefd7519cc58ad6',1,'yask::yc_var']]], - ['set_5felement',['set_element',['../classyask_1_1yk__var.html#afb2994485a7cb77c12df0904910053f1',1,'yask::yk_var::set_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#a2feb3a4e7b07f0693f4fbfd4e3e90b2e',1,'yask::yk_var::set_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0']]], + ['set_5felement',['set_element',['../classyask_1_1yk__var.html#ac509cdab014c58033c15b5c92f4bb7d6',1,'yask::yk_var::set_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#a42a5e0c4f28ef714f1de855ddb81877d',1,'yask::yk_var::set_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0']]], ['set_5felement_5fbytes',['set_element_bytes',['../classyask_1_1yc__solution.html#a6ce565febd97f50efae59c37d7d5ef4f',1,'yask::yc_solution']]], - ['set_5felements_5fin_5fslice',['set_elements_in_slice',['../classyask_1_1yk__var.html#a4b19f91ffed51602eb08f60f2ddb8e70',1,'yask::yk_var']]], - ['set_5felements_5fin_5fslice_5fsame',['set_elements_in_slice_same',['../classyask_1_1yk__var.html#a4744c4ec52086d33f812909660dda4db',1,'yask::yk_var']]], + ['set_5felements_5fin_5fslice',['set_elements_in_slice',['../classyask_1_1yk__var.html#a56798ab60559bd84fdc204d7255ebe46',1,'yask::yk_var']]], + ['set_5felements_5fin_5fslice_5fsame',['set_elements_in_slice_same',['../classyask_1_1yk__var.html#ad919afb54bbde78938a3939e76df0cd8',1,'yask::yk_var']]], ['set_5ffirst_5fmisc_5findex',['set_first_misc_index',['../classyask_1_1yk__var.html#a5beae21df987bf4a93bec2ebf8a423f6',1,'yask::yk_var']]], ['set_5ffold_5flen',['set_fold_len',['../classyask_1_1yc__solution.html#a1168b5b8044e39c047d81a5fe5efc06e',1,'yask::yc_solution']]], ['set_5fhalo_5fexchange_5fl1_5fnorm',['set_halo_exchange_l1_norm',['../classyask_1_1yk__var.html#a5f65d5983b3e8f16bb20c466d6b7f027',1,'yask::yk_var']]], @@ -24,13 +25,16 @@ var searchData= ['set_5fmin_5fpad_5fsize',['set_min_pad_size',['../classyask_1_1yk__solution.html#ab3bd7e95ea13631954d92a638badfb2d',1,'yask::yk_solution::set_min_pad_size()'],['../classyask_1_1yk__var.html#a16aad88dc481991cbe83da7a55cb3799',1,'yask::yk_var::set_min_pad_size()']]], ['set_5fname',['set_name',['../classyask_1_1yc__solution.html#a1dfefccda72a3560e6664471a9ab451a',1,'yask::yc_solution']]], ['set_5fnum_5franks',['set_num_ranks',['../classyask_1_1yk__solution.html#ac4cd27d412b6fe013db58b167999a362',1,'yask::yk_solution']]], + ['set_5fnum_5franks_5fvec',['set_num_ranks_vec',['../classyask_1_1yk__solution.html#a85aebdf4bf311ed1b9d293fa4404f76e',1,'yask::yk_solution::set_num_ranks_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a7c4b880c34659d731ae133f7ae1bd273',1,'yask::yk_solution::set_num_ranks_vec(const idx_t_init_list &vals)=0']]], ['set_5fnuma_5fpreferred',['set_numa_preferred',['../classyask_1_1yk__var.html#ac68f7d6f9bbe826eed31e6dc2be01de4',1,'yask::yk_var']]], ['set_5foverall_5fdomain_5fsize',['set_overall_domain_size',['../classyask_1_1yk__solution.html#a7f9a22d8d2b760a05307e90147d18d8c',1,'yask::yk_solution']]], + ['set_5foverall_5fdomain_5fsize_5fvec',['set_overall_domain_size_vec',['../classyask_1_1yk__solution.html#a156fa79121b033516028c391db968a17',1,'yask::yk_solution::set_overall_domain_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a4540f5a7990503a4f2c1a336188197eb',1,'yask::yk_solution::set_overall_domain_size_vec(const idx_t_init_list &vals)=0']]], ['set_5fprefetch_5fdist',['set_prefetch_dist',['../classyask_1_1yc__solution.html#aa849ba0ae1af7890d8f6f5c0b095ff25',1,'yask::yc_solution']]], ['set_5fradius',['set_radius',['../classyask_1_1yc__solution__with__radius__base.html#a49016a165c8f3e8d2c2b003bebddf80b',1,'yask::yc_solution_with_radius_base']]], ['set_5frank_5fdomain_5fsize',['set_rank_domain_size',['../classyask_1_1yk__solution.html#a155d8f4a38da9da11488a18cca50bae8',1,'yask::yk_solution']]], + ['set_5frank_5fdomain_5fsize_5fvec',['set_rank_domain_size_vec',['../classyask_1_1yk__solution.html#a31a653082530e1049c3030b408bc3fa1',1,'yask::yk_solution::set_rank_domain_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a9d75b7f78aaba21fcbe0583c79106fb7',1,'yask::yk_solution::set_rank_domain_size_vec(const idx_t_init_list &vals)=0']]], ['set_5frank_5findex',['set_rank_index',['../classyask_1_1yk__solution.html#ac0eccaa1d228d7f3408e3c2b0881f279',1,'yask::yk_solution']]], - ['set_5fregion_5fsize',['set_region_size',['../classyask_1_1yk__solution.html#a923afb4530ef5f0370f8bf55856fa040',1,'yask::yk_solution']]], + ['set_5frank_5findex_5fvec',['set_rank_index_vec',['../classyask_1_1yk__solution.html#abcdcb3024837aa125c5218a618e0db94',1,'yask::yk_solution::set_rank_index_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a921f09751c36b56509d2bf2e3d3f05a0',1,'yask::yk_solution::set_rank_index_vec(const idx_t_init_list &vals)=0']]], ['set_5fright_5fhalo_5fsize',['set_right_halo_size',['../classyask_1_1yk__var.html#a6d43cc66c47eb773c0e8aa043ec76099',1,'yask::yk_var']]], ['set_5fright_5fmin_5fpad_5fsize',['set_right_min_pad_size',['../classyask_1_1yk__var.html#a02ee27a0c922e55def701d6efffc725c',1,'yask::yk_var']]], ['set_5fstep_5falloc_5fsize',['set_step_alloc_size',['../classyask_1_1yc__var.html#ae287dc53eb486e8dbf1a3f6ecec9cafb',1,'yask::yc_var']]], @@ -38,7 +42,8 @@ var searchData= ['set_5fstep_5fdim',['set_step_dim',['../classyask_1_1yc__solution.html#ad234ecd4964bcb57b1876be87baf57bd',1,'yask::yc_solution']]], ['set_5fstep_5fwrap',['set_step_wrap',['../classyask_1_1yk__solution.html#af9b7c6f23e3190f3958fa1843fd0cba0',1,'yask::yk_solution']]], ['set_5ftarget',['set_target',['../classyask_1_1yc__solution.html#a1c03fc69e306a700de8c5ae3973229c2',1,'yask::yc_solution']]], - ['set_5ftrace_5fenabled',['set_trace_enabled',['../classyask_1_1yk__env.html#a1705f55210c095cec5d8347bacd24c57',1,'yask::yk_env']]], + ['set_5ftrace_5fenabled',['set_trace_enabled',['../classyask_1_1yk__env.html#ac6b39c25ae59e27dcdb405a9fe24e763',1,'yask::yk_env']]], ['set_5fvalue',['set_value',['../classyask_1_1yc__const__number__node.html#a8e1cf3c96519e6f9f8729e5736d791e3',1,'yask::yc_const_number_node']]], - ['soln_5fmap',['soln_map',['../classyask_1_1yc__solution__base.html#ab5607f329a35a073145389a4f8cb06b5',1,'yask::yc_solution_base']]] + ['soln_5fmap',['soln_map',['../classyask_1_1yc__solution__base.html#ab5607f329a35a073145389a4f8cb06b5',1,'yask::yc_solution_base']]], + ['string_5fvec',['string_vec',['../group__yask.html#ga53bde373152f3af12ba9daa06007eb5f',1,'yask']]] ]; diff --git a/docs/api/html/search/defines_1.html b/docs/api/html/search/defines_1.html new file mode 100644 index 00000000..2858dbe3 --- /dev/null +++ b/docs/api/html/search/defines_1.html @@ -0,0 +1,30 @@ + + + + + + + + + +
                                                                                                  +
                                                                                                  Loading...
                                                                                                  +
                                                                                                  + +
                                                                                                  Searching...
                                                                                                  +
                                                                                                  No Matches
                                                                                                  + +
                                                                                                  + + diff --git a/docs/api/html/search/defines_1.js b/docs/api/html/search/defines_1.js new file mode 100644 index 00000000..25768a6b --- /dev/null +++ b/docs/api/html/search/defines_1.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['yask_5fdeprecated',['YASK_DEPRECATED',['../yask__common__api_8hpp.html#af7d3d837169568cf38a2efc3e7b04123',1,'yask_common_api.hpp']]], + ['yask_5fint64_5ft',['YASK_INT64_T',['../yask__common__api_8hpp.html#a39f516516145bef523f3309b72959cdb',1,'yask_common_api.hpp']]] +]; diff --git a/docs/api/html/search/functions_1.js b/docs/api/html/search/functions_1.js index 0f028439..3c32950e 100644 --- a/docs/api/html/search/functions_1.js +++ b/docs/api/html/search/functions_1.js @@ -3,8 +3,8 @@ var searchData= ['add_5fflow_5fdependency',['add_flow_dependency',['../classyask_1_1yc__solution.html#a727a91bb87e42de9822ac6540e3fc93e',1,'yask::yc_solution']]], ['add_5fmessage',['add_message',['../classyask_1_1yask__exception.html#aff4d4707f040fe2876c8e5d2fbfd74a7',1,'yask::yask_exception']]], ['add_5foperand',['add_operand',['../classyask_1_1yc__commutative__number__node.html#a560e25d93eb1ee672e4fdbb40db31f21',1,'yask::yc_commutative_number_node']]], - ['add_5fto_5felement',['add_to_element',['../classyask_1_1yk__var.html#ad1e93677a7b8070501c9d569a8404714',1,'yask::yk_var::add_to_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#ac93056d8c105cf33eebce78e0f054b5f',1,'yask::yk_var::add_to_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0']]], + ['add_5fto_5felement',['add_to_element',['../classyask_1_1yk__var.html#aac79a05181b3eee1031e27cfc0d2c145',1,'yask::yk_var::add_to_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#adf4832584daca2f5139b8a8ba93bcf6a',1,'yask::yk_var::add_to_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0']]], ['alloc_5fstorage',['alloc_storage',['../classyask_1_1yk__var.html#aa3b479a98b425c3a8d504145972198e0',1,'yask::yk_var']]], - ['apply_5fcommand_5fline_5foptions',['apply_command_line_options',['../classyask_1_1yk__solution.html#ac111abbade055c4923cd0044360ec3b7',1,'yask::yk_solution::apply_command_line_options(const std::string &args)=0'],['../classyask_1_1yk__solution.html#ad0947c4ad4ed06d8a7d5058daecd5dc7',1,'yask::yk_solution::apply_command_line_options(int argc, char *argv[])=0'],['../classyask_1_1yk__solution.html#a6c128cbf16d4d4ab36c30654cd0e9818',1,'yask::yk_solution::apply_command_line_options(const std::vector< std::string > &args)=0']]], - ['are_5findices_5flocal',['are_indices_local',['../classyask_1_1yk__var.html#a81a6970b7812b2c5c4bdb71cd6b1384d',1,'yask::yk_var::are_indices_local(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a7993629f98bbacee174f793e18eff1b5',1,'yask::yk_var::are_indices_local(const std::initializer_list< idx_t > &indices) const =0']]] + ['apply_5fcommand_5fline_5foptions',['apply_command_line_options',['../classyask_1_1yk__solution.html#ac111abbade055c4923cd0044360ec3b7',1,'yask::yk_solution::apply_command_line_options(const std::string &args)=0'],['../classyask_1_1yk__solution.html#ad0947c4ad4ed06d8a7d5058daecd5dc7',1,'yask::yk_solution::apply_command_line_options(int argc, char *argv[])=0'],['../classyask_1_1yk__solution.html#a550b24bc0f81de69619ba3029ca79e7f',1,'yask::yk_solution::apply_command_line_options(const string_vec &args)=0']]], + ['are_5findices_5flocal',['are_indices_local',['../classyask_1_1yk__var.html#a3cc808533b7c6e34614409d34bec1a86',1,'yask::yk_var::are_indices_local(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#a923c12a1a7b80698c09f2447828416ed',1,'yask::yk_var::are_indices_local(const idx_t_init_list &indices) const =0']]] ]; diff --git a/docs/api/html/search/functions_3.js b/docs/api/html/search/functions_3.js index 6ca0a3b7..123e31c7 100644 --- a/docs/api/html/search/functions_3.js +++ b/docs/api/html/search/functions_3.js @@ -10,5 +10,7 @@ var searchData= ['clear_5fdependencies',['clear_dependencies',['../classyask_1_1yc__solution.html#a42cd08d7a26c93d5073134f3b76dcc38',1,'yask::yc_solution']]], ['clear_5ffolding',['clear_folding',['../classyask_1_1yc__solution.html#afaf489e67ed8cc753e999b1495dd4dde',1,'yask::yc_solution']]], ['clone_5fast',['clone_ast',['../classyask_1_1yc__equation__node.html#a02121980dc7dcae2a18b38340579e8ca',1,'yask::yc_equation_node::clone_ast()'],['../classyask_1_1yc__number__node.html#a85093ab8031538e55c2213aacb843faf',1,'yask::yc_number_node::clone_ast()'],['../classyask_1_1yc__bool__node.html#a724e70472b59661feb96ea3a53cab8c7',1,'yask::yc_bool_node::clone_ast()']]], - ['close',['close',['../classyask_1_1yask__file__output.html#ad05306df06c5965659eda39ddfeb0d38',1,'yask::yask_file_output']]] + ['close',['close',['../classyask_1_1yask__file__output.html#ad05306df06c5965659eda39ddfeb0d38',1,'yask::yask_file_output']]], + ['copy_5fvars_5ffrom_5fdevice',['copy_vars_from_device',['../classyask_1_1yk__solution.html#a105c993241498d9c2a98cbec353fc61a',1,'yask::yk_solution']]], + ['copy_5fvars_5fto_5fdevice',['copy_vars_to_device',['../classyask_1_1yk__solution.html#a2cdea230253b47bd16c1a0c326a78df8',1,'yask::yk_solution']]] ]; diff --git a/docs/api/html/search/functions_4.js b/docs/api/html/search/functions_4.js index f4437f8d..fe99757c 100644 --- a/docs/api/html/search/functions_4.js +++ b/docs/api/html/search/functions_4.js @@ -1,5 +1,6 @@ var searchData= [ ['define',['define',['../classyask_1_1yc__solution__base.html#abd34ca7ae7a89fc4a051376a612f494a',1,'yask::yc_solution_base::define()'],['../classyask_1_1yc__solution__with__radius__base.html#a883c31f71b3b2876d4c115ca4f3d926d',1,'yask::yc_solution_with_radius_base::define()']]], + ['disable_5fdebug_5foutput',['disable_debug_output',['../classyask_1_1yk__env.html#ab14fa168dc78346ac61b54c9a509099b',1,'yask::yk_env']]], ['discard',['discard',['../classyask_1_1yask__string__output.html#a86fdebb6dbf89c75d306a2c88166943b',1,'yask::yask_string_output']]] ]; diff --git a/docs/api/html/search/functions_6.js b/docs/api/html/search/functions_6.js index 75e34423..8a6cb6d5 100644 --- a/docs/api/html/search/functions_6.js +++ b/docs/api/html/search/functions_6.js @@ -1,9 +1,9 @@ var searchData= [ ['first_5fdomain_5findex',['first_domain_index',['../classyask_1_1yc__solution__base.html#a56f644e7e6b4e96619245f217b3763b5',1,'yask::yc_solution_base']]], - ['format',['format',['../classyask_1_1yc__solution.html#a8bedbd4e6834f58c8e5d0452638ee64c',1,'yask::yc_solution']]], - ['format_5findices',['format_indices',['../classyask_1_1yk__var.html#a16aca4721fceade611c11d2d11472162',1,'yask::yk_var::format_indices(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a0b705bdba753e24e12de0297bb30e133',1,'yask::yk_var::format_indices(const std::initializer_list< idx_t > &indices) const =0']]], + ['format',['format',['../classyask_1_1yc__solution.html#a5f3624ded964c465724fd0221d5a2aaa',1,'yask::yc_solution']]], + ['format_5findices',['format_indices',['../classyask_1_1yk__var.html#a55d6c585b8384881065c01bb067c96dd',1,'yask::yk_var::format_indices(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#ae929dea5359d07e7541abab78ca7b139',1,'yask::yk_var::format_indices(const idx_t_init_list &indices) const =0']]], ['format_5fsimple',['format_simple',['../classyask_1_1yc__expr__node.html#a1af7948d0c2a977ed20c0b2d1d561052',1,'yask::yc_expr_node']]], - ['fuse_5fgrids',['fuse_grids',['../classyask_1_1yk__solution.html#a2c600b5537983349a50462a90dc73291',1,'yask::yk_solution::fuse_grids()'],['../classyask_1_1yk__var.html#a312eac09604272384823123d7219432a',1,'yask::yk_var::fuse_grids()']]], + ['fuse_5fgrids',['fuse_grids',['../classyask_1_1yk__solution.html#ac20126eb21acf5e61ac2c94d823a34e1',1,'yask::yk_solution']]], ['fuse_5fvars',['fuse_vars',['../classyask_1_1yk__solution.html#a563794842445fcd96d77b463f674a60b',1,'yask::yk_solution::fuse_vars()'],['../classyask_1_1yk__var.html#aa095607d5493fcba38cb332053155b7f',1,'yask::yk_var::fuse_vars()']]] ]; diff --git a/docs/api/html/search/functions_7.js b/docs/api/html/search/functions_7.js index 42bbb6b0..d7ac0f0a 100644 --- a/docs/api/html/search/functions_7.js +++ b/docs/api/html/search/functions_7.js @@ -1,40 +1,46 @@ var searchData= [ ['get_5falloc_5fsize',['get_alloc_size',['../classyask_1_1yk__var.html#a1934db25d379b5ae2366e01a88a2c867',1,'yask::yk_var']]], + ['get_5falloc_5fsize_5fvec',['get_alloc_size_vec',['../classyask_1_1yk__var.html#af47cbeb07ca7728013e71f31bc00281d',1,'yask::yk_var']]], ['get_5farbitrary_5ffd_5fcoefficients',['get_arbitrary_fd_coefficients',['../group__yask.html#ga67e901ad7dd62a3eac164ceed0c46787',1,'yask']]], ['get_5fbackward_5ffd_5fcoefficients',['get_backward_fd_coefficients',['../group__yask.html#ga31c7a4d960e620b75944a40ffe0ff53b',1,'yask']]], ['get_5fblock_5fsize',['get_block_size',['../classyask_1_1yk__solution.html#a601aeebc023d430a311788c3ce73c190',1,'yask::yk_solution']]], + ['get_5fblock_5fsize_5fvec',['get_block_size_vec',['../classyask_1_1yk__solution.html#a75d10347e75c1e01e9592a3cb0fc42a0',1,'yask::yk_solution']]], ['get_5fcenter_5ffd_5fcoefficients',['get_center_fd_coefficients',['../group__yask.html#ga11d0759a323784806d1a30284a83621c',1,'yask']]], ['get_5fcond',['get_cond',['../classyask_1_1yc__equation__node.html#a09eabc1f5854bb4a5b50a715368d6d0f',1,'yask::yc_equation_node']]], - ['get_5fdebug_5foutput',['get_debug_output',['../classyask_1_1yk__env.html#abc475dee318fbe26cad93cc205c90de4',1,'yask::yk_env']]], + ['get_5fdebug_5foutput',['get_debug_output',['../classyask_1_1yk__env.html#a69eac2bb8a841f6259697a7dcc8cf386',1,'yask::yk_env']]], ['get_5fdefault_5fnuma_5fpreferred',['get_default_numa_preferred',['../classyask_1_1yk__solution.html#a05c98a1d8b03d1009ef67b84b2f0bea0',1,'yask::yk_solution']]], ['get_5fdescription',['get_description',['../classyask_1_1yc__solution.html#a8113e505343c5f2598811669f767930c',1,'yask::yc_solution']]], - ['get_5fdim_5fnames',['get_dim_names',['../classyask_1_1yc__var.html#a163ecf734878dd2495cd51a1f479a94d',1,'yask::yc_var::get_dim_names()'],['../classyask_1_1yk__var.html#a9793866823ba93e4d2f284ffeeab92c8',1,'yask::yk_var::get_dim_names()']]], - ['get_5fdomain_5fdim_5fnames',['get_domain_dim_names',['../classyask_1_1yk__solution.html#a665968c18b2f70467dbc7decaeeb1a1f',1,'yask::yk_solution']]], + ['get_5fdim_5fnames',['get_dim_names',['../classyask_1_1yc__var.html#a1cdca3a698cd9029f659fd7b11d89305',1,'yask::yc_var::get_dim_names()'],['../classyask_1_1yk__var.html#adf2e93317a2e86d80713ad56a46fdc68',1,'yask::yk_var::get_dim_names()']]], + ['get_5fdomain_5fdim_5fnames',['get_domain_dim_names',['../classyask_1_1yk__solution.html#a24c2619b5f0471bcfe8eeb5aed769fec',1,'yask::yk_solution']]], ['get_5felapsed_5fsecs',['get_elapsed_secs',['../classyask_1_1yk__stats.html#a8bcf045fdfde4d9120084902f4d31725',1,'yask::yk_stats']]], - ['get_5felement',['get_element',['../classyask_1_1yk__var.html#ae6e5f47af6672d9e9e02e4bdff59d8ce',1,'yask::yk_var::get_element(const std::vector< idx_t > &indices) const =0'],['../classyask_1_1yk__var.html#a01a0e195ee63bd58932adead0cd1c809',1,'yask::yk_var::get_element(const std::initializer_list< idx_t > &indices) const =0']]], + ['get_5felement',['get_element',['../classyask_1_1yk__var.html#aed2d676221d5b99be7b8b8bc2cd37af2',1,'yask::yk_var::get_element(const idx_t_vec &indices) const =0'],['../classyask_1_1yk__var.html#af49bd859b3200e8cd8a55fe55ecbde93',1,'yask::yk_var::get_element(const idx_t_init_list &indices) const =0']]], ['get_5felement_5fbytes',['get_element_bytes',['../classyask_1_1yc__solution.html#a78551a2f7ca0a9644fa802d0806b7642',1,'yask::yc_solution::get_element_bytes()'],['../classyask_1_1yk__solution.html#a50e565487b7175447cc9f6489221eef4',1,'yask::yk_solution::get_element_bytes()']]], - ['get_5felements_5fin_5fslice',['get_elements_in_slice',['../classyask_1_1yk__var.html#a3c825ad9dfa3c9138348399bfbf548c8',1,'yask::yk_var']]], + ['get_5felements_5fin_5fslice',['get_elements_in_slice',['../classyask_1_1yk__var.html#ad33ae6d7f03ec5fb8fa31e4ad9ff7881',1,'yask::yk_var']]], ['get_5fequations',['get_equations',['../classyask_1_1yc__solution.html#a8257de64334bd95fcaca64719653fd1d',1,'yask::yc_solution']]], ['get_5fest_5ffp_5fops_5fdone',['get_est_fp_ops_done',['../classyask_1_1yk__stats.html#a2ce2e8bf959e0af0caae77bd5ae1626f',1,'yask::yk_stats']]], - ['get_5fextra_5fpad_5fsize',['get_extra_pad_size',['../classyask_1_1yk__var.html#a3a7e4eeb67a187fede214956902b1486',1,'yask::yk_var']]], ['get_5ffilename',['get_filename',['../classyask_1_1yask__file__output.html#a370fcde900fe4cebf04741bde16f59d4',1,'yask::yask_file_output']]], ['get_5ffirst_5flocal_5findex',['get_first_local_index',['../classyask_1_1yk__var.html#aaf5b3f06b832bbf77b8f722fda8a2998',1,'yask::yk_var']]], + ['get_5ffirst_5flocal_5findex_5fvec',['get_first_local_index_vec',['../classyask_1_1yk__var.html#a3b7117c1479a41c6d5e3b37fbc2309f3',1,'yask::yk_var']]], ['get_5ffirst_5fmisc_5findex',['get_first_misc_index',['../classyask_1_1yk__var.html#a9faab903ed2467e46ed0b5ea43a9e1e4',1,'yask::yk_var']]], - ['get_5ffirst_5frank_5falloc_5findex',['get_first_rank_alloc_index',['../classyask_1_1yk__var.html#adb7ed5c0f513ae7a5480a480f08165b7',1,'yask::yk_var']]], + ['get_5ffirst_5frank_5falloc_5findex',['get_first_rank_alloc_index',['../classyask_1_1yk__var.html#a010df6d40b808aa0d7fca274bbf5d2d4',1,'yask::yk_var']]], ['get_5ffirst_5frank_5fdomain_5findex',['get_first_rank_domain_index',['../classyask_1_1yk__solution.html#a03bdef5ba9b0b0e37f9b7be2e2e457a4',1,'yask::yk_solution::get_first_rank_domain_index()'],['../classyask_1_1yk__var.html#a60d8b63ef869693d7eeb556e5254d167',1,'yask::yk_var::get_first_rank_domain_index()']]], + ['get_5ffirst_5frank_5fdomain_5findex_5fvec',['get_first_rank_domain_index_vec',['../classyask_1_1yk__solution.html#a79492e63f435ac8a93d8815a3ca4729c',1,'yask::yk_solution::get_first_rank_domain_index_vec()'],['../classyask_1_1yk__var.html#a45c2449324430b59811c1c6257cf9a5a',1,'yask::yk_var::get_first_rank_domain_index_vec()']]], ['get_5ffirst_5frank_5fhalo_5findex',['get_first_rank_halo_index',['../classyask_1_1yk__var.html#a363c78d8f7f5b40581fb0cb179058410',1,'yask::yk_var']]], + ['get_5ffirst_5frank_5fhalo_5findex_5fvec',['get_first_rank_halo_index_vec',['../classyask_1_1yk__var.html#ab0ed2e82d463cc3fae2daa395edb78f4',1,'yask::yk_var']]], ['get_5ffirst_5fvalid_5fstep_5findex',['get_first_valid_step_index',['../classyask_1_1yk__var.html#a6a02011041f38a4d2fe7899d5369511e',1,'yask::yk_var']]], ['get_5fforward_5ffd_5fcoefficients',['get_forward_fd_coefficients',['../group__yask.html#ga2cccdb7135258b002cdac134fbfa1912',1,'yask']]], - ['get_5fgrid',['get_grid',['../classyask_1_1yc__solution.html#a491edfd90571df7a705d9549d1604578',1,'yask::yc_solution::get_grid()'],['../classyask_1_1yc__var__point__node.html#aed1f22e175b0cd60a061fcd6dff807a2',1,'yask::yc_var_point_node::get_grid()'],['../classyask_1_1yk__solution.html#aadd360ab1d09bfcf603f0345793ba586',1,'yask::yk_solution::get_grid()']]], - ['get_5fgrids',['get_grids',['../classyask_1_1yc__solution.html#a150c8dec60f7c43b64d734f91756a3d6',1,'yask::yc_solution::get_grids()'],['../classyask_1_1yk__solution.html#a418199548c44f5c9d7d90305350cf34b',1,'yask::yk_solution::get_grids()']]], + ['get_5fgrid',['get_grid',['../classyask_1_1yc__solution.html#a7f63562fa2519ad4817c2698832c0dcb',1,'yask::yc_solution::get_grid()'],['../classyask_1_1yc__var__point__node.html#a550d4f6efb8cca03b1a5cea1f05ad936',1,'yask::yc_var_point_node::get_grid()'],['../classyask_1_1yk__solution.html#a561517d815dd1004e1c39557554eee16',1,'yask::yk_solution::get_grid()']]], + ['get_5fgrids',['get_grids',['../classyask_1_1yc__solution.html#aa1d2c083951bb0b3f0d1fce606ba411d',1,'yask::yc_solution::get_grids()'],['../classyask_1_1yk__solution.html#a9a0269a914a3e4e5465a7e3643785544',1,'yask::yk_solution::get_grids()']]], ['get_5fhalo_5fexchange_5fl1_5fnorm',['get_halo_exchange_l1_norm',['../classyask_1_1yk__var.html#a2de3a290dae76a40b3c208074ebbdb89',1,'yask::yk_var']]], - ['get_5fhalo_5fsize',['get_halo_size',['../classyask_1_1yk__var.html#af8cc3a22aaeb21b82d4d77d1617ec117',1,'yask::yk_var']]], ['get_5flast_5flocal_5findex',['get_last_local_index',['../classyask_1_1yk__var.html#a7266bb36c93cb6ae538a0f081f22fad7',1,'yask::yk_var']]], + ['get_5flast_5flocal_5findex_5fvec',['get_last_local_index_vec',['../classyask_1_1yk__var.html#a00d8f5b744b8da99190764f0b0fc1cf5',1,'yask::yk_var']]], ['get_5flast_5fmisc_5findex',['get_last_misc_index',['../classyask_1_1yk__var.html#ab551b0a8749c38e8307082631cf597fa',1,'yask::yk_var']]], - ['get_5flast_5frank_5falloc_5findex',['get_last_rank_alloc_index',['../classyask_1_1yk__var.html#a2919bd783fb240a9d0f43a866bf7f551',1,'yask::yk_var']]], + ['get_5flast_5frank_5falloc_5findex',['get_last_rank_alloc_index',['../classyask_1_1yk__var.html#acb67cbd0ecea35f3f50b435e4b977c53',1,'yask::yk_var']]], ['get_5flast_5frank_5fdomain_5findex',['get_last_rank_domain_index',['../classyask_1_1yk__solution.html#a58f2ce95e150787bc8235e85298dcd01',1,'yask::yk_solution::get_last_rank_domain_index()'],['../classyask_1_1yk__var.html#aaf3e835c074a2bdd627f76483727b1dc',1,'yask::yk_var::get_last_rank_domain_index()']]], + ['get_5flast_5frank_5fdomain_5findex_5fvec',['get_last_rank_domain_index_vec',['../classyask_1_1yk__solution.html#a9d571ca2f9e1309d3af73c9d8fe7a084',1,'yask::yk_solution::get_last_rank_domain_index_vec()'],['../classyask_1_1yk__var.html#abdfdb9d4d382fb1c4fe4966e43afb5b2',1,'yask::yk_var::get_last_rank_domain_index_vec()']]], ['get_5flast_5frank_5fhalo_5findex',['get_last_rank_halo_index',['../classyask_1_1yk__var.html#ad3d97b360b0b1e4c5785adf87384c41e',1,'yask::yk_var']]], + ['get_5flast_5frank_5fhalo_5findex_5fvec',['get_last_rank_halo_index_vec',['../classyask_1_1yk__var.html#aa735497e92513f5796af52e8ddb24353',1,'yask::yk_var']]], ['get_5flast_5fvalid_5fstep_5findex',['get_last_valid_step_index',['../classyask_1_1yk__var.html#a7b346e48cb36b0f304d3b030ac9a1512',1,'yask::yk_var']]], ['get_5fleft_5fextra_5fpad_5fsize',['get_left_extra_pad_size',['../classyask_1_1yk__var.html#ae62308be69ca643714879b0a1362de0e',1,'yask::yk_var']]], ['get_5fleft_5fhalo_5fsize',['get_left_halo_size',['../classyask_1_1yk__var.html#acd8e51e7ee79fcc55a622b3012da63bc',1,'yask::yk_var']]], @@ -42,16 +48,17 @@ var searchData= ['get_5flhs',['get_lhs',['../classyask_1_1yc__equation__node.html#a649b44dfcf32970e94d6fb95d2caafa4',1,'yask::yc_equation_node::get_lhs()'],['../classyask_1_1yc__binary__number__node.html#a99ec0c25562c4c8394b2bdaa1bc6a391',1,'yask::yc_binary_number_node::get_lhs()'],['../classyask_1_1yc__binary__bool__node.html#a46f81b0aabf774f50659f4fc0afa0356',1,'yask::yc_binary_bool_node::get_lhs()'],['../classyask_1_1yc__binary__comparison__node.html#a344a9a30a06825bd1759c5553f6276df',1,'yask::yc_binary_comparison_node::get_lhs()']]], ['get_5fmessage',['get_message',['../classyask_1_1yask__exception.html#acb717171229ec55d8fa110a7f16d9913',1,'yask::yask_exception']]], ['get_5fmin_5fpad_5fsize',['get_min_pad_size',['../classyask_1_1yk__solution.html#a193d7c6e708c1ece4d78b39800a7d5fa',1,'yask::yk_solution']]], - ['get_5fmisc_5fdim_5fnames',['get_misc_dim_names',['../classyask_1_1yk__solution.html#af895e7366f7f224c4e6fc780b0dd7a2f',1,'yask::yk_solution']]], + ['get_5fmisc_5fdim_5fnames',['get_misc_dim_names',['../classyask_1_1yk__solution.html#a4184ac69b5e5af07e8cfc0317d6bd759',1,'yask::yk_solution']]], ['get_5fname',['get_name',['../classyask_1_1yc__solution.html#a630ebb8dc2bff24f15b5a56e46efc9f8',1,'yask::yc_solution::get_name()'],['../classyask_1_1yc__var.html#aef8c255b753899f77280fe65008cc5ba',1,'yask::yc_var::get_name()'],['../classyask_1_1yc__index__node.html#a37c072f91771b3b7c95708da9f152c33',1,'yask::yc_index_node::get_name()'],['../classyask_1_1yk__solution.html#aaafc0aa636a3d306e29771d5440c4aa4',1,'yask::yk_solution::get_name()'],['../classyask_1_1yk__var.html#a4dd64a662a1711fe5613e58773ea0fba',1,'yask::yk_var::get_name()']]], ['get_5fnum_5fdims',['get_num_dims',['../classyask_1_1yc__var.html#a7fd5309c762a7ee5450ca4f540b894d1',1,'yask::yc_var::get_num_dims()'],['../classyask_1_1yk__var.html#aa6c3bbc2bea32b76a9dda612fa91f0f4',1,'yask::yk_var::get_num_dims()']]], - ['get_5fnum_5fdomain_5fdims',['get_num_domain_dims',['../classyask_1_1yk__solution.html#a40a13017d8c3a599d00b99561405dd3c',1,'yask::yk_solution']]], + ['get_5fnum_5fdomain_5fdims',['get_num_domain_dims',['../classyask_1_1yk__solution.html#a40a13017d8c3a599d00b99561405dd3c',1,'yask::yk_solution::get_num_domain_dims()'],['../classyask_1_1yk__var.html#afad8c52ff4cd03e67a097525baeaa56d',1,'yask::yk_var::get_num_domain_dims()']]], ['get_5fnum_5felements',['get_num_elements',['../classyask_1_1yk__stats.html#a371b5222182cdae51184da17db92055e',1,'yask::yk_stats']]], ['get_5fnum_5fequations',['get_num_equations',['../classyask_1_1yc__solution.html#a2c0d2dad6b123d0a1444e2f2a74d1ad4',1,'yask::yc_solution']]], - ['get_5fnum_5fgrids',['get_num_grids',['../classyask_1_1yc__solution.html#ace7bc36f2c4e20e8fef6a99ef0964f13',1,'yask::yc_solution::get_num_grids()'],['../classyask_1_1yk__solution.html#ab36a4f206704861958ac280e774f7e03',1,'yask::yk_solution::get_num_grids()']]], + ['get_5fnum_5fgrids',['get_num_grids',['../classyask_1_1yc__solution.html#a246a87a4165aa119cc3a0761afa4ffbf',1,'yask::yc_solution::get_num_grids()'],['../classyask_1_1yk__solution.html#a4a510dff4e18c716baeec707affd0124',1,'yask::yk_solution::get_num_grids()']]], ['get_5fnum_5fnodes',['get_num_nodes',['../classyask_1_1yc__expr__node.html#a7f746b02ea0de618dcef9ce0e124e321',1,'yask::yc_expr_node']]], ['get_5fnum_5foperands',['get_num_operands',['../classyask_1_1yc__commutative__number__node.html#a8c51f0a10eb6039d35f2cc69514c2bbf',1,'yask::yc_commutative_number_node']]], ['get_5fnum_5franks',['get_num_ranks',['../classyask_1_1yk__env.html#add8b12cd6d10f964665a41acedbb9b14',1,'yask::yk_env::get_num_ranks()'],['../classyask_1_1yk__solution.html#a4449fe8902881c9a61ad12fd20a5a866',1,'yask::yk_solution::get_num_ranks()']]], + ['get_5fnum_5franks_5fvec',['get_num_ranks_vec',['../classyask_1_1yk__solution.html#a743acbc4b86f9a65f3fcef1a35fee2d7',1,'yask::yk_solution']]], ['get_5fnum_5fsteps_5fdone',['get_num_steps_done',['../classyask_1_1yk__stats.html#a5492de5b4904008ea60bec09df1dc630',1,'yask::yk_stats']]], ['get_5fnum_5fstorage_5fbytes',['get_num_storage_bytes',['../classyask_1_1yk__var.html#a4741fd9f7413d5ad634c755a52ffc6b3',1,'yask::yk_var']]], ['get_5fnum_5fstorage_5felements',['get_num_storage_elements',['../classyask_1_1yk__var.html#a73864906531e75762a4d0829b65cd997',1,'yask::yk_var']]], @@ -61,13 +68,14 @@ var searchData= ['get_5foperands',['get_operands',['../classyask_1_1yc__commutative__number__node.html#a026b2ecfa4483af95f572f58f9580e34',1,'yask::yc_commutative_number_node']]], ['get_5fostream',['get_ostream',['../classyask_1_1yask__output.html#a8017a86fd806f0f23dde7a70b77b5f43',1,'yask::yask_output']]], ['get_5foverall_5fdomain_5fsize',['get_overall_domain_size',['../classyask_1_1yk__solution.html#aaba39cb06c9f61d408695009667fe8cd',1,'yask::yk_solution']]], - ['get_5fpad_5fsize',['get_pad_size',['../classyask_1_1yk__var.html#a77d5f163315b4eb78dee290b350829be',1,'yask::yk_var']]], + ['get_5foverall_5fdomain_5fsize_5fvec',['get_overall_domain_size_vec',['../classyask_1_1yk__solution.html#aa141292fbfb8fcd075ea7192b03b6c43',1,'yask::yk_solution']]], ['get_5fprefetch_5fdist',['get_prefetch_dist',['../classyask_1_1yc__solution.html#abe595d131a70b3ef881e4e2ab35b47d6',1,'yask::yc_solution']]], ['get_5fradius',['get_radius',['../classyask_1_1yc__solution__with__radius__base.html#a59c519837c701c0043d41c0fdec1cf78',1,'yask::yc_solution_with_radius_base']]], ['get_5frank_5fdomain_5fsize',['get_rank_domain_size',['../classyask_1_1yk__solution.html#a56ba31268cb0098b64e4503c8996300f',1,'yask::yk_solution::get_rank_domain_size()'],['../classyask_1_1yk__var.html#a3eb132fe3e1f813a0c22b1366e5d55ad',1,'yask::yk_var::get_rank_domain_size()']]], + ['get_5frank_5fdomain_5fsize_5fvec',['get_rank_domain_size_vec',['../classyask_1_1yk__solution.html#a50034edeb397bb0285ea4363178de803',1,'yask::yk_solution::get_rank_domain_size_vec()'],['../classyask_1_1yk__var.html#a2f58d9368265fd6f9382ba12f876f0d3',1,'yask::yk_var::get_rank_domain_size_vec()']]], ['get_5frank_5findex',['get_rank_index',['../classyask_1_1yk__env.html#a93deb7b62612742f5a85fd8e319df38c',1,'yask::yk_env::get_rank_index()'],['../classyask_1_1yk__solution.html#a3268e5e7e2f0e45f951f1c38063bc59f',1,'yask::yk_solution::get_rank_index()']]], + ['get_5frank_5findex_5fvec',['get_rank_index_vec',['../classyask_1_1yk__solution.html#a64ce6c3bb5a4a467e6a23e4aa619881d',1,'yask::yk_solution']]], ['get_5fraw_5fstorage_5fbuffer',['get_raw_storage_buffer',['../classyask_1_1yk__var.html#a4a4e42c4cfc5b3b6a5d998611d0d8602',1,'yask::yk_var']]], - ['get_5fregion_5fsize',['get_region_size',['../classyask_1_1yk__solution.html#ac048a52035e60e2cc4e070da0e68181e',1,'yask::yk_solution']]], ['get_5fregistry',['get_registry',['../classyask_1_1yc__solution__base.html#a2b30d03733943c69f5c25b04d43efa84',1,'yask::yc_solution_base']]], ['get_5frhs',['get_rhs',['../classyask_1_1yc__equation__node.html#a0a3f60078eeeb5228b7b52457a717045',1,'yask::yc_equation_node::get_rhs()'],['../classyask_1_1yc__negate__node.html#abf3f9acad28e9a009ea0fa891371982f',1,'yask::yc_negate_node::get_rhs()'],['../classyask_1_1yc__binary__number__node.html#abb59a235acbd54494c566941dd462b7d',1,'yask::yc_binary_number_node::get_rhs()'],['../classyask_1_1yc__not__node.html#ab514f62621a73abca0bb407174db9f08',1,'yask::yc_not_node::get_rhs()'],['../classyask_1_1yc__binary__bool__node.html#a5a2954e82488da70de6781b67acdf8ce',1,'yask::yc_binary_bool_node::get_rhs()'],['../classyask_1_1yc__binary__comparison__node.html#af7a304da70447a51df07e72387eeeb3f',1,'yask::yc_binary_comparison_node::get_rhs()']]], ['get_5fright_5fextra_5fpad_5fsize',['get_right_extra_pad_size',['../classyask_1_1yk__var.html#afe6845890c3f22bf4614f1f57f414501',1,'yask::yk_var']]], diff --git a/docs/api/html/search/functions_8.js b/docs/api/html/search/functions_8.js index 8f793839..e42b830e 100644 --- a/docs/api/html/search/functions_8.js +++ b/docs/api/html/search/functions_8.js @@ -5,10 +5,11 @@ var searchData= ['is_5fdependency_5fchecker_5fenabled',['is_dependency_checker_enabled',['../classyask_1_1yc__solution.html#a8b4a7ac8cc9e6be09b115a106bac769b',1,'yask::yc_solution']]], ['is_5fdim_5fused',['is_dim_used',['../classyask_1_1yk__var.html#ace776bc0e51b07e940b23ebbce7a2232',1,'yask::yk_var']]], ['is_5fdynamic_5fstep_5falloc',['is_dynamic_step_alloc',['../classyask_1_1yc__var.html#a59099b1ba72b889e386a2f48912eef0a',1,'yask::yc_var::is_dynamic_step_alloc()'],['../classyask_1_1yk__var.html#a9c4783735b2f5b72c659f1572c44598f',1,'yask::yk_var::is_dynamic_step_alloc()']]], - ['is_5felement_5fallocated',['is_element_allocated',['../classyask_1_1yk__var.html#ad84cc59ae2c7f11c505c99e5955a60a9',1,'yask::yk_var::is_element_allocated(const std::vector< idx_t > &indices) const'],['../classyask_1_1yk__var.html#a49fcc4e673af91318403f804d5bd9b25',1,'yask::yk_var::is_element_allocated(const std::initializer_list< idx_t > &indices) const']]], ['is_5ffixed_5fsize',['is_fixed_size',['../classyask_1_1yk__var.html#a420e0dde2114ba663069b16d579072eb',1,'yask::yk_var']]], ['is_5ffolding_5fset',['is_folding_set',['../classyask_1_1yc__solution.html#abaefa9675e9551ec10b9eee0016a0822',1,'yask::yc_solution']]], + ['is_5foffloaded',['is_offloaded',['../classyask_1_1yk__solution.html#a2978fb8fd701fe35ae8a8b19e23e5544',1,'yask::yk_solution']]], ['is_5fstorage_5fallocated',['is_storage_allocated',['../classyask_1_1yk__var.html#a3b09855a3cfa7e43c32fc43a24503340',1,'yask::yk_var']]], ['is_5fstorage_5flayout_5fidentical',['is_storage_layout_identical',['../classyask_1_1yk__var.html#a3611ad6d130cb5ee7e030e6ad0c24f5b',1,'yask::yk_var']]], - ['is_5ftarget_5fset',['is_target_set',['../classyask_1_1yc__solution.html#abc13ff80f75eac42bb7300c24246f78c',1,'yask::yc_solution']]] + ['is_5ftarget_5fset',['is_target_set',['../classyask_1_1yc__solution.html#abc13ff80f75eac42bb7300c24246f78c',1,'yask::yc_solution']]], + ['is_5ftrace_5fenabled',['is_trace_enabled',['../classyask_1_1yk__env.html#abe162669b192b8c650ee7dbfbd62bb7d',1,'yask::yk_env']]] ]; diff --git a/docs/api/html/search/functions_a.js b/docs/api/html/search/functions_a.js index ee775c43..256b6558 100644 --- a/docs/api/html/search/functions_a.js +++ b/docs/api/html/search/functions_a.js @@ -10,11 +10,11 @@ var searchData= ['new_5fequation_5fnode',['new_equation_node',['../classyask_1_1yc__node__factory.html#a65838f8b97438cf4841644cff88dfb14',1,'yask::yc_node_factory']]], ['new_5ffile_5foutput',['new_file_output',['../classyask_1_1yask__output__factory.html#a25d64e5f5834fe353f58a7d8b533bcf2',1,'yask::yask_output_factory']]], ['new_5ffirst_5fdomain_5findex',['new_first_domain_index',['../classyask_1_1yc__node__factory.html#a20988bc2d3185873e890bec353687d45',1,'yask::yc_node_factory']]], - ['new_5ffixed_5fsize_5fgrid',['new_fixed_size_grid',['../classyask_1_1yk__solution.html#a5c86f62c9b5ed883da0642e83095bb6d',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)'],['../classyask_1_1yk__solution.html#ab746b975b9929508188aee45b35ba176',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const std::vector< idx_t > &dim_sizes)']]], - ['new_5ffixed_5fsize_5fvar',['new_fixed_size_var',['../classyask_1_1yk__solution.html#a70c44ba3941aa5682e381c7c36dcf453',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)=0'],['../classyask_1_1yk__solution.html#a0b682d609950a4d9b9d8fb497aad9971',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const std::initializer_list< idx_t > &dim_sizes)=0']]], + ['new_5ffixed_5fsize_5fgrid',['new_fixed_size_grid',['../classyask_1_1yk__solution.html#a86f50f2b0851eb969322956258d459d0',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)'],['../classyask_1_1yk__solution.html#a106ecadbbd0a7fa2ddb8ce6b14e14451',1,'yask::yk_solution::new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_vec &dim_sizes)']]], + ['new_5ffixed_5fsize_5fvar',['new_fixed_size_var',['../classyask_1_1yk__solution.html#a2899b81d090c4a2f468ccc31adfa9d85',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)=0'],['../classyask_1_1yk__solution.html#a8cff251e3f20d961e0a11752857dd28a',1,'yask::yk_solution::new_fixed_size_var(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_init_list &dim_sizes)=0']]], ['new_5fgreater_5fthan_5fnode',['new_greater_than_node',['../classyask_1_1yc__node__factory.html#ab917cf34e4a230e090d8fdae04796037',1,'yask::yc_node_factory']]], - ['new_5fgrid',['new_grid',['../classyask_1_1yc__solution.html#a9cbb303e06a3db1835a1bed41820474e',1,'yask::yc_solution::new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#a134cbf4957a7f5c34b082ca471cfb2ff',1,'yask::yc_solution::new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)'],['../classyask_1_1yk__solution.html#a1f74cc62244dc8e3e893160b1a5a76d4',1,'yask::yk_solution::new_grid(const std::string &name, const std::vector< std::string > &dims)'],['../classyask_1_1yk__solution.html#a4c229f530c340ca9d7ad37418fe3b6fd',1,'yask::yk_solution::new_grid(const std::string &name, const std::initializer_list< std::string > &dims)']]], - ['new_5fgrid_5fpoint',['new_grid_point',['../classyask_1_1yc__var.html#a5034bae85ce45db6aed4f72a72a12cdd',1,'yask::yc_var::new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)'],['../classyask_1_1yc__var.html#a315854cf3483982b29c51937956a2188',1,'yask::yc_var::new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)']]], + ['new_5fgrid',['new_grid',['../classyask_1_1yc__solution.html#a99a140813b38f5128b730c2831db9fb4',1,'yask::yc_solution::new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#a233eccb68505d3900b1199493fc74796',1,'yask::yc_solution::new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)'],['../classyask_1_1yk__solution.html#ac5b391babd6897c314f5ba5e3c0b2605',1,'yask::yk_solution::new_grid(const std::string &name, const string_vec &dims)'],['../classyask_1_1yk__solution.html#a30606c931e4b30a1d4d1b515dc4c5926',1,'yask::yk_solution::new_grid(const std::string &name, const std::initializer_list< std::string > &dims)']]], + ['new_5fgrid_5fpoint',['new_grid_point',['../classyask_1_1yc__var.html#aa9dcbc42cd74571ef124f9801a177e18',1,'yask::yc_var::new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)'],['../classyask_1_1yc__var.html#a33df3a69c0a880009a764dd6b9ae04b0',1,'yask::yc_var::new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)']]], ['new_5flast_5fdomain_5findex',['new_last_domain_index',['../classyask_1_1yc__node__factory.html#a8ec2bb0a9c5db26467185f876c73febf',1,'yask::yc_node_factory']]], ['new_5fless_5fthan_5fnode',['new_less_than_node',['../classyask_1_1yc__node__factory.html#af5fcf62243eee64f4d8e06224b2e6de7',1,'yask::yc_node_factory']]], ['new_5fmisc_5findex',['new_misc_index',['../classyask_1_1yc__node__factory.html#aef5fed8db0e1798b421c4a8cb8da77ff',1,'yask::yc_node_factory::new_misc_index()'],['../classyask_1_1yc__solution__base.html#a254355f82c0bbaf2f78f6d38a196dcf3',1,'yask::yc_solution_base::new_misc_index()']]], @@ -28,15 +28,15 @@ var searchData= ['new_5fnull_5foutput',['new_null_output',['../classyask_1_1yask__output__factory.html#ab0bfefeb356653f097800f17fa659399',1,'yask::yask_output_factory']]], ['new_5fnumber_5fnode',['new_number_node',['../classyask_1_1yc__node__factory.html#ad7ad1075e359ddf1100ec25432b869b3',1,'yask::yc_node_factory::new_number_node()'],['../classyask_1_1yc__solution__base.html#aa807cfa83dd78deda5d32249acecbe78',1,'yask::yc_solution_base::new_number_node()']]], ['new_5for_5fnode',['new_or_node',['../classyask_1_1yc__node__factory.html#a73b4735896225d361d2a7c450226162d',1,'yask::yc_node_factory']]], - ['new_5frelative_5fgrid_5fpoint',['new_relative_grid_point',['../classyask_1_1yc__var.html#adb5d005f9d7e7bfb453b83b528f4fa8a',1,'yask::yc_var::new_relative_grid_point(const std::vector< int > &dim_offsets)'],['../classyask_1_1yc__var.html#affdab8bb4a9772a98da29aeffab05170',1,'yask::yc_var::new_relative_grid_point(const std::initializer_list< int > &dim_offsets)']]], - ['new_5frelative_5fvar_5fpoint',['new_relative_var_point',['../classyask_1_1yc__var.html#a2d35bc9166438e77a4f9ff7dc2778f6b',1,'yask::yc_var::new_relative_var_point(const std::vector< int > &dim_offsets)=0'],['../classyask_1_1yc__var.html#a18912b133d220d8ea20a847abb893592',1,'yask::yc_var::new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0']]], - ['new_5fscratch_5fgrid',['new_scratch_grid',['../classyask_1_1yc__solution.html#a06883d38ee1010bf6c950fce684d4a87',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#ae46884786032dab64986774ff36b6eba',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)']]], + ['new_5frelative_5fgrid_5fpoint',['new_relative_grid_point',['../classyask_1_1yc__var.html#a01803ca6d935b1d67093ee39192ecd39',1,'yask::yc_var::new_relative_grid_point(const std::vector< int > &dim_offsets)'],['../classyask_1_1yc__var.html#a69b1d05f4337b58afd9e5715663456ce',1,'yask::yc_var::new_relative_grid_point(const std::initializer_list< int > &dim_offsets)']]], + ['new_5frelative_5fvar_5fpoint',['new_relative_var_point',['../classyask_1_1yc__var.html#a08bd94bd9934eb4cec308638cfffe53d',1,'yask::yc_var::new_relative_var_point(const std::vector< int > &dim_offsets)=0'],['../classyask_1_1yc__var.html#a3d1dc10ae85f73f74203ce405618ae5e',1,'yask::yc_var::new_relative_var_point(const std::initializer_list< int > &dim_offsets)=0']]], + ['new_5fscratch_5fgrid',['new_scratch_grid',['../classyask_1_1yc__solution.html#a9e30883d0a97aa0ef5af6832f67bd863',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)'],['../classyask_1_1yc__solution.html#ae24afb1d88e355707c2113f749445329',1,'yask::yc_solution::new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)']]], ['new_5fscratch_5fvar',['new_scratch_var',['../classyask_1_1yc__solution.html#ac025854d8d7a0e4c62753dda67ff9e39',1,'yask::yc_solution::new_scratch_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#aa3f1bd432ae6b977d8a150e319856228',1,'yask::yc_solution::new_scratch_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0']]], ['new_5fsolution',['new_solution',['../classyask_1_1yc__factory.html#adce155773c9b0d469263303919681d69',1,'yask::yc_factory::new_solution()'],['../classyask_1_1yk__factory.html#a755b1bfc0dd9bfddfe80d924a188b350',1,'yask::yk_factory::new_solution(yk_env_ptr env) const'],['../classyask_1_1yk__factory.html#a43d6b5b6a88c7e4f14e41997b22501f0',1,'yask::yk_factory::new_solution(yk_env_ptr env, const yk_solution_ptr source) const']]], ['new_5fstdout_5foutput',['new_stdout_output',['../classyask_1_1yask__output__factory.html#acf0cc704a266abe9243eaa7b8672ca94',1,'yask::yask_output_factory']]], ['new_5fstep_5findex',['new_step_index',['../classyask_1_1yc__node__factory.html#a77c772e8539b116a9f0adbdf432628a1',1,'yask::yc_node_factory::new_step_index()'],['../classyask_1_1yc__solution__base.html#acd7a84f525c48d932e662597ea6ae32e',1,'yask::yc_solution_base::new_step_index()']]], ['new_5fstring_5foutput',['new_string_output',['../classyask_1_1yask__output__factory.html#ab1ec3a602da73b8ef716c8e07b43da04',1,'yask::yask_output_factory']]], ['new_5fsubtract_5fnode',['new_subtract_node',['../classyask_1_1yc__node__factory.html#af6ec670eeb91d4f4a7b4a9221a808346',1,'yask::yc_node_factory']]], - ['new_5fvar',['new_var',['../classyask_1_1yc__solution.html#a192b0f12d3943483514e16c82c15a42b',1,'yask::yc_solution::new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#a4daa8ae2e61c612cdb79241e43b34fcc',1,'yask::yc_solution::new_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yk__solution.html#a076e1cc78d96e7cc77ec62d58e289131',1,'yask::yk_solution::new_var(const std::string &name, const std::vector< std::string > &dims)=0'],['../classyask_1_1yk__solution.html#ae2774f810be2d57a878884111cbd36e9',1,'yask::yk_solution::new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0']]], + ['new_5fvar',['new_var',['../classyask_1_1yc__solution.html#a192b0f12d3943483514e16c82c15a42b',1,'yask::yc_solution::new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#a4daa8ae2e61c612cdb79241e43b34fcc',1,'yask::yc_solution::new_var(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yk__solution.html#a75ba824977414468dd23f0a1d5f9eaf3',1,'yask::yk_solution::new_var(const std::string &name, const string_vec &dims)=0'],['../classyask_1_1yk__solution.html#ae2774f810be2d57a878884111cbd36e9',1,'yask::yk_solution::new_var(const std::string &name, const std::initializer_list< std::string > &dims)=0']]], ['new_5fvar_5fpoint',['new_var_point',['../classyask_1_1yc__var.html#aad91c7587f75392db28d7a19bb53b423',1,'yask::yc_var::new_var_point(const std::vector< yc_number_node_ptr > &index_exprs)=0'],['../classyask_1_1yc__var.html#acbb35addfd24ab805d68e7ec0e76b8b9',1,'yask::yc_var::new_var_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)=0']]] ]; diff --git a/docs/api/html/search/functions_e.js b/docs/api/html/search/functions_e.js index f962b2b5..987a099f 100644 --- a/docs/api/html/search/functions_e.js +++ b/docs/api/html/search/functions_e.js @@ -3,18 +3,19 @@ var searchData= ['set_5fall_5felements_5fsame',['set_all_elements_same',['../classyask_1_1yk__var.html#a0cbfa0153ac69dfadf0e655246ddeac2',1,'yask::yk_var']]], ['set_5falloc_5fsize',['set_alloc_size',['../classyask_1_1yk__var.html#a7bc339345cc04bb349e2f6bf586a29f1',1,'yask::yk_var']]], ['set_5fblock_5fsize',['set_block_size',['../classyask_1_1yk__solution.html#abd3c7317bf1b397f332962d658f38839',1,'yask::yk_solution']]], + ['set_5fblock_5fsize_5fvec',['set_block_size_vec',['../classyask_1_1yk__solution.html#a191580e8eab142bbf5eeb7573546c9c9',1,'yask::yk_solution::set_block_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a98ae9c07fbc60e4323fa16ee4a2400b5',1,'yask::yk_solution::set_block_size_vec(const idx_t_init_list &vals)=0']]], ['set_5fcluster_5fmult',['set_cluster_mult',['../classyask_1_1yc__solution.html#a45cb1df4af6886e82f98904473873272',1,'yask::yc_solution']]], ['set_5fcond',['set_cond',['../classyask_1_1yc__equation__node.html#ac264942915dfb99fcfc9578873109bdb',1,'yask::yc_equation_node']]], - ['set_5fdebug_5foutput',['set_debug_output',['../classyask_1_1yc__solution.html#aff540803d358a5dcd304f09c522ec867',1,'yask::yc_solution::set_debug_output()'],['../classyask_1_1yk__env.html#a313519caed64266832618f0b00e8f45f',1,'yask::yk_env::set_debug_output()'],['../classyask_1_1yk__solution.html#aa315469b5836e531425836f4fc5eff1c',1,'yask::yk_solution::set_debug_output()']]], + ['set_5fdebug_5foutput',['set_debug_output',['../classyask_1_1yc__solution.html#aff540803d358a5dcd304f09c522ec867',1,'yask::yc_solution::set_debug_output()'],['../classyask_1_1yk__env.html#aa5cf0e8d885f4d9ebde6539d5246cda1',1,'yask::yk_env::set_debug_output()'],['../classyask_1_1yk__solution.html#a7e8ce77d85b54ebaf15ddf11009066c4',1,'yask::yk_solution::set_debug_output()']]], ['set_5fdefault_5fnuma_5fpreferred',['set_default_numa_preferred',['../classyask_1_1yk__solution.html#ac8bde8dfc73219cec84ad3033faabb90',1,'yask::yk_solution']]], ['set_5fdependency_5fchecker_5fenabled',['set_dependency_checker_enabled',['../classyask_1_1yc__solution.html#ac181c24ab7af945318a055cef3b52ee6',1,'yask::yc_solution']]], ['set_5fdescription',['set_description',['../classyask_1_1yc__solution.html#a3cc13f5daf402805f9b1f66996d9d6d3',1,'yask::yc_solution']]], ['set_5fdomain_5fdims',['set_domain_dims',['../classyask_1_1yc__solution.html#a3dc810afcb4ef91c10aa5e0e7092476f',1,'yask::yc_solution::set_domain_dims(const std::vector< yc_index_node_ptr > &dims)=0'],['../classyask_1_1yc__solution.html#ab798850bc3a6bf88322a2f39765ee831',1,'yask::yc_solution::set_domain_dims(const std::initializer_list< yc_index_node_ptr > &dims)=0']]], ['set_5fdynamic_5fstep_5falloc',['set_dynamic_step_alloc',['../classyask_1_1yc__var.html#a528a6e79000ffc8addefd7519cc58ad6',1,'yask::yc_var']]], - ['set_5felement',['set_element',['../classyask_1_1yk__var.html#afb2994485a7cb77c12df0904910053f1',1,'yask::yk_var::set_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#a2feb3a4e7b07f0693f4fbfd4e3e90b2e',1,'yask::yk_var::set_element(double val, const std::initializer_list< idx_t > &indices, bool strict_indices=true)=0']]], + ['set_5felement',['set_element',['../classyask_1_1yk__var.html#ac509cdab014c58033c15b5c92f4bb7d6',1,'yask::yk_var::set_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0'],['../classyask_1_1yk__var.html#a42a5e0c4f28ef714f1de855ddb81877d',1,'yask::yk_var::set_element(double val, const idx_t_init_list &indices, bool strict_indices=true)=0']]], ['set_5felement_5fbytes',['set_element_bytes',['../classyask_1_1yc__solution.html#a6ce565febd97f50efae59c37d7d5ef4f',1,'yask::yc_solution']]], - ['set_5felements_5fin_5fslice',['set_elements_in_slice',['../classyask_1_1yk__var.html#a4b19f91ffed51602eb08f60f2ddb8e70',1,'yask::yk_var']]], - ['set_5felements_5fin_5fslice_5fsame',['set_elements_in_slice_same',['../classyask_1_1yk__var.html#a4744c4ec52086d33f812909660dda4db',1,'yask::yk_var']]], + ['set_5felements_5fin_5fslice',['set_elements_in_slice',['../classyask_1_1yk__var.html#a56798ab60559bd84fdc204d7255ebe46',1,'yask::yk_var']]], + ['set_5felements_5fin_5fslice_5fsame',['set_elements_in_slice_same',['../classyask_1_1yk__var.html#ad919afb54bbde78938a3939e76df0cd8',1,'yask::yk_var']]], ['set_5ffirst_5fmisc_5findex',['set_first_misc_index',['../classyask_1_1yk__var.html#a5beae21df987bf4a93bec2ebf8a423f6',1,'yask::yk_var']]], ['set_5ffold_5flen',['set_fold_len',['../classyask_1_1yc__solution.html#a1168b5b8044e39c047d81a5fe5efc06e',1,'yask::yc_solution']]], ['set_5fhalo_5fexchange_5fl1_5fnorm',['set_halo_exchange_l1_norm',['../classyask_1_1yk__var.html#a5f65d5983b3e8f16bb20c466d6b7f027',1,'yask::yk_var']]], @@ -24,13 +25,16 @@ var searchData= ['set_5fmin_5fpad_5fsize',['set_min_pad_size',['../classyask_1_1yk__solution.html#ab3bd7e95ea13631954d92a638badfb2d',1,'yask::yk_solution::set_min_pad_size()'],['../classyask_1_1yk__var.html#a16aad88dc481991cbe83da7a55cb3799',1,'yask::yk_var::set_min_pad_size()']]], ['set_5fname',['set_name',['../classyask_1_1yc__solution.html#a1dfefccda72a3560e6664471a9ab451a',1,'yask::yc_solution']]], ['set_5fnum_5franks',['set_num_ranks',['../classyask_1_1yk__solution.html#ac4cd27d412b6fe013db58b167999a362',1,'yask::yk_solution']]], + ['set_5fnum_5franks_5fvec',['set_num_ranks_vec',['../classyask_1_1yk__solution.html#a85aebdf4bf311ed1b9d293fa4404f76e',1,'yask::yk_solution::set_num_ranks_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a7c4b880c34659d731ae133f7ae1bd273',1,'yask::yk_solution::set_num_ranks_vec(const idx_t_init_list &vals)=0']]], ['set_5fnuma_5fpreferred',['set_numa_preferred',['../classyask_1_1yk__var.html#ac68f7d6f9bbe826eed31e6dc2be01de4',1,'yask::yk_var']]], ['set_5foverall_5fdomain_5fsize',['set_overall_domain_size',['../classyask_1_1yk__solution.html#a7f9a22d8d2b760a05307e90147d18d8c',1,'yask::yk_solution']]], + ['set_5foverall_5fdomain_5fsize_5fvec',['set_overall_domain_size_vec',['../classyask_1_1yk__solution.html#a156fa79121b033516028c391db968a17',1,'yask::yk_solution::set_overall_domain_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a4540f5a7990503a4f2c1a336188197eb',1,'yask::yk_solution::set_overall_domain_size_vec(const idx_t_init_list &vals)=0']]], ['set_5fprefetch_5fdist',['set_prefetch_dist',['../classyask_1_1yc__solution.html#aa849ba0ae1af7890d8f6f5c0b095ff25',1,'yask::yc_solution']]], ['set_5fradius',['set_radius',['../classyask_1_1yc__solution__with__radius__base.html#a49016a165c8f3e8d2c2b003bebddf80b',1,'yask::yc_solution_with_radius_base']]], ['set_5frank_5fdomain_5fsize',['set_rank_domain_size',['../classyask_1_1yk__solution.html#a155d8f4a38da9da11488a18cca50bae8',1,'yask::yk_solution']]], + ['set_5frank_5fdomain_5fsize_5fvec',['set_rank_domain_size_vec',['../classyask_1_1yk__solution.html#a31a653082530e1049c3030b408bc3fa1',1,'yask::yk_solution::set_rank_domain_size_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a9d75b7f78aaba21fcbe0583c79106fb7',1,'yask::yk_solution::set_rank_domain_size_vec(const idx_t_init_list &vals)=0']]], ['set_5frank_5findex',['set_rank_index',['../classyask_1_1yk__solution.html#ac0eccaa1d228d7f3408e3c2b0881f279',1,'yask::yk_solution']]], - ['set_5fregion_5fsize',['set_region_size',['../classyask_1_1yk__solution.html#a923afb4530ef5f0370f8bf55856fa040',1,'yask::yk_solution']]], + ['set_5frank_5findex_5fvec',['set_rank_index_vec',['../classyask_1_1yk__solution.html#abcdcb3024837aa125c5218a618e0db94',1,'yask::yk_solution::set_rank_index_vec(const idx_t_vec &vals)=0'],['../classyask_1_1yk__solution.html#a921f09751c36b56509d2bf2e3d3f05a0',1,'yask::yk_solution::set_rank_index_vec(const idx_t_init_list &vals)=0']]], ['set_5fright_5fhalo_5fsize',['set_right_halo_size',['../classyask_1_1yk__var.html#a6d43cc66c47eb773c0e8aa043ec76099',1,'yask::yk_var']]], ['set_5fright_5fmin_5fpad_5fsize',['set_right_min_pad_size',['../classyask_1_1yk__var.html#a02ee27a0c922e55def701d6efffc725c',1,'yask::yk_var']]], ['set_5fstep_5falloc_5fsize',['set_step_alloc_size',['../classyask_1_1yc__var.html#ae287dc53eb486e8dbf1a3f6ecec9cafb',1,'yask::yc_var']]], @@ -38,6 +42,6 @@ var searchData= ['set_5fstep_5fdim',['set_step_dim',['../classyask_1_1yc__solution.html#ad234ecd4964bcb57b1876be87baf57bd',1,'yask::yc_solution']]], ['set_5fstep_5fwrap',['set_step_wrap',['../classyask_1_1yk__solution.html#af9b7c6f23e3190f3958fa1843fd0cba0',1,'yask::yk_solution']]], ['set_5ftarget',['set_target',['../classyask_1_1yc__solution.html#a1c03fc69e306a700de8c5ae3973229c2',1,'yask::yc_solution']]], - ['set_5ftrace_5fenabled',['set_trace_enabled',['../classyask_1_1yk__env.html#a1705f55210c095cec5d8347bacd24c57',1,'yask::yk_env']]], + ['set_5ftrace_5fenabled',['set_trace_enabled',['../classyask_1_1yk__env.html#ac6b39c25ae59e27dcdb405a9fe24e763',1,'yask::yk_env']]], ['set_5fvalue',['set_value',['../classyask_1_1yc__const__number__node.html#a8e1cf3c96519e6f9f8729e5736d791e3',1,'yask::yc_const_number_node']]] ]; diff --git a/docs/api/html/search/searchdata.js b/docs/api/html/search/searchdata.js index cac9f9df..1624d640 100644 --- a/docs/api/html/search/searchdata.js +++ b/docs/api/html/search/searchdata.js @@ -6,7 +6,7 @@ var indexSectionsWithContent = 3: "_abcdefgilnoprsuwy~", 4: "y", 5: "hiosy", - 6: "c", + 6: "cy", 7: "y" }; diff --git a/docs/api/html/search/typedefs_1.js b/docs/api/html/search/typedefs_1.js index 9c619742..03c78040 100644 --- a/docs/api/html/search/typedefs_1.js +++ b/docs/api/html/search/typedefs_1.js @@ -1,4 +1,6 @@ var searchData= [ - ['idx_5ft',['idx_t',['../group__yask.html#ga3820f8c6b5f6a92c0df31746b7d2891b',1,'yask']]] + ['idx_5ft',['idx_t',['../group__yask.html#ga1dd7066686ff93559a0f28979be12d81',1,'yask']]], + ['idx_5ft_5finit_5flist',['idx_t_init_list',['../group__yask.html#ga9a86862ece2cddc8fb77fac850c44161',1,'yask']]], + ['idx_5ft_5fvec',['idx_t_vec',['../group__yask.html#gab23959584aacc15a84d1eca058036d09',1,'yask']]] ]; diff --git a/docs/api/html/search/typedefs_3.js b/docs/api/html/search/typedefs_3.js index dfb2cc26..f7efef8d 100644 --- a/docs/api/html/search/typedefs_3.js +++ b/docs/api/html/search/typedefs_3.js @@ -1,4 +1,5 @@ var searchData= [ - ['soln_5fmap',['soln_map',['../classyask_1_1yc__solution__base.html#ab5607f329a35a073145389a4f8cb06b5',1,'yask::yc_solution_base']]] + ['soln_5fmap',['soln_map',['../classyask_1_1yc__solution__base.html#ab5607f329a35a073145389a4f8cb06b5',1,'yask::yc_solution_base']]], + ['string_5fvec',['string_vec',['../group__yask.html#ga53bde373152f3af12ba9daa06007eb5f',1,'yask']]] ]; diff --git a/docs/api/html/search/typedefs_4.js b/docs/api/html/search/typedefs_4.js index cb50db5b..040a69f3 100644 --- a/docs/api/html/search/typedefs_4.js +++ b/docs/api/html/search/typedefs_4.js @@ -18,10 +18,6 @@ var searchData= ['yc_5fequation_5fnode_5fptr',['yc_equation_node_ptr',['../group__yc.html#ga42131f5a8cfbe95d98a131e25b1b86f3',1,'yask']]], ['yc_5fexpr_5fnode_5fptr',['yc_expr_node_ptr',['../group__yc.html#ga83de2a63f5049fe4bcb94c1dd366848a',1,'yask']]], ['yc_5fgreater_5fthan_5fnode_5fptr',['yc_greater_than_node_ptr',['../group__yc.html#ga1428bb8994856ecd456549b2dea7fcd9',1,'yask']]], - ['yc_5fgrid',['yc_grid',['../yask__compiler__api_8hpp.html#ae14796cde0479d525b112001bb6a444c',1,'yask']]], - ['yc_5fgrid_5fpoint_5fnode',['yc_grid_point_node',['../yask__compiler__api_8hpp.html#a7d35999dfda1de8d92cceec57da8d261',1,'yask']]], - ['yc_5fgrid_5fpoint_5fnode_5fptr',['yc_grid_point_node_ptr',['../yask__compiler__api_8hpp.html#acedc107d594b81726586528668145b25',1,'yask']]], - ['yc_5fgrid_5fptr',['yc_grid_ptr',['../yask__compiler__api_8hpp.html#a8b612f791d66a68204d83f264bfb075c',1,'yask']]], ['yc_5findex_5fnode_5fptr',['yc_index_node_ptr',['../group__yc.html#gac5a8be4a272d764b1145f1e0c6f493e0',1,'yask']]], ['yc_5fless_5fthan_5fnode_5fptr',['yc_less_than_node_ptr',['../group__yc.html#ga6f59747cbe25a7bce63c0409a2cd55f3',1,'yask']]], ['yc_5fmod_5fnode_5fptr',['yc_mod_node_ptr',['../group__yc.html#gafeed67539b530a5de7bc954dd80ecb10',1,'yask']]], @@ -38,8 +34,6 @@ var searchData= ['yc_5fvar_5fpoint_5fnode_5fptr',['yc_var_point_node_ptr',['../group__yc.html#ga9a62b44ca70077bbedfd93527c87c56e',1,'yask']]], ['yc_5fvar_5fptr',['yc_var_ptr',['../group__yc.html#ga90f589d7fb0dc32e895384b65140f4a3',1,'yask']]], ['yk_5fenv_5fptr',['yk_env_ptr',['../group__yk.html#ga8dc62f5599d5c5eb9f7583d7d6a63df1',1,'yask']]], - ['yk_5fgrid',['yk_grid',['../group__yk.html#ga18cfd02f68b4f754c1d174dafdd59d65',1,'yask']]], - ['yk_5fgrid_5fptr',['yk_grid_ptr',['../group__yk.html#gab8f7fc989bed12dcfd79b6bef885fe1e',1,'yask']]], ['yk_5fsolution_5fptr',['yk_solution_ptr',['../group__yk.html#ga2debaa7135bb46dfc295ca623bee2876',1,'yask']]], ['yk_5fstats_5fptr',['yk_stats_ptr',['../group__yk.html#ga12d1d46aeb01bd7509a8dc3251657f75',1,'yask']]], ['yk_5fvar_5fptr',['yk_var_ptr',['../group__yk.html#ga95f75e2b515e5455b570ae705115696a',1,'yask']]] diff --git a/docs/api/html/search/variables_0.js b/docs/api/html/search/variables_0.js index 9c6086bb..4843b684 100644 --- a/docs/api/html/search/variables_0.js +++ b/docs/api/html/search/variables_0.js @@ -2,5 +2,12 @@ var searchData= [ ['yask_5fnuma_5finterleave',['yask_numa_interleave',['../group__yk.html#ga4e56e832945f97f2e741738e9194873c',1,'yask']]], ['yask_5fnuma_5flocal',['yask_numa_local',['../group__yk.html#ga82b8e0f360a0e18fe6c730e37b33e3f6',1,'yask']]], - ['yask_5fnuma_5fnone',['yask_numa_none',['../group__yk.html#ga38a50108f67012a357b424545495158a',1,'yask']]] + ['yask_5fnuma_5fnone',['yask_numa_none',['../group__yk.html#ga38a50108f67012a357b424545495158a',1,'yask']]], + ['yask_5fnuma_5foffload',['yask_numa_offload',['../group__yk.html#gaa3d0568a0cda08804b8d0a8c521a81fa',1,'yask']]], + ['yc_5fgrid',['yc_grid',['../yask__compiler__api_8hpp.html#a5af53f9d12f8a64e263f9faf12705833',1,'yask']]], + ['yc_5fgrid_5fpoint_5fnode',['yc_grid_point_node',['../yask__compiler__api_8hpp.html#a24044552be06e5020b82381da8331ab7',1,'yask']]], + ['yc_5fgrid_5fpoint_5fnode_5fptr',['yc_grid_point_node_ptr',['../yask__compiler__api_8hpp.html#a3fdfb1592adfd3b7fad43d3dc0954e7b',1,'yask']]], + ['yc_5fgrid_5fptr',['yc_grid_ptr',['../yask__compiler__api_8hpp.html#ac5d9ddae8098817aebdbb5ead715da01',1,'yask']]], + ['yk_5fgrid',['yk_grid',['../group__yk.html#gab1c5abbc86c9fdde32def4217482cc63',1,'yask']]], + ['yk_5fgrid_5fptr',['yk_grid_ptr',['../group__yk.html#gaf6e19ac605b32b47d4edc5a8985b3c5d',1,'yask']]] ]; diff --git a/docs/api/html/yask__common__api_8hpp.html b/docs/api/html/yask__common__api_8hpp.html index cecc84fe..7214183a 100644 --- a/docs/api/html/yask__common__api_8hpp.html +++ b/docs/api/html/yask__common__api_8hpp.html @@ -68,13 +68,18 @@
                                                                                                  yask_common_api.hpp File Reference
                                                                                                  -
                                                                                                  #include <string>
                                                                                                  +
                                                                                                  #include <cstdint>
                                                                                                  +#include <cinttypes>
                                                                                                  +#include <climits>
                                                                                                  +#include <type_traits>
                                                                                                  +#include <string>
                                                                                                  #include <vector>
                                                                                                  #include <map>
                                                                                                  #include <iostream>
                                                                                                  @@ -108,11 +113,33 @@  Null output. More...
                                                                                                    + + + + + + + +

                                                                                                  +Macros

                                                                                                  +#define YASK_DEPRECATED   [[deprecated]]
                                                                                                   Deprecated attribute.
                                                                                                   
                                                                                                  +#define YASK_INT64_T   std::int64_t
                                                                                                   Signed 64-bit int.
                                                                                                   
                                                                                                  - - - + + + + + + + + + + + + diff --git a/docs/api/html/yask__common__api_8hpp_source.html b/docs/api/html/yask__common__api_8hpp_source.html index b90aa4b5..966550c9 100644 --- a/docs/api/html/yask__common__api_8hpp_source.html +++ b/docs/api/html/yask__common__api_8hpp_source.html @@ -70,27 +70,31 @@
                                                                                                  yask_common_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include <string>
                                                                                                  35 #include <vector>
                                                                                                  36 #include <map>
                                                                                                  37 #include <iostream>
                                                                                                  38 #include <ostream>
                                                                                                  39 #include <memory>
                                                                                                  40 #include <functional>
                                                                                                  41 
                                                                                                  42 namespace yask {
                                                                                                  43 
                                                                                                  50 
                                                                                                  54  std::string yask_get_version_string();
                                                                                                  55 
                                                                                                  57 
                                                                                                  58 #ifdef SWIG
                                                                                                  59  typedef long int idx_t; // SWIG doesn't seem to understand int64_t.
                                                                                                  60 #else
                                                                                                  61  typedef std::int64_t idx_t;
                                                                                                  62 #endif
                                                                                                  63 
                                                                                                  64  // Forward declarations of class-pointers.
                                                                                                  65 
                                                                                                  66  class yask_output;
                                                                                                  68  typedef std::shared_ptr<yask_output> yask_output_ptr;
                                                                                                  69 
                                                                                                  72  typedef std::shared_ptr<yask_file_output> yask_file_output_ptr;
                                                                                                  73 
                                                                                                  76  typedef std::shared_ptr<yask_string_output> yask_string_output_ptr;
                                                                                                  77 
                                                                                                  80  typedef std::shared_ptr<yask_stdout_output> yask_stdout_output_ptr;
                                                                                                  81 
                                                                                                  84  typedef std::shared_ptr<yask_null_output> yask_null_output_ptr;
                                                                                                  85 
                                                                                                  87 
                                                                                                  88  class yask_exception: public std::exception {
                                                                                                  89  private:
                                                                                                  91  std::string _msg;
                                                                                                  92 
                                                                                                  93  public:
                                                                                                  94 
                                                                                                  97  _msg("YASK exception") { };
                                                                                                  98 
                                                                                                  100  yask_exception(const std::string& message) :
                                                                                                  101  _msg(message) { };
                                                                                                  102 
                                                                                                  103  virtual ~yask_exception() { };
                                                                                                  104 
                                                                                                  106 
                                                                                                  108  virtual const char* what() const noexcept;
                                                                                                  109 
                                                                                                  111  virtual void add_message(const std::string& message );
                                                                                                  113 
                                                                                                  115 
                                                                                                  117  virtual const char* get_message() const;
                                                                                                  118  };
                                                                                                  119 
                                                                                                  122  public:
                                                                                                  123  virtual ~yask_output_factory() {}
                                                                                                  124 
                                                                                                  126 
                                                                                                  131  virtual yask_file_output_ptr
                                                                                                  132  new_file_output(const std::string& file_name ) const;
                                                                                                  135 
                                                                                                  137 
                                                                                                  141  virtual yask_string_output_ptr
                                                                                                  142  new_string_output() const;
                                                                                                  143 
                                                                                                  145 
                                                                                                  149  virtual yask_stdout_output_ptr
                                                                                                  150  new_stdout_output() const;
                                                                                                  151 
                                                                                                  153 
                                                                                                  157  virtual yask_null_output_ptr
                                                                                                  158  new_null_output() const;
                                                                                                  159  };
                                                                                                  160 
                                                                                                  162  class yask_output {
                                                                                                  163  public:
                                                                                                  164  virtual ~yask_output() {}
                                                                                                  165 
                                                                                                  167 
                                                                                                  168  virtual std::ostream& get_ostream() =0;
                                                                                                  169  };
                                                                                                  170 
                                                                                                  172  class yask_file_output : public virtual yask_output {
                                                                                                  173  public:
                                                                                                  174  virtual ~yask_file_output() {}
                                                                                                  175 
                                                                                                  177 
                                                                                                  178  virtual std::string get_filename() const =0;
                                                                                                  179 
                                                                                                  181  virtual void close() =0;
                                                                                                  182  };
                                                                                                  183 
                                                                                                  185  class yask_string_output : public virtual yask_output {
                                                                                                  186  public:
                                                                                                  187  virtual ~yask_string_output() {}
                                                                                                  188 
                                                                                                  190 
                                                                                                  192  virtual std::string get_string() const =0;
                                                                                                  193 
                                                                                                  195  virtual void discard() =0;
                                                                                                  196  };
                                                                                                  197 
                                                                                                  199  class yask_stdout_output : public virtual yask_output {
                                                                                                  200  public:
                                                                                                  201  virtual ~yask_stdout_output() {}
                                                                                                  202  };
                                                                                                  203 
                                                                                                  205 
                                                                                                  206  class yask_null_output : public virtual yask_output {
                                                                                                  207  public:
                                                                                                  208  virtual ~yask_null_output() {}
                                                                                                  209  };
                                                                                                  210 
                                                                                                  212 
                                                                                                  218  std::vector<double>
                                                                                                  219  get_center_fd_coefficients(int derivative_order,
                                                                                                  221  int radius );
                                                                                                  223 
                                                                                                  225 
                                                                                                  230  std::vector<double>
                                                                                                  231  get_forward_fd_coefficients(int derivative_order,
                                                                                                  233  int accuracy_order );
                                                                                                  235 
                                                                                                  237 
                                                                                                  242  std::vector<double>
                                                                                                  243  get_backward_fd_coefficients(int derivative_order,
                                                                                                  245  int accuracy_order );
                                                                                                  247 
                                                                                                  249 
                                                                                                  252  std::vector<double>
                                                                                                  253  get_arbitrary_fd_coefficients(int derivative_order,
                                                                                                  255  double eval_point,
                                                                                                  257  const std::vector<double> sample_points );
                                                                                                  259 
                                                                                                  262 } // namespace yask.
                                                                                                  263 
                                                                                                  std::shared_ptr< yask_null_output > yask_null_output_ptr
                                                                                                  Shared pointer to yask_null_output.
                                                                                                  Definition: yask_common_api.hpp:82
                                                                                                  -
                                                                                                  std::shared_ptr< yask_file_output > yask_file_output_ptr
                                                                                                  Shared pointer to yask_file_output.
                                                                                                  Definition: yask_common_api.hpp:70
                                                                                                  -
                                                                                                  Exception from YASK framework.
                                                                                                  Definition: yask_common_api.hpp:88
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include <cstdint>
                                                                                                  35 #include <cinttypes>
                                                                                                  36 #include <climits>
                                                                                                  37 #include <type_traits>
                                                                                                  38 #include <string>
                                                                                                  39 #include <vector>
                                                                                                  40 #include <map>
                                                                                                  41 #include <iostream>
                                                                                                  42 #include <ostream>
                                                                                                  43 #include <memory>
                                                                                                  44 #include <functional>
                                                                                                  45 
                                                                                                  46 // Things SWIG can't handle.
                                                                                                  47 #ifdef SWIG
                                                                                                  48 #ifndef YASK_DEPRECATED
                                                                                                  49 #define YASK_DEPRECATED
                                                                                                  50 #endif
                                                                                                  51 #define YASK_INT64_T long int
                                                                                                  52 #else
                                                                                                  53 #ifndef YASK_DEPRECATED
                                                                                                  55 #define YASK_DEPRECATED [[deprecated]]
                                                                                                  56 #endif
                                                                                                  57 #define YASK_INT64_T std::int64_t
                                                                                                  59 #endif
                                                                                                  60 
                                                                                                  61 namespace yask {
                                                                                                  62 
                                                                                                  69 
                                                                                                  73  std::string yask_get_version_string();
                                                                                                  74 
                                                                                                  76 
                                                                                                  78 
                                                                                                  80  typedef std::vector<idx_t> idx_t_vec;
                                                                                                  81 
                                                                                                  83 
                                                                                                  87  typedef std::initializer_list<idx_t> idx_t_init_list;
                                                                                                  88 
                                                                                                  90  typedef std::vector<std::string> string_vec;
                                                                                                  91 
                                                                                                  92  // Forward declarations of class-pointers.
                                                                                                  93 
                                                                                                  94  class yask_output;
                                                                                                  96  typedef std::shared_ptr<yask_output> yask_output_ptr;
                                                                                                  97 
                                                                                                  100  typedef std::shared_ptr<yask_file_output> yask_file_output_ptr;
                                                                                                  101 
                                                                                                  104  typedef std::shared_ptr<yask_string_output> yask_string_output_ptr;
                                                                                                  105 
                                                                                                  108  typedef std::shared_ptr<yask_stdout_output> yask_stdout_output_ptr;
                                                                                                  109 
                                                                                                  112  typedef std::shared_ptr<yask_null_output> yask_null_output_ptr;
                                                                                                  113 
                                                                                                  115 
                                                                                                  116  class yask_exception: public std::exception {
                                                                                                  117  private:
                                                                                                  119  std::string _msg;
                                                                                                  120 
                                                                                                  121  public:
                                                                                                  122 
                                                                                                  125  _msg("YASK exception") { };
                                                                                                  126 
                                                                                                  128  yask_exception(const std::string& message) :
                                                                                                  129  _msg(message) { };
                                                                                                  130 
                                                                                                  131  virtual ~yask_exception() { };
                                                                                                  132 
                                                                                                  134 
                                                                                                  136  virtual const char* what() const noexcept;
                                                                                                  137 
                                                                                                  139  virtual void add_message(const std::string& message );
                                                                                                  141 
                                                                                                  143 
                                                                                                  145  virtual const char* get_message() const;
                                                                                                  146  };
                                                                                                  147 
                                                                                                  150  public:
                                                                                                  151  virtual ~yask_output_factory() {}
                                                                                                  152 
                                                                                                  154 
                                                                                                  159  virtual yask_file_output_ptr
                                                                                                  160  new_file_output(const std::string& file_name ) const;
                                                                                                  163 
                                                                                                  165 
                                                                                                  169  virtual yask_string_output_ptr
                                                                                                  170  new_string_output() const;
                                                                                                  171 
                                                                                                  173 
                                                                                                  177  virtual yask_stdout_output_ptr
                                                                                                  178  new_stdout_output() const;
                                                                                                  179 
                                                                                                  181 
                                                                                                  185  virtual yask_null_output_ptr
                                                                                                  186  new_null_output() const;
                                                                                                  187  };
                                                                                                  188 
                                                                                                  190  class yask_output {
                                                                                                  191  public:
                                                                                                  192  virtual ~yask_output() {}
                                                                                                  193 
                                                                                                  195 
                                                                                                  196  virtual std::ostream& get_ostream() =0;
                                                                                                  197  };
                                                                                                  198 
                                                                                                  200  class yask_file_output : public virtual yask_output {
                                                                                                  201  public:
                                                                                                  202  virtual ~yask_file_output() {}
                                                                                                  203 
                                                                                                  205 
                                                                                                  206  virtual std::string get_filename() const =0;
                                                                                                  207 
                                                                                                  209  virtual void close() =0;
                                                                                                  210  };
                                                                                                  211 
                                                                                                  213  class yask_string_output : public virtual yask_output {
                                                                                                  214  public:
                                                                                                  215  virtual ~yask_string_output() {}
                                                                                                  216 
                                                                                                  218 
                                                                                                  220  virtual std::string get_string() const =0;
                                                                                                  221 
                                                                                                  223  virtual void discard() =0;
                                                                                                  224  };
                                                                                                  225 
                                                                                                  227  class yask_stdout_output : public virtual yask_output {
                                                                                                  228  public:
                                                                                                  229  virtual ~yask_stdout_output() {}
                                                                                                  230  };
                                                                                                  231 
                                                                                                  233 
                                                                                                  234  class yask_null_output : public virtual yask_output {
                                                                                                  235  public:
                                                                                                  236  virtual ~yask_null_output() {}
                                                                                                  237  };
                                                                                                  238 
                                                                                                  240 
                                                                                                  246  std::vector<double>
                                                                                                  247  get_center_fd_coefficients(int derivative_order,
                                                                                                  249  int radius );
                                                                                                  251 
                                                                                                  253 
                                                                                                  258  std::vector<double>
                                                                                                  259  get_forward_fd_coefficients(int derivative_order,
                                                                                                  261  int accuracy_order );
                                                                                                  263 
                                                                                                  265 
                                                                                                  270  std::vector<double>
                                                                                                  271  get_backward_fd_coefficients(int derivative_order,
                                                                                                  273  int accuracy_order );
                                                                                                  275 
                                                                                                  277 
                                                                                                  280  std::vector<double>
                                                                                                  281  get_arbitrary_fd_coefficients(int derivative_order,
                                                                                                  283  double eval_point,
                                                                                                  285  const std::vector<double> sample_points );
                                                                                                  287 
                                                                                                  290 } // namespace yask.
                                                                                                  291 
                                                                                                  std::initializer_list< idx_t > idx_t_init_list
                                                                                                  Initializer list of indices.
                                                                                                  Definition: yask_common_api.hpp:87
                                                                                                  +
                                                                                                  std::shared_ptr< yask_null_output > yask_null_output_ptr
                                                                                                  Shared pointer to yask_null_output.
                                                                                                  Definition: yask_common_api.hpp:110
                                                                                                  +
                                                                                                  std::shared_ptr< yask_file_output > yask_file_output_ptr
                                                                                                  Shared pointer to yask_file_output.
                                                                                                  Definition: yask_common_api.hpp:98
                                                                                                  +
                                                                                                  Exception from YASK framework.
                                                                                                  Definition: yask_common_api.hpp:116
                                                                                                  std::vector< double > get_backward_fd_coefficients(int derivative_order, int accuracy_order)
                                                                                                  Create finite-difference (FD) coefficients for the standard backward form.
                                                                                                  virtual void add_message(const std::string &message)
                                                                                                  Append message to description of this exception.
                                                                                                  -
                                                                                                  yask_exception(const std::string &message)
                                                                                                  Construct a YASK exception with message.
                                                                                                  Definition: yask_common_api.hpp:100
                                                                                                  -
                                                                                                  Factory to create output objects.
                                                                                                  Definition: yask_common_api.hpp:121
                                                                                                  -
                                                                                                  Base interface for output.
                                                                                                  Definition: yask_common_api.hpp:162
                                                                                                  +
                                                                                                  std::vector< idx_t > idx_t_vec
                                                                                                  Vector of indices.
                                                                                                  Definition: yask_common_api.hpp:80
                                                                                                  +
                                                                                                  yask_exception(const std::string &message)
                                                                                                  Construct a YASK exception with message.
                                                                                                  Definition: yask_common_api.hpp:128
                                                                                                  +
                                                                                                  Factory to create output objects.
                                                                                                  Definition: yask_common_api.hpp:149
                                                                                                  +
                                                                                                  Base interface for output.
                                                                                                  Definition: yask_common_api.hpp:190
                                                                                                  +
                                                                                                  #define YASK_INT64_T
                                                                                                  Signed 64-bit int.
                                                                                                  Definition: yask_common_api.hpp:58
                                                                                                  +
                                                                                                  YASK_INT64_T idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:77
                                                                                                  virtual const char * get_message() const
                                                                                                  Get description.
                                                                                                  -
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:66
                                                                                                  +
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:94
                                                                                                  std::vector< double > get_center_fd_coefficients(int derivative_order, int radius)
                                                                                                  Create finite-difference (FD) coefficients for the standard center form.
                                                                                                  -
                                                                                                  Null output.
                                                                                                  Definition: yask_common_api.hpp:206
                                                                                                  -
                                                                                                  yask_exception()
                                                                                                  Construct a YASK exception with no message.
                                                                                                  Definition: yask_common_api.hpp:96
                                                                                                  -
                                                                                                  String output.
                                                                                                  Definition: yask_common_api.hpp:185
                                                                                                  -
                                                                                                  std::shared_ptr< yask_stdout_output > yask_stdout_output_ptr
                                                                                                  Shared pointer to yask_stdout_output.
                                                                                                  Definition: yask_common_api.hpp:78
                                                                                                  +
                                                                                                  Null output.
                                                                                                  Definition: yask_common_api.hpp:234
                                                                                                  +
                                                                                                  yask_exception()
                                                                                                  Construct a YASK exception with no message.
                                                                                                  Definition: yask_common_api.hpp:124
                                                                                                  +
                                                                                                  String output.
                                                                                                  Definition: yask_common_api.hpp:213
                                                                                                  +
                                                                                                  std::shared_ptr< yask_stdout_output > yask_stdout_output_ptr
                                                                                                  Shared pointer to yask_stdout_output.
                                                                                                  Definition: yask_common_api.hpp:106
                                                                                                  std::vector< double > get_arbitrary_fd_coefficients(int derivative_order, double eval_point, const std::vector< double > sample_points)
                                                                                                  Create finite-difference (FD) coefficients at arbitrary evaluation and sample points.
                                                                                                  std::vector< double > get_forward_fd_coefficients(int derivative_order, int accuracy_order)
                                                                                                  Create finite-difference (FD) coefficients for the standard forward form.
                                                                                                  -
                                                                                                  Stdout output.
                                                                                                  Definition: yask_common_api.hpp:199
                                                                                                  -
                                                                                                  File output.
                                                                                                  Definition: yask_common_api.hpp:172
                                                                                                  -
                                                                                                  std::int64_t idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:61
                                                                                                  -
                                                                                                  std::shared_ptr< yask_string_output > yask_string_output_ptr
                                                                                                  Shared pointer to yask_string_output.
                                                                                                  Definition: yask_common_api.hpp:74
                                                                                                  +
                                                                                                  std::vector< std::string > string_vec
                                                                                                  Vector of strings.
                                                                                                  Definition: yask_common_api.hpp:90
                                                                                                  +
                                                                                                  Stdout output.
                                                                                                  Definition: yask_common_api.hpp:227
                                                                                                  +
                                                                                                  File output.
                                                                                                  Definition: yask_common_api.hpp:200
                                                                                                  +
                                                                                                  std::shared_ptr< yask_string_output > yask_string_output_ptr
                                                                                                  Shared pointer to yask_string_output.
                                                                                                  Definition: yask_common_api.hpp:102
                                                                                                  virtual const char * what() const noexcept
                                                                                                  Get description.
                                                                                                  std::string yask_get_version_string()
                                                                                                  Version information.
                                                                                                  diff --git a/docs/api/html/yask__compiler__api_8hpp.html b/docs/api/html/yask__compiler__api_8hpp.html index 494f6674..a4f3c52d 100644 --- a/docs/api/html/yask__compiler__api_8hpp.html +++ b/docs/api/html/yask__compiler__api_8hpp.html @@ -69,7 +69,8 @@ +Typedefs | +Variables
                                                                                                  yask_compiler_api.hpp File Reference
                                                                                                  @@ -137,22 +138,25 @@ typedef std::shared_ptr< yc_var_point_node >  - - - - - - - - - - - - +

                                                                                                  Typedefs

                                                                                                  typedef std::int64_t yask::idx_t
                                                                                                   Type to use for indexing grids. More...
                                                                                                   
                                                                                                  typedef YASK_INT64_T yask::idx_t
                                                                                                   Type to use for indexing grids. More...
                                                                                                   
                                                                                                  +typedef std::vector< idx_t > yask::idx_t_vec
                                                                                                   Vector of indices.
                                                                                                   
                                                                                                  typedef std::initializer_list< idx_t > yask::idx_t_init_list
                                                                                                   Initializer list of indices. More...
                                                                                                   
                                                                                                  +typedef std::vector< std::string > yask::string_vec
                                                                                                   Vector of strings.
                                                                                                   
                                                                                                  typedef std::shared_ptr< yask_output > yask::yask_output_ptr
                                                                                                   Shared pointer to yask_output.
                                                                                                  yask::yc_var_point_node_ptr
                                                                                                   Shared pointer to yc_var_point_node.
                                                                                                   
                                                                                                  -typedef yc_var yask::yc_grid
                                                                                                   [Deprecated] Use yc_var.
                                                                                                   
                                                                                                  -typedef yc_var_ptr yask::yc_grid_ptr
                                                                                                   [Deprecated] Use yc_var_ptr.
                                                                                                   
                                                                                                  -typedef yc_var_point_node yask::yc_grid_point_node
                                                                                                   [Deprecated] Use yc_var_point_node.
                                                                                                   
                                                                                                  -typedef yc_var_point_node_ptr yask::yc_grid_point_node_ptr
                                                                                                   [Deprecated] Use yc_var_point_node_ptr.
                                                                                                   
                                                                                                  + + + + + + + + + + + + +

                                                                                                  +Variables

                                                                                                  +YASK_DEPRECATED typedef yc_var yask::yc_grid
                                                                                                   [Deprecated] Use yc_var.
                                                                                                   
                                                                                                  +YASK_DEPRECATED typedef yc_var_ptr yask::yc_grid_ptr
                                                                                                   [Deprecated] Use yc_var_ptr.
                                                                                                   
                                                                                                  +YASK_DEPRECATED typedef yc_var_point_node yask::yc_grid_point_node
                                                                                                   [Deprecated] Use yc_var_point_node.
                                                                                                   
                                                                                                  +YASK_DEPRECATED typedef yc_var_point_node_ptr yask::yc_grid_point_node_ptr
                                                                                                   [Deprecated] Use yc_var_point_node_ptr.
                                                                                                   

                                                                                                  Macro Definition Documentation

                                                                                                  diff --git a/docs/api/html/yask__compiler__api_8hpp_source.html b/docs/api/html/yask__compiler__api_8hpp_source.html index 8e005bc1..4c5d5f6d 100644 --- a/docs/api/html/yask__compiler__api_8hpp_source.html +++ b/docs/api/html/yask__compiler__api_8hpp_source.html @@ -70,60 +70,60 @@
                                                                                                  yask_compiler_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_common_api.hpp"
                                                                                                  35 #include <functional>
                                                                                                  36 #include <vector>
                                                                                                  37 
                                                                                                  38 namespace yask {
                                                                                                  39 
                                                                                                  46  // Forward declarations of classes and their pointers.
                                                                                                  47  // See yask_compiler_api.hpp for more.
                                                                                                  48 
                                                                                                  49  class yc_solution;
                                                                                                  51  typedef std::shared_ptr<yc_solution> yc_solution_ptr;
                                                                                                  52 
                                                                                                  53  class yc_var;
                                                                                                  55  typedef yc_var* yc_var_ptr;
                                                                                                  56 
                                                                                                  57  // Forward declarations of expression nodes and their pointers.
                                                                                                  58 
                                                                                                  59  class yc_expr_node;
                                                                                                  61  typedef std::shared_ptr<yc_expr_node> yc_expr_node_ptr;
                                                                                                  62 
                                                                                                  63  class yc_bool_node;
                                                                                                  65  typedef std::shared_ptr<yc_bool_node> yc_bool_node_ptr;
                                                                                                  66 
                                                                                                  69  typedef std::shared_ptr<yc_number_node> yc_number_node_ptr;
                                                                                                  70 
                                                                                                  73  typedef std::shared_ptr<yc_index_node> yc_index_node_ptr;
                                                                                                  74 
                                                                                                  77  typedef std::shared_ptr<yc_equation_node> yc_equation_node_ptr;
                                                                                                  78 
                                                                                                  81  typedef std::shared_ptr<yc_var_point_node> yc_var_point_node_ptr;
                                                                                                  82 
                                                                                                  84 }
                                                                                                  85 
                                                                                                  86 #include "aux/yc_node_api.hpp"
                                                                                                  87 
                                                                                                  88 namespace yask {
                                                                                                  89 
                                                                                                  95  class yc_factory {
                                                                                                  97  public:
                                                                                                  98  virtual ~yc_factory() {}
                                                                                                  99 
                                                                                                  101 
                                                                                                  104  virtual std::string
                                                                                                  106 
                                                                                                  108 
                                                                                                  112  virtual yc_solution_ptr
                                                                                                  113  new_solution(const std::string& name ) const;
                                                                                                  115  }; // yc_factory.
                                                                                                  116 
                                                                                                  118 
                                                                                                  123  class yc_solution {
                                                                                                  124  public:
                                                                                                  125  virtual ~yc_solution() {}
                                                                                                  126 
                                                                                                  128  virtual void
                                                                                                  132 
                                                                                                  134 
                                                                                                  138  virtual std::string
                                                                                                  139  get_name() const =0;
                                                                                                  140 
                                                                                                  142 
                                                                                                  145  virtual void
                                                                                                  146  set_name(std::string name ) =0;
                                                                                                  148 
                                                                                                  150 
                                                                                                  154  virtual std::string
                                                                                                  155  get_description() const =0;
                                                                                                  156 
                                                                                                  158 
                                                                                                  163  virtual void
                                                                                                  164  set_description(std::string description ) =0;
                                                                                                  166 
                                                                                                  168 
                                                                                                  173  virtual std::string
                                                                                                  174  get_target() =0;
                                                                                                  175 
                                                                                                  177 
                                                                                                  193  virtual void
                                                                                                  194  set_target(
                                                                                                  195  const std::string& format) =0;
                                                                                                  196 
                                                                                                  198 
                                                                                                  202  virtual bool
                                                                                                  203  is_target_set() =0;
                                                                                                  204 
                                                                                                  206 
                                                                                                  207  virtual int
                                                                                                  208  get_element_bytes() const =0;
                                                                                                  209 
                                                                                                  211  virtual void
                                                                                                  212  set_element_bytes(int nbytes ) =0;
                                                                                                  214 
                                                                                                  216 
                                                                                                  246  virtual yc_var_ptr
                                                                                                  247  new_var(const std::string& name,
                                                                                                  250  const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  253 
                                                                                                  254 #ifndef SWIG
                                                                                                  255 
                                                                                                  262  virtual yc_var_ptr
                                                                                                  263  new_var(const std::string& name ,
                                                                                                  266  const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  269 #endif
                                                                                                  270 
                                                                                                  272 
                                                                                                  287  virtual yc_var_ptr
                                                                                                  288  new_scratch_var(const std::string& name,
                                                                                                  291  const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  294 
                                                                                                  295 #ifndef SWIG
                                                                                                  296 
                                                                                                  303  virtual yc_var_ptr
                                                                                                  304  new_scratch_var(const std::string& name,
                                                                                                  308  const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  311 #endif
                                                                                                  312 
                                                                                                  314 
                                                                                                  317  virtual int
                                                                                                  318  get_num_vars() const =0;
                                                                                                  319 
                                                                                                  321 
                                                                                                  322  virtual std::vector<yc_var_ptr>
                                                                                                  323  get_vars() =0;
                                                                                                  324 
                                                                                                  326 
                                                                                                  327  virtual yc_var_ptr
                                                                                                  328  get_var(const std::string& name ) =0;
                                                                                                  329 
                                                                                                  331 
                                                                                                  346  virtual void
                                                                                                  350  int len ) =0;
                                                                                                  351 
                                                                                                  353 
                                                                                                  357  virtual bool
                                                                                                  358  is_folding_set() =0;
                                                                                                  359 
                                                                                                  361  virtual void
                                                                                                  362  clear_folding() =0;
                                                                                                  363 
                                                                                                  365 
                                                                                                  372  virtual void
                                                                                                  376  int mult ) =0;
                                                                                                  377 
                                                                                                  379 
                                                                                                  383  virtual bool
                                                                                                  384  is_clustering_set() =0;
                                                                                                  385 
                                                                                                  387  virtual void
                                                                                                  388  clear_clustering() =0;
                                                                                                  389 
                                                                                                  391 
                                                                                                  393  virtual int
                                                                                                  394  get_num_equations() const =0;
                                                                                                  395 
                                                                                                  397 
                                                                                                  399  virtual std::vector<yc_equation_node_ptr>
                                                                                                  400  get_equations() =0;
                                                                                                  401 
                                                                                                  403 
                                                                                                  407  virtual int
                                                                                                  409  int level) =0;
                                                                                                  410 
                                                                                                  412 
                                                                                                  416  virtual void
                                                                                                  418  int level,
                                                                                                  421  int distance) =0;
                                                                                                  422 
                                                                                                  424 
                                                                                                  435  virtual void
                                                                                                  436  output_solution(yask_output_ptr output) =0;
                                                                                                  439 
                                                                                                  440 #ifndef SWIG
                                                                                                  441  typedef std::function<void(yc_solution& soln,
                                                                                                  444 
                                                                                                  446 
                                                                                                  458  virtual void
                                                                                                  460  output_hook_t hook_fn) =0;
                                                                                                  461 #endif
                                                                                                  462 
                                                                                                  464 
                                                                                                  487  virtual void
                                                                                                  488  call_after_new_solution(const std::string& code) =0;
                                                                                                  492 
                                                                                                  494 
                                                                                                  498 #define CALL_AFTER_NEW_SOLUTION(...) call_after_new_solution(#__VA_ARGS__)
                                                                                                  499 
                                                                                                  501 
                                                                                                  510  virtual void
                                                                                                  511  set_domain_dims(const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  513 
                                                                                                  514 #ifndef SWIG
                                                                                                  515 
                                                                                                  521  virtual void
                                                                                                  522  set_domain_dims(const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  524 #endif
                                                                                                  525 
                                                                                                  527 
                                                                                                  534  virtual void
                                                                                                  535  set_step_dim(const yc_index_node_ptr dim) =0;
                                                                                                  537 
                                                                                                  539 
                                                                                                  548  virtual void
                                                                                                  549  set_dependency_checker_enabled(bool enable) =0;
                                                                                                  551 
                                                                                                  553 
                                                                                                  556  virtual bool
                                                                                                  558 
                                                                                                  560 
                                                                                                  629  virtual void
                                                                                                  632  yc_equation_node_ptr to) =0;
                                                                                                  634 
                                                                                                  636 
                                                                                                  639  virtual void
                                                                                                  640  clear_dependencies() =0;
                                                                                                  641 
                                                                                                  643  inline void
                                                                                                  644  format(const std::string& format_type,
                                                                                                  645  yask_output_ptr output) {
                                                                                                  646  set_target(format_type);
                                                                                                  647  output_solution(output);
                                                                                                  648  }
                                                                                                  649 
                                                                                                  651  inline yc_var_ptr
                                                                                                  652  new_grid(const std::string& name,
                                                                                                  653  const std::vector<yc_index_node_ptr>& dims) {
                                                                                                  654  return new_var(name, dims);
                                                                                                  655  }
                                                                                                  656 
                                                                                                  657 #ifndef SWIG
                                                                                                  658  inline yc_var_ptr
                                                                                                  660  new_grid(const std::string& name,
                                                                                                  661  const std::initializer_list<yc_index_node_ptr>& dims) {
                                                                                                  662  return new_var(name, dims);
                                                                                                  663  }
                                                                                                  664 #endif
                                                                                                  665 
                                                                                                  667  inline yc_var_ptr
                                                                                                  668  new_scratch_grid(const std::string& name,
                                                                                                  669  const std::vector<yc_index_node_ptr>& dims) {
                                                                                                  670  return new_scratch_var(name, dims);
                                                                                                  671  }
                                                                                                  672 
                                                                                                  673 #ifndef SWIG
                                                                                                  674  inline yc_var_ptr
                                                                                                  676  new_scratch_grid(const std::string& name,
                                                                                                  677  const std::initializer_list<yc_index_node_ptr>& dims) {
                                                                                                  678  return new_scratch_var(name, dims);
                                                                                                  679  }
                                                                                                  680 #endif
                                                                                                  681 
                                                                                                  683  inline int
                                                                                                  684  get_num_grids() const {
                                                                                                  685  return get_num_vars();
                                                                                                  686  }
                                                                                                  687 
                                                                                                  689  inline std::vector<yc_var_ptr>
                                                                                                  691  return get_vars();
                                                                                                  692  }
                                                                                                  693 
                                                                                                  695  inline yc_var_ptr
                                                                                                  696  get_grid(const std::string& name) {
                                                                                                  697  return get_var(name);
                                                                                                  698  }
                                                                                                  699  }; // yc_solution.
                                                                                                  700 
                                                                                                  702 
                                                                                                  711  class yc_var {
                                                                                                  712  public:
                                                                                                  713  virtual ~yc_var() {}
                                                                                                  714 
                                                                                                  716 
                                                                                                  718  virtual const std::string& get_name() const =0;
                                                                                                  719 
                                                                                                  721 
                                                                                                  723  virtual int get_num_dims() const =0;
                                                                                                  724 
                                                                                                  726 
                                                                                                  731  virtual std::vector<std::string>
                                                                                                  732  get_dim_names() const =0;
                                                                                                  733 
                                                                                                  735 
                                                                                                  740  virtual yc_var_point_node_ptr
                                                                                                  741  new_var_point(const std::vector<yc_number_node_ptr>& index_exprs ) =0;
                                                                                                  745 
                                                                                                  746 #ifndef SWIG
                                                                                                  747 
                                                                                                  757  virtual yc_var_point_node_ptr
                                                                                                  758  new_var_point(const std::initializer_list<yc_number_node_ptr>& index_exprs) = 0;
                                                                                                  759 #endif
                                                                                                  760 
                                                                                                  762 
                                                                                                  777  virtual yc_var_point_node_ptr
                                                                                                  778  new_relative_var_point(const std::vector<int>& dim_offsets ) =0;
                                                                                                  780 
                                                                                                  781 #ifndef SWIG
                                                                                                  782 
                                                                                                  788  virtual yc_var_point_node_ptr
                                                                                                  789  new_relative_var_point(const std::initializer_list<int>& dim_offsets) = 0;
                                                                                                  790 #endif
                                                                                                  791 
                                                                                                  793 
                                                                                                  796  virtual bool
                                                                                                  797  is_dynamic_step_alloc() const =0;
                                                                                                  798 
                                                                                                  800 
                                                                                                  803  virtual void
                                                                                                  804  set_dynamic_step_alloc(bool is_dynamic) =0;
                                                                                                  806 
                                                                                                  808 
                                                                                                  814  virtual idx_t
                                                                                                  815  get_step_alloc_size() const =0;
                                                                                                  816 
                                                                                                  818 
                                                                                                  822  virtual void
                                                                                                  823  set_step_alloc_size(idx_t size) =0;
                                                                                                  825 
                                                                                                  827  inline yc_var_point_node_ptr
                                                                                                  828  new_grid_point(const std::vector<yc_number_node_ptr>& index_exprs) {
                                                                                                  829  return new_var_point(index_exprs);
                                                                                                  830  }
                                                                                                  832  inline yc_var_point_node_ptr
                                                                                                  833  new_grid_point(const std::initializer_list<yc_number_node_ptr>& index_exprs) {
                                                                                                  834  return new_var_point(index_exprs);
                                                                                                  835  }
                                                                                                  837  inline yc_var_point_node_ptr
                                                                                                  838  new_relative_grid_point(const std::vector<int>& dim_offsets) {
                                                                                                  839  return new_relative_var_point(dim_offsets);
                                                                                                  840  }
                                                                                                  842  inline yc_var_point_node_ptr
                                                                                                  843  new_relative_grid_point(const std::initializer_list<int>& dim_offsets) {
                                                                                                  844  return new_relative_var_point(dim_offsets);
                                                                                                  845  }
                                                                                                  846 
                                                                                                  847  }; // yc_var.
                                                                                                  848 
                                                                                                  850 
                                                                                                  882  class yc_var_proxy {
                                                                                                  883  private:
                                                                                                  884  yc_var_ptr _var;
                                                                                                  885 
                                                                                                  886  public:
                                                                                                  887 
                                                                                                  889 
                                                                                                  893  yc_var_proxy(const std::string& name,
                                                                                                  896  yc_solution_ptr soln,
                                                                                                  898  const std::vector< yc_index_node_ptr > &dims,
                                                                                                  901  bool is_scratch = false) {
                                                                                                  903  if (is_scratch)
                                                                                                  904  _var = soln->new_scratch_var(name, dims);
                                                                                                  905  else
                                                                                                  906  _var = soln->new_var(name, dims);
                                                                                                  907  }
                                                                                                  908 
                                                                                                  909 #ifndef SWIG
                                                                                                  910 
                                                                                                  916  yc_var_proxy(const std::string& name,
                                                                                                  919  yc_solution_ptr soln,
                                                                                                  921  const std::initializer_list< yc_index_node_ptr > &dims,
                                                                                                  924  bool is_scratch = false) {
                                                                                                  926  if (is_scratch)
                                                                                                  927  _var = soln->new_scratch_var(name, dims);
                                                                                                  928  else
                                                                                                  929  _var = soln->new_var(name, dims);
                                                                                                  930  }
                                                                                                  931 #endif
                                                                                                  932 
                                                                                                  934 
                                                                                                  937  yc_var_proxy(const std::string& name,
                                                                                                  940  yc_solution_ptr soln) {
                                                                                                  942  _var = soln->new_var(name, { });
                                                                                                  943  }
                                                                                                  944 
                                                                                                  946 
                                                                                                  950  yc_var_proxy(yc_var_ptr& var) : _var(var) { }
                                                                                                  951 
                                                                                                  953  virtual ~yc_var_proxy() { }
                                                                                                  954 
                                                                                                  956  virtual yc_var_ptr get_var() {
                                                                                                  957  return _var;
                                                                                                  958  }
                                                                                                  959 
                                                                                                  961  virtual yc_var_ptr get_var() const {
                                                                                                  962  return _var;
                                                                                                  963  }
                                                                                                  964 
                                                                                                  966 
                                                                                                  972  virtual yc_var_point_node_ptr
                                                                                                  973  operator()(const std::vector<yc_number_node_ptr>& index_exprs) {
                                                                                                  974  return _var->new_var_point(index_exprs);
                                                                                                  975  }
                                                                                                  976 
                                                                                                  977 #ifndef SWIG
                                                                                                  978 
                                                                                                  986  virtual yc_var_point_node_ptr
                                                                                                  987  operator()(const std::initializer_list<yc_number_node_ptr>& index_exprs) {
                                                                                                  988  return _var->new_var_point(index_exprs);
                                                                                                  989  }
                                                                                                  990 
                                                                                                  992 
                                                                                                  1001  const yc_number_any_arg i2 = nullptr,
                                                                                                  1002  const yc_number_any_arg i3 = nullptr,
                                                                                                  1003  const yc_number_any_arg i4 = nullptr,
                                                                                                  1004  const yc_number_any_arg i5 = nullptr,
                                                                                                  1005  const yc_number_any_arg i6 = nullptr) {
                                                                                                  1006  std::vector<yc_number_node_ptr> args;
                                                                                                  1007  if (i1)
                                                                                                  1008  args.push_back(i1);
                                                                                                  1009  if (i2)
                                                                                                  1010  args.push_back(i2);
                                                                                                  1011  if (i3)
                                                                                                  1012  args.push_back(i3);
                                                                                                  1013  if (i4)
                                                                                                  1014  args.push_back(i4);
                                                                                                  1015  if (i5)
                                                                                                  1016  args.push_back(i5);
                                                                                                  1017  if (i6)
                                                                                                  1018  args.push_back(i6);
                                                                                                  1019  return _var->new_var_point(args);
                                                                                                  1020  }
                                                                                                  1021 
                                                                                                  1023 
                                                                                                  1030  virtual operator yc_number_ptr_arg() {
                                                                                                  1031  return _var->new_var_point({});
                                                                                                  1032  }
                                                                                                  1033 
                                                                                                  1035 
                                                                                                  1043  return _var->new_var_point({i1});
                                                                                                  1044  }
                                                                                                  1045 
                                                                                                  1046 #endif
                                                                                                  1047 
                                                                                                  1048  }; // yc_var_proxy.
                                                                                                  1051  typedef yc_var yc_grid;
                                                                                                  1059 
                                                                                                  1060 } // namespace yask.
                                                                                                  1061 
                                                                                                  1062 // More solution-based objects.
                                                                                                  1063 #include "aux/yc_solution_api.hpp"
                                                                                                  virtual void output_solution(yask_output_ptr output)=0
                                                                                                  Optimize and the current equation(s) and write to given output object.
                                                                                                  -
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:560
                                                                                                  -
                                                                                                  virtual yc_var_ptr get_var()
                                                                                                  Get the underlying yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:956
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_common_api.hpp"
                                                                                                  35 #include <functional>
                                                                                                  36 #include <vector>
                                                                                                  37 
                                                                                                  38 namespace yask {
                                                                                                  39 
                                                                                                  46  // Forward declarations of classes and their pointers.
                                                                                                  47  // See yask_compiler_api.hpp for more.
                                                                                                  48 
                                                                                                  49  class yc_solution;
                                                                                                  51  typedef std::shared_ptr<yc_solution> yc_solution_ptr;
                                                                                                  52 
                                                                                                  53  class yc_var;
                                                                                                  55  typedef yc_var* yc_var_ptr;
                                                                                                  56 
                                                                                                  57  // Forward declarations of expression nodes and their pointers.
                                                                                                  58 
                                                                                                  59  class yc_expr_node;
                                                                                                  61  typedef std::shared_ptr<yc_expr_node> yc_expr_node_ptr;
                                                                                                  62 
                                                                                                  63  class yc_bool_node;
                                                                                                  65  typedef std::shared_ptr<yc_bool_node> yc_bool_node_ptr;
                                                                                                  66 
                                                                                                  69  typedef std::shared_ptr<yc_number_node> yc_number_node_ptr;
                                                                                                  70 
                                                                                                  73  typedef std::shared_ptr<yc_index_node> yc_index_node_ptr;
                                                                                                  74 
                                                                                                  77  typedef std::shared_ptr<yc_equation_node> yc_equation_node_ptr;
                                                                                                  78 
                                                                                                  81  typedef std::shared_ptr<yc_var_point_node> yc_var_point_node_ptr;
                                                                                                  82 
                                                                                                  84 }
                                                                                                  85 
                                                                                                  86 #include "aux/yc_node_api.hpp"
                                                                                                  87 
                                                                                                  88 namespace yask {
                                                                                                  89 
                                                                                                  95  class yc_factory {
                                                                                                  97  public:
                                                                                                  98  virtual ~yc_factory() {}
                                                                                                  99 
                                                                                                  101 
                                                                                                  104  virtual std::string
                                                                                                  106 
                                                                                                  108 
                                                                                                  112  virtual yc_solution_ptr
                                                                                                  113  new_solution(const std::string& name ) const;
                                                                                                  115  }; // yc_factory.
                                                                                                  116 
                                                                                                  118 
                                                                                                  123  class yc_solution {
                                                                                                  124  public:
                                                                                                  125  virtual ~yc_solution() {}
                                                                                                  126 
                                                                                                  128  virtual void
                                                                                                  132 
                                                                                                  134 
                                                                                                  138  virtual std::string
                                                                                                  139  get_name() const =0;
                                                                                                  140 
                                                                                                  142 
                                                                                                  145  virtual void
                                                                                                  146  set_name(std::string name ) =0;
                                                                                                  148 
                                                                                                  150 
                                                                                                  154  virtual std::string
                                                                                                  155  get_description() const =0;
                                                                                                  156 
                                                                                                  158 
                                                                                                  163  virtual void
                                                                                                  164  set_description(std::string description ) =0;
                                                                                                  166 
                                                                                                  168 
                                                                                                  173  virtual std::string
                                                                                                  174  get_target() =0;
                                                                                                  175 
                                                                                                  177 
                                                                                                  193  virtual void
                                                                                                  194  set_target(
                                                                                                  195  const std::string& format) =0;
                                                                                                  196 
                                                                                                  198 
                                                                                                  202  virtual bool
                                                                                                  203  is_target_set() =0;
                                                                                                  204 
                                                                                                  206 
                                                                                                  207  virtual int
                                                                                                  208  get_element_bytes() const =0;
                                                                                                  209 
                                                                                                  211  virtual void
                                                                                                  212  set_element_bytes(int nbytes ) =0;
                                                                                                  214 
                                                                                                  216 
                                                                                                  246  virtual yc_var_ptr
                                                                                                  247  new_var(const std::string& name,
                                                                                                  250  const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  253 
                                                                                                  254  #ifndef SWIG
                                                                                                  255 
                                                                                                  262  virtual yc_var_ptr
                                                                                                  263  new_var(const std::string& name ,
                                                                                                  266  const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  269  #endif
                                                                                                  270 
                                                                                                  272 
                                                                                                  287  virtual yc_var_ptr
                                                                                                  288  new_scratch_var(const std::string& name,
                                                                                                  291  const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  294 
                                                                                                  295  #ifndef SWIG
                                                                                                  296 
                                                                                                  303  virtual yc_var_ptr
                                                                                                  304  new_scratch_var(const std::string& name,
                                                                                                  308  const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  311  #endif
                                                                                                  312 
                                                                                                  314 
                                                                                                  317  virtual int
                                                                                                  318  get_num_vars() const =0;
                                                                                                  319 
                                                                                                  321 
                                                                                                  322  virtual std::vector<yc_var_ptr>
                                                                                                  323  get_vars() =0;
                                                                                                  324 
                                                                                                  326 
                                                                                                  327  virtual yc_var_ptr
                                                                                                  328  get_var(const std::string& name ) =0;
                                                                                                  329 
                                                                                                  331 
                                                                                                  346  virtual void
                                                                                                  350  int len ) =0;
                                                                                                  351 
                                                                                                  353 
                                                                                                  357  virtual bool
                                                                                                  358  is_folding_set() =0;
                                                                                                  359 
                                                                                                  361  virtual void
                                                                                                  362  clear_folding() =0;
                                                                                                  363 
                                                                                                  365 
                                                                                                  372  virtual void
                                                                                                  376  int mult ) =0;
                                                                                                  377 
                                                                                                  379 
                                                                                                  383  virtual bool
                                                                                                  384  is_clustering_set() =0;
                                                                                                  385 
                                                                                                  387  virtual void
                                                                                                  388  clear_clustering() =0;
                                                                                                  389 
                                                                                                  391 
                                                                                                  393  virtual int
                                                                                                  394  get_num_equations() const =0;
                                                                                                  395 
                                                                                                  397 
                                                                                                  399  virtual std::vector<yc_equation_node_ptr>
                                                                                                  400  get_equations() =0;
                                                                                                  401 
                                                                                                  403 
                                                                                                  407  virtual int
                                                                                                  409  int level) =0;
                                                                                                  410 
                                                                                                  412  virtual void
                                                                                                  414  int level,
                                                                                                  417  int distance) =0;
                                                                                                  418 
                                                                                                  420 
                                                                                                  431  virtual void
                                                                                                  432  output_solution(yask_output_ptr output) =0;
                                                                                                  435 
                                                                                                  436  #ifndef SWIG
                                                                                                  437  typedef std::function<void(yc_solution& soln,
                                                                                                  440 
                                                                                                  442 
                                                                                                  454  virtual void
                                                                                                  456  output_hook_t hook_fn) =0;
                                                                                                  457  #endif
                                                                                                  458 
                                                                                                  460 
                                                                                                  483  virtual void
                                                                                                  484  call_after_new_solution(const std::string& code) =0;
                                                                                                  488 
                                                                                                  490 
                                                                                                  494  #define CALL_AFTER_NEW_SOLUTION(...) call_after_new_solution(#__VA_ARGS__)
                                                                                                  495 
                                                                                                  497 
                                                                                                  506  virtual void
                                                                                                  507  set_domain_dims(const std::vector<yc_index_node_ptr>& dims ) =0;
                                                                                                  509 
                                                                                                  510  #ifndef SWIG
                                                                                                  511 
                                                                                                  517  virtual void
                                                                                                  518  set_domain_dims(const std::initializer_list<yc_index_node_ptr>& dims ) =0;
                                                                                                  520  #endif
                                                                                                  521 
                                                                                                  523 
                                                                                                  530  virtual void
                                                                                                  531  set_step_dim(const yc_index_node_ptr dim) =0;
                                                                                                  533 
                                                                                                  535 
                                                                                                  544  virtual void
                                                                                                  545  set_dependency_checker_enabled(bool enable) =0;
                                                                                                  547 
                                                                                                  549 
                                                                                                  552  virtual bool
                                                                                                  554 
                                                                                                  556 
                                                                                                  625  virtual void
                                                                                                  628  yc_equation_node_ptr to) =0;
                                                                                                  630 
                                                                                                  632 
                                                                                                  635  virtual void
                                                                                                  636  clear_dependencies() =0;
                                                                                                  637 
                                                                                                  640  inline void
                                                                                                  641  format(const std::string& format_type,
                                                                                                  642  yask_output_ptr output) {
                                                                                                  643  set_target(format_type);
                                                                                                  644  output_solution(output);
                                                                                                  645  }
                                                                                                  646 
                                                                                                  649  inline yc_var_ptr
                                                                                                  650  new_grid(const std::string& name,
                                                                                                  651  const std::vector<yc_index_node_ptr>& dims) {
                                                                                                  652  return new_var(name, dims);
                                                                                                  653  }
                                                                                                  654 
                                                                                                  655  #ifndef SWIG
                                                                                                  658  inline yc_var_ptr
                                                                                                  659  new_grid(const std::string& name,
                                                                                                  660  const std::initializer_list<yc_index_node_ptr>& dims) {
                                                                                                  661  return new_var(name, dims);
                                                                                                  662  }
                                                                                                  663  #endif
                                                                                                  664 
                                                                                                  667  inline yc_var_ptr
                                                                                                  668  new_scratch_grid(const std::string& name,
                                                                                                  669  const std::vector<yc_index_node_ptr>& dims) {
                                                                                                  670  return new_scratch_var(name, dims);
                                                                                                  671  }
                                                                                                  672 
                                                                                                  673  #ifndef SWIG
                                                                                                  676  inline yc_var_ptr
                                                                                                  677  new_scratch_grid(const std::string& name,
                                                                                                  678  const std::initializer_list<yc_index_node_ptr>& dims) {
                                                                                                  679  return new_scratch_var(name, dims);
                                                                                                  680  }
                                                                                                  681  #endif
                                                                                                  682 
                                                                                                  685  inline int
                                                                                                  686  get_num_grids() const {
                                                                                                  687  return get_num_vars();
                                                                                                  688  }
                                                                                                  689 
                                                                                                  692  inline std::vector<yc_var_ptr>
                                                                                                  694  return get_vars();
                                                                                                  695  }
                                                                                                  696 
                                                                                                  699  inline yc_var_ptr
                                                                                                  700  get_grid(const std::string& name) {
                                                                                                  701  return get_var(name);
                                                                                                  702  }
                                                                                                  703  }; // yc_solution.
                                                                                                  704 
                                                                                                  706 
                                                                                                  715  class yc_var {
                                                                                                  716  public:
                                                                                                  717  virtual ~yc_var() {}
                                                                                                  718 
                                                                                                  720 
                                                                                                  722  virtual const std::string& get_name() const =0;
                                                                                                  723 
                                                                                                  725 
                                                                                                  727  virtual int get_num_dims() const =0;
                                                                                                  728 
                                                                                                  730 
                                                                                                  735  virtual string_vec
                                                                                                  736  get_dim_names() const =0;
                                                                                                  737 
                                                                                                  739 
                                                                                                  744  virtual yc_var_point_node_ptr
                                                                                                  745  new_var_point(const std::vector<yc_number_node_ptr>& index_exprs ) =0;
                                                                                                  749 
                                                                                                  750  #ifndef SWIG
                                                                                                  751 
                                                                                                  761  virtual yc_var_point_node_ptr
                                                                                                  762  new_var_point(const std::initializer_list<yc_number_node_ptr>& index_exprs) = 0;
                                                                                                  763  #endif
                                                                                                  764 
                                                                                                  766 
                                                                                                  769  virtual bool
                                                                                                  770  is_dynamic_step_alloc() const =0;
                                                                                                  771 
                                                                                                  773 
                                                                                                  776  virtual void
                                                                                                  777  set_dynamic_step_alloc(bool is_dynamic) =0;
                                                                                                  779 
                                                                                                  781 
                                                                                                  787  virtual idx_t
                                                                                                  788  get_step_alloc_size() const =0;
                                                                                                  789 
                                                                                                  791 
                                                                                                  795  virtual void
                                                                                                  796  set_step_alloc_size(idx_t size) =0;
                                                                                                  798 
                                                                                                  801  virtual yc_var_point_node_ptr
                                                                                                  802  new_relative_var_point(const std::vector<int>& dim_offsets) =0;
                                                                                                  803  #ifndef SWIG
                                                                                                  806  virtual yc_var_point_node_ptr
                                                                                                  807  new_relative_var_point(const std::initializer_list<int>& dim_offsets) = 0;
                                                                                                  808  #endif
                                                                                                  809 
                                                                                                  812  inline yc_var_point_node_ptr
                                                                                                  813  new_grid_point(const std::vector<yc_number_node_ptr>& index_exprs) {
                                                                                                  814  return new_var_point(index_exprs);
                                                                                                  815  }
                                                                                                  816  #ifndef SWIG
                                                                                                  819  inline yc_var_point_node_ptr
                                                                                                  820  new_grid_point(const std::initializer_list<yc_number_node_ptr>& index_exprs) {
                                                                                                  821  return new_var_point(index_exprs);
                                                                                                  822  }
                                                                                                  823  #endif
                                                                                                  824 
                                                                                                  827  inline yc_var_point_node_ptr
                                                                                                  828  new_relative_grid_point(const std::vector<int>& dim_offsets) {
                                                                                                  829  return new_relative_var_point(dim_offsets);
                                                                                                  830  }
                                                                                                  831  #ifndef SWIG
                                                                                                  834  inline yc_var_point_node_ptr
                                                                                                  835  new_relative_grid_point(const std::initializer_list<int>& dim_offsets) {
                                                                                                  836  return new_relative_var_point(dim_offsets);
                                                                                                  837  }
                                                                                                  838  #endif
                                                                                                  839 
                                                                                                  840  }; // yc_var.
                                                                                                  841 
                                                                                                  843 
                                                                                                  875  class yc_var_proxy {
                                                                                                  876  private:
                                                                                                  877  yc_var_ptr _var;
                                                                                                  878 
                                                                                                  879  public:
                                                                                                  880 
                                                                                                  882 
                                                                                                  886  yc_var_proxy(const std::string& name,
                                                                                                  889  yc_solution_ptr soln,
                                                                                                  891  const std::vector< yc_index_node_ptr > &dims,
                                                                                                  894  bool is_scratch = false) {
                                                                                                  896  if (is_scratch)
                                                                                                  897  _var = soln->new_scratch_var(name, dims);
                                                                                                  898  else
                                                                                                  899  _var = soln->new_var(name, dims);
                                                                                                  900  }
                                                                                                  901 
                                                                                                  902  #ifndef SWIG
                                                                                                  903 
                                                                                                  909  yc_var_proxy(const std::string& name,
                                                                                                  912  yc_solution_ptr soln,
                                                                                                  914  const std::initializer_list< yc_index_node_ptr > &dims,
                                                                                                  917  bool is_scratch = false) {
                                                                                                  919  if (is_scratch)
                                                                                                  920  _var = soln->new_scratch_var(name, dims);
                                                                                                  921  else
                                                                                                  922  _var = soln->new_var(name, dims);
                                                                                                  923  }
                                                                                                  924  #endif
                                                                                                  925 
                                                                                                  927 
                                                                                                  930  yc_var_proxy(const std::string& name,
                                                                                                  933  yc_solution_ptr soln) {
                                                                                                  935  _var = soln->new_var(name, { });
                                                                                                  936  }
                                                                                                  937 
                                                                                                  939 
                                                                                                  943  yc_var_proxy(yc_var_ptr& var) : _var(var) { }
                                                                                                  944 
                                                                                                  946  virtual ~yc_var_proxy() { }
                                                                                                  947 
                                                                                                  949  virtual yc_var_ptr get_var() {
                                                                                                  950  return _var;
                                                                                                  951  }
                                                                                                  952 
                                                                                                  954  virtual yc_var_ptr get_var() const {
                                                                                                  955  return _var;
                                                                                                  956  }
                                                                                                  957 
                                                                                                  959 
                                                                                                  965  virtual yc_var_point_node_ptr
                                                                                                  966  operator()(const std::vector<yc_number_node_ptr>& index_exprs) {
                                                                                                  967  return _var->new_var_point(index_exprs);
                                                                                                  968  }
                                                                                                  969 
                                                                                                  970  #ifndef SWIG
                                                                                                  971 
                                                                                                  979  virtual yc_var_point_node_ptr
                                                                                                  980  operator()(const std::initializer_list<yc_number_node_ptr>& index_exprs) {
                                                                                                  981  return _var->new_var_point(index_exprs);
                                                                                                  982  }
                                                                                                  983 
                                                                                                  985 
                                                                                                  994  const yc_number_any_arg i2 = nullptr,
                                                                                                  995  const yc_number_any_arg i3 = nullptr,
                                                                                                  996  const yc_number_any_arg i4 = nullptr,
                                                                                                  997  const yc_number_any_arg i5 = nullptr,
                                                                                                  998  const yc_number_any_arg i6 = nullptr) {
                                                                                                  999  std::vector<yc_number_node_ptr> args;
                                                                                                  1000  if (i1)
                                                                                                  1001  args.push_back(i1);
                                                                                                  1002  if (i2)
                                                                                                  1003  args.push_back(i2);
                                                                                                  1004  if (i3)
                                                                                                  1005  args.push_back(i3);
                                                                                                  1006  if (i4)
                                                                                                  1007  args.push_back(i4);
                                                                                                  1008  if (i5)
                                                                                                  1009  args.push_back(i5);
                                                                                                  1010  if (i6)
                                                                                                  1011  args.push_back(i6);
                                                                                                  1012  return _var->new_var_point(args);
                                                                                                  1013  }
                                                                                                  1014 
                                                                                                  1016 
                                                                                                  1023  virtual operator yc_number_ptr_arg() {
                                                                                                  1024  return _var->new_var_point({});
                                                                                                  1025  }
                                                                                                  1026 
                                                                                                  1028 
                                                                                                  1036  return _var->new_var_point({i1});
                                                                                                  1037  }
                                                                                                  1038 
                                                                                                  1039  #endif
                                                                                                  1040 
                                                                                                  1041  }; // yc_var_proxy.
                                                                                                  1046  typedef yc_var yc_grid;
                                                                                                  1056 
                                                                                                  1057 } // namespace yask.
                                                                                                  1058 
                                                                                                  1059 // More solution-based objects.
                                                                                                  1060 #include "aux/yc_solution_api.hpp"
                                                                                                  virtual void output_solution(yask_output_ptr output)=0
                                                                                                  Optimize and the current equation(s) and write to given output object.
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr get_grid(const std::string &name)
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yask_compiler_api.hpp:700
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yask_compiler_api.hpp:650
                                                                                                  +
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:561
                                                                                                  +
                                                                                                  virtual yc_var_ptr get_var()
                                                                                                  Get the underlying yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:949
                                                                                                  virtual void clear_dependencies()=0
                                                                                                  [Advanced] Remove all existing dependencies.
                                                                                                  -
                                                                                                  std::vector< yc_var_ptr > get_grids()
                                                                                                  [Deprecated] Use get_vars().
                                                                                                  Definition: yask_compiler_api.hpp:690
                                                                                                  -
                                                                                                  virtual std::vector< std::string > get_dim_names() const =0
                                                                                                  Get all the dimensions in this var.
                                                                                                  +
                                                                                                  YASK_DEPRECATED int get_num_grids() const
                                                                                                  [Deprecated] Use get_num_vars().
                                                                                                  Definition: yask_compiler_api.hpp:686
                                                                                                  virtual std::string get_version_string()
                                                                                                  Version information.
                                                                                                  virtual void set_step_dim(const yc_index_node_ptr dim)=0
                                                                                                  [Advanced] Explicitly identify the step dimension in the solution.
                                                                                                  -
                                                                                                  yc_var_ptr new_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yask_compiler_api.hpp:652
                                                                                                  -
                                                                                                  yc_var_point_node_ptr new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)
                                                                                                  [Deprecated] Use new_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:833
                                                                                                  virtual yc_var_ptr get_var(const std::string &name)=0
                                                                                                  Get the specified var.
                                                                                                  virtual void set_domain_dims(const std::vector< yc_index_node_ptr > &dims)=0
                                                                                                  [Advanced] Explicitly define and order the domain dimensions used in the solution.
                                                                                                  virtual yc_var_point_node_ptr new_var_point(const std::vector< yc_number_node_ptr > &index_exprs)=0
                                                                                                  Create a reference to a point in this var.
                                                                                                  -
                                                                                                  yc_var_ptr new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_scratch_var().
                                                                                                  Definition: yask_compiler_api.hpp:668
                                                                                                  virtual void set_target(const std::string &format)=0
                                                                                                  Set the output target.
                                                                                                  virtual bool is_clustering_set()=0
                                                                                                  Determine whether any clustering has been set.
                                                                                                  -
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln)
                                                                                                  Contructor for a simple scalar value.
                                                                                                  Definition: yask_compiler_api.hpp:937
                                                                                                  +
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln)
                                                                                                  Contructor for a simple scalar value.
                                                                                                  Definition: yask_compiler_api.hpp:930
                                                                                                  virtual std::vector< yc_var_ptr > get_vars()=0
                                                                                                  Get all the vars in the solution.
                                                                                                  virtual idx_t get_step_alloc_size() const =0
                                                                                                  [Advanced] Get the current allocation in the step dimension of this var.
                                                                                                  Stencil solution.
                                                                                                  Definition: yask_compiler_api.hpp:123
                                                                                                  virtual bool is_dependency_checker_enabled() const =0
                                                                                                  [Advanced] Determine whether automatic dependency checker is enabled.
                                                                                                  std::shared_ptr< yc_var_point_node > yc_var_point_node_ptr
                                                                                                  Shared pointer to yc_var_point_node.
                                                                                                  Definition: yask_compiler_api.hpp:79
                                                                                                  yc_var * yc_var_ptr
                                                                                                  Pointer to yc_var.
                                                                                                  Definition: yask_compiler_api.hpp:53
                                                                                                  -
                                                                                                  A wrapper or "proxy" class around a yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:882
                                                                                                  +
                                                                                                  A wrapper or "proxy" class around a yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:875
                                                                                                  +
                                                                                                  YASK_DEPRECATED void format(const std::string &format_type, yask_output_ptr output)
                                                                                                  [Deprecated] Use set_target() and output_solution().
                                                                                                  Definition: yask_compiler_api.hpp:641
                                                                                                  virtual std::string get_target()=0
                                                                                                  Get the current output-file format.
                                                                                                  virtual void call_before_output(output_hook_t hook_fn)=0
                                                                                                  [Advanced] Register a function to be called before a solution is output.
                                                                                                  virtual void set_description(std::string description)=0
                                                                                                  Set the description of the solution.
                                                                                                  Equation node.
                                                                                                  Definition: yc_node_api.hpp:149
                                                                                                  virtual std::vector< yc_equation_node_ptr > get_equations()=0
                                                                                                  Get a list of all the defined equations.
                                                                                                  -
                                                                                                  virtual ~yc_var_proxy()
                                                                                                  Provide a virtual destructor.
                                                                                                  Definition: yask_compiler_api.hpp:953
                                                                                                  -
                                                                                                  yc_var_point_node_ptr yc_grid_point_node_ptr
                                                                                                  [Deprecated] Use yc_var_point_node_ptr.
                                                                                                  Definition: yask_compiler_api.hpp:1058
                                                                                                  -
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln, const std::initializer_list< yc_index_node_ptr > &dims, bool is_scratch=false)
                                                                                                  Contructor taking an initializer_list of index vars.
                                                                                                  Definition: yask_compiler_api.hpp:916
                                                                                                  -
                                                                                                  yc_var_point_node_ptr new_relative_grid_point(const std::initializer_list< int > &dim_offsets)
                                                                                                  [Deprecated] Use new_relative_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:843
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yask_compiler_api.hpp:659
                                                                                                  +
                                                                                                  virtual ~yc_var_proxy()
                                                                                                  Provide a virtual destructor.
                                                                                                  Definition: yask_compiler_api.hpp:946
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_point_node_ptr new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)
                                                                                                  [Deprecated] Use new_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:813
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_scratch_var().
                                                                                                  Definition: yask_compiler_api.hpp:677
                                                                                                  +
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln, const std::initializer_list< yc_index_node_ptr > &dims, bool is_scratch=false)
                                                                                                  Contructor taking an initializer_list of index vars.
                                                                                                  Definition: yask_compiler_api.hpp:909
                                                                                                  virtual void set_step_alloc_size(idx_t size)=0
                                                                                                  [Advanced] Set the current allocation in the step dimension of this var.
                                                                                                  std::shared_ptr< yc_index_node > yc_index_node_ptr
                                                                                                  Shared pointer to yc_index_node.
                                                                                                  Definition: yask_compiler_api.hpp:71
                                                                                                  +
                                                                                                  YASK_INT64_T idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:77
                                                                                                  virtual void set_dynamic_step_alloc(bool is_dynamic)=0
                                                                                                  [Advanced] Set whether the allocation of the step dimension of this var can be modified at run-time.
                                                                                                  -
                                                                                                  int get_num_grids() const
                                                                                                  [Deprecated] Use get_num_vars().
                                                                                                  Definition: yask_compiler_api.hpp:684
                                                                                                  -
                                                                                                  virtual yc_var_ptr get_var() const
                                                                                                  Get the underlying yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:961
                                                                                                  +
                                                                                                  virtual yc_var_ptr get_var() const
                                                                                                  Get the underlying yc_var pointer.
                                                                                                  Definition: yask_compiler_api.hpp:954
                                                                                                  Base class for all numerical AST nodes.
                                                                                                  Definition: yc_node_api.hpp:247
                                                                                                  virtual int get_num_vars() const =0
                                                                                                  Get the number of vars in the solution.
                                                                                                  -
                                                                                                  yc_var_ptr get_grid(const std::string &name)
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yask_compiler_api.hpp:696
                                                                                                  virtual void clear_clustering()=0
                                                                                                  Remove all vector-clustering settings.
                                                                                                  -
                                                                                                  virtual yc_var_point_node_ptr operator()(const std::initializer_list< yc_number_node_ptr > &index_exprs)
                                                                                                  Create an expression for a point in a var.
                                                                                                  Definition: yask_compiler_api.hpp:987
                                                                                                  +
                                                                                                  virtual yc_var_point_node_ptr operator()(const std::initializer_list< yc_number_node_ptr > &index_exprs)
                                                                                                  Create an expression for a point in a var.
                                                                                                  Definition: yask_compiler_api.hpp:980
                                                                                                  std::shared_ptr< yc_expr_node > yc_expr_node_ptr
                                                                                                  Shared pointer to yc_expr_node.
                                                                                                  Definition: yask_compiler_api.hpp:59
                                                                                                  virtual yc_solution_ptr new_solution(const std::string &name) const
                                                                                                  Create a stencil solution.
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yc_var_point_node yc_grid_point_node
                                                                                                  [Deprecated] Use yc_var_point_node.
                                                                                                  Definition: yask_compiler_api.hpp:1052
                                                                                                  virtual void set_name(std::string name)=0
                                                                                                  Set the name of the solution.
                                                                                                  -
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:66
                                                                                                  +
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:94
                                                                                                  virtual bool is_target_set()=0
                                                                                                  Determine whether target has been set.
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_point_node_ptr new_grid_point(const std::initializer_list< yc_number_node_ptr > &index_exprs)
                                                                                                  [Deprecated] Use new_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:820
                                                                                                  virtual void clear_folding()=0
                                                                                                  Remove all vector-folding settings.
                                                                                                  -
                                                                                                  yc_var_point_node_ptr new_grid_point(const std::vector< yc_number_node_ptr > &index_exprs)
                                                                                                  [Deprecated] Use new_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:828
                                                                                                  virtual yc_var_ptr new_scratch_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0
                                                                                                  Create an n-dimensional scratch variable in the solution.
                                                                                                  std::shared_ptr< yc_bool_node > yc_bool_node_ptr
                                                                                                  Shared pointer to yc_bool_node.
                                                                                                  Definition: yask_compiler_api.hpp:63
                                                                                                  -
                                                                                                  std::function< void(yc_solution &soln, yask_output_ptr output)> output_hook_t
                                                                                                  [Advanced] Callback type for call_before_output().
                                                                                                  Definition: yask_compiler_api.hpp:443
                                                                                                  +
                                                                                                  std::function< void(yc_solution &soln, yask_output_ptr output)> output_hook_t
                                                                                                  [Advanced] Callback type for call_before_output().
                                                                                                  Definition: yask_compiler_api.hpp:439
                                                                                                  std::shared_ptr< yc_equation_node > yc_equation_node_ptr
                                                                                                  Shared pointer to yc_equation_node.
                                                                                                  Definition: yask_compiler_api.hpp:75
                                                                                                  A reference to a point in a var.
                                                                                                  Definition: yc_node_api.hpp:283
                                                                                                  virtual int get_num_equations() const =0
                                                                                                  Get the number of equations in the solution.
                                                                                                  @@ -131,39 +131,44 @@
                                                                                                  virtual std::string get_description() const =0
                                                                                                  Get the description of the solution.
                                                                                                  Base class for all AST nodes.
                                                                                                  Definition: yc_node_api.hpp:125
                                                                                                  virtual void set_prefetch_dist(int level, int distance)=0
                                                                                                  Set the prefetch distance for the given cache.
                                                                                                  -
                                                                                                  virtual yc_var_point_node_ptr operator()(const std::vector< yc_number_node_ptr > &index_exprs)
                                                                                                  Create an expression for a point in a var.
                                                                                                  Definition: yask_compiler_api.hpp:973
                                                                                                  +
                                                                                                  virtual yc_var_point_node_ptr operator()(const std::vector< yc_number_node_ptr > &index_exprs)
                                                                                                  Create an expression for a point in a var.
                                                                                                  Definition: yask_compiler_api.hpp:966
                                                                                                  virtual std::string get_name() const =0
                                                                                                  Get the name of the solution.
                                                                                                  virtual int get_element_bytes() const =0
                                                                                                  Get current floating-point precision setting.
                                                                                                  -
                                                                                                  virtual yc_var_point_node_ptr operator[](const yc_number_any_arg i1)
                                                                                                  Create an expression for a point in a one-dim (array) var.
                                                                                                  Definition: yask_compiler_api.hpp:1042
                                                                                                  +
                                                                                                  virtual yc_var_point_node_ptr operator[](const yc_number_any_arg i1)
                                                                                                  Create an expression for a point in a one-dim (array) var.
                                                                                                  Definition: yask_compiler_api.hpp:1035
                                                                                                  virtual void set_cluster_mult(const yc_index_node_ptr dim, int mult)=0
                                                                                                  Set the cluster multiplier (unroll factor) in given dimension.
                                                                                                  +
                                                                                                  virtual YASK_DEPRECATED yc_var_point_node_ptr new_relative_var_point(const std::vector< int > &dim_offsets)=0
                                                                                                  [Deprecated] Use new_var_point().
                                                                                                  Base class for all boolean AST nodes.
                                                                                                  Definition: yc_node_api.hpp:256
                                                                                                  std::shared_ptr< yc_number_node > yc_number_node_ptr
                                                                                                  Shared pointer to yc_number_node.
                                                                                                  Definition: yask_compiler_api.hpp:67
                                                                                                  virtual bool is_folding_set()=0
                                                                                                  Determine whether any folding has been set.
                                                                                                  +
                                                                                                  #define YASK_DEPRECATED
                                                                                                  Deprecated attribute.
                                                                                                  Definition: yask_common_api.hpp:55
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yc_var yc_grid
                                                                                                  [Deprecated] Use yc_var.
                                                                                                  Definition: yask_compiler_api.hpp:1046
                                                                                                  virtual void call_after_new_solution(const std::string &code)=0
                                                                                                  [Advanced] Add block of custom C++ code to the kernel solution.
                                                                                                  virtual int get_num_dims() const =0
                                                                                                  Get the number of dimensions.
                                                                                                  -
                                                                                                  yc_var_point_node_ptr new_relative_grid_point(const std::vector< int > &dim_offsets)
                                                                                                  [Deprecated] Use new_relative_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:838
                                                                                                  virtual void set_fold_len(const yc_index_node_ptr dim, int len)=0
                                                                                                  Set the vectorization length in given dimension.
                                                                                                  -
                                                                                                  yc_var_ptr new_scratch_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_scratch_var().
                                                                                                  Definition: yask_compiler_api.hpp:676
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yc_var_ptr yc_grid_ptr
                                                                                                  [Deprecated] Use yc_var_ptr.
                                                                                                  Definition: yask_compiler_api.hpp:1049
                                                                                                  +
                                                                                                  std::vector< std::string > string_vec
                                                                                                  Vector of strings.
                                                                                                  Definition: yask_common_api.hpp:90
                                                                                                  std::shared_ptr< yc_solution > yc_solution_ptr
                                                                                                  Shared pointer to yc_solution.
                                                                                                  Definition: yask_compiler_api.hpp:49
                                                                                                  virtual bool is_dynamic_step_alloc() const =0
                                                                                                  [Advanced] Get whether the allocation of the step dimension of this var can be modified at run-time.
                                                                                                  -
                                                                                                  virtual yc_var_point_node_ptr new_relative_var_point(const std::vector< int > &dim_offsets)=0
                                                                                                  Create a reference to a point in this var using relative offsets.
                                                                                                  virtual int get_prefetch_dist(int level)=0
                                                                                                  Get the current prefetch distance for the given cache.
                                                                                                  virtual yc_var_ptr new_var(const std::string &name, const std::vector< yc_index_node_ptr > &dims)=0
                                                                                                  Create an n-dimensional variable in the solution.
                                                                                                  -
                                                                                                  Arguments that may be YASK numeric pointer types.
                                                                                                  Definition: yc_node_api.hpp:494
                                                                                                  +
                                                                                                  Arguments that may be YASK numeric pointer types.
                                                                                                  Definition: yc_node_api.hpp:495
                                                                                                  +
                                                                                                  virtual string_vec get_dim_names() const =0
                                                                                                  Get all the dimensions in this var.
                                                                                                  A dimension or an index in that dimension.
                                                                                                  Definition: yc_node_api.hpp:270
                                                                                                  virtual void set_element_bytes(int nbytes)=0
                                                                                                  Set floating-point precision.
                                                                                                  virtual const std::string & get_name() const =0
                                                                                                  Get the name of the var.
                                                                                                  -
                                                                                                  A compile-time data variable.
                                                                                                  Definition: yask_compiler_api.hpp:711
                                                                                                  -
                                                                                                  yc_var_proxy(yc_var_ptr &var)
                                                                                                  Contructor taking an existing var.
                                                                                                  Definition: yask_compiler_api.hpp:950
                                                                                                  +
                                                                                                  A compile-time data variable.
                                                                                                  Definition: yask_compiler_api.hpp:715
                                                                                                  +
                                                                                                  yc_var_proxy(yc_var_ptr &var)
                                                                                                  Contructor taking an existing var.
                                                                                                  Definition: yask_compiler_api.hpp:943
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr new_scratch_grid(const std::string &name, const std::vector< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_scratch_var().
                                                                                                  Definition: yask_compiler_api.hpp:668
                                                                                                  virtual void set_debug_output(yask_output_ptr debug)=0
                                                                                                  Set object to receive debug output.
                                                                                                  -
                                                                                                  yc_var_ptr new_grid(const std::string &name, const std::initializer_list< yc_index_node_ptr > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yask_compiler_api.hpp:660
                                                                                                  -
                                                                                                  void format(const std::string &format_type, yask_output_ptr output)
                                                                                                  [Deprecated] Use set_target() and output_solution().
                                                                                                  Definition: yask_compiler_api.hpp:644
                                                                                                  -
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln, const std::vector< yc_index_node_ptr > &dims, bool is_scratch=false)
                                                                                                  Contructor taking a vector of index vars.
                                                                                                  Definition: yask_compiler_api.hpp:893
                                                                                                  -
                                                                                                  std::int64_t idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:61
                                                                                                  +
                                                                                                  yc_var_proxy(const std::string &name, yc_solution_ptr soln, const std::vector< yc_index_node_ptr > &dims, bool is_scratch=false)
                                                                                                  Contructor taking a vector of index vars.
                                                                                                  Definition: yask_compiler_api.hpp:886
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_point_node_ptr new_relative_grid_point(const std::vector< int > &dim_offsets)
                                                                                                  [Deprecated] Use new_relative_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:828
                                                                                                  virtual void add_flow_dependency(yc_equation_node_ptr from, yc_equation_node_ptr to)=0
                                                                                                  [Advanced] Add a dependency between two equations.
                                                                                                  -
                                                                                                  virtual yc_var_point_node_ptr operator()(const yc_number_any_arg i1=nullptr, const yc_number_any_arg i2=nullptr, const yc_number_any_arg i3=nullptr, const yc_number_any_arg i4=nullptr, const yc_number_any_arg i5=nullptr, const yc_number_any_arg i6=nullptr)
                                                                                                  Create an expression for a point in a 1-6 dim var.
                                                                                                  Definition: yask_compiler_api.hpp:1000
                                                                                                  +
                                                                                                  virtual yc_var_point_node_ptr operator()(const yc_number_any_arg i1=nullptr, const yc_number_any_arg i2=nullptr, const yc_number_any_arg i3=nullptr, const yc_number_any_arg i4=nullptr, const yc_number_any_arg i5=nullptr, const yc_number_any_arg i6=nullptr)
                                                                                                  Create an expression for a point in a 1-6 dim var.
                                                                                                  Definition: yask_compiler_api.hpp:993
                                                                                                  +
                                                                                                  YASK_DEPRECATED std::vector< yc_var_ptr > get_grids()
                                                                                                  [Deprecated] Use get_vars().
                                                                                                  Definition: yask_compiler_api.hpp:693
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yc_var_point_node_ptr yc_grid_point_node_ptr
                                                                                                  [Deprecated] Use yc_var_point_node_ptr.
                                                                                                  Definition: yask_compiler_api.hpp:1055
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_point_node_ptr new_relative_grid_point(const std::initializer_list< int > &dim_offsets)
                                                                                                  [Deprecated] Use new_relative_var_point().
                                                                                                  Definition: yask_compiler_api.hpp:835
                                                                                                  @@ -108,14 +109,17 @@ typedef std::shared_ptr< yk_stats > yask::yk_stats_ptr  Shared pointer to yk_stats.
                                                                                                    - -typedef yk_var yask::yk_grid[Deprecated] Use yk_var.
                                                                                                  -  - -typedef yk_var_ptr yask::yk_grid_ptr[Deprecated] Use yk_var_ptr.
                                                                                                  -  + + + + + + + +

                                                                                                  +Variables

                                                                                                  +YASK_DEPRECATED typedef yk_var yask::yk_grid
                                                                                                   [Deprecated] Use yk_var.
                                                                                                   
                                                                                                  +YASK_DEPRECATED typedef yk_var_ptr yask::yk_grid_ptr
                                                                                                   [Deprecated] Use yk_var_ptr.
                                                                                                   
                                                                                                  diff --git a/docs/api/html/yask__kernel__api_8hpp_source.html b/docs/api/html/yask__kernel__api_8hpp_source.html index 01a5afd1..5a6e4d06 100644 --- a/docs/api/html/yask__kernel__api_8hpp_source.html +++ b/docs/api/html/yask__kernel__api_8hpp_source.html @@ -70,27 +70,30 @@
                                                                                                  yask_kernel_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_common_api.hpp"
                                                                                                  35 #include <vector>
                                                                                                  36 #include <cinttypes>
                                                                                                  37 
                                                                                                  38 #ifndef MPI_VERSION
                                                                                                  39 typedef int MPI_Comm;
                                                                                                  40 #endif
                                                                                                  41 
                                                                                                  42 namespace yask {
                                                                                                  43 
                                                                                                  50  // Forward declarations of classes and pointers.
                                                                                                  51 
                                                                                                  52  class yk_env;
                                                                                                  54  typedef std::shared_ptr<yk_env> yk_env_ptr;
                                                                                                  55 
                                                                                                  56  class yk_solution;
                                                                                                  58  typedef std::shared_ptr<yk_solution> yk_solution_ptr;
                                                                                                  59 
                                                                                                  60  class yk_var;
                                                                                                  62  typedef std::shared_ptr<yk_var> yk_var_ptr;
                                                                                                  63 
                                                                                                  64  class yk_stats;
                                                                                                  66  typedef std::shared_ptr<yk_stats> yk_stats_ptr;
                                                                                                  67 
                                                                                                  69 } // namespace yask.
                                                                                                  70 
                                                                                                  71 #include "aux/yk_solution_api.hpp"
                                                                                                  72 #include "aux/yk_var_api.hpp"
                                                                                                  73 
                                                                                                  74 namespace yask {
                                                                                                  75 
                                                                                                  81  class yk_factory {
                                                                                                  83  public:
                                                                                                  84  yk_factory();
                                                                                                  85  virtual ~yk_factory() {}
                                                                                                  86 
                                                                                                  88 
                                                                                                  91  virtual std::string
                                                                                                  93 
                                                                                                  95 
                                                                                                  123  virtual yk_env_ptr
                                                                                                  124  new_env() const;
                                                                                                  125 
                                                                                                  127 
                                                                                                  136  virtual yk_env_ptr
                                                                                                  137  new_env(MPI_Comm comm) const;
                                                                                                  138 
                                                                                                  140 
                                                                                                  145  virtual yk_solution_ptr
                                                                                                  146  new_solution(yk_env_ptr env ) const;
                                                                                                  147 
                                                                                                  149 
                                                                                                  156  virtual yk_solution_ptr
                                                                                                  158  const yk_solution_ptr source ) const;
                                                                                                  161  }; // yk_factory.
                                                                                                  162 
                                                                                                  164 
                                                                                                  167  class yk_env {
                                                                                                  168  public:
                                                                                                  169  virtual ~yk_env() {}
                                                                                                  170 
                                                                                                  172  virtual void
                                                                                                  176 
                                                                                                  178 
                                                                                                  182  virtual yask_output_ptr
                                                                                                  183  get_debug_output() const =0;
                                                                                                  184 
                                                                                                  186 
                                                                                                  189  virtual void
                                                                                                  190  set_trace_enabled(bool enable) =0;
                                                                                                  191 
                                                                                                  193 
                                                                                                  196  virtual int get_num_ranks() const =0;
                                                                                                  197 
                                                                                                  199 
                                                                                                  202  virtual int get_rank_index() const =0;
                                                                                                  203 
                                                                                                  205 
                                                                                                  209  virtual void
                                                                                                  210  global_barrier() const =0;
                                                                                                  211 
                                                                                                  212  }; // yk_env.
                                                                                                  213 
                                                                                                  215  typedef yk_var yk_grid;
                                                                                                  218 
                                                                                                  221 } // namespace yask.
                                                                                                  Statistics from calls to run_solution().
                                                                                                  Definition: yk_solution_api.hpp:1040
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_common_api.hpp"
                                                                                                  35 #include <vector>
                                                                                                  36 #include <cinttypes>
                                                                                                  37 
                                                                                                  38 #ifndef MPI_VERSION
                                                                                                  39 typedef int MPI_Comm;
                                                                                                  40 #endif
                                                                                                  41 
                                                                                                  42 namespace yask {
                                                                                                  43 
                                                                                                  50  // Forward declarations of classes and pointers.
                                                                                                  51 
                                                                                                  52  class yk_env;
                                                                                                  54  typedef std::shared_ptr<yk_env> yk_env_ptr;
                                                                                                  55 
                                                                                                  56  class yk_solution;
                                                                                                  58  typedef std::shared_ptr<yk_solution> yk_solution_ptr;
                                                                                                  59 
                                                                                                  60  class yk_var;
                                                                                                  62  typedef std::shared_ptr<yk_var> yk_var_ptr;
                                                                                                  63 
                                                                                                  64  class yk_stats;
                                                                                                  66  typedef std::shared_ptr<yk_stats> yk_stats_ptr;
                                                                                                  67 
                                                                                                  69 } // namespace yask.
                                                                                                  70 
                                                                                                  71 #include "aux/yk_solution_api.hpp"
                                                                                                  72 #include "aux/yk_var_api.hpp"
                                                                                                  73 
                                                                                                  74 namespace yask {
                                                                                                  75 
                                                                                                  81  class yk_factory {
                                                                                                  83  public:
                                                                                                  84  yk_factory();
                                                                                                  85  virtual ~yk_factory() {}
                                                                                                  86 
                                                                                                  88 
                                                                                                  91  virtual std::string
                                                                                                  93 
                                                                                                  95 
                                                                                                  123  virtual yk_env_ptr
                                                                                                  124  new_env() const;
                                                                                                  125 
                                                                                                  127 
                                                                                                  136  virtual yk_env_ptr
                                                                                                  137  new_env(MPI_Comm comm) const;
                                                                                                  138 
                                                                                                  140 
                                                                                                  145  virtual yk_solution_ptr
                                                                                                  146  new_solution(yk_env_ptr env ) const;
                                                                                                  147 
                                                                                                  149 
                                                                                                  156  virtual yk_solution_ptr
                                                                                                  158  const yk_solution_ptr source ) const;
                                                                                                  161  }; // yk_factory.
                                                                                                  162 
                                                                                                  164 
                                                                                                  167  class yk_env {
                                                                                                  168  public:
                                                                                                  169  virtual ~yk_env() {}
                                                                                                  170 
                                                                                                  172 
                                                                                                  177  static void
                                                                                                  181 
                                                                                                  183 
                                                                                                  186  static void
                                                                                                  188 
                                                                                                  190 
                                                                                                  197  static yask_output_ptr
                                                                                                  199 
                                                                                                  201 
                                                                                                  208  static void
                                                                                                  209  set_trace_enabled(bool enable);
                                                                                                  210 
                                                                                                  212 
                                                                                                  218  static bool
                                                                                                  220 
                                                                                                  222 
                                                                                                  225  virtual int get_num_ranks() const =0;
                                                                                                  226 
                                                                                                  228 
                                                                                                  231  virtual int get_rank_index() const =0;
                                                                                                  232 
                                                                                                  234 
                                                                                                  238  virtual void
                                                                                                  239  global_barrier() const =0;
                                                                                                  240 
                                                                                                  241  }; // yk_env.
                                                                                                  242 
                                                                                                  245  typedef yk_var yk_grid;
                                                                                                  249 
                                                                                                  252 } // namespace yask.
                                                                                                  static void disable_debug_output()
                                                                                                  Disable the debug output.
                                                                                                  +
                                                                                                  Statistics from calls to run_solution().
                                                                                                  Definition: yk_solution_api.hpp:1219
                                                                                                  std::shared_ptr< yk_stats > yk_stats_ptr
                                                                                                  Shared pointer to yk_stats.
                                                                                                  Definition: yask_kernel_api.hpp:64
                                                                                                  Kernel environment.
                                                                                                  Definition: yask_kernel_api.hpp:167
                                                                                                  virtual yk_env_ptr new_env() const
                                                                                                  Create an object to hold environment information.
                                                                                                  +
                                                                                                  static bool is_trace_enabled()
                                                                                                  Get whether tracing is enabled.
                                                                                                  virtual int get_num_ranks() const =0
                                                                                                  Get number of MPI ranks.
                                                                                                  -
                                                                                                  yk_var yk_grid
                                                                                                  [Deprecated] Use yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:215
                                                                                                  +
                                                                                                  static yask_output_ptr get_debug_output()
                                                                                                  Get object to receive debug output.
                                                                                                  std::shared_ptr< yk_env > yk_env_ptr
                                                                                                  Shared pointer to yk_env.
                                                                                                  Definition: yask_kernel_api.hpp:52
                                                                                                  -
                                                                                                  yk_var_ptr yk_grid_ptr
                                                                                                  [Deprecated] Use yk_var_ptr.
                                                                                                  Definition: yask_kernel_api.hpp:217
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yk_var_ptr yk_grid_ptr
                                                                                                  [Deprecated] Use yk_var_ptr.
                                                                                                  Definition: yask_kernel_api.hpp:248
                                                                                                  virtual std::string get_version_string()
                                                                                                  Version information.
                                                                                                  -
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:66
                                                                                                  +
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:94
                                                                                                  A run-time YASK data container.
                                                                                                  Definition: yk_var_api.hpp:185
                                                                                                  virtual yk_solution_ptr new_solution(yk_env_ptr env) const
                                                                                                  Create a stencil solution.
                                                                                                  -
                                                                                                  virtual void set_debug_output(yask_output_ptr debug)=0
                                                                                                  Set object to receive debug output.
                                                                                                  -
                                                                                                  virtual void set_trace_enabled(bool enable)=0
                                                                                                  Enable or disable additional debug tracing.
                                                                                                  -
                                                                                                  virtual yask_output_ptr get_debug_output() const =0
                                                                                                  Get object to receive debug output.
                                                                                                  +
                                                                                                  #define YASK_DEPRECATED
                                                                                                  Deprecated attribute.
                                                                                                  Definition: yask_common_api.hpp:55
                                                                                                  virtual void global_barrier() const =0
                                                                                                  Wait until all ranks have reached this element.
                                                                                                  +
                                                                                                  static void set_trace_enabled(bool enable)
                                                                                                  Enable or disable additional debug tracing.
                                                                                                  virtual int get_rank_index() const =0
                                                                                                  Get MPI rank index.
                                                                                                  std::shared_ptr< yk_solution > yk_solution_ptr
                                                                                                  Shared pointer to yk_solution.
                                                                                                  Definition: yask_kernel_api.hpp:56
                                                                                                  -
                                                                                                  Stencil solution as defined by the generated code from the YASK stencil compiler.
                                                                                                  Definition: yk_solution_api.hpp:74
                                                                                                  +
                                                                                                  static void set_debug_output(yask_output_ptr debug)
                                                                                                  Set object to receive debug output.
                                                                                                  +
                                                                                                  Stencil solution as defined by the generated code from the YASK stencil compiler.
                                                                                                  Definition: yk_solution_api.hpp:82
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yk_var yk_grid
                                                                                                  [Deprecated] Use yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:245
                                                                                                  std::shared_ptr< yk_var > yk_var_ptr
                                                                                                  Shared pointer to yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:60
                                                                                                  diff --git a/docs/api/html/yc__node__api_8hpp_source.html b/docs/api/html/yc__node__api_8hpp_source.html index 988c9908..c6620f61 100644 --- a/docs/api/html/yc__node__api_8hpp_source.html +++ b/docs/api/html/yc__node__api_8hpp_source.html @@ -70,32 +70,31 @@
                                                                                                  yc_node_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 namespace yask {
                                                                                                  35 
                                                                                                  41  // More node types not exposed except via RTTI.
                                                                                                  42 
                                                                                                  45  typedef std::shared_ptr<yc_const_number_node> yc_const_number_node_ptr;
                                                                                                  46 
                                                                                                  49  typedef std::shared_ptr<yc_negate_node> yc_negate_node_ptr;
                                                                                                  50 
                                                                                                  53  typedef std::shared_ptr<yc_commutative_number_node> yc_commutative_number_node_ptr;
                                                                                                  54 
                                                                                                  57  typedef std::shared_ptr<yc_binary_number_node> yc_binary_number_node_ptr;
                                                                                                  58 
                                                                                                  61  typedef std::shared_ptr<yc_binary_bool_node> yc_binary_bool_node_ptr;
                                                                                                  62 
                                                                                                  65  typedef std::shared_ptr<yc_binary_comparison_node> yc_binary_comparison_node_ptr;
                                                                                                  66 
                                                                                                  67  class yc_add_node;
                                                                                                  69  typedef std::shared_ptr<yc_add_node> yc_add_node_ptr;
                                                                                                  70 
                                                                                                  73  typedef std::shared_ptr<yc_multiply_node> yc_multiply_node_ptr;
                                                                                                  74 
                                                                                                  77  typedef std::shared_ptr<yc_subtract_node> yc_subtract_node_ptr;
                                                                                                  78 
                                                                                                  81  typedef std::shared_ptr<yc_divide_node> yc_divide_node_ptr;
                                                                                                  82 
                                                                                                  83  class yc_mod_node;
                                                                                                  85  typedef std::shared_ptr<yc_mod_node> yc_mod_node_ptr;
                                                                                                  86 
                                                                                                  87  class yc_not_node;
                                                                                                  89  typedef std::shared_ptr<yc_not_node> yc_not_node_ptr;
                                                                                                  90 
                                                                                                  93  typedef std::shared_ptr<yc_equals_node> yc_equals_node_ptr;
                                                                                                  94 
                                                                                                  97  typedef std::shared_ptr<yc_not_equals_node> yc_not_equals_node_ptr;
                                                                                                  98 
                                                                                                  101  typedef std::shared_ptr<yc_less_than_node> yc_less_than_node_ptr;
                                                                                                  102 
                                                                                                  105  typedef std::shared_ptr<yc_greater_than_node> yc_greater_than_node_ptr;
                                                                                                  106 
                                                                                                  109  typedef std::shared_ptr<yc_not_less_than_node> yc_not_less_than_node_ptr;
                                                                                                  110 
                                                                                                  113  typedef std::shared_ptr<yc_not_greater_than_node> yc_not_greater_than_node_ptr;
                                                                                                  114 
                                                                                                  115  class yc_and_node;
                                                                                                  117  typedef std::shared_ptr<yc_and_node> yc_and_node_ptr;
                                                                                                  118 
                                                                                                  119  class yc_or_node;
                                                                                                  121  typedef std::shared_ptr<yc_or_node> yc_or_node_ptr;
                                                                                                  122 
                                                                                                  124 
                                                                                                  125  class yc_expr_node {
                                                                                                  126  public:
                                                                                                  127  virtual ~yc_expr_node() {}
                                                                                                  128 
                                                                                                  130 
                                                                                                  134  virtual std::string format_simple() const =0;
                                                                                                  135 
                                                                                                  137 
                                                                                                  141  virtual int get_num_nodes() const =0;
                                                                                                  142  };
                                                                                                  143 
                                                                                                  145 
                                                                                                  149  class yc_equation_node : public virtual yc_expr_node {
                                                                                                  150  public:
                                                                                                  151 
                                                                                                  153 
                                                                                                  154  virtual yc_var_point_node_ptr get_lhs() =0;
                                                                                                  155 
                                                                                                  157 
                                                                                                  158  virtual yc_number_node_ptr get_rhs() =0;
                                                                                                  159 
                                                                                                  161 
                                                                                                  163  virtual yc_bool_node_ptr get_cond() =0;
                                                                                                  164 
                                                                                                  166 
                                                                                                  207  virtual void set_cond(yc_bool_node_ptr sub_domain_cond ) =0;
                                                                                                  211 
                                                                                                  213 
                                                                                                  236  virtual void set_step_cond(yc_bool_node_ptr step_cond ) =0;
                                                                                                  240 
                                                                                                  242  virtual yc_equation_node_ptr clone_ast() const =0;
                                                                                                  243  };
                                                                                                  244 
                                                                                                  246 
                                                                                                  247  class yc_number_node : public virtual yc_expr_node {
                                                                                                  248  public:
                                                                                                  249 
                                                                                                  251  virtual yc_number_node_ptr clone_ast() const =0;
                                                                                                  252  };
                                                                                                  253 
                                                                                                  255 
                                                                                                  256  class yc_bool_node : public virtual yc_expr_node {
                                                                                                  257  public:
                                                                                                  258 
                                                                                                  260  virtual yc_bool_node_ptr clone_ast() const =0;
                                                                                                  261  };
                                                                                                  262 
                                                                                                  264 
                                                                                                  270  class yc_index_node : public virtual yc_number_node {
                                                                                                  271  public:
                                                                                                  272 
                                                                                                  274 
                                                                                                  275  virtual const std::string&
                                                                                                  276  get_name() const =0;
                                                                                                  277  };
                                                                                                  278 
                                                                                                  280 
                                                                                                  283  class yc_var_point_node : public virtual yc_number_node {
                                                                                                  284  public:
                                                                                                  285 
                                                                                                  287 
                                                                                                  288  virtual yc_var_ptr
                                                                                                  289  get_var() =0;
                                                                                                  290 
                                                                                                  292  inline yc_var_ptr
                                                                                                  294  return get_var();
                                                                                                  295  }
                                                                                                  296  };
                                                                                                  297 
                                                                                                  299 
                                                                                                  303  class yc_const_number_node : public virtual yc_number_node {
                                                                                                  304  public:
                                                                                                  305 
                                                                                                  307 
                                                                                                  309  virtual void
                                                                                                  310  set_value(double val ) =0;
                                                                                                  311 
                                                                                                  313 
                                                                                                  314  virtual double
                                                                                                  315  get_value() const =0;
                                                                                                  316  };
                                                                                                  317 
                                                                                                  319 
                                                                                                  322  class yc_negate_node : public virtual yc_number_node {
                                                                                                  323  public:
                                                                                                  324 
                                                                                                  326 
                                                                                                  329  virtual yc_number_node_ptr
                                                                                                  330  get_rhs() =0;
                                                                                                  331  };
                                                                                                  332 
                                                                                                  334 
                                                                                                  337  public:
                                                                                                  338 
                                                                                                  340 
                                                                                                  345  virtual int
                                                                                                  346  get_num_operands() =0;
                                                                                                  347 
                                                                                                  349 
                                                                                                  350  virtual std::vector<yc_number_node_ptr>
                                                                                                  351  get_operands() =0;
                                                                                                  352 
                                                                                                  354  virtual void
                                                                                                  356  };
                                                                                                  357 
                                                                                                  359 
                                                                                                  360  class yc_add_node : public virtual yc_commutative_number_node { };
                                                                                                  361 
                                                                                                  363 
                                                                                                  364  class yc_multiply_node : public virtual yc_commutative_number_node { };
                                                                                                  365 
                                                                                                  367 
                                                                                                  369  class yc_binary_number_node : public virtual yc_number_node {
                                                                                                  370  public:
                                                                                                  371 
                                                                                                  373  virtual yc_number_node_ptr
                                                                                                  374  get_lhs() =0;
                                                                                                  375 
                                                                                                  377  virtual yc_number_node_ptr
                                                                                                  378  get_rhs() =0;
                                                                                                  379  };
                                                                                                  380 
                                                                                                  382 
                                                                                                  383  class yc_subtract_node : public virtual yc_binary_number_node { };
                                                                                                  384 
                                                                                                  386 
                                                                                                  387  class yc_divide_node : public virtual yc_binary_number_node { };
                                                                                                  388 
                                                                                                  390 
                                                                                                  391  class yc_mod_node : public virtual yc_binary_number_node { };
                                                                                                  392 
                                                                                                  394 
                                                                                                  397  class yc_not_node : public virtual yc_bool_node {
                                                                                                  398  public:
                                                                                                  399 
                                                                                                  401 
                                                                                                  402  virtual yc_bool_node_ptr
                                                                                                  403  get_rhs() =0;
                                                                                                  404  };
                                                                                                  405 
                                                                                                  407  class yc_binary_bool_node : public virtual yc_bool_node {
                                                                                                  408  public:
                                                                                                  409 
                                                                                                  411  virtual yc_bool_node_ptr
                                                                                                  412  get_lhs() =0;
                                                                                                  413 
                                                                                                  415  virtual yc_bool_node_ptr
                                                                                                  416  get_rhs() =0;
                                                                                                  417  };
                                                                                                  418 
                                                                                                  420 
                                                                                                  423  class yc_and_node : public virtual yc_binary_bool_node { };
                                                                                                  424 
                                                                                                  426 
                                                                                                  429  class yc_or_node : public virtual yc_binary_bool_node { };
                                                                                                  430 
                                                                                                  432  class yc_binary_comparison_node : public virtual yc_bool_node {
                                                                                                  433  public:
                                                                                                  434 
                                                                                                  436 
                                                                                                  437  virtual yc_number_node_ptr
                                                                                                  438  get_lhs() =0;
                                                                                                  439 
                                                                                                  441 
                                                                                                  442  virtual yc_number_node_ptr
                                                                                                  443  get_rhs() =0;
                                                                                                  444  };
                                                                                                  445 
                                                                                                  447 
                                                                                                  450  class yc_equals_node : public virtual yc_binary_comparison_node { };
                                                                                                  451 
                                                                                                  453 
                                                                                                  456  class yc_not_equals_node : public virtual yc_binary_comparison_node { };
                                                                                                  457 
                                                                                                  459 
                                                                                                  462  class yc_less_than_node : public virtual yc_binary_comparison_node { };
                                                                                                  463 
                                                                                                  465 
                                                                                                  469 
                                                                                                  471 
                                                                                                  475 
                                                                                                  477 
                                                                                                  481 
                                                                                                  482 #ifndef SWIG
                                                                                                  483 
                                                                                                  495 
                                                                                                  496  public:
                                                                                                  497 
                                                                                                  500  yc_number_node_ptr(p) { }
                                                                                                  501 
                                                                                                  504  yc_number_node_ptr(p) { }
                                                                                                  505 
                                                                                                  508  yc_number_node_ptr(p) { }
                                                                                                  509  };
                                                                                                  510 
                                                                                                  512 
                                                                                                  523 
                                                                                                  524  protected:
                                                                                                  525 
                                                                                                  527  yc_number_node_ptr _convert_const(double val) const;
                                                                                                  528 
                                                                                                  529  public:
                                                                                                  530 
                                                                                                  534 
                                                                                                  538 
                                                                                                  542 
                                                                                                  546  };
                                                                                                  547 
                                                                                                  549 
                                                                                                  561 
                                                                                                  562  protected:
                                                                                                  563 
                                                                                                  565  yc_number_node_ptr _convert_const(double val) const;
                                                                                                  566 
                                                                                                  567  public:
                                                                                                  568 
                                                                                                  571  yc_number_node_ptr(p) { }
                                                                                                  572 
                                                                                                  575  yc_number_node_ptr(p) { }
                                                                                                  576 
                                                                                                  579  yc_number_node_ptr(p) { }
                                                                                                  580 
                                                                                                  584 
                                                                                                  588 
                                                                                                  590  yc_number_any_arg(double f) :
                                                                                                  592 
                                                                                                  594  yc_number_any_arg(float f) :
                                                                                                  596 
                                                                                                  598  yc_number_any_arg(std::nullptr_t p) :
                                                                                                  599  yc_number_node_ptr(p) { }
                                                                                                  600  };
                                                                                                  601 #endif
                                                                                                  602 
                                                                                                  604 
                                                                                                  607  public:
                                                                                                  608  virtual ~yc_node_factory() {}
                                                                                                  609 
                                                                                                  611 
                                                                                                  617  virtual yc_index_node_ptr
                                                                                                  618  new_step_index(const std::string& name ) const;
                                                                                                  620 
                                                                                                  622 
                                                                                                  634  virtual yc_index_node_ptr
                                                                                                  635  new_domain_index(const std::string& name ) const;
                                                                                                  637 
                                                                                                  639 
                                                                                                  647  virtual yc_index_node_ptr
                                                                                                  648  new_misc_index(const std::string& name ) const;
                                                                                                  650 
                                                                                                  652 
                                                                                                  678  virtual yc_equation_node_ptr
                                                                                                  681  yc_number_node_ptr rhs,
                                                                                                  683  yc_bool_node_ptr sub_domain_cond = nullptr ) const;
                                                                                                  686 
                                                                                                  687 #ifndef SWIG
                                                                                                  688 
                                                                                                  694  virtual yc_number_node_ptr
                                                                                                  697  return std::move(arg);
                                                                                                  698  }
                                                                                                  699 #endif
                                                                                                  700 
                                                                                                  702 
                                                                                                  708  virtual yc_number_node_ptr
                                                                                                  709  new_const_number_node(double val ) const;
                                                                                                  711 
                                                                                                  713 
                                                                                                  719  virtual yc_number_node_ptr
                                                                                                  720  new_const_number_node(idx_t val ) const;
                                                                                                  722 
                                                                                                  724 
                                                                                                  729  virtual yc_number_node_ptr
                                                                                                  732 
                                                                                                  734 
                                                                                                  740  virtual yc_number_node_ptr
                                                                                                  742  yc_number_node_ptr rhs ) const;
                                                                                                  743 
                                                                                                  745 
                                                                                                  751  virtual yc_number_node_ptr
                                                                                                  753  yc_number_node_ptr rhs ) const;
                                                                                                  754 
                                                                                                  756 
                                                                                                  766  virtual yc_number_node_ptr
                                                                                                  768  yc_number_node_ptr rhs ) const;
                                                                                                  769 
                                                                                                  771 
                                                                                                  778  virtual yc_number_node_ptr
                                                                                                  780  yc_number_node_ptr rhs ) const;
                                                                                                  781 
                                                                                                  783 
                                                                                                  790  virtual yc_number_node_ptr
                                                                                                  792  yc_number_node_ptr rhs ) const;
                                                                                                  793 
                                                                                                  795 
                                                                                                  804  virtual yc_number_node_ptr
                                                                                                  807 
                                                                                                  809 
                                                                                                  818  virtual yc_number_node_ptr
                                                                                                  821 
                                                                                                  823 
                                                                                                  829  virtual yc_bool_node_ptr
                                                                                                  830  new_not_node(yc_bool_node_ptr rhs ) const;
                                                                                                  831 
                                                                                                  833 
                                                                                                  839  virtual yc_bool_node_ptr
                                                                                                  841  yc_bool_node_ptr rhs ) const;
                                                                                                  842 
                                                                                                  844 
                                                                                                  850  virtual yc_bool_node_ptr
                                                                                                  852  yc_bool_node_ptr rhs ) const;
                                                                                                  853 
                                                                                                  855 
                                                                                                  860  virtual yc_bool_node_ptr
                                                                                                  862  yc_number_node_ptr rhs ) const;
                                                                                                  863 
                                                                                                  865 
                                                                                                  870  virtual yc_bool_node_ptr
                                                                                                  872  yc_number_node_ptr rhs ) const;
                                                                                                  873 
                                                                                                  875 
                                                                                                  880  virtual yc_bool_node_ptr
                                                                                                  882  yc_number_node_ptr rhs ) const;
                                                                                                  883 
                                                                                                  885 
                                                                                                  890  virtual yc_bool_node_ptr
                                                                                                  892  yc_number_node_ptr rhs ) const;
                                                                                                  893 
                                                                                                  895 
                                                                                                  900  virtual yc_bool_node_ptr
                                                                                                  902  yc_number_node_ptr rhs ) const;
                                                                                                  903 
                                                                                                  905 
                                                                                                  910  virtual yc_bool_node_ptr
                                                                                                  912  yc_number_node_ptr rhs ) const;
                                                                                                  913 
                                                                                                  914  };
                                                                                                  915 
                                                                                                  917 #define UNARY_MATH_EXPR(fn_name) \
                                                                                                  918  yc_number_node_ptr fn_name(const yc_number_node_ptr rhs)
                                                                                                  919 
                                                                                                  921  UNARY_MATH_EXPR(sqrt);
                                                                                                  923  UNARY_MATH_EXPR(cbrt);
                                                                                                  925  UNARY_MATH_EXPR(fabs);
                                                                                                  927  UNARY_MATH_EXPR(erf);
                                                                                                  929  UNARY_MATH_EXPR(exp);
                                                                                                  931  UNARY_MATH_EXPR(log);
                                                                                                  933  UNARY_MATH_EXPR(sin);
                                                                                                  935  UNARY_MATH_EXPR(cos);
                                                                                                  937  UNARY_MATH_EXPR(atan);
                                                                                                  938 #undef UNARY_MATH_EXPR
                                                                                                  939 
                                                                                                  941 #define BINARY_MATH_EXPR(fn_name) \
                                                                                                  942  yc_number_node_ptr fn_name(const yc_number_node_ptr arg1, const yc_number_node_ptr arg2); \
                                                                                                  943  yc_number_node_ptr fn_name(double arg1, const yc_number_node_ptr arg2); \
                                                                                                  944  yc_number_node_ptr fn_name(const yc_number_node_ptr arg1, double arg2)
                                                                                                  945 
                                                                                                  947 
                                                                                                  951  BINARY_MATH_EXPR(pow);
                                                                                                  952 #undef BINARY_MATH_EXPR
                                                                                                  953 
                                                                                                  954 #if !defined SWIG
                                                                                                  955 
                                                                                                  956  // Non-class operators.
                                                                                                  957  // These are not defined for SWIG because
                                                                                                  958  // the Python operators are defined in the ".i" file.
                                                                                                  959  // For the binary operators, we define 3 combinations to implicitly
                                                                                                  960  // avoid the const-const combinations, which conflict with built-in
                                                                                                  961  // operators on fundamental C++ types, e.g., '5+8'.
                                                                                                  962 
                                                                                                  964  yc_number_node_ptr operator-(yc_number_ptr_arg rhs);
                                                                                                  965 
                                                                                                  967  yc_number_node_ptr operator+(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  969  yc_number_node_ptr operator+(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  971  yc_number_node_ptr operator+(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  972 
                                                                                                  974  yc_number_node_ptr operator/(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  976  yc_number_node_ptr operator/(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  978  yc_number_node_ptr operator/(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  979 
                                                                                                  981  yc_number_node_ptr operator%(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  983  yc_number_node_ptr operator%(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  985  yc_number_node_ptr operator%(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  986 
                                                                                                  988  yc_number_node_ptr operator*(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  990  yc_number_node_ptr operator*(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  992  yc_number_node_ptr operator*(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  993 
                                                                                                  995  yc_number_node_ptr operator-(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  997  yc_number_node_ptr operator-(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  999  yc_number_node_ptr operator-(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  1000 
                                                                                                  1004  void operator+=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1005 
                                                                                                  1009  void operator-=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1010 
                                                                                                  1014  void operator*=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1015 
                                                                                                  1019  void operator/=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1020 
                                                                                                  1022 
                                                                                                  1024 
                                                                                                  1026 
                                                                                                  1028 
                                                                                                  1030 
                                                                                                  1032 
                                                                                                  1034 
                                                                                                  1040 #define BOOL_OPER(oper, fn) \
                                                                                                  1041  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1042  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1043  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1044  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1045  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1046  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1047  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1048  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1049  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1050  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1051  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1052  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1053  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1054  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1055  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1056  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1057  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1058  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1059  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, double rhs) { \
                                                                                                  1060  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); } \
                                                                                                  1061  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, double rhs) { \
                                                                                                  1062  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); } \
                                                                                                  1063  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, double rhs) { \
                                                                                                  1064  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); }
                                                                                                  1065 
                                                                                                  1066  BOOL_OPER(==, new_equals_node)
                                                                                                  1067  BOOL_OPER(!=, new_not_equals_node)
                                                                                                  1068  BOOL_OPER(<, new_less_than_node)
                                                                                                  1069  BOOL_OPER(>, new_greater_than_node)
                                                                                                  1070  BOOL_OPER(<=, new_not_greater_than_node)
                                                                                                  1071  BOOL_OPER(>=, new_not_less_than_node)
                                                                                                  1072 #undef BOOL_OPER
                                                                                                  1073 
                                                                                                  1075 
                                                                                                  1083 #define EQUALS <<
                                                                                                  1084 
                                                                                                  1086  yc_equation_node_ptr operator EQUALS(yc_var_point_node_ptr gpp, const yc_number_any_arg rhs);
                                                                                                  1087 
                                                                                                  1089 
                                                                                                  1093 #define IF_DOMAIN ^=
                                                                                                  1094 
                                                                                                  1097  const yc_bool_node_ptr cond);
                                                                                                  1098 
                                                                                                  1100 
                                                                                                  1104 #define IF_STEP |=
                                                                                                  1105 
                                                                                                  1108  const yc_bool_node_ptr cond);
                                                                                                  1109 
                                                                                                  1110 #endif // !SWIG.
                                                                                                  1111 
                                                                                                  1114 } // namespace yask.
                                                                                                  virtual yc_bool_node_ptr get_rhs()=0
                                                                                                  Get the [only] operand.
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 namespace yask {
                                                                                                  35 
                                                                                                  41  // More node types not exposed except via RTTI.
                                                                                                  42 
                                                                                                  45  typedef std::shared_ptr<yc_const_number_node> yc_const_number_node_ptr;
                                                                                                  46 
                                                                                                  49  typedef std::shared_ptr<yc_negate_node> yc_negate_node_ptr;
                                                                                                  50 
                                                                                                  53  typedef std::shared_ptr<yc_commutative_number_node> yc_commutative_number_node_ptr;
                                                                                                  54 
                                                                                                  57  typedef std::shared_ptr<yc_binary_number_node> yc_binary_number_node_ptr;
                                                                                                  58 
                                                                                                  61  typedef std::shared_ptr<yc_binary_bool_node> yc_binary_bool_node_ptr;
                                                                                                  62 
                                                                                                  65  typedef std::shared_ptr<yc_binary_comparison_node> yc_binary_comparison_node_ptr;
                                                                                                  66 
                                                                                                  67  class yc_add_node;
                                                                                                  69  typedef std::shared_ptr<yc_add_node> yc_add_node_ptr;
                                                                                                  70 
                                                                                                  73  typedef std::shared_ptr<yc_multiply_node> yc_multiply_node_ptr;
                                                                                                  74 
                                                                                                  77  typedef std::shared_ptr<yc_subtract_node> yc_subtract_node_ptr;
                                                                                                  78 
                                                                                                  81  typedef std::shared_ptr<yc_divide_node> yc_divide_node_ptr;
                                                                                                  82 
                                                                                                  83  class yc_mod_node;
                                                                                                  85  typedef std::shared_ptr<yc_mod_node> yc_mod_node_ptr;
                                                                                                  86 
                                                                                                  87  class yc_not_node;
                                                                                                  89  typedef std::shared_ptr<yc_not_node> yc_not_node_ptr;
                                                                                                  90 
                                                                                                  93  typedef std::shared_ptr<yc_equals_node> yc_equals_node_ptr;
                                                                                                  94 
                                                                                                  97  typedef std::shared_ptr<yc_not_equals_node> yc_not_equals_node_ptr;
                                                                                                  98 
                                                                                                  101  typedef std::shared_ptr<yc_less_than_node> yc_less_than_node_ptr;
                                                                                                  102 
                                                                                                  105  typedef std::shared_ptr<yc_greater_than_node> yc_greater_than_node_ptr;
                                                                                                  106 
                                                                                                  109  typedef std::shared_ptr<yc_not_less_than_node> yc_not_less_than_node_ptr;
                                                                                                  110 
                                                                                                  113  typedef std::shared_ptr<yc_not_greater_than_node> yc_not_greater_than_node_ptr;
                                                                                                  114 
                                                                                                  115  class yc_and_node;
                                                                                                  117  typedef std::shared_ptr<yc_and_node> yc_and_node_ptr;
                                                                                                  118 
                                                                                                  119  class yc_or_node;
                                                                                                  121  typedef std::shared_ptr<yc_or_node> yc_or_node_ptr;
                                                                                                  122 
                                                                                                  124 
                                                                                                  125  class yc_expr_node {
                                                                                                  126  public:
                                                                                                  127  virtual ~yc_expr_node() {}
                                                                                                  128 
                                                                                                  130 
                                                                                                  134  virtual std::string format_simple() const =0;
                                                                                                  135 
                                                                                                  137 
                                                                                                  141  virtual int get_num_nodes() const =0;
                                                                                                  142  };
                                                                                                  143 
                                                                                                  145 
                                                                                                  149  class yc_equation_node : public virtual yc_expr_node {
                                                                                                  150  public:
                                                                                                  151 
                                                                                                  153 
                                                                                                  154  virtual yc_var_point_node_ptr get_lhs() =0;
                                                                                                  155 
                                                                                                  157 
                                                                                                  158  virtual yc_number_node_ptr get_rhs() =0;
                                                                                                  159 
                                                                                                  161 
                                                                                                  163  virtual yc_bool_node_ptr get_cond() =0;
                                                                                                  164 
                                                                                                  166 
                                                                                                  207  virtual void set_cond(yc_bool_node_ptr sub_domain_cond ) =0;
                                                                                                  211 
                                                                                                  213 
                                                                                                  236  virtual void set_step_cond(yc_bool_node_ptr step_cond ) =0;
                                                                                                  240 
                                                                                                  242  virtual yc_equation_node_ptr clone_ast() const =0;
                                                                                                  243  };
                                                                                                  244 
                                                                                                  246 
                                                                                                  247  class yc_number_node : public virtual yc_expr_node {
                                                                                                  248  public:
                                                                                                  249 
                                                                                                  251  virtual yc_number_node_ptr clone_ast() const =0;
                                                                                                  252  };
                                                                                                  253 
                                                                                                  255 
                                                                                                  256  class yc_bool_node : public virtual yc_expr_node {
                                                                                                  257  public:
                                                                                                  258 
                                                                                                  260  virtual yc_bool_node_ptr clone_ast() const =0;
                                                                                                  261  };
                                                                                                  262 
                                                                                                  264 
                                                                                                  270  class yc_index_node : public virtual yc_number_node {
                                                                                                  271  public:
                                                                                                  272 
                                                                                                  274 
                                                                                                  275  virtual const std::string&
                                                                                                  276  get_name() const =0;
                                                                                                  277  };
                                                                                                  278 
                                                                                                  280 
                                                                                                  283  class yc_var_point_node : public virtual yc_number_node {
                                                                                                  284  public:
                                                                                                  285 
                                                                                                  287 
                                                                                                  288  virtual yc_var_ptr
                                                                                                  289  get_var() =0;
                                                                                                  290 
                                                                                                  293  inline yc_var_ptr
                                                                                                  295  return get_var();
                                                                                                  296  }
                                                                                                  297  };
                                                                                                  298 
                                                                                                  300 
                                                                                                  304  class yc_const_number_node : public virtual yc_number_node {
                                                                                                  305  public:
                                                                                                  306 
                                                                                                  308 
                                                                                                  310  virtual void
                                                                                                  311  set_value(double val ) =0;
                                                                                                  312 
                                                                                                  314 
                                                                                                  315  virtual double
                                                                                                  316  get_value() const =0;
                                                                                                  317  };
                                                                                                  318 
                                                                                                  320 
                                                                                                  323  class yc_negate_node : public virtual yc_number_node {
                                                                                                  324  public:
                                                                                                  325 
                                                                                                  327 
                                                                                                  330  virtual yc_number_node_ptr
                                                                                                  331  get_rhs() =0;
                                                                                                  332  };
                                                                                                  333 
                                                                                                  335 
                                                                                                  338  public:
                                                                                                  339 
                                                                                                  341 
                                                                                                  346  virtual int
                                                                                                  347  get_num_operands() =0;
                                                                                                  348 
                                                                                                  350 
                                                                                                  351  virtual std::vector<yc_number_node_ptr>
                                                                                                  352  get_operands() =0;
                                                                                                  353 
                                                                                                  355  virtual void
                                                                                                  357  };
                                                                                                  358 
                                                                                                  360 
                                                                                                  361  class yc_add_node : public virtual yc_commutative_number_node { };
                                                                                                  362 
                                                                                                  364 
                                                                                                  365  class yc_multiply_node : public virtual yc_commutative_number_node { };
                                                                                                  366 
                                                                                                  368 
                                                                                                  370  class yc_binary_number_node : public virtual yc_number_node {
                                                                                                  371  public:
                                                                                                  372 
                                                                                                  374  virtual yc_number_node_ptr
                                                                                                  375  get_lhs() =0;
                                                                                                  376 
                                                                                                  378  virtual yc_number_node_ptr
                                                                                                  379  get_rhs() =0;
                                                                                                  380  };
                                                                                                  381 
                                                                                                  383 
                                                                                                  384  class yc_subtract_node : public virtual yc_binary_number_node { };
                                                                                                  385 
                                                                                                  387 
                                                                                                  388  class yc_divide_node : public virtual yc_binary_number_node { };
                                                                                                  389 
                                                                                                  391 
                                                                                                  392  class yc_mod_node : public virtual yc_binary_number_node { };
                                                                                                  393 
                                                                                                  395 
                                                                                                  398  class yc_not_node : public virtual yc_bool_node {
                                                                                                  399  public:
                                                                                                  400 
                                                                                                  402 
                                                                                                  403  virtual yc_bool_node_ptr
                                                                                                  404  get_rhs() =0;
                                                                                                  405  };
                                                                                                  406 
                                                                                                  408  class yc_binary_bool_node : public virtual yc_bool_node {
                                                                                                  409  public:
                                                                                                  410 
                                                                                                  412  virtual yc_bool_node_ptr
                                                                                                  413  get_lhs() =0;
                                                                                                  414 
                                                                                                  416  virtual yc_bool_node_ptr
                                                                                                  417  get_rhs() =0;
                                                                                                  418  };
                                                                                                  419 
                                                                                                  421 
                                                                                                  424  class yc_and_node : public virtual yc_binary_bool_node { };
                                                                                                  425 
                                                                                                  427 
                                                                                                  430  class yc_or_node : public virtual yc_binary_bool_node { };
                                                                                                  431 
                                                                                                  433  class yc_binary_comparison_node : public virtual yc_bool_node {
                                                                                                  434  public:
                                                                                                  435 
                                                                                                  437 
                                                                                                  438  virtual yc_number_node_ptr
                                                                                                  439  get_lhs() =0;
                                                                                                  440 
                                                                                                  442 
                                                                                                  443  virtual yc_number_node_ptr
                                                                                                  444  get_rhs() =0;
                                                                                                  445  };
                                                                                                  446 
                                                                                                  448 
                                                                                                  451  class yc_equals_node : public virtual yc_binary_comparison_node { };
                                                                                                  452 
                                                                                                  454 
                                                                                                  457  class yc_not_equals_node : public virtual yc_binary_comparison_node { };
                                                                                                  458 
                                                                                                  460 
                                                                                                  463  class yc_less_than_node : public virtual yc_binary_comparison_node { };
                                                                                                  464 
                                                                                                  466 
                                                                                                  470 
                                                                                                  472 
                                                                                                  476 
                                                                                                  478 
                                                                                                  482 
                                                                                                  483 #ifndef SWIG
                                                                                                  484 
                                                                                                  496 
                                                                                                  497  public:
                                                                                                  498 
                                                                                                  501  yc_number_node_ptr(p) { }
                                                                                                  502 
                                                                                                  505  yc_number_node_ptr(p) { }
                                                                                                  506 
                                                                                                  509  yc_number_node_ptr(p) { }
                                                                                                  510  };
                                                                                                  511 
                                                                                                  513 
                                                                                                  524 
                                                                                                  525  protected:
                                                                                                  526 
                                                                                                  528  yc_number_node_ptr _convert_const(double val) const;
                                                                                                  529 
                                                                                                  530  public:
                                                                                                  531 
                                                                                                  535 
                                                                                                  539 
                                                                                                  543 
                                                                                                  547  };
                                                                                                  548 
                                                                                                  550 
                                                                                                  562 
                                                                                                  563  protected:
                                                                                                  564 
                                                                                                  566  yc_number_node_ptr _convert_const(double val) const;
                                                                                                  567 
                                                                                                  568  public:
                                                                                                  569 
                                                                                                  572  yc_number_node_ptr(p) { }
                                                                                                  573 
                                                                                                  576  yc_number_node_ptr(p) { }
                                                                                                  577 
                                                                                                  580  yc_number_node_ptr(p) { }
                                                                                                  581 
                                                                                                  585 
                                                                                                  589 
                                                                                                  591  yc_number_any_arg(double f) :
                                                                                                  593 
                                                                                                  595  yc_number_any_arg(float f) :
                                                                                                  597 
                                                                                                  599  yc_number_any_arg(std::nullptr_t p) :
                                                                                                  600  yc_number_node_ptr(p) { }
                                                                                                  601  };
                                                                                                  602 #endif
                                                                                                  603 
                                                                                                  605 
                                                                                                  608  public:
                                                                                                  609  virtual ~yc_node_factory() {}
                                                                                                  610 
                                                                                                  612 
                                                                                                  618  virtual yc_index_node_ptr
                                                                                                  619  new_step_index(const std::string& name ) const;
                                                                                                  621 
                                                                                                  623 
                                                                                                  635  virtual yc_index_node_ptr
                                                                                                  636  new_domain_index(const std::string& name ) const;
                                                                                                  638 
                                                                                                  640 
                                                                                                  648  virtual yc_index_node_ptr
                                                                                                  649  new_misc_index(const std::string& name ) const;
                                                                                                  651 
                                                                                                  653 
                                                                                                  679  virtual yc_equation_node_ptr
                                                                                                  682  yc_number_node_ptr rhs,
                                                                                                  684  yc_bool_node_ptr sub_domain_cond = nullptr ) const;
                                                                                                  687 
                                                                                                  688 #ifndef SWIG
                                                                                                  689 
                                                                                                  695  virtual yc_number_node_ptr
                                                                                                  698  return std::move(arg);
                                                                                                  699  }
                                                                                                  700 #endif
                                                                                                  701 
                                                                                                  703 
                                                                                                  709  virtual yc_number_node_ptr
                                                                                                  710  new_const_number_node(double val ) const;
                                                                                                  712 
                                                                                                  714 
                                                                                                  720  virtual yc_number_node_ptr
                                                                                                  721  new_const_number_node(idx_t val ) const;
                                                                                                  723 
                                                                                                  725 
                                                                                                  730  virtual yc_number_node_ptr
                                                                                                  733 
                                                                                                  735 
                                                                                                  741  virtual yc_number_node_ptr
                                                                                                  743  yc_number_node_ptr rhs ) const;
                                                                                                  744 
                                                                                                  746 
                                                                                                  752  virtual yc_number_node_ptr
                                                                                                  754  yc_number_node_ptr rhs ) const;
                                                                                                  755 
                                                                                                  757 
                                                                                                  767  virtual yc_number_node_ptr
                                                                                                  769  yc_number_node_ptr rhs ) const;
                                                                                                  770 
                                                                                                  772 
                                                                                                  779  virtual yc_number_node_ptr
                                                                                                  781  yc_number_node_ptr rhs ) const;
                                                                                                  782 
                                                                                                  784 
                                                                                                  791  virtual yc_number_node_ptr
                                                                                                  793  yc_number_node_ptr rhs ) const;
                                                                                                  794 
                                                                                                  796 
                                                                                                  805  virtual yc_number_node_ptr
                                                                                                  808 
                                                                                                  810 
                                                                                                  819  virtual yc_number_node_ptr
                                                                                                  822 
                                                                                                  824 
                                                                                                  830  virtual yc_bool_node_ptr
                                                                                                  831  new_not_node(yc_bool_node_ptr rhs ) const;
                                                                                                  832 
                                                                                                  834 
                                                                                                  840  virtual yc_bool_node_ptr
                                                                                                  842  yc_bool_node_ptr rhs ) const;
                                                                                                  843 
                                                                                                  845 
                                                                                                  851  virtual yc_bool_node_ptr
                                                                                                  853  yc_bool_node_ptr rhs ) const;
                                                                                                  854 
                                                                                                  856 
                                                                                                  861  virtual yc_bool_node_ptr
                                                                                                  863  yc_number_node_ptr rhs ) const;
                                                                                                  864 
                                                                                                  866 
                                                                                                  871  virtual yc_bool_node_ptr
                                                                                                  873  yc_number_node_ptr rhs ) const;
                                                                                                  874 
                                                                                                  876 
                                                                                                  881  virtual yc_bool_node_ptr
                                                                                                  883  yc_number_node_ptr rhs ) const;
                                                                                                  884 
                                                                                                  886 
                                                                                                  891  virtual yc_bool_node_ptr
                                                                                                  893  yc_number_node_ptr rhs ) const;
                                                                                                  894 
                                                                                                  896 
                                                                                                  901  virtual yc_bool_node_ptr
                                                                                                  903  yc_number_node_ptr rhs ) const;
                                                                                                  904 
                                                                                                  906 
                                                                                                  911  virtual yc_bool_node_ptr
                                                                                                  913  yc_number_node_ptr rhs ) const;
                                                                                                  914 
                                                                                                  915  };
                                                                                                  916 
                                                                                                  918 #define UNARY_MATH_EXPR(fn_name) \
                                                                                                  919  yc_number_node_ptr fn_name(const yc_number_node_ptr rhs)
                                                                                                  920 
                                                                                                  922  UNARY_MATH_EXPR(sqrt);
                                                                                                  924  UNARY_MATH_EXPR(cbrt);
                                                                                                  926  UNARY_MATH_EXPR(fabs);
                                                                                                  928  UNARY_MATH_EXPR(erf);
                                                                                                  930  UNARY_MATH_EXPR(exp);
                                                                                                  932  UNARY_MATH_EXPR(log);
                                                                                                  934  UNARY_MATH_EXPR(sin);
                                                                                                  936  UNARY_MATH_EXPR(cos);
                                                                                                  938  UNARY_MATH_EXPR(atan);
                                                                                                  939 #undef UNARY_MATH_EXPR
                                                                                                  940 
                                                                                                  942 #define BINARY_MATH_EXPR(fn_name) \
                                                                                                  943  yc_number_node_ptr fn_name(const yc_number_node_ptr arg1, const yc_number_node_ptr arg2); \
                                                                                                  944  yc_number_node_ptr fn_name(double arg1, const yc_number_node_ptr arg2); \
                                                                                                  945  yc_number_node_ptr fn_name(const yc_number_node_ptr arg1, double arg2)
                                                                                                  946 
                                                                                                  948 
                                                                                                  952  BINARY_MATH_EXPR(pow);
                                                                                                  953 #undef BINARY_MATH_EXPR
                                                                                                  954 
                                                                                                  955 #if !defined SWIG
                                                                                                  956 
                                                                                                  957  // Non-class operators.
                                                                                                  958  // These are not defined for SWIG because
                                                                                                  959  // the Python operators are defined in the ".i" file.
                                                                                                  960  // For the binary operators, we define 3 combinations to implicitly
                                                                                                  961  // avoid the const-const combinations, which conflict with built-in
                                                                                                  962  // operators on fundamental C++ types, e.g., '5+8'.
                                                                                                  963 
                                                                                                  965  yc_number_node_ptr operator-(yc_number_ptr_arg rhs);
                                                                                                  966 
                                                                                                  968  yc_number_node_ptr operator+(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  970  yc_number_node_ptr operator+(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  972  yc_number_node_ptr operator+(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  973 
                                                                                                  975  yc_number_node_ptr operator/(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  977  yc_number_node_ptr operator/(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  979  yc_number_node_ptr operator/(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  980 
                                                                                                  982  yc_number_node_ptr operator%(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  984  yc_number_node_ptr operator%(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  986  yc_number_node_ptr operator%(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  987 
                                                                                                  989  yc_number_node_ptr operator*(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  991  yc_number_node_ptr operator*(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  993  yc_number_node_ptr operator*(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  994 
                                                                                                  996  yc_number_node_ptr operator-(yc_number_ptr_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  998  yc_number_node_ptr operator-(yc_number_const_arg lhs, yc_number_ptr_arg rhs);
                                                                                                  1000  yc_number_node_ptr operator-(yc_number_ptr_arg lhs, yc_number_const_arg rhs);
                                                                                                  1001 
                                                                                                  1005  void operator+=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1006 
                                                                                                  1010  void operator-=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1011 
                                                                                                  1015  void operator*=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1016 
                                                                                                  1020  void operator/=(yc_number_node_ptr& lhs, yc_number_const_arg rhs);
                                                                                                  1021 
                                                                                                  1023 
                                                                                                  1025 
                                                                                                  1027 
                                                                                                  1029 
                                                                                                  1031 
                                                                                                  1033 
                                                                                                  1035 
                                                                                                  1041 #define BOOL_OPER(oper, fn) \
                                                                                                  1042  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1043  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1044  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1045  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1046  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1047  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1048  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1049  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1050  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1051  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1052  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1053  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1054  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_number_node_ptr rhs) { \
                                                                                                  1055  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1056  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_index_node_ptr rhs) { \
                                                                                                  1057  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1058  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, const yc_var_point_node_ptr rhs) { \
                                                                                                  1059  yc_node_factory nfac; return nfac.fn(lhs, rhs); } \
                                                                                                  1060  inline yc_bool_node_ptr operator oper(const yc_number_node_ptr lhs, double rhs) { \
                                                                                                  1061  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); } \
                                                                                                  1062  inline yc_bool_node_ptr operator oper(const yc_index_node_ptr lhs, double rhs) { \
                                                                                                  1063  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); } \
                                                                                                  1064  inline yc_bool_node_ptr operator oper(const yc_var_point_node_ptr lhs, double rhs) { \
                                                                                                  1065  yc_node_factory nfac; return nfac.fn(lhs, nfac.new_number_node(rhs)); }
                                                                                                  1066 
                                                                                                  1067  BOOL_OPER(==, new_equals_node)
                                                                                                  1068  BOOL_OPER(!=, new_not_equals_node)
                                                                                                  1069  BOOL_OPER(<, new_less_than_node)
                                                                                                  1070  BOOL_OPER(>, new_greater_than_node)
                                                                                                  1071  BOOL_OPER(<=, new_not_greater_than_node)
                                                                                                  1072  BOOL_OPER(>=, new_not_less_than_node)
                                                                                                  1073 #undef BOOL_OPER
                                                                                                  1074 
                                                                                                  1076 
                                                                                                  1084 #define EQUALS <<
                                                                                                  1085 
                                                                                                  1087  yc_equation_node_ptr operator EQUALS(yc_var_point_node_ptr gpp, const yc_number_any_arg rhs);
                                                                                                  1088 
                                                                                                  1090 
                                                                                                  1094 #define IF_DOMAIN ^=
                                                                                                  1095 
                                                                                                  1098  const yc_bool_node_ptr cond);
                                                                                                  1099 
                                                                                                  1101 
                                                                                                  1105 #define IF_STEP |=
                                                                                                  1106 
                                                                                                  1109  const yc_bool_node_ptr cond);
                                                                                                  1110 
                                                                                                  1111 #endif // !SWIG.
                                                                                                  1112 
                                                                                                  1115 } // namespace yask.
                                                                                                  virtual yc_bool_node_ptr get_rhs()=0
                                                                                                  Get the [only] operand.
                                                                                                  virtual yc_bool_node_ptr new_not_equals_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'not-equals' node.
                                                                                                  -
                                                                                                  Base class for boolean binary operators that take boolean inputs.
                                                                                                  Definition: yc_node_api.hpp:407
                                                                                                  +
                                                                                                  Base class for boolean binary operators that take boolean inputs.
                                                                                                  Definition: yc_node_api.hpp:408
                                                                                                  std::shared_ptr< yc_commutative_number_node > yc_commutative_number_node_ptr
                                                                                                  Shared pointer to yc_commutative_number_node.
                                                                                                  Definition: yc_node_api.hpp:51
                                                                                                  virtual yc_number_node_ptr new_add_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create an addition node.
                                                                                                  -
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:560
                                                                                                  +
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:561
                                                                                                  virtual int get_num_operands()=0
                                                                                                  Get the number of operands.
                                                                                                  virtual yc_bool_node_ptr new_or_node(yc_bool_node_ptr lhs, yc_bool_node_ptr rhs) const
                                                                                                  Create a boolean 'or' node.
                                                                                                  -
                                                                                                  yc_number_const_arg(idx_t i)
                                                                                                  Arg can be an index type.
                                                                                                  Definition: yc_node_api.hpp:532
                                                                                                  +
                                                                                                  yc_number_const_arg(idx_t i)
                                                                                                  Arg can be an index type.
                                                                                                  Definition: yc_node_api.hpp:533
                                                                                                  std::shared_ptr< yc_not_equals_node > yc_not_equals_node_ptr
                                                                                                  Shared pointer to yc_not_equals_node.
                                                                                                  Definition: yc_node_api.hpp:95
                                                                                                  -
                                                                                                  A modulo node.
                                                                                                  Definition: yc_node_api.hpp:391
                                                                                                  -
                                                                                                  A constant numerical value.
                                                                                                  Definition: yc_node_api.hpp:303
                                                                                                  -
                                                                                                  Factory to create AST nodes.
                                                                                                  Definition: yc_node_api.hpp:606
                                                                                                  +
                                                                                                  A modulo node.
                                                                                                  Definition: yc_node_api.hpp:392
                                                                                                  +
                                                                                                  A constant numerical value.
                                                                                                  Definition: yc_node_api.hpp:304
                                                                                                  +
                                                                                                  Factory to create AST nodes.
                                                                                                  Definition: yc_node_api.hpp:607
                                                                                                  virtual yc_number_node_ptr get_lhs()=0
                                                                                                  Get the left-hand-side operand.
                                                                                                  -
                                                                                                  yc_number_any_arg(double f)
                                                                                                  Arg can be a double.
                                                                                                  Definition: yc_node_api.hpp:590
                                                                                                  -
                                                                                                  yc_var_ptr get_grid()
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yc_node_api.hpp:293
                                                                                                  -
                                                                                                  yc_number_any_arg(yc_number_node_ptr p)
                                                                                                  Arg can be a number-node pointer.
                                                                                                  Definition: yc_node_api.hpp:570
                                                                                                  +
                                                                                                  yc_number_any_arg(double f)
                                                                                                  Arg can be a double.
                                                                                                  Definition: yc_node_api.hpp:591
                                                                                                  +
                                                                                                  yc_number_any_arg(yc_number_node_ptr p)
                                                                                                  Arg can be a number-node pointer.
                                                                                                  Definition: yc_node_api.hpp:571
                                                                                                  virtual yc_bool_node_ptr new_not_greater_than_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'less-than or equals' node.
                                                                                                  -
                                                                                                  A division node.
                                                                                                  Definition: yc_node_api.hpp:387
                                                                                                  -
                                                                                                  #define IF_DOMAIN
                                                                                                  Recommended macro to make the domain-condition operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1093
                                                                                                  +
                                                                                                  A division node.
                                                                                                  Definition: yc_node_api.hpp:388
                                                                                                  +
                                                                                                  #define IF_DOMAIN
                                                                                                  Recommended macro to make the domain-condition operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1094
                                                                                                  std::shared_ptr< yc_greater_than_node > yc_greater_than_node_ptr
                                                                                                  Shared pointer to yc_greater_than_node.
                                                                                                  Definition: yc_node_api.hpp:103
                                                                                                  virtual yc_var_ptr get_var()=0
                                                                                                  Get the var this point is in.
                                                                                                  -
                                                                                                  A numerical-comparison 'greater_than' operator.
                                                                                                  Definition: yc_node_api.hpp:468
                                                                                                  +
                                                                                                  A numerical-comparison 'greater_than' operator.
                                                                                                  Definition: yc_node_api.hpp:469
                                                                                                  virtual yc_index_node_ptr new_misc_index(const std::string &name) const
                                                                                                  Create a new miscellaneous index.
                                                                                                  virtual yc_number_node_ptr new_mod_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a modulo node.
                                                                                                  -
                                                                                                  A numerical negation operator.
                                                                                                  Definition: yc_node_api.hpp:322
                                                                                                  +
                                                                                                  A numerical negation operator.
                                                                                                  Definition: yc_node_api.hpp:323
                                                                                                  std::shared_ptr< yc_not_node > yc_not_node_ptr
                                                                                                  Shared pointer to yc_not_node.
                                                                                                  Definition: yc_node_api.hpp:87
                                                                                                  yc_bool_node_ptr operator &&(yc_bool_node_ptr lhs, yc_bool_node_ptr rhs)
                                                                                                  Operator version of yc_node_factory::new_and_node().
                                                                                                  virtual const std::string & get_name() const =0
                                                                                                  Get the dimension's name.
                                                                                                  @@ -107,79 +106,81 @@
                                                                                                  std::shared_ptr< yc_var_point_node > yc_var_point_node_ptr
                                                                                                  Shared pointer to yc_var_point_node.
                                                                                                  Definition: yask_compiler_api.hpp:79
                                                                                                  void operator+=(yc_number_node_ptr &lhs, yc_number_const_arg rhs)
                                                                                                  Shortcut for creating expression A = A + B.
                                                                                                  virtual void add_operand(yc_number_node_ptr node)=0
                                                                                                  Add an operand.
                                                                                                  -
                                                                                                  A boolean 'or' operator.
                                                                                                  Definition: yc_node_api.hpp:429
                                                                                                  -
                                                                                                  yc_number_any_arg(yc_var_point_node_ptr p)
                                                                                                  Arg can be a var-point-node pointer.
                                                                                                  Definition: yc_node_api.hpp:578
                                                                                                  +
                                                                                                  A boolean 'or' operator.
                                                                                                  Definition: yc_node_api.hpp:430
                                                                                                  +
                                                                                                  yc_number_any_arg(yc_var_point_node_ptr p)
                                                                                                  Arg can be a var-point-node pointer.
                                                                                                  Definition: yc_node_api.hpp:579
                                                                                                  std::shared_ptr< yc_binary_number_node > yc_binary_number_node_ptr
                                                                                                  Shared pointer to yc_binary_number_node.
                                                                                                  Definition: yc_node_api.hpp:55
                                                                                                  void operator *=(yc_number_node_ptr &lhs, yc_number_const_arg rhs)
                                                                                                  Shortcut for creating expression A = A * B.
                                                                                                  std::shared_ptr< yc_not_greater_than_node > yc_not_greater_than_node_ptr
                                                                                                  Shared pointer to yc_not_greater_than_node.
                                                                                                  Definition: yc_node_api.hpp:111
                                                                                                  virtual yc_number_node_ptr new_multiply_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a multiplication node.
                                                                                                  std::shared_ptr< yc_equals_node > yc_equals_node_ptr
                                                                                                  Shared pointer to yc_equals_node.
                                                                                                  Definition: yc_node_api.hpp:91
                                                                                                  Equation node.
                                                                                                  Definition: yc_node_api.hpp:149
                                                                                                  -
                                                                                                  yc_number_any_arg(yc_index_node_ptr p)
                                                                                                  Arg can be an index-node pointer.
                                                                                                  Definition: yc_node_api.hpp:574
                                                                                                  +
                                                                                                  yc_number_any_arg(yc_index_node_ptr p)
                                                                                                  Arg can be an index-node pointer.
                                                                                                  Definition: yc_node_api.hpp:575
                                                                                                  virtual yc_number_node_ptr new_negate_node(yc_number_node_ptr rhs) const
                                                                                                  Create a numerical negation operator node.
                                                                                                  virtual yc_equation_node_ptr new_equation_node(yc_var_point_node_ptr lhs, yc_number_node_ptr rhs, yc_bool_node_ptr sub_domain_cond=nullptr) const
                                                                                                  Create an equation node.
                                                                                                  virtual void set_cond(yc_bool_node_ptr sub_domain_cond)=0
                                                                                                  Set the condition describing the sub-domain for this equation.
                                                                                                  virtual yc_index_node_ptr new_step_index(const std::string &name) const
                                                                                                  Create a step-index node.
                                                                                                  -
                                                                                                  #define BINARY_MATH_EXPR(fn_name)
                                                                                                  Binary math functions. Used internally to define pow().
                                                                                                  Definition: yc_node_api.hpp:941
                                                                                                  +
                                                                                                  #define BINARY_MATH_EXPR(fn_name)
                                                                                                  Binary math functions. Used internally to define pow().
                                                                                                  Definition: yc_node_api.hpp:942
                                                                                                  std::shared_ptr< yc_index_node > yc_index_node_ptr
                                                                                                  Shared pointer to yc_index_node.
                                                                                                  Definition: yask_compiler_api.hpp:71
                                                                                                  -
                                                                                                  Base class for commutative numerical operators.
                                                                                                  Definition: yc_node_api.hpp:336
                                                                                                  +
                                                                                                  Base class for commutative numerical operators.
                                                                                                  Definition: yc_node_api.hpp:337
                                                                                                  virtual int get_num_nodes() const =0
                                                                                                  Count the size of the AST.
                                                                                                  -
                                                                                                  #define BOOL_OPER(oper, fn)
                                                                                                  Binary numerical-to-boolean operators. Used internally to define ==, <, etc.
                                                                                                  Definition: yc_node_api.hpp:1040
                                                                                                  +
                                                                                                  YASK_INT64_T idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:77
                                                                                                  +
                                                                                                  #define BOOL_OPER(oper, fn)
                                                                                                  Binary numerical-to-boolean operators. Used internally to define ==, <, etc.
                                                                                                  Definition: yc_node_api.hpp:1041
                                                                                                  virtual yc_number_node_ptr get_rhs()=0
                                                                                                  Get the right-hand-side operand.
                                                                                                  std::shared_ptr< yc_multiply_node > yc_multiply_node_ptr
                                                                                                  Shared pointer to yc_multiply_node.
                                                                                                  Definition: yc_node_api.hpp:71
                                                                                                  -
                                                                                                  A boolean 'and' operator.
                                                                                                  Definition: yc_node_api.hpp:423
                                                                                                  +
                                                                                                  A boolean 'and' operator.
                                                                                                  Definition: yc_node_api.hpp:424
                                                                                                  Base class for all numerical AST nodes.
                                                                                                  Definition: yc_node_api.hpp:247
                                                                                                  -
                                                                                                  yc_number_any_arg(std::nullptr_t p)
                                                                                                  Arg can be a null pointer.
                                                                                                  Definition: yc_node_api.hpp:598
                                                                                                  -
                                                                                                  Arguments that may be non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:522
                                                                                                  -
                                                                                                  yc_number_ptr_arg(yc_var_point_node_ptr p)
                                                                                                  Arg can be a var-point-node pointer.
                                                                                                  Definition: yc_node_api.hpp:507
                                                                                                  +
                                                                                                  yc_number_any_arg(std::nullptr_t p)
                                                                                                  Arg can be a null pointer.
                                                                                                  Definition: yc_node_api.hpp:599
                                                                                                  +
                                                                                                  Arguments that may be non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:523
                                                                                                  +
                                                                                                  yc_number_ptr_arg(yc_var_point_node_ptr p)
                                                                                                  Arg can be a var-point-node pointer.
                                                                                                  Definition: yc_node_api.hpp:508
                                                                                                  virtual yc_bool_node_ptr new_less_than_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'less-than' node.
                                                                                                  virtual yc_bool_node_ptr new_equals_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'equals' node.
                                                                                                  std::shared_ptr< yc_or_node > yc_or_node_ptr
                                                                                                  Shared pointer to yc_or_node.
                                                                                                  Definition: yc_node_api.hpp:119
                                                                                                  -
                                                                                                  #define UNARY_MATH_EXPR(fn_name)
                                                                                                  Unary math functions. Used internally to define sqrt(), sin(), etc.
                                                                                                  Definition: yc_node_api.hpp:917
                                                                                                  -
                                                                                                  yc_number_ptr_arg(yc_index_node_ptr p)
                                                                                                  Arg can be an index-node pointer.
                                                                                                  Definition: yc_node_api.hpp:503
                                                                                                  +
                                                                                                  #define UNARY_MATH_EXPR(fn_name)
                                                                                                  Unary math functions. Used internally to define sqrt(), sin(), etc.
                                                                                                  Definition: yc_node_api.hpp:918
                                                                                                  +
                                                                                                  yc_number_ptr_arg(yc_index_node_ptr p)
                                                                                                  Arg can be an index-node pointer.
                                                                                                  Definition: yc_node_api.hpp:504
                                                                                                  std::shared_ptr< yc_divide_node > yc_divide_node_ptr
                                                                                                  Shared pointer to yc_divide_node.
                                                                                                  Definition: yc_node_api.hpp:79
                                                                                                  std::shared_ptr< yc_bool_node > yc_bool_node_ptr
                                                                                                  Shared pointer to yc_bool_node.
                                                                                                  Definition: yask_compiler_api.hpp:63
                                                                                                  virtual void set_step_cond(yc_bool_node_ptr step_cond)=0
                                                                                                  Set the condition describing when the equation is valid.
                                                                                                  std::shared_ptr< yc_equation_node > yc_equation_node_ptr
                                                                                                  Shared pointer to yc_equation_node.
                                                                                                  Definition: yask_compiler_api.hpp:75
                                                                                                  -
                                                                                                  yc_number_any_arg(int i)
                                                                                                  Arg can be an int.
                                                                                                  Definition: yc_node_api.hpp:586
                                                                                                  +
                                                                                                  yc_number_any_arg(int i)
                                                                                                  Arg can be an int.
                                                                                                  Definition: yc_node_api.hpp:587
                                                                                                  virtual yc_number_node_ptr get_rhs()=0
                                                                                                  Get the right-hand-side operand.
                                                                                                  -
                                                                                                  A numerical-comparison 'not_equals' operator.
                                                                                                  Definition: yc_node_api.hpp:456
                                                                                                  -
                                                                                                  A numerical-comparison 'equals' operator.
                                                                                                  Definition: yc_node_api.hpp:450
                                                                                                  +
                                                                                                  A numerical-comparison 'not_equals' operator.
                                                                                                  Definition: yc_node_api.hpp:457
                                                                                                  +
                                                                                                  A numerical-comparison 'equals' operator.
                                                                                                  Definition: yc_node_api.hpp:451
                                                                                                  A reference to a point in a var.
                                                                                                  Definition: yc_node_api.hpp:283
                                                                                                  virtual yc_number_node_ptr new_divide_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a division node.
                                                                                                  virtual std::vector< yc_number_node_ptr > get_operands()=0
                                                                                                  Get a list of the operands.
                                                                                                  virtual yc_var_point_node_ptr get_lhs()=0
                                                                                                  Get the left-hand-side operand.
                                                                                                  std::shared_ptr< yc_less_than_node > yc_less_than_node_ptr
                                                                                                  Shared pointer to yc_less_than_node.
                                                                                                  Definition: yc_node_api.hpp:99
                                                                                                  virtual yc_bool_node_ptr get_rhs()=0
                                                                                                  Get the right-hand-side operand.
                                                                                                  -
                                                                                                  A numerical-comparison 'less_than' operator.
                                                                                                  Definition: yc_node_api.hpp:462
                                                                                                  -
                                                                                                  yc_number_const_arg(double f)
                                                                                                  Arg can be a double.
                                                                                                  Definition: yc_node_api.hpp:540
                                                                                                  +
                                                                                                  A numerical-comparison 'less_than' operator.
                                                                                                  Definition: yc_node_api.hpp:463
                                                                                                  +
                                                                                                  yc_number_const_arg(double f)
                                                                                                  Arg can be a double.
                                                                                                  Definition: yc_node_api.hpp:541
                                                                                                  yc_number_node_ptr _convert_const(double val) const
                                                                                                  Create an argument from a constant value.
                                                                                                  yc_bool_node_ptr operator!(yc_bool_node_ptr rhs)
                                                                                                  Operator version of yc_node_factory::new_not_node().
                                                                                                  Base class for all AST nodes.
                                                                                                  Definition: yc_node_api.hpp:125
                                                                                                  std::shared_ptr< yc_binary_comparison_node > yc_binary_comparison_node_ptr
                                                                                                  Shared pointer to yc_binary_comparison_node.
                                                                                                  Definition: yc_node_api.hpp:63
                                                                                                  -
                                                                                                  yc_number_const_arg(float f)
                                                                                                  Arg can be a float.
                                                                                                  Definition: yc_node_api.hpp:544
                                                                                                  -
                                                                                                  A boolean inversion operator.
                                                                                                  Definition: yc_node_api.hpp:397
                                                                                                  +
                                                                                                  yc_number_const_arg(float f)
                                                                                                  Arg can be a float.
                                                                                                  Definition: yc_node_api.hpp:545
                                                                                                  +
                                                                                                  A boolean inversion operator.
                                                                                                  Definition: yc_node_api.hpp:398
                                                                                                  yc_number_node_ptr operator/(yc_number_ptr_arg lhs, yc_number_const_arg rhs)
                                                                                                  Operator version of yc_node_factory::new_divide_node().
                                                                                                  void operator/=(yc_number_node_ptr &lhs, yc_number_const_arg rhs)
                                                                                                  Shortcut for creating expression A = A / B.
                                                                                                  -
                                                                                                  A numerical-comparison 'not_greater_than' operator.
                                                                                                  Definition: yc_node_api.hpp:480
                                                                                                  +
                                                                                                  A numerical-comparison 'not_greater_than' operator.
                                                                                                  Definition: yc_node_api.hpp:481
                                                                                                  Base class for all boolean AST nodes.
                                                                                                  Definition: yc_node_api.hpp:256
                                                                                                  virtual yc_number_node_ptr new_first_domain_index(yc_index_node_ptr idx) const
                                                                                                  Create a symbol for the first index value in a given dimension.
                                                                                                  -
                                                                                                  An addition node.
                                                                                                  Definition: yc_node_api.hpp:360
                                                                                                  +
                                                                                                  An addition node.
                                                                                                  Definition: yc_node_api.hpp:361
                                                                                                  std::shared_ptr< yc_number_node > yc_number_node_ptr
                                                                                                  Shared pointer to yc_number_node.
                                                                                                  Definition: yask_compiler_api.hpp:67
                                                                                                  -
                                                                                                  Base class for numerical binary operators.
                                                                                                  Definition: yc_node_api.hpp:369
                                                                                                  +
                                                                                                  #define YASK_DEPRECATED
                                                                                                  Deprecated attribute.
                                                                                                  Definition: yask_common_api.hpp:55
                                                                                                  +
                                                                                                  Base class for numerical binary operators.
                                                                                                  Definition: yc_node_api.hpp:370
                                                                                                  yc_number_node_ptr operator *(yc_number_ptr_arg lhs, yc_number_const_arg rhs)
                                                                                                  Operator version of yc_node_factory::new_multiply_node().
                                                                                                  -
                                                                                                  A subtraction node.
                                                                                                  Definition: yc_node_api.hpp:383
                                                                                                  +
                                                                                                  A subtraction node.
                                                                                                  Definition: yc_node_api.hpp:384
                                                                                                  virtual yc_bool_node_ptr get_cond()=0
                                                                                                  Get the condition describing the sub-domain.
                                                                                                  -
                                                                                                  virtual yc_number_node_ptr new_number_node(yc_number_any_arg arg) const
                                                                                                  Create a numerical-value expression node.
                                                                                                  Definition: yc_node_api.hpp:695
                                                                                                  +
                                                                                                  virtual yc_number_node_ptr new_number_node(yc_number_any_arg arg) const
                                                                                                  Create a numerical-value expression node.
                                                                                                  Definition: yc_node_api.hpp:696
                                                                                                  virtual yc_number_node_ptr new_const_number_node(double val) const
                                                                                                  Create a constant numerical-value node.
                                                                                                  -
                                                                                                  yc_number_any_arg(idx_t i)
                                                                                                  Arg can be an index type.
                                                                                                  Definition: yc_node_api.hpp:582
                                                                                                  +
                                                                                                  yc_number_any_arg(idx_t i)
                                                                                                  Arg can be an index type.
                                                                                                  Definition: yc_node_api.hpp:583
                                                                                                  void operator-=(yc_number_node_ptr &lhs, yc_number_const_arg rhs)
                                                                                                  Shortcut for creating expression A = A - B.
                                                                                                  virtual std::string format_simple() const =0
                                                                                                  Create a simple human-readable string.
                                                                                                  yc_bool_node_ptr operator||(yc_bool_node_ptr lhs, yc_bool_node_ptr rhs)
                                                                                                  Operator version of yc_node_factory::new_or_node().
                                                                                                  virtual yc_number_node_ptr get_lhs()=0
                                                                                                  Get the left-hand-side operand.
                                                                                                  -
                                                                                                  #define EQUALS
                                                                                                  Recommended macro to make the "equality" operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1083
                                                                                                  +
                                                                                                  #define EQUALS
                                                                                                  Recommended macro to make the "equality" operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1084
                                                                                                  virtual yc_bool_node_ptr new_not_less_than_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'greater-than or equals' node.
                                                                                                  -
                                                                                                  Arguments that may be YASK numeric pointer types.
                                                                                                  Definition: yc_node_api.hpp:494
                                                                                                  +
                                                                                                  Arguments that may be YASK numeric pointer types.
                                                                                                  Definition: yc_node_api.hpp:495
                                                                                                  virtual yc_bool_node_ptr new_not_node(yc_bool_node_ptr rhs) const
                                                                                                  Create a binary inverse operator node.
                                                                                                  std::shared_ptr< yc_negate_node > yc_negate_node_ptr
                                                                                                  Shared pointer to yc_negate_node.
                                                                                                  Definition: yc_node_api.hpp:47
                                                                                                  A dimension or an index in that dimension.
                                                                                                  Definition: yc_node_api.hpp:270
                                                                                                  @@ -187,10 +188,10 @@
                                                                                                  virtual yc_index_node_ptr new_domain_index(const std::string &name) const
                                                                                                  Create a domain-index node.
                                                                                                  std::shared_ptr< yc_not_less_than_node > yc_not_less_than_node_ptr
                                                                                                  Shared pointer to yc_not_less_than_node.
                                                                                                  Definition: yc_node_api.hpp:107
                                                                                                  virtual yc_bool_node_ptr clone_ast() const =0
                                                                                                  Create a deep copy of AST starting with this node.
                                                                                                  -
                                                                                                  A compile-time data variable.
                                                                                                  Definition: yask_compiler_api.hpp:711
                                                                                                  +
                                                                                                  A compile-time data variable.
                                                                                                  Definition: yask_compiler_api.hpp:715
                                                                                                  virtual yc_number_node_ptr clone_ast() const =0
                                                                                                  Create a deep copy of AST starting with this node.
                                                                                                  virtual yc_number_node_ptr new_last_domain_index(yc_index_node_ptr idx) const
                                                                                                  Create a symbol for the last index value in a given dimension.
                                                                                                  -
                                                                                                  yc_number_const_arg(int i)
                                                                                                  Arg can be an int.
                                                                                                  Definition: yc_node_api.hpp:536
                                                                                                  +
                                                                                                  yc_number_const_arg(int i)
                                                                                                  Arg can be an int.
                                                                                                  Definition: yc_node_api.hpp:537
                                                                                                  std::shared_ptr< yc_add_node > yc_add_node_ptr
                                                                                                  Shared pointer to yc_add_node.
                                                                                                  Definition: yc_node_api.hpp:67
                                                                                                  yc_number_node_ptr operator-(yc_number_ptr_arg lhs, yc_number_const_arg rhs)
                                                                                                  Operator version of yc_node_factory::new_subtract_node().
                                                                                                  virtual double get_value() const =0
                                                                                                  Get the stored value.
                                                                                                  @@ -199,17 +200,17 @@
                                                                                                  std::shared_ptr< yc_const_number_node > yc_const_number_node_ptr
                                                                                                  Shared pointer to yc_const_number_node.
                                                                                                  Definition: yc_node_api.hpp:43
                                                                                                  yc_number_node_ptr operator%(yc_number_ptr_arg lhs, yc_number_const_arg rhs)
                                                                                                  Operator version of yc_node_factory::new_mod_node().
                                                                                                  virtual yc_number_node_ptr get_rhs()=0
                                                                                                  Get the right-hand-side operand.
                                                                                                  -
                                                                                                  Base class for boolean binary operators that take numerical inputs.
                                                                                                  Definition: yc_node_api.hpp:432
                                                                                                  +
                                                                                                  Base class for boolean binary operators that take numerical inputs.
                                                                                                  Definition: yc_node_api.hpp:433
                                                                                                  yc_number_node_ptr _convert_const(double val) const
                                                                                                  Create an argument from a constant value.
                                                                                                  -
                                                                                                  std::int64_t idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:61
                                                                                                  virtual yc_number_node_ptr new_subtract_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a subtraction node.
                                                                                                  -
                                                                                                  #define IF_STEP
                                                                                                  Recommended macro to make the step-condition operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1104
                                                                                                  +
                                                                                                  #define IF_STEP
                                                                                                  Recommended macro to make the step-condition operator readable and self-explanatory.
                                                                                                  Definition: yc_node_api.hpp:1105
                                                                                                  virtual void set_value(double val)=0
                                                                                                  Set the value.
                                                                                                  -
                                                                                                  A numerical-comparison 'not_less_than' operator.
                                                                                                  Definition: yc_node_api.hpp:474
                                                                                                  -
                                                                                                  A multiplication node.
                                                                                                  Definition: yc_node_api.hpp:364
                                                                                                  +
                                                                                                  A numerical-comparison 'not_less_than' operator.
                                                                                                  Definition: yc_node_api.hpp:475
                                                                                                  +
                                                                                                  A multiplication node.
                                                                                                  Definition: yc_node_api.hpp:365
                                                                                                  +
                                                                                                  YASK_DEPRECATED yc_var_ptr get_grid()
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yc_node_api.hpp:294
                                                                                                  virtual yc_bool_node_ptr new_greater_than_node(yc_number_node_ptr lhs, yc_number_node_ptr rhs) const
                                                                                                  Create a numerical-comparison 'greater-than' node.
                                                                                                  -
                                                                                                  yc_number_any_arg(float f)
                                                                                                  Arg can be a float.
                                                                                                  Definition: yc_node_api.hpp:594
                                                                                                  -
                                                                                                  yc_number_ptr_arg(yc_number_node_ptr p)
                                                                                                  Arg can be a number-node pointer.
                                                                                                  Definition: yc_node_api.hpp:499
                                                                                                  +
                                                                                                  yc_number_any_arg(float f)
                                                                                                  Arg can be a float.
                                                                                                  Definition: yc_node_api.hpp:595
                                                                                                  +
                                                                                                  yc_number_ptr_arg(yc_number_node_ptr p)
                                                                                                  Arg can be a number-node pointer.
                                                                                                  Definition: yc_node_api.hpp:500
                                                                                                  virtual yc_bool_node_ptr new_and_node(yc_bool_node_ptr lhs, yc_bool_node_ptr rhs) const
                                                                                                  Create a boolean 'and' node.
                                                                                                  diff --git a/docs/api/html/yc__solution__api_8hpp_source.html b/docs/api/html/yc__solution__api_8hpp_source.html index 039c7b0f..1b19d585 100644 --- a/docs/api/html/yc__solution__api_8hpp_source.html +++ b/docs/api/html/yc__solution__api_8hpp_source.html @@ -70,9 +70,9 @@
                                                                                                  yc_solution_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  26 // This file contains a base class and macros to create
                                                                                                  27 // stencils to be included in the YASK compiler binary utility.
                                                                                                  28 
                                                                                                  29 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  30 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  33 #pragma once
                                                                                                  34 
                                                                                                  35 // Standard headers.
                                                                                                  36 #include <cassert>
                                                                                                  37 #include <map>
                                                                                                  38 
                                                                                                  39 namespace yask {
                                                                                                  40 
                                                                                                  46 
                                                                                                  58 
                                                                                                  59  public:
                                                                                                  61  typedef std::map<std::string, yc_solution_base*> soln_map;
                                                                                                  62 
                                                                                                  63  private:
                                                                                                  64 
                                                                                                  66  yc_solution_ptr _soln;
                                                                                                  67 
                                                                                                  69  yc_factory _yc_factory;
                                                                                                  70 
                                                                                                  72  yc_node_factory _node_factory;
                                                                                                  73 
                                                                                                  74  public:
                                                                                                  75 
                                                                                                  77 
                                                                                                  82  yc_solution_base(const std::string& name);
                                                                                                  83 
                                                                                                  85 
                                                                                                  91 
                                                                                                  93  virtual ~yc_solution_base() { }
                                                                                                  94 
                                                                                                  96 
                                                                                                  99  static soln_map& get_registry();
                                                                                                  100 
                                                                                                  102 
                                                                                                  117  virtual void
                                                                                                  118  define();
                                                                                                  119 
                                                                                                  121  inline yc_solution_ptr
                                                                                                  123  return _soln;
                                                                                                  124  }
                                                                                                  125 
                                                                                                  127  inline yc_index_node_ptr
                                                                                                  128  new_step_index(const std::string& name) {
                                                                                                  129  return _node_factory.new_step_index(name);
                                                                                                  130  }
                                                                                                  131 
                                                                                                  133  inline yc_index_node_ptr
                                                                                                  134  new_domain_index(const std::string& name) {
                                                                                                  135  return _node_factory.new_domain_index(name);
                                                                                                  136  }
                                                                                                  137 
                                                                                                  139  inline yc_index_node_ptr
                                                                                                  140  new_misc_index(const std::string& name) {
                                                                                                  141  return _node_factory.new_misc_index(name);
                                                                                                  142  }
                                                                                                  143 
                                                                                                  145  inline yc_number_node_ptr
                                                                                                  147  return _node_factory.new_number_node(arg);
                                                                                                  148  }
                                                                                                  149 
                                                                                                  151  inline yc_number_node_ptr
                                                                                                  153  return _node_factory.new_first_domain_index(dim);
                                                                                                  154  }
                                                                                                  155 
                                                                                                  157  inline yc_number_node_ptr
                                                                                                  159  return _node_factory.new_last_domain_index(dim);
                                                                                                  160  }
                                                                                                  161  };
                                                                                                  162 
                                                                                                  164 
                                                                                                  172  private:
                                                                                                  173 
                                                                                                  175  int _radius;
                                                                                                  176 
                                                                                                  177  public:
                                                                                                  179  yc_solution_with_radius_base(const std::string& name, int radius) :
                                                                                                  180  yc_solution_base(name) {
                                                                                                  181  set_radius(radius);
                                                                                                  182  }
                                                                                                  183 
                                                                                                  185 
                                                                                                  188  virtual void
                                                                                                  189  define() override;
                                                                                                  190 
                                                                                                  192 
                                                                                                  197  virtual bool
                                                                                                  198  set_radius(int radius) {
                                                                                                  199  _radius = radius;
                                                                                                  200  auto soln = get_soln();
                                                                                                  201  soln->set_description(soln->get_name() + " radius " + std::to_string(radius));
                                                                                                  202  return radius >= 0; // support only non-neg. radius.
                                                                                                  203  }
                                                                                                  204 
                                                                                                  206 
                                                                                                  209  virtual int
                                                                                                  210  get_radius() const {
                                                                                                  211  return _radius;
                                                                                                  212  }
                                                                                                  213  };
                                                                                                  214 
                                                                                                  217 } // namespace yask.
                                                                                                  virtual ~yc_solution_base()
                                                                                                  Destructor.
                                                                                                  Definition: yc_solution_api.hpp:93
                                                                                                  -
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:560
                                                                                                  -
                                                                                                  Factory to create AST nodes.
                                                                                                  Definition: yc_node_api.hpp:606
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  26 // This file contains a base class and macros to create
                                                                                                  27 // stencils to be included in the YASK compiler binary utility.
                                                                                                  28 
                                                                                                  29 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  30 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  33 #pragma once
                                                                                                  34 
                                                                                                  35 // Standard headers.
                                                                                                  36 #include <cassert>
                                                                                                  37 #include <map>
                                                                                                  38 
                                                                                                  39 namespace yask {
                                                                                                  40 
                                                                                                  46 
                                                                                                  58 
                                                                                                  59  public:
                                                                                                  61  typedef std::map<std::string, yc_solution_base*> soln_map;
                                                                                                  62 
                                                                                                  63  private:
                                                                                                  64 
                                                                                                  66  yc_solution_ptr _soln;
                                                                                                  67 
                                                                                                  69  yc_factory _yc_factory;
                                                                                                  70 
                                                                                                  72  yc_node_factory _node_factory;
                                                                                                  73 
                                                                                                  74  public:
                                                                                                  75 
                                                                                                  77 
                                                                                                  82  yc_solution_base(const std::string& name);
                                                                                                  83 
                                                                                                  85 
                                                                                                  91 
                                                                                                  93  virtual ~yc_solution_base() { }
                                                                                                  94 
                                                                                                  96 
                                                                                                  99  static soln_map& get_registry();
                                                                                                  100 
                                                                                                  102 
                                                                                                  117  virtual void
                                                                                                  118  define();
                                                                                                  119 
                                                                                                  121  inline yc_solution_ptr
                                                                                                  123  return _soln;
                                                                                                  124  }
                                                                                                  125 
                                                                                                  127  inline yc_index_node_ptr
                                                                                                  128  new_step_index(const std::string& name) {
                                                                                                  129  return _node_factory.new_step_index(name);
                                                                                                  130  }
                                                                                                  131 
                                                                                                  133  inline yc_index_node_ptr
                                                                                                  134  new_domain_index(const std::string& name) {
                                                                                                  135  return _node_factory.new_domain_index(name);
                                                                                                  136  }
                                                                                                  137 
                                                                                                  139  inline yc_index_node_ptr
                                                                                                  140  new_misc_index(const std::string& name) {
                                                                                                  141  return _node_factory.new_misc_index(name);
                                                                                                  142  }
                                                                                                  143 
                                                                                                  145  inline yc_number_node_ptr
                                                                                                  147  return _node_factory.new_number_node(arg);
                                                                                                  148  }
                                                                                                  149 
                                                                                                  151  inline yc_number_node_ptr
                                                                                                  153  return _node_factory.new_first_domain_index(dim);
                                                                                                  154  }
                                                                                                  155 
                                                                                                  157  inline yc_number_node_ptr
                                                                                                  159  return _node_factory.new_last_domain_index(dim);
                                                                                                  160  }
                                                                                                  161  };
                                                                                                  162 
                                                                                                  164 
                                                                                                  172  private:
                                                                                                  173 
                                                                                                  175  int _radius;
                                                                                                  176 
                                                                                                  177  public:
                                                                                                  179  yc_solution_with_radius_base(const std::string& name, int radius) :
                                                                                                  180  yc_solution_base(name) {
                                                                                                  181  set_radius(radius);
                                                                                                  182  }
                                                                                                  183 
                                                                                                  185 
                                                                                                  188  virtual void
                                                                                                  189  define() override;
                                                                                                  190 
                                                                                                  192 
                                                                                                  197  virtual bool
                                                                                                  198  set_radius(int radius) {
                                                                                                  199  _radius = radius;
                                                                                                  200  auto soln = get_soln();
                                                                                                  201  soln->set_description(soln->get_name() + " radius " + std::to_string(radius));
                                                                                                  202  return radius >= 0; // support only non-neg. radius.
                                                                                                  203  }
                                                                                                  204 
                                                                                                  206 
                                                                                                  209  virtual int
                                                                                                  210  get_radius() const {
                                                                                                  211  return _radius;
                                                                                                  212  }
                                                                                                  213  };
                                                                                                  214 
                                                                                                  217 } // namespace yask.
                                                                                                  virtual ~yc_solution_base()
                                                                                                  Destructor.
                                                                                                  Definition: yc_solution_api.hpp:93
                                                                                                  +
                                                                                                  Arguments that may be YASK or non-YASK numeric types.
                                                                                                  Definition: yc_node_api.hpp:561
                                                                                                  +
                                                                                                  Factory to create AST nodes.
                                                                                                  Definition: yc_node_api.hpp:607
                                                                                                  yc_index_node_ptr new_step_index(const std::string &name)
                                                                                                  A simple wrapper for yc_node_factory::new_step_index().
                                                                                                  Definition: yc_solution_api.hpp:128
                                                                                                  virtual yc_index_node_ptr new_misc_index(const std::string &name) const
                                                                                                  Create a new miscellaneous index.
                                                                                                  yc_number_node_ptr first_domain_index(yc_index_node_ptr dim)
                                                                                                  A simple wrapper for yc_node_factory::new_first_domain_index().
                                                                                                  Definition: yc_solution_api.hpp:152
                                                                                                  @@ -88,7 +88,7 @@
                                                                                                  yc_solution_base(const std::string &name)
                                                                                                  Constructor.
                                                                                                  virtual yc_number_node_ptr new_first_domain_index(yc_index_node_ptr idx) const
                                                                                                  Create a symbol for the first index value in a given dimension.
                                                                                                  std::shared_ptr< yc_number_node > yc_number_node_ptr
                                                                                                  Shared pointer to yc_number_node.
                                                                                                  Definition: yask_compiler_api.hpp:67
                                                                                                  -
                                                                                                  virtual yc_number_node_ptr new_number_node(yc_number_any_arg arg) const
                                                                                                  Create a numerical-value expression node.
                                                                                                  Definition: yc_node_api.hpp:695
                                                                                                  +
                                                                                                  virtual yc_number_node_ptr new_number_node(yc_number_any_arg arg) const
                                                                                                  Create a numerical-value expression node.
                                                                                                  Definition: yc_node_api.hpp:696
                                                                                                  A base class for stencils that have a "radius" size parameter.
                                                                                                  Definition: yc_solution_api.hpp:171
                                                                                                  yc_number_node_ptr last_domain_index(yc_index_node_ptr dim)
                                                                                                  A simple wrapper for yc_node_factory::new_last_domain_index().
                                                                                                  Definition: yc_solution_api.hpp:158
                                                                                                  static soln_map & get_registry()
                                                                                                  Access to the registry.
                                                                                                  diff --git a/docs/api/html/yk__solution__api_8hpp.html b/docs/api/html/yk__solution__api_8hpp.html index 0f192516..a3e16a97 100644 --- a/docs/api/html/yk__solution__api_8hpp.html +++ b/docs/api/html/yk__solution__api_8hpp.html @@ -97,6 +97,9 @@ const int yask::yask_numa_none = -9  Do not specify any NUMA binding. More...
                                                                                                    +const int yask::yask_numa_offload = -11 + Do not specify any NUMA binding and use allocations optimized for offloading. More...
                                                                                                  diff --git a/docs/api/html/yk__solution__api_8hpp_source.html b/docs/api/html/yk__solution__api_8hpp_source.html index 46b9b7ec..12cffbd6 100644 --- a/docs/api/html/yk__solution__api_8hpp_source.html +++ b/docs/api/html/yk__solution__api_8hpp_source.html @@ -70,14 +70,15 @@
                                                                                                  yk_solution_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_kernel_api.hpp"
                                                                                                  35 
                                                                                                  36 namespace yask {
                                                                                                  37 
                                                                                                  43 
                                                                                                  49  const int yask_numa_local = -1;
                                                                                                  50 
                                                                                                  52 
                                                                                                  57  const int yask_numa_interleave = -2;
                                                                                                  58 
                                                                                                  60 
                                                                                                  65  const int yask_numa_none = -9;
                                                                                                  66 
                                                                                                  68 
                                                                                                  74  class yk_solution {
                                                                                                  75  public:
                                                                                                  76  virtual ~yk_solution() {}
                                                                                                  77 
                                                                                                  79 
                                                                                                  83  virtual void
                                                                                                  87 
                                                                                                  89 
                                                                                                  92  virtual const std::string&
                                                                                                  93  get_name() const =0;
                                                                                                  94 
                                                                                                  96 
                                                                                                  101  virtual std::string
                                                                                                  102  get_target() const =0;
                                                                                                  103 
                                                                                                  105 
                                                                                                  108  virtual int
                                                                                                  109  get_element_bytes() const =0;
                                                                                                  110 
                                                                                                  112 
                                                                                                  117  virtual std::string
                                                                                                  118  get_step_dim_name() const =0;
                                                                                                  119 
                                                                                                  121 
                                                                                                  127  virtual int
                                                                                                  128  get_num_domain_dims() const =0;
                                                                                                  129 
                                                                                                  131 
                                                                                                  136  virtual std::vector<std::string>
                                                                                                  137  get_domain_dim_names() const =0;
                                                                                                  138 
                                                                                                  140 
                                                                                                  148  virtual std::vector<std::string>
                                                                                                  149  get_misc_dim_names() const =0;
                                                                                                  150 
                                                                                                  152 
                                                                                                  175  virtual void
                                                                                                  176  set_rank_domain_size(const std::string& dim,
                                                                                                  179  idx_t size ) =0;
                                                                                                  180 
                                                                                                  182 
                                                                                                  192  virtual idx_t
                                                                                                  193  get_rank_domain_size(const std::string& dim) const =0;
                                                                                                  196 
                                                                                                  198 
                                                                                                  208  virtual void
                                                                                                  209  set_overall_domain_size(const std::string& dim,
                                                                                                  212  idx_t size ) =0;
                                                                                                  213 
                                                                                                  215 
                                                                                                  228  virtual idx_t
                                                                                                  229  get_overall_domain_size(const std::string& dim ) const =0;
                                                                                                  232 
                                                                                                  234 
                                                                                                  251  virtual void
                                                                                                  252  set_block_size(const std::string& dim,
                                                                                                  256  idx_t size ) =0;
                                                                                                  258 
                                                                                                  260 
                                                                                                  265  virtual idx_t
                                                                                                  266  get_block_size(const std::string& dim) const =0;
                                                                                                  270 
                                                                                                  272 
                                                                                                  299  virtual void
                                                                                                  300  set_num_ranks(const std::string& dim,
                                                                                                  303  idx_t num ) =0;
                                                                                                  304 
                                                                                                  306 
                                                                                                  309  virtual idx_t
                                                                                                  310  get_num_ranks(const std::string& dim) const =0;
                                                                                                  313 
                                                                                                  315 
                                                                                                  336  virtual void
                                                                                                  337  set_rank_index(const std::string& dim,
                                                                                                  340  idx_t num ) =0;
                                                                                                  341 
                                                                                                  343 
                                                                                                  348  virtual idx_t
                                                                                                  349  get_rank_index(const std::string& dim ) const =0;
                                                                                                  352 
                                                                                                  354 
                                                                                                  368  virtual std::string
                                                                                                  369  apply_command_line_options(const std::string& args ) =0;
                                                                                                  371 
                                                                                                  373 
                                                                                                  380  virtual std::string
                                                                                                  381  apply_command_line_options(int argc, char* argv[]) =0;
                                                                                                  382 
                                                                                                  384 
                                                                                                  390  virtual std::string
                                                                                                  391  apply_command_line_options(const std::vector<std::string>& args) =0;
                                                                                                  392 
                                                                                                  394 
                                                                                                  401  virtual int
                                                                                                  402  get_num_vars() const =0;
                                                                                                  403 
                                                                                                  405 
                                                                                                  409  virtual yk_var_ptr
                                                                                                  410  get_var(const std::string& name ) =0;
                                                                                                  412 
                                                                                                  414 
                                                                                                  417  virtual std::vector<yk_var_ptr>
                                                                                                  418  get_vars() =0;
                                                                                                  419 
                                                                                                  421 
                                                                                                  429  virtual void
                                                                                                  430  prepare_solution() =0;
                                                                                                  431 
                                                                                                  433 
                                                                                                  445  virtual idx_t
                                                                                                  446  get_first_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  449 
                                                                                                  451 
                                                                                                  464  virtual idx_t
                                                                                                  465  get_last_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  468 
                                                                                                  470 
                                                                                                  501  virtual void
                                                                                                  502  run_solution(idx_t first_step_index ,
                                                                                                  503  idx_t last_step_index ) =0;
                                                                                                  504 
                                                                                                  506 
                                                                                                  531  virtual void
                                                                                                  532  run_solution(idx_t step_index ) =0;
                                                                                                  533 
                                                                                                  535 
                                                                                                  541  virtual void
                                                                                                  542  end_solution() =0;
                                                                                                  543 
                                                                                                  545 
                                                                                                  550  virtual yk_stats_ptr
                                                                                                  551  get_stats() =0;
                                                                                                  552 
                                                                                                  554 
                                                                                                  559  virtual bool
                                                                                                  560  is_auto_tuner_enabled() const =0;
                                                                                                  561 
                                                                                                  562  /* Advanced APIs for yk_solution found below are not needed for most applications. */
                                                                                                  563 
                                                                                                  565 
                                                                                                  592  virtual void
                                                                                                  593  set_region_size(const std::string& dim,
                                                                                                  597  idx_t size ) =0;
                                                                                                  599 
                                                                                                  601 
                                                                                                  606  virtual idx_t
                                                                                                  607  get_region_size(const std::string& dim) const =0;
                                                                                                  611 
                                                                                                  613 
                                                                                                  637  virtual void
                                                                                                  638  set_min_pad_size(const std::string& dim,
                                                                                                  641  idx_t size ) =0;
                                                                                                  644 
                                                                                                  646 
                                                                                                  649  virtual idx_t
                                                                                                  650  get_min_pad_size(const std::string& dim) const =0;
                                                                                                  653 
                                                                                                  655 
                                                                                                  666  virtual void
                                                                                                  667  reset_auto_tuner(bool enable,
                                                                                                  670  bool verbose = false ) =0;
                                                                                                  673 
                                                                                                  675 
                                                                                                  693  virtual void
                                                                                                  694  run_auto_tuner_now(bool verbose = true ) =0;
                                                                                                  697 
                                                                                                  699 
                                                                                                  747  virtual yk_var_ptr
                                                                                                  748  new_var(const std::string& name,
                                                                                                  751  const std::vector<std::string>& dims ) =0;
                                                                                                  755 
                                                                                                  756 #ifndef SWIG
                                                                                                  757 
                                                                                                  764  virtual yk_var_ptr
                                                                                                  765  new_var(const std::string& name,
                                                                                                  768  const std::initializer_list<std::string>& dims ) =0;
                                                                                                  772 #endif
                                                                                                  773 
                                                                                                  775 
                                                                                                  822  virtual yk_var_ptr
                                                                                                  823  new_fixed_size_var(const std::string& name,
                                                                                                  826  const std::vector<std::string>& dims,
                                                                                                  830  const std::vector<idx_t>& dim_sizes ) =0;
                                                                                                  833 
                                                                                                  834 #ifndef SWIG
                                                                                                  835 
                                                                                                  842  virtual yk_var_ptr
                                                                                                  843  new_fixed_size_var(const std::string& name,
                                                                                                  846  const std::initializer_list<std::string>& dims,
                                                                                                  850  const std::initializer_list<idx_t>& dim_sizes ) =0;
                                                                                                  853 #endif
                                                                                                  854 
                                                                                                  856 
                                                                                                  867  virtual bool
                                                                                                  868  set_default_numa_preferred(int numa_node) =0;
                                                                                                  879 
                                                                                                  881 
                                                                                                  884  virtual int
                                                                                                  885  get_default_numa_preferred() const =0;
                                                                                                  886 
                                                                                                  887 #ifndef SWIG
                                                                                                  888  typedef std::function<void(yk_solution&)> hook_fn_t;
                                                                                                  890 
                                                                                                  892  typedef std::function<void(yk_solution& soln,
                                                                                                  893  idx_t first_step_index,
                                                                                                  894  idx_t last_step_index)> hook_fn_2idx_t;
                                                                                                  895 
                                                                                                  897 
                                                                                                  905  virtual void
                                                                                                  908 
                                                                                                  910 
                                                                                                  918  virtual void
                                                                                                  921 
                                                                                                  923 
                                                                                                  933  virtual void
                                                                                                  936 
                                                                                                  938 
                                                                                                  948  virtual void
                                                                                                  951 #endif
                                                                                                  952 
                                                                                                  954 
                                                                                                  959  virtual void
                                                                                                  960  fuse_vars(yk_solution_ptr source) =0;
                                                                                                  962 
                                                                                                  964  virtual void
                                                                                                  965  set_step_wrap(bool do_wrap) =0;
                                                                                                  967 
                                                                                                  969 
                                                                                                  972  virtual bool
                                                                                                  973  get_step_wrap() const =0;
                                                                                                  974 
                                                                                                  976  inline int
                                                                                                  977  get_num_grids() const {
                                                                                                  978  return get_num_vars();
                                                                                                  979  }
                                                                                                  980 
                                                                                                  982  inline yk_var_ptr
                                                                                                  983  get_grid(const std::string& name) {
                                                                                                  984  return get_var(name);
                                                                                                  985  }
                                                                                                  986 
                                                                                                  988  inline std::vector<yk_var_ptr>
                                                                                                  990  return get_vars();
                                                                                                  991  }
                                                                                                  992 
                                                                                                  994  inline yk_var_ptr
                                                                                                  995  new_grid(const std::string& name,
                                                                                                  996  const std::vector<std::string>& dims) {
                                                                                                  997  return new_var(name, dims);
                                                                                                  998  }
                                                                                                  999 
                                                                                                  1000 #ifndef SWIG
                                                                                                  1001  inline yk_var_ptr
                                                                                                  1003  new_grid(const std::string& name,
                                                                                                  1004  const std::initializer_list<std::string>& dims) {
                                                                                                  1005  return new_var(name, dims);
                                                                                                  1006  }
                                                                                                  1007 #endif
                                                                                                  1008 
                                                                                                  1010  inline yk_var_ptr
                                                                                                  1011  new_fixed_size_grid(const std::string& name,
                                                                                                  1012  const std::vector<std::string>& dims,
                                                                                                  1013  const std::vector<idx_t>& dim_sizes) {
                                                                                                  1014  return new_fixed_size_var(name, dims, dim_sizes);
                                                                                                  1015  }
                                                                                                  1016 
                                                                                                  1017 #ifndef SWIG
                                                                                                  1018  inline yk_var_ptr
                                                                                                  1020  new_fixed_size_grid(const std::string& name,
                                                                                                  1021  const std::initializer_list<std::string>& dims,
                                                                                                  1022  const std::vector<idx_t>& dim_sizes) {
                                                                                                  1023  return new_fixed_size_var(name, dims, dim_sizes);
                                                                                                  1024  }
                                                                                                  1025 #endif
                                                                                                  1026 
                                                                                                  1028  inline void
                                                                                                  1030  fuse_vars(source);
                                                                                                  1031  }
                                                                                                  1032  }; // yk_solution.
                                                                                                  1033 
                                                                                                  1035 
                                                                                                  1040  class yk_stats {
                                                                                                  1041  public:
                                                                                                  1042  virtual ~yk_stats() {}
                                                                                                  1043 
                                                                                                  1045 
                                                                                                  1051  virtual idx_t
                                                                                                  1052  get_num_elements() =0;
                                                                                                  1053 
                                                                                                  1055 
                                                                                                  1059  virtual idx_t
                                                                                                  1060  get_num_steps_done() =0;
                                                                                                  1061 
                                                                                                  1063 
                                                                                                  1067  virtual idx_t
                                                                                                  1068  get_num_writes_done() =0;
                                                                                                  1069 
                                                                                                  1071 
                                                                                                  1077  virtual idx_t
                                                                                                  1078  get_est_fp_ops_done() =0;
                                                                                                  1079 
                                                                                                  1081 
                                                                                                  1085  virtual double
                                                                                                  1086  get_elapsed_secs() =0;
                                                                                                  1087  }; // yk_stats.
                                                                                                  1088 
                                                                                                  1090 } // namespace yask.
                                                                                                  virtual void reset_auto_tuner(bool enable, bool verbose=false)=0
                                                                                                  [Advanced] Restart or disable the auto-tuner on this rank.
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_kernel_api.hpp"
                                                                                                  35 
                                                                                                  36 namespace yask {
                                                                                                  37 
                                                                                                  43 
                                                                                                  49  const int yask_numa_local = -1;
                                                                                                  50 
                                                                                                  52 
                                                                                                  57  const int yask_numa_interleave = -2;
                                                                                                  58 
                                                                                                  60 
                                                                                                  65  const int yask_numa_none = -9;
                                                                                                  66 
                                                                                                  68 
                                                                                                  73  const int yask_numa_offload = -11;
                                                                                                  74 
                                                                                                  76 
                                                                                                  82  class yk_solution {
                                                                                                  83  public:
                                                                                                  84  virtual ~yk_solution() {}
                                                                                                  85 
                                                                                                  87 
                                                                                                  90  virtual const std::string&
                                                                                                  91  get_name() const =0;
                                                                                                  92 
                                                                                                  94 
                                                                                                  99  virtual std::string
                                                                                                  100  get_target() const =0;
                                                                                                  101 
                                                                                                  103 
                                                                                                  106  virtual bool
                                                                                                  107  is_offloaded() const =0;
                                                                                                  108 
                                                                                                  110 
                                                                                                  113  virtual int
                                                                                                  114  get_element_bytes() const =0;
                                                                                                  115 
                                                                                                  117 
                                                                                                  122  virtual std::string
                                                                                                  123  get_step_dim_name() const =0;
                                                                                                  124 
                                                                                                  126 
                                                                                                  132  virtual int
                                                                                                  133  get_num_domain_dims() const =0;
                                                                                                  134 
                                                                                                  136 
                                                                                                  141  virtual string_vec
                                                                                                  142  get_domain_dim_names() const =0;
                                                                                                  143 
                                                                                                  145 
                                                                                                  153  virtual string_vec
                                                                                                  154  get_misc_dim_names() const =0;
                                                                                                  155 
                                                                                                  157 
                                                                                                  179  virtual void
                                                                                                  180  set_rank_domain_size(const std::string& dim,
                                                                                                  183  idx_t size ) =0;
                                                                                                  184 
                                                                                                  186 
                                                                                                  189  virtual void
                                                                                                  190  set_rank_domain_size_vec(const idx_t_vec& vals) = 0;
                                                                                                  192 
                                                                                                  193  #ifndef SWIG
                                                                                                  194 
                                                                                                  198  virtual void
                                                                                                  199  set_rank_domain_size_vec(const idx_t_init_list& vals) = 0;
                                                                                                  201  #endif
                                                                                                  202 
                                                                                                  204 
                                                                                                  213  virtual idx_t
                                                                                                  214  get_rank_domain_size(const std::string& dim) const =0;
                                                                                                  217 
                                                                                                  219 
                                                                                                  223  virtual idx_t_vec
                                                                                                  224  get_rank_domain_size_vec() const =0;
                                                                                                  225 
                                                                                                  227 
                                                                                                  236  virtual void
                                                                                                  237  set_overall_domain_size(const std::string& dim,
                                                                                                  240  idx_t size ) =0;
                                                                                                  241 
                                                                                                  243 
                                                                                                  246  virtual void
                                                                                                  247  set_overall_domain_size_vec(const idx_t_vec& vals) = 0;
                                                                                                  249 
                                                                                                  250  #ifndef SWIG
                                                                                                  251 
                                                                                                  255  virtual void
                                                                                                  258  #endif
                                                                                                  259 
                                                                                                  261 
                                                                                                  273  virtual idx_t
                                                                                                  274  get_overall_domain_size(const std::string& dim ) const =0;
                                                                                                  277 
                                                                                                  279 
                                                                                                  284  virtual idx_t_vec
                                                                                                  285  get_overall_domain_size_vec() const =0;
                                                                                                  286 
                                                                                                  288 
                                                                                                  308  virtual void
                                                                                                  309  set_block_size(const std::string& dim,
                                                                                                  313  idx_t size ) =0;
                                                                                                  315 
                                                                                                  317 
                                                                                                  324  virtual void
                                                                                                  325  set_block_size_vec(const idx_t_vec& vals) = 0;
                                                                                                  327 
                                                                                                  328  #ifndef SWIG
                                                                                                  329 
                                                                                                  337  virtual void
                                                                                                  338  set_block_size_vec(const idx_t_init_list& vals) = 0;
                                                                                                  340  #endif
                                                                                                  341 
                                                                                                  343 
                                                                                                  348  virtual idx_t
                                                                                                  349  get_block_size(const std::string& dim) const =0;
                                                                                                  353 
                                                                                                  355 
                                                                                                  364  virtual idx_t_vec
                                                                                                  365  get_block_size_vec() const =0;
                                                                                                  366 
                                                                                                  368 
                                                                                                  395  virtual void
                                                                                                  396  set_num_ranks(const std::string& dim,
                                                                                                  399  idx_t num ) =0;
                                                                                                  400 
                                                                                                  402 
                                                                                                  405  virtual void
                                                                                                  406  set_num_ranks_vec(const idx_t_vec& vals) = 0;
                                                                                                  408 
                                                                                                  409  #ifndef SWIG
                                                                                                  410 
                                                                                                  414  virtual void
                                                                                                  415  set_num_ranks_vec(const idx_t_init_list& vals) = 0;
                                                                                                  417  #endif
                                                                                                  418 
                                                                                                  420 
                                                                                                  427  virtual idx_t
                                                                                                  428  get_num_ranks(const std::string& dim) const =0;
                                                                                                  431 
                                                                                                  433 
                                                                                                  437  virtual idx_t_vec
                                                                                                  438  get_num_ranks_vec() const =0;
                                                                                                  439 
                                                                                                  441 
                                                                                                  466  virtual void
                                                                                                  467  set_rank_index(const std::string& dim,
                                                                                                  470  idx_t num ) =0;
                                                                                                  471 
                                                                                                  473 
                                                                                                  476  virtual void
                                                                                                  477  set_rank_index_vec(const idx_t_vec& vals) = 0;
                                                                                                  479 
                                                                                                  480  #ifndef SWIG
                                                                                                  481 
                                                                                                  485  virtual void
                                                                                                  486  set_rank_index_vec(const idx_t_init_list& vals) = 0;
                                                                                                  488  #endif
                                                                                                  489 
                                                                                                  491 
                                                                                                  496  virtual idx_t
                                                                                                  497  get_rank_index(const std::string& dim ) const =0;
                                                                                                  500 
                                                                                                  502 
                                                                                                  506  virtual idx_t_vec
                                                                                                  507  get_rank_index_vec() const =0;
                                                                                                  508 
                                                                                                  510 
                                                                                                  523  virtual std::string
                                                                                                  524  apply_command_line_options(const std::string& args ) =0;
                                                                                                  526 
                                                                                                  528 
                                                                                                  535  virtual std::string
                                                                                                  536  apply_command_line_options(int argc, char* argv[]) =0;
                                                                                                  537 
                                                                                                  539 
                                                                                                  545  virtual std::string
                                                                                                  546  apply_command_line_options(const string_vec& args) =0;
                                                                                                  547 
                                                                                                  549 
                                                                                                  556  virtual int
                                                                                                  557  get_num_vars() const =0;
                                                                                                  558 
                                                                                                  560 
                                                                                                  564  virtual yk_var_ptr
                                                                                                  565  get_var(const std::string& name ) =0;
                                                                                                  567 
                                                                                                  569 
                                                                                                  572  virtual std::vector<yk_var_ptr>
                                                                                                  573  get_vars() =0;
                                                                                                  574 
                                                                                                  576 
                                                                                                  586  virtual void
                                                                                                  587  prepare_solution() =0;
                                                                                                  588 
                                                                                                  590 
                                                                                                  602  virtual idx_t
                                                                                                  603  get_first_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  606 
                                                                                                  608 
                                                                                                  612  virtual idx_t_vec
                                                                                                  614 
                                                                                                  616 
                                                                                                  629  virtual idx_t
                                                                                                  630  get_last_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  633 
                                                                                                  635 
                                                                                                  639  virtual idx_t_vec
                                                                                                  641 
                                                                                                  643 
                                                                                                  677  virtual void
                                                                                                  678  run_solution(idx_t first_step_index ,
                                                                                                  679  idx_t last_step_index ) =0;
                                                                                                  680 
                                                                                                  682 
                                                                                                  707  virtual void
                                                                                                  708  run_solution(idx_t step_index ) =0;
                                                                                                  709 
                                                                                                  711 
                                                                                                  723  virtual void
                                                                                                  724  copy_vars_to_device() const =0;
                                                                                                  725 
                                                                                                  727 
                                                                                                  739  virtual void
                                                                                                  740  copy_vars_from_device() const =0;
                                                                                                  741 
                                                                                                  743 
                                                                                                  749  virtual void
                                                                                                  750  end_solution() =0;
                                                                                                  751 
                                                                                                  753 
                                                                                                  760  virtual yk_stats_ptr
                                                                                                  761  get_stats() =0;
                                                                                                  762 
                                                                                                  764 
                                                                                                  775  virtual void
                                                                                                  776  reset_auto_tuner(bool enable,
                                                                                                  779  bool verbose = false ) =0;
                                                                                                  782 
                                                                                                  784 
                                                                                                  792  virtual bool
                                                                                                  793  is_auto_tuner_enabled() const =0;
                                                                                                  794 
                                                                                                  796 
                                                                                                  815  virtual void
                                                                                                  816  run_auto_tuner_now(bool verbose = true ) =0;
                                                                                                  819 
                                                                                                  820  /* Advanced APIs for yk_solution found below are not needed for most applications. */
                                                                                                  821 
                                                                                                  823 
                                                                                                  847  virtual void
                                                                                                  848  set_min_pad_size(const std::string& dim,
                                                                                                  851  idx_t size ) =0;
                                                                                                  854 
                                                                                                  856 
                                                                                                  859  virtual idx_t
                                                                                                  860  get_min_pad_size(const std::string& dim) const =0;
                                                                                                  863 
                                                                                                  865 
                                                                                                  913  virtual yk_var_ptr
                                                                                                  914  new_var(const std::string& name,
                                                                                                  917  const string_vec& dims ) =0;
                                                                                                  921 
                                                                                                  922 #ifndef SWIG
                                                                                                  923 
                                                                                                  930  virtual yk_var_ptr
                                                                                                  931  new_var(const std::string& name,
                                                                                                  934  const std::initializer_list<std::string>& dims ) =0;
                                                                                                  938 #endif
                                                                                                  939 
                                                                                                  941 
                                                                                                  988  virtual yk_var_ptr
                                                                                                  989  new_fixed_size_var(const std::string& name,
                                                                                                  992  const string_vec& dims,
                                                                                                  996  const idx_t_vec& dim_sizes ) =0;
                                                                                                  999 
                                                                                                  1000 #ifndef SWIG
                                                                                                  1001 
                                                                                                  1008  virtual yk_var_ptr
                                                                                                  1009  new_fixed_size_var(const std::string& name,
                                                                                                  1012  const std::initializer_list<std::string>& dims,
                                                                                                  1016  const idx_t_init_list& dim_sizes ) =0;
                                                                                                  1019 #endif
                                                                                                  1020 
                                                                                                  1022 
                                                                                                  1033  virtual bool
                                                                                                  1034  set_default_numa_preferred(int numa_node) =0;
                                                                                                  1045 
                                                                                                  1047 
                                                                                                  1050  virtual int
                                                                                                  1051  get_default_numa_preferred() const =0;
                                                                                                  1052 
                                                                                                  1053 #ifndef SWIG
                                                                                                  1054  typedef std::function<void(yk_solution&)> hook_fn_t;
                                                                                                  1056 
                                                                                                  1058  typedef std::function<void(yk_solution& soln,
                                                                                                  1059  idx_t first_step_index,
                                                                                                  1060  idx_t last_step_index)> hook_fn_2idx_t;
                                                                                                  1061 
                                                                                                  1063 
                                                                                                  1071  virtual void
                                                                                                  1074 
                                                                                                  1076 
                                                                                                  1084  virtual void
                                                                                                  1087 
                                                                                                  1089 
                                                                                                  1099  virtual void
                                                                                                  1102 
                                                                                                  1104 
                                                                                                  1114  virtual void
                                                                                                  1117 #endif
                                                                                                  1118 
                                                                                                  1120 
                                                                                                  1125  virtual void
                                                                                                  1126  fuse_vars(yk_solution_ptr source) =0;
                                                                                                  1128 
                                                                                                  1130  virtual void
                                                                                                  1131  set_step_wrap(bool do_wrap) =0;
                                                                                                  1133 
                                                                                                  1135 
                                                                                                  1138  virtual bool
                                                                                                  1139  get_step_wrap() const =0;
                                                                                                  1140 
                                                                                                  1143  virtual void
                                                                                                  1145 
                                                                                                  1148  inline int
                                                                                                  1149  get_num_grids() const {
                                                                                                  1150  return get_num_vars();
                                                                                                  1151  }
                                                                                                  1152 
                                                                                                  1155  inline yk_var_ptr
                                                                                                  1156  get_grid(const std::string& name) {
                                                                                                  1157  return get_var(name);
                                                                                                  1158  }
                                                                                                  1159 
                                                                                                  1162  inline std::vector<yk_var_ptr>
                                                                                                  1164  return get_vars();
                                                                                                  1165  }
                                                                                                  1166 
                                                                                                  1169  inline yk_var_ptr
                                                                                                  1170  new_grid(const std::string& name,
                                                                                                  1171  const string_vec& dims) {
                                                                                                  1172  return new_var(name, dims);
                                                                                                  1173  }
                                                                                                  1174 
                                                                                                  1175 #ifndef SWIG
                                                                                                  1178  inline yk_var_ptr
                                                                                                  1179  new_grid(const std::string& name,
                                                                                                  1180  const std::initializer_list<std::string>& dims) {
                                                                                                  1181  return new_var(name, dims);
                                                                                                  1182  }
                                                                                                  1183 #endif
                                                                                                  1184 
                                                                                                  1187  inline yk_var_ptr
                                                                                                  1188  new_fixed_size_grid(const std::string& name,
                                                                                                  1189  const string_vec& dims,
                                                                                                  1190  const idx_t_vec& dim_sizes) {
                                                                                                  1191  return new_fixed_size_var(name, dims, dim_sizes);
                                                                                                  1192  }
                                                                                                  1193 
                                                                                                  1194 #ifndef SWIG
                                                                                                  1197  inline yk_var_ptr
                                                                                                  1198  new_fixed_size_grid(const std::string& name,
                                                                                                  1199  const std::initializer_list<std::string>& dims,
                                                                                                  1200  const idx_t_vec& dim_sizes) {
                                                                                                  1201  return new_fixed_size_var(name, dims, dim_sizes);
                                                                                                  1202  }
                                                                                                  1203 #endif
                                                                                                  1204 
                                                                                                  1207  inline void
                                                                                                  1209  fuse_vars(source);
                                                                                                  1210  }
                                                                                                  1211  }; // yk_solution.
                                                                                                  1212 
                                                                                                  1214 
                                                                                                  1219  class yk_stats {
                                                                                                  1220  public:
                                                                                                  1221  virtual ~yk_stats() {}
                                                                                                  1222 
                                                                                                  1224 
                                                                                                  1230  virtual idx_t
                                                                                                  1231  get_num_elements() =0;
                                                                                                  1232 
                                                                                                  1234 
                                                                                                  1238  virtual idx_t
                                                                                                  1239  get_num_steps_done() =0;
                                                                                                  1240 
                                                                                                  1242 
                                                                                                  1246  virtual idx_t
                                                                                                  1247  get_num_writes_done() =0;
                                                                                                  1248 
                                                                                                  1250 
                                                                                                  1256  virtual idx_t
                                                                                                  1257  get_est_fp_ops_done() =0;
                                                                                                  1258 
                                                                                                  1260 
                                                                                                  1264  virtual double
                                                                                                  1265  get_elapsed_secs() =0;
                                                                                                  1266  }; // yk_stats.
                                                                                                  1267 
                                                                                                  1269 } // namespace yask.
                                                                                                  virtual yk_var_ptr new_fixed_size_var(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)=0
                                                                                                  [Advanced] Add a new var to the solution with a specified size.
                                                                                                  +
                                                                                                  virtual string_vec get_misc_dim_names() const =0
                                                                                                  Get all the miscellaneous dimension names.
                                                                                                  +
                                                                                                  std::initializer_list< idx_t > idx_t_init_list
                                                                                                  Initializer list of indices.
                                                                                                  Definition: yask_common_api.hpp:87
                                                                                                  +
                                                                                                  virtual void reset_auto_tuner(bool enable, bool verbose=false)=0
                                                                                                  Start or stop the online auto-tuner on this rank.
                                                                                                  virtual idx_t get_num_writes_done()=0
                                                                                                  Get the number of elements written across all steps.
                                                                                                  -
                                                                                                  void fuse_grids(yk_solution_ptr source)
                                                                                                  [Deprecated] Use fuse_vars().
                                                                                                  Definition: yk_solution_api.hpp:1029
                                                                                                  -
                                                                                                  virtual yk_var_ptr new_fixed_size_var(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)=0
                                                                                                  [Advanced] Add a new var to the solution with a specified size.
                                                                                                  +
                                                                                                  virtual bool is_offloaded() const =0
                                                                                                  Get whether the stencil kernel will be offloaded to a device.
                                                                                                  virtual void prepare_solution()=0
                                                                                                  Prepare the solution for stencil application.
                                                                                                  -
                                                                                                  virtual std::vector< std::string > get_domain_dim_names() const =0
                                                                                                  Get all the domain dimension names.
                                                                                                  -
                                                                                                  Statistics from calls to run_solution().
                                                                                                  Definition: yk_solution_api.hpp:1040
                                                                                                  -
                                                                                                  virtual std::vector< std::string > get_misc_dim_names() const =0
                                                                                                  Get all the miscellaneous dimension names.
                                                                                                  +
                                                                                                  Statistics from calls to run_solution().
                                                                                                  Definition: yk_solution_api.hpp:1219
                                                                                                  +
                                                                                                  virtual yk_var_ptr new_var(const std::string &name, const string_vec &dims)=0
                                                                                                  [Advanced] Add a new var to the solution.
                                                                                                  const int yask_numa_local
                                                                                                  Allocate vars on local NUMA node.
                                                                                                  Definition: yk_solution_api.hpp:49
                                                                                                  virtual std::vector< yk_var_ptr > get_vars()=0
                                                                                                  Get all the vars.
                                                                                                  virtual bool set_default_numa_preferred(int numa_node)=0
                                                                                                  [Advanced] Set the default preferred NUMA node on which to allocate data.
                                                                                                  @@ -85,62 +86,79 @@
                                                                                                  virtual void call_before_run_solution(hook_fn_2idx_t hook_fn)=0
                                                                                                  [Advanced] Register a hook function to be called at the beginning of yk_solution::run_solution().
                                                                                                  const int yask_numa_none
                                                                                                  Do not specify any NUMA binding.
                                                                                                  Definition: yk_solution_api.hpp:65
                                                                                                  virtual void set_block_size(const std::string &dim, idx_t size)=0
                                                                                                  Set the block size in the given dimension.
                                                                                                  -
                                                                                                  yk_var_ptr new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const std::vector< idx_t > &dim_sizes)
                                                                                                  [Deprecated] Use new_fixed_size_var().
                                                                                                  Definition: yk_solution_api.hpp:1020
                                                                                                  -
                                                                                                  std::function< void(yk_solution &)> hook_fn_t
                                                                                                  [Advanced] Callback type with yk_solution parameter.
                                                                                                  Definition: yk_solution_api.hpp:889
                                                                                                  -
                                                                                                  std::vector< yk_var_ptr > get_grids()
                                                                                                  [Deprecated] Use get_vars().
                                                                                                  Definition: yk_solution_api.hpp:989
                                                                                                  -
                                                                                                  yk_var_ptr new_grid(const std::string &name, const std::initializer_list< std::string > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yk_solution_api.hpp:1003
                                                                                                  +
                                                                                                  std::function< void(yk_solution &)> hook_fn_t
                                                                                                  [Advanced] Callback type with yk_solution parameter.
                                                                                                  Definition: yk_solution_api.hpp:1055
                                                                                                  +
                                                                                                  virtual idx_t_vec get_num_ranks_vec() const =0
                                                                                                  Get the number of MPI ranks in all domain dimensions.
                                                                                                  +
                                                                                                  std::vector< idx_t > idx_t_vec
                                                                                                  Vector of indices.
                                                                                                  Definition: yask_common_api.hpp:80
                                                                                                  +
                                                                                                  virtual string_vec get_domain_dim_names() const =0
                                                                                                  Get all the domain dimension names.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_rank_index_vec() const =0
                                                                                                  Get the rank index in all domain dimensions.
                                                                                                  +
                                                                                                  virtual void set_num_ranks_vec(const idx_t_vec &vals)=0
                                                                                                  Set the number of MPI ranks in all domain dimensions.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_rank_domain_size_vec() const =0
                                                                                                  Get the local-domain size in all domain dimensions.
                                                                                                  const int yask_numa_interleave
                                                                                                  Allocate vars across all available NUMA nodes.
                                                                                                  Definition: yk_solution_api.hpp:57
                                                                                                  virtual int get_default_numa_preferred() const =0
                                                                                                  [Advanced] Get the default preferred NUMA node on which to allocate data.
                                                                                                  +
                                                                                                  YASK_DEPRECATED int get_num_grids() const
                                                                                                  [Deprecated] Use get_num_vars().
                                                                                                  Definition: yk_solution_api.hpp:1149
                                                                                                  +
                                                                                                  virtual void set_rank_domain_size_vec(const idx_t_vec &vals)=0
                                                                                                  Set the local-domain size in all domain dimensions.
                                                                                                  virtual double get_elapsed_secs()=0
                                                                                                  Get the number of seconds elapsed during calls to run_solution().
                                                                                                  virtual idx_t get_first_rank_domain_index(const std::string &dim) const =0
                                                                                                  Get the first index of the sub-domain in this rank in the specified dimension.
                                                                                                  -
                                                                                                  yk_var_ptr new_fixed_size_grid(const std::string &name, const std::vector< std::string > &dims, const std::vector< idx_t > &dim_sizes)
                                                                                                  [Deprecated] Use new_fixed_size_var().
                                                                                                  Definition: yk_solution_api.hpp:1011
                                                                                                  virtual int get_num_domain_dims() const =0
                                                                                                  Get the number of domain dimensions used in this solution.
                                                                                                  virtual idx_t get_min_pad_size(const std::string &dim) const =0
                                                                                                  [Advanced] Get the minimum amount of padding for all vars.
                                                                                                  +
                                                                                                  YASK_INT64_T idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:77
                                                                                                  virtual int get_element_bytes() const =0
                                                                                                  Get the floating-point precision size.
                                                                                                  virtual idx_t get_rank_index(const std::string &dim) const =0
                                                                                                  Get the rank index in the specified dimension.
                                                                                                  -
                                                                                                  std::function< void(yk_solution &soln, idx_t first_step_index, idx_t last_step_index)> hook_fn_2idx_t
                                                                                                  [Advanced] Callback type with yk_solution and step-index parameters.
                                                                                                  Definition: yk_solution_api.hpp:894
                                                                                                  +
                                                                                                  std::function< void(yk_solution &soln, idx_t first_step_index, idx_t last_step_index)> hook_fn_2idx_t
                                                                                                  [Advanced] Callback type with yk_solution and step-index parameters.
                                                                                                  Definition: yk_solution_api.hpp:1060
                                                                                                  virtual bool get_step_wrap() const =0
                                                                                                  [Advanced] Get whether invalid step indices alias to valid ones.
                                                                                                  virtual idx_t get_overall_domain_size(const std::string &dim) const =0
                                                                                                  Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks.
                                                                                                  virtual idx_t get_rank_domain_size(const std::string &dim) const =0
                                                                                                  Get the local-domain size in the specified dimension, i.e., the size in this rank.
                                                                                                  -
                                                                                                  virtual yk_var_ptr new_var(const std::string &name, const std::vector< std::string > &dims)=0
                                                                                                  [Advanced] Add a new var to the solution.
                                                                                                  -
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:66
                                                                                                  +
                                                                                                  std::shared_ptr< yask_output > yask_output_ptr
                                                                                                  Shared pointer to yask_output.
                                                                                                  Definition: yask_common_api.hpp:94
                                                                                                  +
                                                                                                  virtual idx_t_vec get_block_size_vec() const =0
                                                                                                  Get the block size in all domain dimensions.
                                                                                                  virtual idx_t get_last_rank_domain_index(const std::string &dim) const =0
                                                                                                  Get the last index of the sub-domain in this rank the specified dimension.
                                                                                                  virtual void call_after_prepare_solution(hook_fn_t hook_fn)=0
                                                                                                  [Advanced] Register a hook function to be called at the end of yk_solution::prepare_solution().
                                                                                                  -
                                                                                                  virtual void set_region_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the region size in the given dimension.
                                                                                                  -
                                                                                                  virtual bool is_auto_tuner_enabled() const =0
                                                                                                  Determine whether the auto-tuner is enabled on this rank.
                                                                                                  -
                                                                                                  virtual void run_auto_tuner_now(bool verbose=true)=0
                                                                                                  [Advanced] Automatically tune selected settings immediately.
                                                                                                  +
                                                                                                  YASK_DEPRECATED yk_var_ptr new_fixed_size_grid(const std::string &name, const string_vec &dims, const idx_t_vec &dim_sizes)
                                                                                                  [Deprecated] Use new_fixed_size_var().
                                                                                                  Definition: yk_solution_api.hpp:1188
                                                                                                  +
                                                                                                  YASK_DEPRECATED yk_var_ptr new_grid(const std::string &name, const std::initializer_list< std::string > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yk_solution_api.hpp:1179
                                                                                                  +
                                                                                                  virtual idx_t_vec get_first_rank_domain_index_vec() const =0
                                                                                                  Get the first index of the sub-domain in this rank in all domain dimensions.
                                                                                                  +
                                                                                                  virtual bool is_auto_tuner_enabled() const =0
                                                                                                  Determine whether the online auto-tuner is enabled on this rank.
                                                                                                  +
                                                                                                  virtual void set_overall_domain_size_vec(const idx_t_vec &vals)=0
                                                                                                  Set the global-domain size in all domain dimensions.
                                                                                                  +
                                                                                                  virtual void set_block_size_vec(const idx_t_vec &vals)=0
                                                                                                  Set the block size in all domain dimensions.
                                                                                                  +
                                                                                                  virtual void run_auto_tuner_now(bool verbose=true)=0
                                                                                                  Run the offline auto-tuner immediately, not preserving variable data.
                                                                                                  +
                                                                                                  virtual void copy_vars_from_device() const =0
                                                                                                  Update data on the host.
                                                                                                  +
                                                                                                  YASK_DEPRECATED yk_var_ptr new_fixed_size_grid(const std::string &name, const std::initializer_list< std::string > &dims, const idx_t_vec &dim_sizes)
                                                                                                  [Deprecated] Use new_fixed_size_var().
                                                                                                  Definition: yk_solution_api.hpp:1198
                                                                                                  virtual idx_t get_num_steps_done()=0
                                                                                                  Get the number of steps executed via run_solution().
                                                                                                  virtual void call_after_run_solution(hook_fn_2idx_t hook_fn)=0
                                                                                                  [Advanced] Register a hook function to be called at the end of yk_solution::run_solution().
                                                                                                  virtual idx_t get_num_elements()=0
                                                                                                  Get the number of elements in the overall domain.
                                                                                                  -
                                                                                                  yk_var_ptr get_grid(const std::string &name)
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yk_solution_api.hpp:983
                                                                                                  -
                                                                                                  virtual void set_debug_output(yask_output_ptr debug)=0
                                                                                                  Set object to receive debug output.
                                                                                                  +
                                                                                                  YASK_DEPRECATED void fuse_grids(yk_solution_ptr source)
                                                                                                  [Deprecated] Use fuse_vars().
                                                                                                  Definition: yk_solution_api.hpp:1208
                                                                                                  +
                                                                                                  YASK_DEPRECATED yk_var_ptr get_grid(const std::string &name)
                                                                                                  [Deprecated] Use get_var().
                                                                                                  Definition: yk_solution_api.hpp:1156
                                                                                                  virtual std::string get_target() const =0
                                                                                                  Get the target ISA.
                                                                                                  -
                                                                                                  int get_num_grids() const
                                                                                                  [Deprecated] Use get_num_vars().
                                                                                                  Definition: yk_solution_api.hpp:977
                                                                                                  virtual void set_rank_index(const std::string &dim, idx_t num)=0
                                                                                                  Set the rank index in the specified dimension.
                                                                                                  virtual int get_num_vars() const =0
                                                                                                  Get the number of vars in the solution.
                                                                                                  +
                                                                                                  const int yask_numa_offload
                                                                                                  Do not specify any NUMA binding and use allocations optimized for offloading.
                                                                                                  Definition: yk_solution_api.hpp:73
                                                                                                  +
                                                                                                  virtual YASK_DEPRECATED void set_debug_output(yask_output_ptr debug)=0
                                                                                                  [Deprecated] Use yk_env::set_debug_output().
                                                                                                  +
                                                                                                  #define YASK_DEPRECATED
                                                                                                  Deprecated attribute.
                                                                                                  Definition: yask_common_api.hpp:55
                                                                                                  virtual void call_before_prepare_solution(hook_fn_t hook_fn)=0
                                                                                                  [Advanced] Register a function to be called at the beginning of yk_solution::prepare_solution().
                                                                                                  +
                                                                                                  virtual idx_t_vec get_last_rank_domain_index_vec() const =0
                                                                                                  Get the last index of the sub-domain in this rank in all domain dimensions.
                                                                                                  virtual void set_num_ranks(const std::string &dim, idx_t num)=0
                                                                                                  Set the number of MPI ranks in the given dimension.
                                                                                                  +
                                                                                                  YASK_DEPRECATED std::vector< yk_var_ptr > get_grids()
                                                                                                  [Deprecated] Use get_vars().
                                                                                                  Definition: yk_solution_api.hpp:1163
                                                                                                  virtual void set_overall_domain_size(const std::string &dim, idx_t size)=0
                                                                                                  Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks.
                                                                                                  +
                                                                                                  std::vector< std::string > string_vec
                                                                                                  Vector of strings.
                                                                                                  Definition: yask_common_api.hpp:90
                                                                                                  virtual idx_t get_num_ranks(const std::string &dim) const =0
                                                                                                  Get the number of MPI ranks in the given dimension.
                                                                                                  virtual yk_var_ptr get_var(const std::string &name)=0
                                                                                                  Get the specified var.
                                                                                                  std::shared_ptr< yk_solution > yk_solution_ptr
                                                                                                  Shared pointer to yk_solution.
                                                                                                  Definition: yask_kernel_api.hpp:56
                                                                                                  -
                                                                                                  yk_var_ptr new_grid(const std::string &name, const std::vector< std::string > &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yk_solution_api.hpp:995
                                                                                                  virtual std::string get_step_dim_name() const =0
                                                                                                  Get the solution step dimension.
                                                                                                  -
                                                                                                  Stencil solution as defined by the generated code from the YASK stencil compiler.
                                                                                                  Definition: yk_solution_api.hpp:74
                                                                                                  -
                                                                                                  virtual idx_t get_region_size(const std::string &dim) const =0
                                                                                                  [Advanced] Get the region size.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_overall_domain_size_vec() const =0
                                                                                                  Get the global-domain size in all domain dimensions.
                                                                                                  +
                                                                                                  Stencil solution as defined by the generated code from the YASK stencil compiler.
                                                                                                  Definition: yk_solution_api.hpp:82
                                                                                                  virtual void set_rank_domain_size(const std::string &dim, idx_t size)=0
                                                                                                  Set the local-domain size in the specified dimension, i.e., the size of the part of the domain that i...
                                                                                                  virtual void set_min_pad_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the minimum amount of padding for all vars.
                                                                                                  +
                                                                                                  YASK_DEPRECATED yk_var_ptr new_grid(const std::string &name, const string_vec &dims)
                                                                                                  [Deprecated] Use new_var().
                                                                                                  Definition: yk_solution_api.hpp:1170
                                                                                                  virtual void end_solution()=0
                                                                                                  Finish using a solution.
                                                                                                  virtual std::string apply_command_line_options(const std::string &args)=0
                                                                                                  Set kernel options from a string.
                                                                                                  +
                                                                                                  virtual void copy_vars_to_device() const =0
                                                                                                  Update data on the device.
                                                                                                  std::shared_ptr< yk_var > yk_var_ptr
                                                                                                  Shared pointer to yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:60
                                                                                                  virtual yk_stats_ptr get_stats()=0
                                                                                                  Get performance statistics associated with preceding calls to run_solution().
                                                                                                  +
                                                                                                  virtual void set_rank_index_vec(const idx_t_vec &vals)=0
                                                                                                  Set the rank index in all domain dimensions.
                                                                                                  virtual void run_solution(idx_t first_step_index, idx_t last_step_index)=0
                                                                                                  Run the stencil solution for the specified steps.
                                                                                                  virtual idx_t get_block_size(const std::string &dim) const =0
                                                                                                  Get the block size.
                                                                                                  virtual void fuse_vars(yk_solution_ptr source)=0
                                                                                                  [Advanced] Merge YASK variables with another solution.
                                                                                                  virtual const std::string & get_name() const =0
                                                                                                  Get the name of the solution.
                                                                                                  virtual idx_t get_est_fp_ops_done()=0
                                                                                                  Get the estimated number of floating-point operations executed across all steps.
                                                                                                  -
                                                                                                  std::int64_t idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:61
                                                                                                  virtual void set_step_wrap(bool do_wrap)=0
                                                                                                  [Advanced] Set whether invalid step indices alias to valid ones.
                                                                                                  diff --git a/docs/api/html/yk__var__api_8hpp_source.html b/docs/api/html/yk__var__api_8hpp_source.html index 49b43f3f..0d091f95 100644 --- a/docs/api/html/yk__var__api_8hpp_source.html +++ b/docs/api/html/yk__var__api_8hpp_source.html @@ -70,68 +70,75 @@
                                                                                                  yk_var_api.hpp
                                                                                                  -Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2021, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_kernel_api.hpp"
                                                                                                  35 
                                                                                                  36 namespace yask {
                                                                                                  37 
                                                                                                  43 
                                                                                                  185  class yk_var {
                                                                                                  186  public:
                                                                                                  187  virtual ~yk_var() {}
                                                                                                  188 
                                                                                                  190 
                                                                                                  196  virtual const std::string& get_name() const =0;
                                                                                                  197 
                                                                                                  199 
                                                                                                  204  virtual int get_num_dims() const =0;
                                                                                                  205 
                                                                                                  207 
                                                                                                  211  virtual std::vector<std::string>
                                                                                                  212  get_dim_names() const =0;
                                                                                                  213 
                                                                                                  215 
                                                                                                  219  virtual bool
                                                                                                  220  is_dim_used(const std::string& dim) const =0;
                                                                                                  221 
                                                                                                  223 
                                                                                                  227  virtual bool is_fixed_size() const =0;
                                                                                                  228 
                                                                                                  230 
                                                                                                  242  virtual idx_t
                                                                                                  243  get_first_local_index(const std::string& dim ) const =0;
                                                                                                  246 
                                                                                                  248 
                                                                                                  260  virtual idx_t
                                                                                                  261  get_last_local_index(const std::string& dim ) const =0;
                                                                                                  264 
                                                                                                  266 
                                                                                                  273  virtual idx_t
                                                                                                  274  get_alloc_size(const std::string& dim ) const =0;
                                                                                                  277 
                                                                                                  279 
                                                                                                  287  virtual idx_t
                                                                                                  288  get_first_valid_step_index() const =0;
                                                                                                  289 
                                                                                                  291 
                                                                                                  299  virtual idx_t
                                                                                                  300  get_last_valid_step_index() const =0;
                                                                                                  301 
                                                                                                  303 
                                                                                                  308  virtual idx_t
                                                                                                  309  get_rank_domain_size(const std::string& dim) const =0;
                                                                                                  312 
                                                                                                  314 
                                                                                                  320  virtual idx_t
                                                                                                  321  get_first_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  324 
                                                                                                  326 
                                                                                                  333  virtual idx_t
                                                                                                  334  get_last_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  337 
                                                                                                  339 
                                                                                                  343  virtual idx_t
                                                                                                  344  get_left_halo_size(const std::string& dim ) const =0;
                                                                                                  348 
                                                                                                  350 
                                                                                                  354  virtual idx_t
                                                                                                  355  get_right_halo_size(const std::string& dim ) const =0;
                                                                                                  359 
                                                                                                  361 
                                                                                                  368  virtual idx_t
                                                                                                  369  get_first_rank_halo_index(const std::string& dim ) const =0;
                                                                                                  372 
                                                                                                  374 
                                                                                                  381  virtual idx_t
                                                                                                  382  get_last_rank_halo_index(const std::string& dim ) const =0;
                                                                                                  385 
                                                                                                  387 
                                                                                                  395  virtual idx_t
                                                                                                  396  get_left_pad_size(const std::string& dim ) const =0;
                                                                                                  400 
                                                                                                  402 
                                                                                                  410  virtual idx_t
                                                                                                  411  get_right_pad_size(const std::string& dim ) const =0;
                                                                                                  415 
                                                                                                  417 
                                                                                                  422  virtual idx_t
                                                                                                  423  get_left_extra_pad_size(const std::string& dim ) const =0;
                                                                                                  427 
                                                                                                  429 
                                                                                                  434  virtual idx_t
                                                                                                  435  get_right_extra_pad_size(const std::string& dim ) const =0;
                                                                                                  439 
                                                                                                  441 
                                                                                                  445  virtual idx_t
                                                                                                  446  get_first_misc_index(const std::string& dim ) const =0;
                                                                                                  449 
                                                                                                  451 
                                                                                                  455  virtual idx_t
                                                                                                  456  get_last_misc_index(const std::string& dim ) const =0;
                                                                                                  459 
                                                                                                  461 
                                                                                                  468  virtual bool
                                                                                                  469  are_indices_local(const std::vector<idx_t>& indices ) const =0;
                                                                                                  471 
                                                                                                  472 #ifndef SWIG
                                                                                                  473 
                                                                                                  477  virtual bool
                                                                                                  478  are_indices_local(const std::initializer_list<idx_t>& indices ) const =0;
                                                                                                  480 #endif
                                                                                                  481 
                                                                                                  483 
                                                                                                  491  virtual double
                                                                                                  492  get_element(const std::vector<idx_t>& indices ) const =0;
                                                                                                  494 
                                                                                                  495 #ifndef SWIG
                                                                                                  496 
                                                                                                  501  virtual double
                                                                                                  502  get_element(const std::initializer_list<idx_t>& indices ) const =0;
                                                                                                  504 #endif
                                                                                                  505 
                                                                                                  507 
                                                                                                  526  virtual idx_t
                                                                                                  527  set_element(double val ,
                                                                                                  528  const std::vector<idx_t>& indices,
                                                                                                  530  bool strict_indices = true ) =0;
                                                                                                  534 
                                                                                                  535 #ifndef SWIG
                                                                                                  536 
                                                                                                  541  virtual idx_t
                                                                                                  542  set_element(double val ,
                                                                                                  543  const std::initializer_list<idx_t>& indices,
                                                                                                  545  bool strict_indices = true ) =0;
                                                                                                  549 #endif
                                                                                                  550 
                                                                                                  552 
                                                                                                  569  virtual idx_t
                                                                                                  570  get_elements_in_slice(void* buffer_ptr,
                                                                                                  572  const std::vector<idx_t>& first_indices,
                                                                                                  574  const std::vector<idx_t>& last_indices ) const =0;
                                                                                                  576 
                                                                                                  578 
                                                                                                  593  virtual idx_t
                                                                                                  594  add_to_element(double val ,
                                                                                                  595  const std::vector<idx_t>& indices,
                                                                                                  597  bool strict_indices = true ) =0;
                                                                                                  601 
                                                                                                  602 #ifndef SWIG
                                                                                                  603 
                                                                                                  608  virtual idx_t
                                                                                                  609  add_to_element(double val ,
                                                                                                  610  const std::initializer_list<idx_t>& indices,
                                                                                                  612  bool strict_indices = true ) =0;
                                                                                                  616 #endif
                                                                                                  617 
                                                                                                  619 
                                                                                                  627  virtual void
                                                                                                  628  set_all_elements_same(double val ) =0;
                                                                                                  629 
                                                                                                  631 
                                                                                                  644  virtual idx_t
                                                                                                  645  set_elements_in_slice_same(double val ,
                                                                                                  646  const std::vector<idx_t>& first_indices,
                                                                                                  648  const std::vector<idx_t>& last_indices,
                                                                                                  650  bool strict_indices = true ) =0;
                                                                                                  654 
                                                                                                  656 
                                                                                                  675  virtual idx_t
                                                                                                  676  set_elements_in_slice(const void* buffer_ptr,
                                                                                                  678  const std::vector<idx_t>& first_indices,
                                                                                                  680  const std::vector<idx_t>& last_indices ) =0;
                                                                                                  682 
                                                                                                  683 #ifdef COPY_SLICE_IMPLEMENTED
                                                                                                  684 
                                                                                                  698  virtual idx_t
                                                                                                  699  set_elements_in_slice(const yk_var_ptr source,
                                                                                                  701  const std::vector<idx_t>& first_source_indices,
                                                                                                  704  const std::vector<idx_t>& first_target_indices,
                                                                                                  707  const std::vector<idx_t>& last_target_indices ) =0;
                                                                                                  710 #endif
                                                                                                  711 
                                                                                                  713 
                                                                                                  717  virtual std::string
                                                                                                  718  format_indices(const std::vector<idx_t>& indices ) const =0;
                                                                                                  720 
                                                                                                  721 #ifndef SWIG
                                                                                                  722 
                                                                                                  727  virtual std::string
                                                                                                  728  format_indices(const std::initializer_list<idx_t>& indices ) const =0;
                                                                                                  730 #endif
                                                                                                  731 
                                                                                                  732  /* Advanced APIs for yk_var found below are not needed for most applications. */
                                                                                                  733 
                                                                                                  735 
                                                                                                  752  virtual int
                                                                                                  753  get_halo_exchange_l1_norm() const =0;
                                                                                                  754 
                                                                                                  756 
                                                                                                  759  virtual void
                                                                                                  760  set_halo_exchange_l1_norm(int norm) =0;
                                                                                                  763 
                                                                                                  765 
                                                                                                  768  virtual bool
                                                                                                  769  is_dynamic_step_alloc() const =0;
                                                                                                  770 
                                                                                                  772 
                                                                                                  779  virtual bool
                                                                                                  780  set_numa_preferred(int numa_node) =0;
                                                                                                  783 
                                                                                                  785 
                                                                                                  788  virtual int
                                                                                                  789  get_numa_preferred() const =0;
                                                                                                  790 
                                                                                                  792 
                                                                                                  802  virtual void
                                                                                                  803  set_left_min_pad_size(const std::string& dim,
                                                                                                  807  idx_t size ) =0;
                                                                                                  810 
                                                                                                  812 
                                                                                                  822  virtual void
                                                                                                  823  set_right_min_pad_size(const std::string& dim,
                                                                                                  827  idx_t size ) =0;
                                                                                                  830 
                                                                                                  832 
                                                                                                  835  virtual void
                                                                                                  836  set_min_pad_size(const std::string& dim,
                                                                                                  840  idx_t size ) =0;
                                                                                                  843 
                                                                                                  845 
                                                                                                  854  virtual void
                                                                                                  855  set_left_halo_size(const std::string& dim,
                                                                                                  859  idx_t size ) =0;
                                                                                                  861 
                                                                                                  863 
                                                                                                  872  virtual void
                                                                                                  873  set_right_halo_size(const std::string& dim,
                                                                                                  877  idx_t size ) =0;
                                                                                                  879 
                                                                                                  881 
                                                                                                  884  virtual void
                                                                                                  885  set_halo_size(const std::string& dim,
                                                                                                  889  idx_t size ) =0;
                                                                                                  891 
                                                                                                  892 
                                                                                                  894 
                                                                                                  919  virtual void
                                                                                                  920  set_alloc_size(const std::string& dim,
                                                                                                  924  idx_t size ) =0;
                                                                                                  925 
                                                                                                  927 
                                                                                                  933  virtual void
                                                                                                  934  set_first_misc_index(const std::string& dim,
                                                                                                  937  idx_t idx ) =0;
                                                                                                  939 
                                                                                                  941 
                                                                                                  945  virtual idx_t
                                                                                                  946  get_first_rank_alloc_index(const std::string& dim ) const =0;
                                                                                                  950 
                                                                                                  952 
                                                                                                  956  virtual idx_t
                                                                                                  957  get_last_rank_alloc_index(const std::string& dim ) const =0;
                                                                                                  961 
                                                                                                  963 
                                                                                                  967  virtual bool
                                                                                                  968  is_storage_allocated() const =0;
                                                                                                  969 
                                                                                                  971 
                                                                                                  975  virtual idx_t
                                                                                                  976  get_num_storage_bytes() const =0;
                                                                                                  977 
                                                                                                  979 
                                                                                                  982  virtual idx_t
                                                                                                  983  get_num_storage_elements() const =0;
                                                                                                  984 
                                                                                                  986 
                                                                                                  992  virtual void
                                                                                                  993  alloc_storage() =0;
                                                                                                  994 
                                                                                                  996 
                                                                                                  1000  virtual void
                                                                                                  1001  release_storage() =0;
                                                                                                  1002 
                                                                                                  1004 
                                                                                                  1020  virtual bool
                                                                                                  1021  is_storage_layout_identical(const yk_var_ptr other) const =0;
                                                                                                  1022 
                                                                                                  1024 
                                                                                                  1050  virtual void
                                                                                                  1051  fuse_vars(yk_var_ptr source) =0;
                                                                                                  1053 
                                                                                                  1055 
                                                                                                  1091  virtual void* get_raw_storage_buffer() =0;
                                                                                                  1092 
                                                                                                  1093  /* Deprecated APIs for yk_var found below should be avoided.
                                                                                                  1094  Use the more explicit form found in the documentation. */
                                                                                                  1095 
                                                                                                  1097  inline idx_t
                                                                                                  1098  get_halo_size(const std::string& dim) const {
                                                                                                  1099  return get_left_halo_size(dim);
                                                                                                  1100  }
                                                                                                  1102  inline idx_t
                                                                                                  1103  get_pad_size(const std::string& dim) const {
                                                                                                  1104  return get_left_pad_size(dim);
                                                                                                  1105  }
                                                                                                  1107  inline idx_t
                                                                                                  1108  get_extra_pad_size(const std::string& dim) const {
                                                                                                  1109  return get_left_extra_pad_size(dim);
                                                                                                  1110  }
                                                                                                  1111 
                                                                                                  1113  inline bool
                                                                                                  1114  is_element_allocated(const std::vector<idx_t>& indices ) const {
                                                                                                  1116  return are_indices_local(indices);
                                                                                                  1117  }
                                                                                                  1118 
                                                                                                  1119 #ifndef SWIG
                                                                                                  1120  inline bool
                                                                                                  1122  is_element_allocated(const std::initializer_list<idx_t>& indices ) const {
                                                                                                  1124  return are_indices_local(indices);
                                                                                                  1125  }
                                                                                                  1126 #endif
                                                                                                  1127 
                                                                                                  1129  inline void
                                                                                                  1131  fuse_vars(source);
                                                                                                  1132  }
                                                                                                  1133 
                                                                                                  1134  }; // yk_var.
                                                                                                  1135 
                                                                                                  1137  typedef yk_var yk_grid;
                                                                                                  1138 
                                                                                                  1140 } // namespace yask.
                                                                                                  virtual idx_t get_alloc_size(const std::string &dim) const =0
                                                                                                  Get the number of elements allocated in the specified dimension.
                                                                                                  +Go to the documentation of this file.
                                                                                                  1 /*****************************************************************************
                                                                                                  2 
                                                                                                  3 YASK: Yet Another Stencil Kit
                                                                                                  4 Copyright (c) 2014-2022, Intel Corporation
                                                                                                  5 
                                                                                                  6 Permission is hereby granted, free of charge, to any person obtaining a copy
                                                                                                  7 of this software and associated documentation files (the "Software"), to
                                                                                                  8 deal in the Software without restriction, including without limitation the
                                                                                                  9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
                                                                                                  10 sell copies of the Software, and to permit persons to whom the Software is
                                                                                                  11 furnished to do so, subject to the following conditions:
                                                                                                  12 
                                                                                                  13 * The above copyright notice and this permission notice shall be included in
                                                                                                  14  all copies or substantial portions of the Software.
                                                                                                  15 
                                                                                                  16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
                                                                                                  17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
                                                                                                  18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
                                                                                                  19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
                                                                                                  20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
                                                                                                  21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
                                                                                                  22 IN THE SOFTWARE.
                                                                                                  23 
                                                                                                  24 *****************************************************************************/
                                                                                                  25 
                                                                                                  27 
                                                                                                  28 // This file uses Doxygen 1.8 markup for API documentation-generation.
                                                                                                  29 // See http://www.stack.nl/~dimitri/doxygen.
                                                                                                  32 #pragma once
                                                                                                  33 
                                                                                                  34 #include "yask_kernel_api.hpp"
                                                                                                  35 
                                                                                                  36 namespace yask {
                                                                                                  37 
                                                                                                  43 
                                                                                                  185  class yk_var {
                                                                                                  186  public:
                                                                                                  187  virtual ~yk_var() {}
                                                                                                  188 
                                                                                                  190 
                                                                                                  196  virtual const std::string& get_name() const =0;
                                                                                                  197 
                                                                                                  199 
                                                                                                  205  virtual int get_num_dims() const =0;
                                                                                                  206 
                                                                                                  208 
                                                                                                  212  virtual string_vec
                                                                                                  213  get_dim_names() const =0;
                                                                                                  214 
                                                                                                  216 
                                                                                                  221  virtual int get_num_domain_dims() const =0;
                                                                                                  222 
                                                                                                  224 
                                                                                                  228  virtual bool
                                                                                                  229  is_dim_used(const std::string& dim) const =0;
                                                                                                  230 
                                                                                                  232 
                                                                                                  236  virtual bool is_fixed_size() const =0;
                                                                                                  237 
                                                                                                  239 
                                                                                                  251  virtual idx_t
                                                                                                  252  get_first_local_index(const std::string& dim ) const =0;
                                                                                                  255 
                                                                                                  257 
                                                                                                  261  virtual idx_t_vec
                                                                                                  262  get_first_local_index_vec() const =0;
                                                                                                  263 
                                                                                                  265 
                                                                                                  277  virtual idx_t
                                                                                                  278  get_last_local_index(const std::string& dim ) const =0;
                                                                                                  281 
                                                                                                  283 
                                                                                                  287  virtual idx_t_vec
                                                                                                  288  get_last_local_index_vec() const =0;
                                                                                                  289 
                                                                                                  291 
                                                                                                  298  virtual idx_t
                                                                                                  299  get_alloc_size(const std::string& dim ) const =0;
                                                                                                  302 
                                                                                                  304 
                                                                                                  308  virtual idx_t_vec
                                                                                                  309  get_alloc_size_vec() const =0;
                                                                                                  310 
                                                                                                  312 
                                                                                                  320  virtual idx_t
                                                                                                  321  get_first_valid_step_index() const =0;
                                                                                                  322 
                                                                                                  324 
                                                                                                  332  virtual idx_t
                                                                                                  333  get_last_valid_step_index() const =0;
                                                                                                  334 
                                                                                                  336 
                                                                                                  343  virtual idx_t
                                                                                                  344  get_rank_domain_size(const std::string& dim) const =0;
                                                                                                  347 
                                                                                                  349 
                                                                                                  353  virtual idx_t_vec
                                                                                                  354  get_rank_domain_size_vec() const =0;
                                                                                                  355 
                                                                                                  357 
                                                                                                  364  virtual idx_t
                                                                                                  365  get_first_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  368 
                                                                                                  370 
                                                                                                  374  virtual idx_t_vec
                                                                                                  376 
                                                                                                  378 
                                                                                                  386  virtual idx_t
                                                                                                  387  get_last_rank_domain_index(const std::string& dim ) const =0;
                                                                                                  390 
                                                                                                  392 
                                                                                                  396  virtual idx_t_vec
                                                                                                  398 
                                                                                                  400 
                                                                                                  404  virtual idx_t
                                                                                                  405  get_left_halo_size(const std::string& dim ) const =0;
                                                                                                  409 
                                                                                                  411 
                                                                                                  415  virtual idx_t
                                                                                                  416  get_right_halo_size(const std::string& dim ) const =0;
                                                                                                  420 
                                                                                                  422 
                                                                                                  429  virtual idx_t
                                                                                                  430  get_first_rank_halo_index(const std::string& dim ) const =0;
                                                                                                  433 
                                                                                                  435 
                                                                                                  439  virtual idx_t_vec
                                                                                                  441 
                                                                                                  443 
                                                                                                  450  virtual idx_t
                                                                                                  451  get_last_rank_halo_index(const std::string& dim ) const =0;
                                                                                                  454 
                                                                                                  456 
                                                                                                  460  virtual idx_t_vec
                                                                                                  461  get_last_rank_halo_index_vec() const =0;
                                                                                                  462 
                                                                                                  464 
                                                                                                  472  virtual idx_t
                                                                                                  473  get_left_pad_size(const std::string& dim ) const =0;
                                                                                                  477 
                                                                                                  479 
                                                                                                  487  virtual idx_t
                                                                                                  488  get_right_pad_size(const std::string& dim ) const =0;
                                                                                                  492 
                                                                                                  494 
                                                                                                  499  virtual idx_t
                                                                                                  500  get_left_extra_pad_size(const std::string& dim ) const =0;
                                                                                                  504 
                                                                                                  506 
                                                                                                  511  virtual idx_t
                                                                                                  512  get_right_extra_pad_size(const std::string& dim ) const =0;
                                                                                                  516 
                                                                                                  518 
                                                                                                  522  virtual idx_t
                                                                                                  523  get_first_misc_index(const std::string& dim ) const =0;
                                                                                                  526 
                                                                                                  528 
                                                                                                  532  virtual idx_t
                                                                                                  533  get_last_misc_index(const std::string& dim ) const =0;
                                                                                                  536 
                                                                                                  538 
                                                                                                  545  virtual bool
                                                                                                  546  are_indices_local(const idx_t_vec& indices ) const =0;
                                                                                                  548 
                                                                                                  549 #ifndef SWIG
                                                                                                  550 
                                                                                                  554  virtual bool
                                                                                                  555  are_indices_local(const idx_t_init_list& indices ) const =0;
                                                                                                  557 #endif
                                                                                                  558 
                                                                                                  560 
                                                                                                  568  virtual double
                                                                                                  569  get_element(const idx_t_vec& indices ) const =0;
                                                                                                  571 
                                                                                                  572 #ifndef SWIG
                                                                                                  573 
                                                                                                  578  virtual double
                                                                                                  579  get_element(const idx_t_init_list& indices ) const =0;
                                                                                                  581 #endif
                                                                                                  582 
                                                                                                  584 
                                                                                                  603  virtual idx_t
                                                                                                  604  set_element(double val ,
                                                                                                  605  const idx_t_vec& indices,
                                                                                                  607  bool strict_indices = true ) =0;
                                                                                                  611 
                                                                                                  612 #ifndef SWIG
                                                                                                  613 
                                                                                                  618  virtual idx_t
                                                                                                  619  set_element(double val ,
                                                                                                  620  const idx_t_init_list& indices,
                                                                                                  622  bool strict_indices = true ) =0;
                                                                                                  626 #endif
                                                                                                  627 
                                                                                                  629 
                                                                                                  646  virtual idx_t
                                                                                                  647  get_elements_in_slice(void* buffer_ptr,
                                                                                                  649  const idx_t_vec& first_indices,
                                                                                                  651  const idx_t_vec& last_indices ) const =0;
                                                                                                  653 
                                                                                                  655 
                                                                                                  670  virtual idx_t
                                                                                                  671  add_to_element(double val ,
                                                                                                  672  const idx_t_vec& indices,
                                                                                                  674  bool strict_indices = true ) =0;
                                                                                                  678 
                                                                                                  679 #ifndef SWIG
                                                                                                  680 
                                                                                                  685  virtual idx_t
                                                                                                  686  add_to_element(double val ,
                                                                                                  687  const idx_t_init_list& indices,
                                                                                                  689  bool strict_indices = true ) =0;
                                                                                                  693 #endif
                                                                                                  694 
                                                                                                  696 
                                                                                                  704  virtual void
                                                                                                  705  set_all_elements_same(double val ) =0;
                                                                                                  706 
                                                                                                  708 
                                                                                                  721  virtual idx_t
                                                                                                  722  set_elements_in_slice_same(double val ,
                                                                                                  723  const idx_t_vec& first_indices,
                                                                                                  725  const idx_t_vec& last_indices,
                                                                                                  727  bool strict_indices = true ) =0;
                                                                                                  731 
                                                                                                  733 
                                                                                                  752  virtual idx_t
                                                                                                  753  set_elements_in_slice(const void* buffer_ptr,
                                                                                                  755  const idx_t_vec& first_indices,
                                                                                                  757  const idx_t_vec& last_indices ) =0;
                                                                                                  759 
                                                                                                  760 #ifdef COPY_SLICE_IMPLEMENTED
                                                                                                  761 
                                                                                                  775  virtual idx_t
                                                                                                  776  set_elements_in_slice(const yk_var_ptr source,
                                                                                                  778  const idx_t_vec& first_source_indices,
                                                                                                  781  const idx_t_vec& first_target_indices,
                                                                                                  784  const idx_t_vec& last_target_indices ) =0;
                                                                                                  787 #endif
                                                                                                  788 
                                                                                                  790 
                                                                                                  794  virtual std::string
                                                                                                  795  format_indices(const idx_t_vec& indices ) const =0;
                                                                                                  797 
                                                                                                  798 #ifndef SWIG
                                                                                                  799 
                                                                                                  804  virtual std::string
                                                                                                  805  format_indices(const idx_t_init_list& indices ) const =0;
                                                                                                  807 #endif
                                                                                                  808 
                                                                                                  809  /* Advanced APIs for yk_var found below are not needed for most applications. */
                                                                                                  810 
                                                                                                  812 
                                                                                                  829  virtual int
                                                                                                  830  get_halo_exchange_l1_norm() const =0;
                                                                                                  831 
                                                                                                  833 
                                                                                                  838  virtual void
                                                                                                  839  set_halo_exchange_l1_norm(int norm) =0;
                                                                                                  842 
                                                                                                  844 
                                                                                                  847  virtual bool
                                                                                                  848  is_dynamic_step_alloc() const =0;
                                                                                                  849 
                                                                                                  851 
                                                                                                  858  virtual bool
                                                                                                  859  set_numa_preferred(int numa_node) =0;
                                                                                                  862 
                                                                                                  864 
                                                                                                  867  virtual int
                                                                                                  868  get_numa_preferred() const =0;
                                                                                                  869 
                                                                                                  871 
                                                                                                  881  virtual void
                                                                                                  882  set_left_min_pad_size(const std::string& dim,
                                                                                                  886  idx_t size ) =0;
                                                                                                  889 
                                                                                                  891 
                                                                                                  901  virtual void
                                                                                                  902  set_right_min_pad_size(const std::string& dim,
                                                                                                  906  idx_t size ) =0;
                                                                                                  909 
                                                                                                  911 
                                                                                                  914  virtual void
                                                                                                  915  set_min_pad_size(const std::string& dim,
                                                                                                  919  idx_t size ) =0;
                                                                                                  922 
                                                                                                  924 
                                                                                                  933  virtual void
                                                                                                  934  set_left_halo_size(const std::string& dim,
                                                                                                  938  idx_t size ) =0;
                                                                                                  940 
                                                                                                  942 
                                                                                                  951  virtual void
                                                                                                  952  set_right_halo_size(const std::string& dim,
                                                                                                  956  idx_t size ) =0;
                                                                                                  958 
                                                                                                  960 
                                                                                                  963  virtual void
                                                                                                  964  set_halo_size(const std::string& dim,
                                                                                                  968  idx_t size ) =0;
                                                                                                  970 
                                                                                                  971 
                                                                                                  973 
                                                                                                  998  virtual void
                                                                                                  999  set_alloc_size(const std::string& dim,
                                                                                                  1003  idx_t size ) =0;
                                                                                                  1004 
                                                                                                  1006 
                                                                                                  1012  virtual void
                                                                                                  1013  set_first_misc_index(const std::string& dim,
                                                                                                  1016  idx_t idx ) =0;
                                                                                                  1018 
                                                                                                  1020 
                                                                                                  1024  virtual bool
                                                                                                  1025  is_storage_allocated() const =0;
                                                                                                  1026 
                                                                                                  1028 
                                                                                                  1032  virtual idx_t
                                                                                                  1033  get_num_storage_bytes() const =0;
                                                                                                  1034 
                                                                                                  1036 
                                                                                                  1039  virtual idx_t
                                                                                                  1040  get_num_storage_elements() const =0;
                                                                                                  1041 
                                                                                                  1043 
                                                                                                  1049  virtual void
                                                                                                  1050  alloc_storage() =0;
                                                                                                  1051 
                                                                                                  1053 
                                                                                                  1057  virtual void
                                                                                                  1058  release_storage() =0;
                                                                                                  1059 
                                                                                                  1061 
                                                                                                  1077  virtual bool
                                                                                                  1078  is_storage_layout_identical(const yk_var_ptr other) const =0;
                                                                                                  1079 
                                                                                                  1081 
                                                                                                  1107  virtual void
                                                                                                  1108  fuse_vars(yk_var_ptr source) =0;
                                                                                                  1110 
                                                                                                  1112 
                                                                                                  1149  virtual void* get_raw_storage_buffer() =0;
                                                                                                  1150 
                                                                                                  1151 
                                                                                                  1154  virtual idx_t
                                                                                                  1155  get_first_rank_alloc_index(const std::string& dim) const {
                                                                                                  1156  return get_first_local_index(dim);
                                                                                                  1157  }
                                                                                                  1158 
                                                                                                  1161  virtual idx_t
                                                                                                  1162  get_last_rank_alloc_index(const std::string& dim) const {
                                                                                                  1163  return get_last_local_index(dim);
                                                                                                  1164  }
                                                                                                  1165 
                                                                                                  1166  }; // yk_var.
                                                                                                  1167 
                                                                                                  1170  typedef yk_var yk_grid;
                                                                                                  1171 
                                                                                                  1173 } // namespace yask.
                                                                                                  std::initializer_list< idx_t > idx_t_init_list
                                                                                                  Initializer list of indices.
                                                                                                  Definition: yask_common_api.hpp:87
                                                                                                  +
                                                                                                  virtual idx_t get_alloc_size(const std::string &dim) const =0
                                                                                                  Get the number of elements allocated in the specified dimension.
                                                                                                  virtual idx_t get_first_valid_step_index() const =0
                                                                                                  Get the first valid index in the step dimension.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_first_rank_halo_index_vec() const =0
                                                                                                  Get the first index of the left halo in this rank in all domain dimensions in this var.
                                                                                                  virtual idx_t get_left_pad_size(const std::string &dim) const =0
                                                                                                  Get the actual left padding in the specified dimension.
                                                                                                  virtual bool is_dynamic_step_alloc() const =0
                                                                                                  [Advanced] Get whether the allocation of the step dimension of this var can be modified at run-time.
                                                                                                  -
                                                                                                  void fuse_grids(yk_var_ptr source)
                                                                                                  [Deprecated] Use fuse_vars().
                                                                                                  Definition: yk_var_api.hpp:1130
                                                                                                  virtual int get_numa_preferred() const =0
                                                                                                  [Advanced] Get the default preferred NUMA node on which to allocate data.
                                                                                                  virtual idx_t get_left_halo_size(const std::string &dim) const =0
                                                                                                  Get the left halo size in the specified dimension.
                                                                                                  -
                                                                                                  idx_t get_halo_size(const std::string &dim) const
                                                                                                  [Deprecated] Use get_left_halo_size() and get_right_halo_size().
                                                                                                  Definition: yk_var_api.hpp:1098
                                                                                                  +
                                                                                                  virtual bool are_indices_local(const idx_t_vec &indices) const =0
                                                                                                  Determine whether the given indices refer to an accessible element in this rank.
                                                                                                  +
                                                                                                  std::vector< idx_t > idx_t_vec
                                                                                                  Vector of indices.
                                                                                                  Definition: yask_common_api.hpp:80
                                                                                                  +
                                                                                                  virtual idx_t_vec get_last_rank_halo_index_vec() const =0
                                                                                                  Get the last index of the right halo in this rank in all domain dimensions in this var.
                                                                                                  virtual void set_first_misc_index(const std::string &dim, idx_t idx)=0
                                                                                                  [Advanced] Set the first index of a specified miscellaneous dimension.
                                                                                                  -
                                                                                                  virtual idx_t set_elements_in_slice_same(double val, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices, bool strict_indices=true)=0
                                                                                                  Initialize var elements within specified subset of the var to the same value.
                                                                                                  virtual void set_halo_exchange_l1_norm(int norm)=0
                                                                                                  [Advanced] Set the maximum L1-norm of a neighbor rank for halo exchange.
                                                                                                  virtual bool is_storage_allocated() const =0
                                                                                                  [Advanced] Determine whether storage has been allocated.
                                                                                                  -
                                                                                                  yk_var yk_grid
                                                                                                  [Deprecated] Use yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:215
                                                                                                  -
                                                                                                  virtual idx_t get_rank_domain_size(const std::string &dim) const =0
                                                                                                  Get the domain size for this rank.
                                                                                                  -
                                                                                                  virtual idx_t add_to_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0
                                                                                                  Atomically add to the value of one var element.
                                                                                                  +
                                                                                                  virtual idx_t set_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0
                                                                                                  Set the value of one element in this var.
                                                                                                  +
                                                                                                  virtual idx_t get_rank_domain_size(const std::string &dim) const =0
                                                                                                  Get the domain size for this rank in the specified dimension.
                                                                                                  virtual void fuse_vars(yk_var_ptr source)=0
                                                                                                  [Advanced] Merge this var with another var.
                                                                                                  -
                                                                                                  virtual std::vector< std::string > get_dim_names() const =0
                                                                                                  Get all the dimensions in this var.
                                                                                                  -
                                                                                                  bool is_element_allocated(const std::initializer_list< idx_t > &indices) const
                                                                                                  [Deprecated] Use are_indices_local().
                                                                                                  Definition: yk_var_api.hpp:1122
                                                                                                  +
                                                                                                  virtual std::string format_indices(const idx_t_vec &indices) const =0
                                                                                                  Format the indices for human-readable display.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_alloc_size_vec() const =0
                                                                                                  Get the number of elements allocated in all dimensions in this var.
                                                                                                  virtual void set_right_min_pad_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the minimum right padding in the specified dimension.
                                                                                                  -
                                                                                                  virtual idx_t set_element(double val, const std::vector< idx_t > &indices, bool strict_indices=true)=0
                                                                                                  Set the value of one element in this var.
                                                                                                  +
                                                                                                  YASK_INT64_T idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:77
                                                                                                  virtual idx_t get_first_misc_index(const std::string &dim) const =0
                                                                                                  Get the first index of a specified miscellaneous dimension.
                                                                                                  virtual void release_storage()=0
                                                                                                  [Advanced] Explicitly release any allocated data-storage for this var.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_last_rank_domain_index_vec() const =0
                                                                                                  Get the last index of the sub-domain in this rank in all domain dimensions in this var.
                                                                                                  +
                                                                                                  virtual double get_element(const idx_t_vec &indices) const =0
                                                                                                  Read the value of one element in this var.
                                                                                                  virtual bool set_numa_preferred(int numa_node)=0
                                                                                                  [Advanced] Set the default preferred NUMA node on which to allocate data.
                                                                                                  +
                                                                                                  virtual YASK_DEPRECATED idx_t get_last_rank_alloc_index(const std::string &dim) const
                                                                                                  [Deprecated] Use get_last_local_index().
                                                                                                  Definition: yk_var_api.hpp:1162
                                                                                                  +
                                                                                                  virtual idx_t_vec get_first_rank_domain_index_vec() const =0
                                                                                                  Get the first index of the sub-domain in this rank in all domain dimensions in this var.
                                                                                                  +
                                                                                                  virtual int get_num_domain_dims() const =0
                                                                                                  Get the number of domain dimensions used in this var.
                                                                                                  virtual idx_t get_right_pad_size(const std::string &dim) const =0
                                                                                                  Get the actual right padding in the specified dimension.
                                                                                                  -
                                                                                                  bool is_element_allocated(const std::vector< idx_t > &indices) const
                                                                                                  [Deprecated] Use are_indices_local().
                                                                                                  Definition: yk_var_api.hpp:1114
                                                                                                  +
                                                                                                  virtual idx_t set_elements_in_slice(const void *buffer_ptr, const idx_t_vec &first_indices, const idx_t_vec &last_indices)=0
                                                                                                  Set var elements within specified subset of the var from values in a buffer.
                                                                                                  virtual void set_halo_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the left and right halo sizes in the specified dimension.
                                                                                                  A run-time YASK data container.
                                                                                                  Definition: yk_var_api.hpp:185
                                                                                                  virtual void set_left_halo_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the left halo size in the specified dimension.
                                                                                                  +
                                                                                                  virtual YASK_DEPRECATED idx_t get_first_rank_alloc_index(const std::string &dim) const
                                                                                                  [Deprecated] Use get_first_local_index().
                                                                                                  Definition: yk_var_api.hpp:1155
                                                                                                  +
                                                                                                  virtual idx_t_vec get_first_local_index_vec() const =0
                                                                                                  Get the first valid index in this rank in all dimensions in this var.
                                                                                                  virtual idx_t get_right_halo_size(const std::string &dim) const =0
                                                                                                  Get the right halo size in the specified dimension.
                                                                                                  virtual idx_t get_last_local_index(const std::string &dim) const =0
                                                                                                  Get the last index in this rank in the specified dimension.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_last_local_index_vec() const =0
                                                                                                  Get the last valid index in this rank in all dimensions in this var.
                                                                                                  virtual idx_t get_last_valid_step_index() const =0
                                                                                                  Get the last valid index in the step dimension.
                                                                                                  virtual idx_t get_last_misc_index(const std::string &dim) const =0
                                                                                                  Get the last index of a specified miscellaneous dimension.
                                                                                                  -
                                                                                                  virtual idx_t get_first_rank_alloc_index(const std::string &dim) const =0
                                                                                                  [Advanced] Get the first accessible index in this var in this rank in the specified domain dimension.
                                                                                                  -
                                                                                                  virtual bool are_indices_local(const std::vector< idx_t > &indices) const =0
                                                                                                  Determine whether the given indices refer to an accessible element in this rank.
                                                                                                  virtual idx_t get_last_rank_halo_index(const std::string &dim) const =0
                                                                                                  Get the last index of the right halo in this rank in the specified dimension.
                                                                                                  virtual void set_left_min_pad_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the minimum left padding in the specified dimension.
                                                                                                  virtual idx_t get_first_rank_domain_index(const std::string &dim) const =0
                                                                                                  Get the first index of the sub-domain in this rank in the specified dimension.
                                                                                                  -
                                                                                                  virtual idx_t get_last_rank_alloc_index(const std::string &dim) const =0
                                                                                                  [Advanced] Get the last accessible index in this var in this rank in the specified domain dimension.
                                                                                                  +
                                                                                                  virtual idx_t get_elements_in_slice(void *buffer_ptr, const idx_t_vec &first_indices, const idx_t_vec &last_indices) const =0
                                                                                                  Copy elements within specified subset of this var into a buffer.
                                                                                                  virtual int get_halo_exchange_l1_norm() const =0
                                                                                                  [Advanced] Get the maximum L1-norm of a neighbor rank for halo exchange.
                                                                                                  virtual void alloc_storage()=0
                                                                                                  [Advanced] Explicitly allocate data-storage memory for this var.
                                                                                                  virtual bool is_dim_used(const std::string &dim) const =0
                                                                                                  Determine whether specified dimension exists in this var.
                                                                                                  +
                                                                                                  #define YASK_DEPRECATED
                                                                                                  Deprecated attribute.
                                                                                                  Definition: yask_common_api.hpp:55
                                                                                                  virtual int get_num_dims() const =0
                                                                                                  Get the number of dimensions used in this var.
                                                                                                  -
                                                                                                  virtual double get_element(const std::vector< idx_t > &indices) const =0
                                                                                                  Read the value of one element in this var.
                                                                                                  virtual idx_t get_right_extra_pad_size(const std::string &dim) const =0
                                                                                                  Get the actual extra right padding in the specified dimension.
                                                                                                  +
                                                                                                  virtual idx_t_vec get_rank_domain_size_vec() const =0
                                                                                                  Get the domain size for this rank in all domain dimensions in this var.
                                                                                                  virtual void set_min_pad_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the minimum padding in the specified dimension.
                                                                                                  virtual idx_t get_num_storage_elements() const =0
                                                                                                  [Advanced] Determine size of raw storage in elements.
                                                                                                  +
                                                                                                  std::vector< std::string > string_vec
                                                                                                  Vector of strings.
                                                                                                  Definition: yask_common_api.hpp:90
                                                                                                  +
                                                                                                  virtual idx_t add_to_element(double val, const idx_t_vec &indices, bool strict_indices=true)=0
                                                                                                  Atomically add to the value of one var element.
                                                                                                  virtual const std::string & get_name() const =0
                                                                                                  Get the name of the var.
                                                                                                  -
                                                                                                  virtual idx_t get_elements_in_slice(void *buffer_ptr, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices) const =0
                                                                                                  Copy elements within specified subset of this var into a buffer.
                                                                                                  virtual void set_alloc_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the number of elements to allocate in the specified dimension.
                                                                                                  +
                                                                                                  virtual string_vec get_dim_names() const =0
                                                                                                  Get all the dimensions in this var.
                                                                                                  -
                                                                                                  idx_t get_pad_size(const std::string &dim) const
                                                                                                  [Deprecated] Use get_left_pad_size() and get_right_pad_size().
                                                                                                  Definition: yk_var_api.hpp:1103
                                                                                                  -
                                                                                                  idx_t get_extra_pad_size(const std::string &dim) const
                                                                                                  [Deprecated] Use get_left_extra_pad_size() and get_right_extra_pad_size().
                                                                                                  Definition: yk_var_api.hpp:1108
                                                                                                  virtual bool is_fixed_size() const =0
                                                                                                  Determine whether this var is not automatically resized based on the solution.
                                                                                                  virtual void * get_raw_storage_buffer()=0
                                                                                                  [Advanced] Get pointer to raw data storage buffer.
                                                                                                  -
                                                                                                  virtual std::string format_indices(const std::vector< idx_t > &indices) const =0
                                                                                                  Format the indices for pretty-printing.
                                                                                                  +
                                                                                                  YASK_DEPRECATED typedef yk_var yk_grid
                                                                                                  [Deprecated] Use yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:245
                                                                                                  virtual idx_t get_left_extra_pad_size(const std::string &dim) const =0
                                                                                                  Get the actual extra left padding in the specified dimension.
                                                                                                  virtual bool is_storage_layout_identical(const yk_var_ptr other) const =0
                                                                                                  [Advanced] Determines whether storage layout is the same as another var.
                                                                                                  virtual idx_t get_first_local_index(const std::string &dim) const =0
                                                                                                  Get the first valid index in this rank in the specified dimension.
                                                                                                  std::shared_ptr< yk_var > yk_var_ptr
                                                                                                  Shared pointer to yk_var.
                                                                                                  Definition: yask_kernel_api.hpp:60
                                                                                                  +
                                                                                                  virtual idx_t set_elements_in_slice_same(double val, const idx_t_vec &first_indices, const idx_t_vec &last_indices, bool strict_indices=true)=0
                                                                                                  Initialize var elements within specified subset of the var to the same value.
                                                                                                  virtual void set_right_halo_size(const std::string &dim, idx_t size)=0
                                                                                                  [Advanced] Set the right halo size in the specified dimension.
                                                                                                  -
                                                                                                  virtual idx_t set_elements_in_slice(const void *buffer_ptr, const std::vector< idx_t > &first_indices, const std::vector< idx_t > &last_indices)=0
                                                                                                  Set var elements within specified subset of the var from values in a buffer.
                                                                                                  -
                                                                                                  std::int64_t idx_t
                                                                                                  Type to use for indexing grids.
                                                                                                  Definition: yask_common_api.hpp:61
                                                                                                  virtual void set_all_elements_same(double val)=0
                                                                                                  Initialize all var elements to the same value.
                                                                                                  virtual idx_t get_first_rank_halo_index(const std::string &dim) const =0
                                                                                                  Get the first index of the left halo in this rank in the specified dimension.
                                                                                                  virtual idx_t get_last_rank_domain_index(const std::string &dim) const =0
                                                                                                  Get the last index of the sub-domain in this rank in the specified dimension.
                                                                                                  diff --git a/include/aux/Soln.hpp b/include/aux/Soln.hpp index 26318ac1..590e5dcf 100644 --- a/include/aux/Soln.hpp +++ b/include/aux/Soln.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/include/aux/yc_node_api.hpp b/include/aux/yc_node_api.hpp index d82b050b..40d08284 100644 --- a/include/aux/yc_node_api.hpp +++ b/include/aux/yc_node_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -289,6 +289,7 @@ namespace yask { get_var() =0; /// **[Deprecated]** Use get_var(). + YASK_DEPRECATED inline yc_var_ptr get_grid() { return get_var(); diff --git a/include/aux/yc_solution_api.hpp b/include/aux/yc_solution_api.hpp index d4f6a78f..13fc9a17 100644 --- a/include/aux/yc_solution_api.hpp +++ b/include/aux/yc_solution_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/include/aux/yk_solution_api.hpp b/include/aux/yk_solution_api.hpp index 7de32e95..2ca1cff1 100644 --- a/include/aux/yk_solution_api.hpp +++ b/include/aux/yk_solution_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -64,6 +64,14 @@ namespace yask { */ const int yask_numa_none = -9; + /// Do not specify any NUMA binding and use allocations optimized for offloading. + /** + This is used in yk_solution::set_default_numa_preferred + and yk_var::set_numa_preferred. + In Python, specify as `yask_kernel.cvar.yask_numa_host`. + */ + const int yask_numa_offload = -11; + /// Stencil solution as defined by the generated code from the YASK stencil compiler. /** Objects of this type contain all the vars and equations @@ -75,16 +83,6 @@ namespace yask { public: virtual ~yk_solution() {} - /// Set object to receive debug output. - /** - Just a shortcut for setting the debug output in the \ref yk_env - used to create the solution. - */ - virtual void - set_debug_output(yask_output_ptr debug - /**< [out] Pointer to object to receive debug output. - See \ref yask_output_factory. */ ) =0; - /// Get the name of the solution. /** @returns String containing the solution name provided during stencil compilation. @@ -94,13 +92,20 @@ namespace yask { /// Get the target ISA. /** - @returns String describing the instruction-set architecture targeted + @returns String describing the instruction-set architecture of the CPU targeted during kernel compilation. See the allowed YASK kernel targets in yc_solution::set_target(). */ virtual std::string get_target() const =0; + /// Get whether the stencil kernel will be offloaded to a device. + /** + @returns _true_ if kernel will be offloaded or _false_ if not. + */ + virtual bool + is_offloaded() const =0; + /// Get the floating-point precision size. /** @returns Number of bytes in each FP element: 4 or 8. @@ -133,7 +138,7 @@ namespace yask { that were defined by yc_node_factory::new_domain_index() and used in one or more vars. */ - virtual std::vector + virtual string_vec get_domain_dim_names() const =0; /// Get all the miscellaneous dimension names. @@ -145,7 +150,7 @@ namespace yask { * Created at run-time by adding a new dimension via yk_solution::new_var() or yk_solution::new_fixed_size_var(). */ - virtual std::vector + virtual string_vec get_misc_dim_names() const =0; /// Set the local-domain size in the specified dimension, i.e., the size of the part of the domain that is in this rank. @@ -165,10 +170,9 @@ namespace yask { each dimension. You should set either the local-domain size or the global-domain size - in each dimension. The unspecified (zero) sizes will be calculated based on the + in each dimension; the other should be set to zero (unspecified). + The unspecified (zero) sizes will be calculated based on the specified ones when prepare_solution() is called. - Setting the local-domain size to a non-zero value will clear the - global-domain size in that dimension until prepare_solution() is called. See the "Detailed Description" for \ref yk_var for more information on var sizes. */ @@ -178,12 +182,29 @@ namespace yask { the names from get_domain_dim_names(). */, idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; + /// Set the local-domain size in all domain dimensions. + /** + See set_rank_domain_size(). + */ + virtual void + set_rank_domain_size_vec(const idx_t_vec& vals + /**< [in] Elements in all domain dims. */) = 0; + + #ifndef SWIG + /// Set the local-domain size in all domain dimensions. + /** + See set_rank_domain_size(). + */ + virtual void + set_rank_domain_size_vec(const idx_t_init_list& vals + /**< [in] Elements in all domain dims. */) = 0; + #endif + /// Get the local-domain size in the specified dimension, i.e., the size in this rank. /** See documentation for set_rank_domain_size(). - If you have called set_overall_domain_size() in a given dimension, - get_rank_domain_size() will return zero in that dimension until + @note get_rank_domain_size() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned. @@ -194,13 +215,20 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */) const =0; + /// Get the local-domain size in all domain dimensions. + /** + See get_rank_domain_size(). + @returns Vector of current setting of rank domain sizes. + */ + virtual idx_t_vec + get_rank_domain_size_vec() const =0; + /// Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. /** You should set either the local-domain size or the global-domain size - in each dimension. The unspecified (zero) sizes will be calculated based on the + in each dimension; the other should be set to zero (unspecified). + The unspecified (zero) sizes will be calculated based on the specified ones when prepare_solution() is called. - Setting the global-domain size to a non-zero value will clear the - local-domain size in that dimension until prepare_solution() is called. See documentation for set_rank_domain_size(). See the "Detailed Description" for \ref yk_var for more information on var sizes. @@ -211,6 +239,24 @@ namespace yask { the names from get_domain_dim_names(). */, idx_t size /**< [in] Elements in the domain in this `dim`. */ ) =0; + /// Set the global-domain size in all domain dimensions. + /** + See set_overall_domain_size(). + */ + virtual void + set_overall_domain_size_vec(const idx_t_vec& vals + /**< [in] Elements in all domain dims. */) = 0; + + #ifndef SWIG + /// Set the global-domain size in all domain dimensions. + /** + See set_overall_domain_size(). + */ + virtual void + set_overall_domain_size_vec(const idx_t_init_list& vals + /**< [in] Elements in all domain dims. */) = 0; + #endif + /// Get the global-domain size in the specified dimension, i.e., the total size across all MPI ranks. /** The global-domain indices in the specified dimension will range from @@ -218,8 +264,7 @@ namespace yask { Call get_first_rank_domain_index() and get_last_rank_domain_index() to find the subset of this domain in each rank. - If you have called set_rank_domain_size() in a given dimension, - get_overall_domain_size() will return zero in that dimension until + @note get_overall_domain_size() may return zero in a dimension until prepare_solution() is called. After prepare_solution() is called, the computed size will be returned. @@ -230,6 +275,15 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */ ) const =0; + /// Get the global-domain size in all domain dimensions. + /** + See get_overall_domain_size(). + + @returns Vector of current setting of global domain sizes. + */ + virtual idx_t_vec + get_overall_domain_size_vec() const =0; + /// Set the block size in the given dimension. /** This sets the approximate number of elements that are evaluated in @@ -247,6 +301,9 @@ namespace yask { Unless auto-tuning is disabled, the block size will be used as a starting point for an automated search for a higher-performing block size. + + This and all other tile sizes (Mega-blocks, blocks, micro-blocks, etc.) + can be set via apply_command_line_options(). */ virtual void set_block_size(const std::string& dim @@ -256,6 +313,32 @@ namespace yask { idx_t size /**< [in] Elements in a block in this `dim`. */ ) =0; + /// Set the block size in all domain dimensions. + /** + See set_block_size(). + + @note Does _not_ set the block size in the step dim. + Call set_block_size() with the name of the step dim to set the + temporal block size. + */ + virtual void + set_block_size_vec(const idx_t_vec& vals + /**< [in] Elements in all domain dims. */) = 0; + + #ifndef SWIG + /// Set the block size in all domain dimensions. + /** + See set_block_size(). + + @note Does _not_ set the block size in the step dim. + Call set_block_size() with the name of the step dim to set the + temporal block size. + */ + virtual void + set_block_size_vec(const idx_t_init_list& vals + /**< [in] Elements in all domain dims. */) = 0; + #endif + /// Get the block size. /** Returned value may be slightly larger than the value provided @@ -268,6 +351,19 @@ namespace yask { the names from get_step_dim_name() or get_domain_dim_names(). */) const =0; + /// Get the block size in all domain dimensions. + /** + See get_block_size(). + + @note Does _not_ return the block size in the step domain. + Call get_block_size() with the name of the step-domain dimension + to get the temporal block size. + + @returns Vector of current setting of block domain sizes. + */ + virtual idx_t_vec + get_block_size_vec() const =0; + /// Set the number of MPI ranks in the given dimension. /** If set_num_ranks() is set to a non-zero value in all @@ -302,15 +398,45 @@ namespace yask { the names from get_domain_dim_names(). */, idx_t num /**< [in] Number of ranks in `dim`. */ ) =0; + /// Set the number of MPI ranks in all domain dimensions. + /** + See set_num_ranks(). + */ + virtual void + set_num_ranks_vec(const idx_t_vec& vals + /**< [in] Number of ranks in all domain dims. */) = 0; + + #ifndef SWIG + /// Set the number of all MPI ranks in all domain dimensions. + /** + See set_num_ranks(). + */ + virtual void + set_num_ranks_vec(const idx_t_init_list& vals + /**< [in] Number of ranks in all domain dims. */) = 0; + #endif + /// Get the number of MPI ranks in the given dimension. /** - @returns Current setting of rank size. + @note get_num_ranks() may return zero in a dimension until + prepare_solution() is called. After prepare_solution() is called, + the computed number of ranks will be returned. + + @returns Current number of ranks. */ virtual idx_t get_num_ranks(const std::string& dim /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */) const =0; + /// Get the number of MPI ranks in all domain dimensions. + /** + See get_num_ranks(); + @returns Vector of current number of ranks in all domain dimensions. + */ + virtual idx_t_vec + get_num_ranks_vec() const =0; + /// Set the rank index in the specified dimension. /** The overall rank index in the specified dimension must range from @@ -332,6 +458,10 @@ namespace yask { See yk_env::get_num_ranks() and yk_env::get_rank_index() for MPI rank index. + + @note get_rank_index() may return zero in a dimension until + prepare_solution() is called. After prepare_solution() is called, + the computed index will be returned. */ virtual void set_rank_index(const std::string& dim @@ -339,6 +469,24 @@ namespace yask { the names from get_domain_dim_names(). */, idx_t num /**< [in] Rank index in `dim`. */ ) =0; + /// Set the rank index in all domain dimensions. + /** + See set_rank_index(). + */ + virtual void + set_rank_index_vec(const idx_t_vec& vals + /**< [in] Index of this rank in all domain dims. */) = 0; + + #ifndef SWIG + /// Set the rank index in all domain dimensions. + /** + See set_rank_index(). + */ + virtual void + set_rank_index_vec(const idx_t_init_list& vals + /**< [in] Index of this rank in all domain dims. */) = 0; + #endif + /// Get the rank index in the specified dimension. /** The overall rank index in the specified dimension will range from @@ -350,12 +498,19 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */ ) const =0; + /// Get the rank index in all domain dimensions. + /** + See get_rank_index(); + @returns Vector of zero-based indices of this rank in all domain dimensions. + */ + virtual idx_t_vec + get_rank_index_vec() const =0; + /// Set kernel options from a string. /** Parses the string for options as if from a command-line. - Example: "-bx 64 -block_threads 4" sets the block-size in the *x* - dimension to 64 and the number of threads used to process each - block to 4. + Example: "-bx 64 -inner_threads 4" sets the block-size in the *x* + dimension to 64 and the number of nested OpenMp threads to 4. See the help message from the YASK kernel binary for documentation on the command-line options. Used to set less-common options not directly supported by the @@ -388,7 +543,7 @@ namespace yask { @returns Any parts of `args` that were not recognized by the parser as options. */ virtual std::string - apply_command_line_options(const std::vector& args) =0; + apply_command_line_options(const string_vec& args) =0; /// Get the number of vars in the solution. /** @@ -419,8 +574,10 @@ namespace yask { /// Prepare the solution for stencil application. /** + Calculates the position of each rank in the overall problem domain + if not previsouly specified. + Calculates the sizes of each rank if not previsously specified. Allocates data in vars that do not already have storage allocated. - Calculates the position of each rank in the overall problem domain. Sets many other data structures needed for proper stencil application. Since this function initiates MPI communication, it must be called on all MPI ranks, and it will block until all ranks have completed. @@ -447,6 +604,14 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */ ) const =0; + /// Get the first index of the sub-domain in this rank in all domain dimensions. + /** + See get_first_rank_domain_index(). + @returns Vector of first domain indices of this rank in all domain dimensions. + */ + virtual idx_t_vec + get_first_rank_domain_index_vec() const =0; + /// Get the last index of the sub-domain in this rank the specified dimension. /** This returns the last *overall* index within the domain in this rank @@ -466,11 +631,19 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */ ) const =0; + /// Get the last index of the sub-domain in this rank in all domain dimensions. + /** + See get_last_rank_domain_index(). + @returns Vector of last domain indices of this rank in all domain dimensions. + */ + virtual idx_t_vec + get_last_rank_domain_index_vec() const =0; + /// Run the stencil solution for the specified steps. /** The stencil(s) in the solution are applied to the var data, setting the index variables as follows: - 1. If temporal wave-front tiling is *not* used (the default): + 1. If temporal tiling is *not* used (the default): - The step index (e.g., `t` for "time") will be sequentially set to values from `first_step_index` to `last_step_index`, inclusive. + If the stencil equations were defined with dependencies on lower-valued steps, @@ -483,20 +656,23 @@ namespace yask { to values across the entire domain as returned by yk_solution::get_overall_domain_size() (not necessarily sequentially). - MPI halo exchanges will occur as necessary before or during each step. - - Since this function initiates MPI communication, it must be called - on all MPI ranks, and it will block until all ranks have completed. - 2. **[Advanced]** If temporal wave-front tiling *is* enabled via set_region_size(): + 2. **[Advanced]** If temporal wave-front tiling *is* enabled: - The step index (e.g., `t` for "time") will be sequentially set to values - from `first_step_index` to `last_step_index`, inclusive, within each region. - + The number of steps in a region may also be restricted by the size - of the region in the step dimension. In that case, tiles will be done in slices of that size. - - For each step index within each region, the domain indices will be set - to values across the entire region (not necessarily sequentially). + from `first_step_index` to `last_step_index`, inclusive, within each area configured + for temporal tiling. + + The number of steps in an area may also be restricted by the size + of the area in the step dimension. + In that case, tiles will be done in temporal slices of that size. + - For each step index within each area, the domain indices will be set + to values across the entire area (not necessarily sequentially). - Ultimately, the stencil(s) will be applied to same the elements in both the step and domain dimensions as when wave-front tiling is not used. - - MPI halo exchanges will occur before each number of steps in a region. + - MPI halo exchanges may occur at less frequent intervals. This function should be called only *after* calling prepare_solution(). + + Since this function initiates MPI communication, it must be called + on all MPI ranks, and it will block until all ranks have completed. */ virtual void run_solution(idx_t first_step_index /**< [in] First index in the step dimension */, @@ -526,11 +702,43 @@ namespace yask { @note The parameter is *not* the number of steps to run. @warning Since only one step is taken per call, using this function effectively disables - wave-front tiling. + wave-front tiling (except in the special case of tiling only across stages within a step). */ virtual void run_solution(idx_t step_index /**< [in] Index in the step dimension */ ) =0; + /// Update data on the device. + /** + Copies any YASK var data that has been modified on the host but + not on the device from the host to the device. + + This is done automatically as needed, so calling this function is only + needed when you want to control when the copy is done. + + If the kernel has been compiled for offloading using unified shared memory, + calling this function will have no effect. + Similarly, if the kernel has not been compiled for offloading, + calling this function will have no effect. + */ + virtual void + copy_vars_to_device() const =0; + + /// Update data on the host. + /** + Copies any YASK var data that has been modified on the device but + not on the host from the device to the host. + + This is done automatically as needed, so calling this function is only + needed when you want to control when the copy is done. + + If the kernel has been compiled for offloading using unified shared memory, + calling this function will have no effect. + Similarly, if the kernel has not been compiled for offloading, + calling this function will have no effect. + */ + virtual void + copy_vars_from_device() const =0; + /// Finish using a solution. /** Performs a final MPI halo exchange. @@ -545,69 +753,71 @@ namespace yask { /** @note Side effect: resets all statistics, so each call returns only the elapsed time and counts since the previous call. + @note Side effect: outputs stats in human-readable format + to current debug output object. @returns Pointer to statistics object. */ virtual yk_stats_ptr get_stats() =0; - /// Determine whether the auto-tuner is enabled on this rank. + /// Start or stop the online auto-tuner on this rank. /** - The auto-tuner is enabled by default. - It will become disabled after it has converged or after reset_auto_tuner(false) has been called. + This function is used to apply the current best-known settings if the tuner is + currently running, reset the state of the auto-tuner, and either + restart its search (if `enable==true`) or stop it (if `enable==false`). + This call must be made on each rank where the change is desired. + + This mode of running the auto-tuner is called "online" or "in-situ" because + changes are made to the tile sizes between calls to run_solution(). + It will stop automatically when it converges. + Call is_auto_tuner_enabled() to determine if it has converged. + */ + virtual void + reset_auto_tuner(bool enable + /**< [in] If _true_, start or restart the auto-tuner search on this rank. + If _false_, stop the auto-tuner. */, + bool verbose = false + /**< [in] If _true_, print progress information to the debug object + set via set_debug_output(). */ ) =0; + + /// Determine whether the online auto-tuner is enabled on this rank. + /** + The "online" or "in-situ" auto-tuner is disabled by default. + It can be enabled by calling reset_auto_tuner(true). + It will also become disabled after it has converged or after reset_auto_tuner(false) has been called. + Auto-tuners run independently on each rank, so they will not generally finish at the same step + across all ranks. @returns Whether the auto-tuner is still searching. */ virtual bool is_auto_tuner_enabled() const =0; - /* Advanced APIs for yk_solution found below are not needed for most applications. */ - - /// **[Advanced]** Set the region size in the given dimension. + /// Run the offline auto-tuner immediately, not preserving variable data. /** - This sets the approximate number of elements that are evaluated in - each "region". - This is a performance setting and should not affect the functional - correctness or total number of elements evaluated. - A region is typically the unit of work done by each - top-level OpenMP parallel region. The actual number of elements evaluated - in a region may be greater than the specified size due to rounding. - The number of elements in a region may - also be smaller than the specified size when the region is at the - edge of the domain. - - A region is most often used to specify the size of a temporal - wave-front tile. Thus, you will normally specify the size of the - region in the step dimension as well as all the domain dimensions. - For example, `set_region_size("t", 4)` specifies that four - time-steps will be executed in each region. - The sizes of regions in the domain dimensions are typically - set to fit within a large cache structure such as MCDRAM cache - in an Intel(R) Xeon Phi(TM) processor. - - In order to get the benefit of regions with multiple steps, - you must also call run_solution() where the number of steps - between its `first_step_index` and `last_step_index` - arguments is greater than or equal to the step-size of the - regions. + This runs the auto-tuner in "offline" mode. + (Under "online" operation, an auto-tuner is invoked during calls to + run_solution(); see reset_auto_tuner() and is_auto_tuner_enabled() + for more information on running in online mode.) + + This function causes the stencil solution to be run immediately + until the auto-tuner converges on all ranks. + It is useful for benchmarking, where performance is to be timed + for a given number of steps after the best settings are found. + This function should be called only *after* calling prepare_solution(). + This call must be made on each rank. + + @warning Modifies the contents of the YASK vars by automatically calling run_solution() + an arbitrary number of times, but without halo exchanges. + (See run_solution() for other restrictions and warnings.) + Thus, var data should be set or reset *after* calling this function when + used in a production or test setting where correct results are expected. */ virtual void - set_region_size(const std::string& dim - /**< [in] Name of dimension to set. Must be one of - the names from get_step_dim_name() or - get_domain_dim_names(). */, - idx_t size - /**< [in] Elements in a region in this `dim`. */ ) =0; + run_auto_tuner_now(bool verbose = true + /**< [in] If _true_, print progress information to the debug object + set via set_debug_output(). */ ) =0; - /// **[Advanced]** Get the region size. - /** - Returned value may be slightly larger than the value provided - via set_region_size() due to rounding. - @returns Current settings of region size. - */ - virtual idx_t - get_region_size(const std::string& dim - /**< [in] Name of dimension to get. Must be one of - the names from get_step_dim_name() or - get_domain_dim_names(). */) const =0; + /* Advanced APIs for yk_solution found below are not needed for most applications. */ /// **[Advanced]** Set the minimum amount of padding for all vars. /** @@ -651,50 +861,6 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_domain_dim_names(). */) const =0; - /// **[Advanced]** Restart or disable the auto-tuner on this rank. - /** - Under normal operation, an auto-tuner is invoked automatically during calls to - run_solution(). - Currently, only the block size is set by the auto-tuner, and the search begins from the - sizes set via set_block_size() or the default size if set_block_size() has - not been called. - This function is used to apply the current best-known settings if the tuner has - been running, reset the state of the auto-tuner, and either - restart its search or disable it from running. - This call must be made on each rank where the change is desired. - */ - virtual void - reset_auto_tuner(bool enable - /**< [in] If _true_, start or restart the auto-tuner search. - If _false_, disable the auto-tuner from running. */, - bool verbose = false - /**< [in] If _true_, print progress information to the debug object - set via set_debug_output(). */ ) =0; - - /// **[Advanced]** Automatically tune selected settings immediately. - /** - Executes a search algorithm to find [locally] optimum values for some of the - settings. - Under normal operation, an auto-tuner is invoked during calls to - run_solution(). - See reset_auto_tuner() for more information. - This function causes the stencil solution to be run immediately - until the auto-tuner converges on all ranks. - It is useful for benchmarking, where performance is to be timed - for a given number of steps after the best settings are found. - This function should be called only *after* calling prepare_solution(). - This call must be made on each rank. - @warning Modifies the contents of the vars by calling run_solution() - an arbitrary number of times, but without halo exchange. - (See run_solution() for other restrictions and warnings.) - Thus, var data should be set *after* calling this function when - used in a production or test setting where correct results are expected. - */ - virtual void - run_auto_tuner_now(bool verbose = true - /**< [in] If _true_, print progress information to the debug object - set via set_debug_output(). */ ) =0; - /// **[Advanced]** Add a new var to the solution. /** This is typically not needed because vars used by the stencils are pre-defined @@ -748,7 +914,7 @@ namespace yask { new_var(const std::string& name /**< [in] Name of the var; must be unique within the solution. */, - const std::vector& dims + const string_vec& dims /**< [in] List of names of all dimensions. Names must be valid C++ identifiers and not repeated within this var. */ ) =0; @@ -823,11 +989,11 @@ namespace yask { new_fixed_size_var(const std::string& name /**< [in] Name of the var; must be unique within the solution. */, - const std::vector& dims + const string_vec& dims /**< [in] List of names of all dimensions. Names must be valid C++ identifiers and not repeated within this var. */, - const std::vector& dim_sizes + const idx_t_vec& dim_sizes /**< [in] Initial allocation in each dimension. Must be exatly one size for each dimension. */ ) =0; @@ -847,7 +1013,7 @@ namespace yask { /**< [in] List of names of all dimensions. Names must be valid C++ identifiers and not repeated within this var. */, - const std::initializer_list& dim_sizes + const idx_t_init_list& dim_sizes /**< [in] Initial allocation in each dimension. Must be exatly one size for each dimension. */ ) =0; #endif @@ -972,33 +1138,43 @@ namespace yask { virtual bool get_step_wrap() const =0; + /// ***[Deprecated]*** Use yk_env::set_debug_output(). + YASK_DEPRECATED + virtual void + set_debug_output(yask_output_ptr debug) =0; + /// **[Deprecated]** Use get_num_vars(). + YASK_DEPRECATED inline int get_num_grids() const { return get_num_vars(); } /// **[Deprecated]** Use get_var(). + YASK_DEPRECATED inline yk_var_ptr get_grid(const std::string& name) { return get_var(name); } /// **[Deprecated]** Use get_vars(). + YASK_DEPRECATED inline std::vector get_grids() { return get_vars(); } /// **[Deprecated]** Use new_var(). + YASK_DEPRECATED inline yk_var_ptr new_grid(const std::string& name, - const std::vector& dims) { + const string_vec& dims) { return new_var(name, dims); } #ifndef SWIG /// **[Deprecated]** Use new_var(). + YASK_DEPRECATED inline yk_var_ptr new_grid(const std::string& name, const std::initializer_list& dims) { @@ -1007,24 +1183,27 @@ namespace yask { #endif /// **[Deprecated]** Use new_fixed_size_var(). + YASK_DEPRECATED inline yk_var_ptr new_fixed_size_grid(const std::string& name, - const std::vector& dims, - const std::vector& dim_sizes) { + const string_vec& dims, + const idx_t_vec& dim_sizes) { return new_fixed_size_var(name, dims, dim_sizes); } #ifndef SWIG /// **[Deprecated]** Use new_fixed_size_var(). + YASK_DEPRECATED inline yk_var_ptr new_fixed_size_grid(const std::string& name, const std::initializer_list& dims, - const std::vector& dim_sizes) { + const idx_t_vec& dim_sizes) { return new_fixed_size_var(name, dims, dim_sizes); } #endif /// **[Deprecated]** Use fuse_vars(). + YASK_DEPRECATED inline void fuse_grids(yk_solution_ptr source) { fuse_vars(source); diff --git a/include/aux/yk_var_api.hpp b/include/aux/yk_var_api.hpp index 109de934..9b2acc03 100644 --- a/include/aux/yk_var_api.hpp +++ b/include/aux/yk_var_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -144,7 +144,7 @@ namespace yask {
                                                                                                  overall problem domain
                                                                                                  right padding of rank Z
                                                                                                  - The intermediate halos and paddings also exist, but are not shown in the above diagram. + Halos and paddings between ranks also exist, but are not shown in the above diagram. The halos overlap the domains of adjacent ranks. For example, the left halo of rank B in the diagram would overlap the domain of rank A. Data in these overlapped areas are exchanged as needed during stencil application @@ -198,7 +198,8 @@ namespace yask { /// Get the number of dimensions used in this var. /** This may include domain, step, and/or miscellaneous dimensions. - @returns Number of dimensions created via yc_solution::new_var(), + @returns Number of dimensions declared in the stencil code + or created via yc_solution::new_var(), yk_solution::new_var(), or yk_solution::new_fixed_size_var(). */ virtual int get_num_dims() const =0; @@ -208,9 +209,17 @@ namespace yask { This may include domain, step, and/or miscellaneous dimensions. @returns List of names of all the dimensions. */ - virtual std::vector + virtual string_vec get_dim_names() const =0; + /// Get the number of _domain_ dimensions used in this var. + /** + @returns Number of domain dimensions declared in the stencil code + or created via yc_solution::new_var(), + yk_solution::new_var(), or yk_solution::new_fixed_size_var(). + */ + virtual int get_num_domain_dims() const =0; + /// Determine whether specified dimension exists in this var. /** @returns `true` if dimension exists (including step-dimension), @@ -230,9 +239,9 @@ namespace yask { /** This is a convenience function that provides the first possible index in any var dimension regardless of the dimension type. - It is equivalent to - get_first_rank_alloc_index(dim) when `dim` is - a domain dimension, get_first_misc_index(dim) + If `dim` is a domain dimension, returns the first accessible index + in the left padding area. + It is equivalent to get_first_misc_index(dim) for a misc dimension, and get_first_valid_step_index() for the step dimension. @note This function should be called only *after* calling prepare_solution() @@ -244,13 +253,21 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_dim_names(). */ ) const =0; + /// Get the first valid index in this rank in all dimensions in this var. + /** + See get_first_local_index(). + @returns vector of first valid indices. + */ + virtual idx_t_vec + get_first_local_index_vec() const =0; + /// Get the last index in this rank in the specified dimension. /** This is a convenience function that provides the last possible index in any var dimension regardless of the dimension type. - It is equivalent to - get_last_rank_alloc_index(dim) when `dim` is - a domain dimension, get_last_misc_index(dim) + If `dim` is a domain dimension, returns the last accessible index + in the right padding area. + It is equivalent to get_last_misc_index(dim) for a misc dimension, and get_last_valid_step_index() for the step dimension. @note This function should be called only *after* calling prepare_solution() @@ -262,19 +279,35 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from get_dim_names(). */ ) const =0; + /// Get the last valid index in this rank in all dimensions in this var. + /** + See get_last_local_index(). + @returns vector of last valid indices. + */ + virtual idx_t_vec + get_last_local_index_vec() const =0; + /// Get the number of elements allocated in the specified dimension. /** For the domain dimensions, this includes the rank-domain and padding sizes. See the "Detailed Description" for \ref yk_var for information on var sizes. For any dimension `dim`, `get_alloc_size(dim) == get_last_local_index(dim) - get_first_local_index(dim) + 1`; - @returns allocation in number of elements (not bytes). + @returns allocation size in number of elements (not bytes). */ virtual idx_t get_alloc_size(const std::string& dim /**< [in] Name of dimension to get. Must be one of the names from get_dim_names(). */ ) const =0; + /// Get the number of elements allocated in all dimensions in this var. + /** + See get_alloc_size(). + @returns vector of allocation sizes in number of elements (not bytes). + */ + virtual idx_t_vec + get_alloc_size_vec() const =0; + /// Get the first valid index in the step dimension. /** The valid step indices in a var are updated by calling yk_solution::run_solution() @@ -299,8 +332,10 @@ namespace yask { virtual idx_t get_last_valid_step_index() const =0; - /// Get the domain size for this rank. + /// Get the domain size for this rank in the specified dimension. /** + @note This function should be called only *after* calling prepare_solution() + because prepare_solution() assigns this rank's size. @returns The same value as yk_solution::get_rank_domain_size() if is_fixed_size() returns `false` or the fixed sized provided via yk_solution::new_fixed_size_var() otherwise. @@ -310,8 +345,17 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */) const =0; + /// Get the domain size for this rank in all domain dimensions in this var. + /** + See get_rank_domain_size(). + @returns vector of values, one for each domain dimension in this var. + */ + virtual idx_t_vec + get_rank_domain_size_vec() const =0; + /// Get the first index of the sub-domain in this rank in the specified dimension. /** + Does _not_ include indices of padding area. @note This function should be called only *after* calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain. @returns The same value as yk_solution::get_first_rank_domain_index() if @@ -322,8 +366,17 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */ ) const =0; + /// Get the first index of the sub-domain in this rank in all domain dimensions in this var. + /** + See get_first_rank_domain_index(). + @returns vector of values, one for each domain dimension in this var. + */ + virtual idx_t_vec + get_first_rank_domain_index_vec() const =0; + /// Get the last index of the sub-domain in this rank in the specified dimension. /** + Does _not_ include indices of padding area. @note This function should be called only *after* calling prepare_solution() because prepare_solution() assigns this rank's position in the problem domain. @returns The same value as yk_solution::get_last_rank_domain_index() if @@ -335,6 +388,14 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */ ) const =0; + /// Get the last index of the sub-domain in this rank in all domain dimensions in this var. + /** + See get_last_rank_domain_index(). + @returns vector of values, one for each domain dimension in this var. + */ + virtual idx_t_vec + get_last_rank_domain_index_vec() const =0; + /// Get the left halo size in the specified dimension. /** This value is typically set by the stencil compiler. @@ -370,6 +431,14 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */ ) const =0; + /// Get the first index of the left halo in this rank in all domain dimensions in this var. + /** + See get_first_rank_halo_index(). + @returns vector of values, one for each domain dimension in this var. + */ + virtual idx_t_vec + get_first_rank_halo_index_vec() const =0; + /// Get the last index of the right halo in this rank in the specified dimension. /** @note This function should be called only *after* calling prepare_solution() @@ -383,6 +452,14 @@ namespace yask { /**< [in] Name of dimension to get. Must be one of the names from yk_solution::get_domain_dim_names(). */ ) const =0; + /// Get the last index of the right halo in this rank in all domain dimensions in this var. + /** + See get_last_rank_halo_index(). + @returns vector of values, one for each domain dimension in this var. + */ + virtual idx_t_vec + get_last_rank_halo_index_vec() const =0; + /// Get the actual left padding in the specified dimension. /** The left padding is the memory allocated before @@ -466,16 +543,16 @@ namespace yask { `dim` in the var; `false` otherwise. */ virtual bool - are_indices_local(const std::vector& indices + are_indices_local(const idx_t_vec& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #ifndef SWIG /// Determine whether the given indices refer to an accessible element in this rank. /** - See get_last_misc_index(). + See are_indices_local(). */ virtual bool - are_indices_local(const std::initializer_list& indices + are_indices_local(const idx_t_init_list& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #endif @@ -489,7 +566,7 @@ namespace yask { @returns value in var at given indices. */ virtual double - get_element(const std::vector& indices + get_element(const idx_t_vec& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #ifndef SWIG @@ -499,7 +576,7 @@ namespace yask { @returns value in var at given indices. */ virtual double - get_element(const std::initializer_list& indices + get_element(const idx_t_init_list& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #endif @@ -525,7 +602,7 @@ namespace yask { */ virtual idx_t set_element(double val /**< [in] Element in var will be set to this. */, - const std::vector& indices + const idx_t_vec& indices /**< [in] List of indices, one for each var dimension. */, bool strict_indices = true /**< [in] If true, indices must be within domain or padding. @@ -540,7 +617,7 @@ namespace yask { */ virtual idx_t set_element(double val /**< [in] Element in var will be set to this. */, - const std::initializer_list& indices + const idx_t_init_list& indices /**< [in] List of indices, one for each var dimension. */, bool strict_indices = true /**< [in] If true, indices must be within domain or padding. @@ -569,9 +646,9 @@ namespace yask { virtual idx_t get_elements_in_slice(void* buffer_ptr /**< [out] Pointer to buffer where values will be written. */, - const std::vector& first_indices + const idx_t_vec& first_indices /**< [in] List of initial indices, one for each var dimension. */, - const std::vector& last_indices + const idx_t_vec& last_indices /**< [in] List of final indices, one for each var dimension. */ ) const =0; /// Atomically add to the value of one var element. @@ -592,7 +669,7 @@ namespace yask { */ virtual idx_t add_to_element(double val /**< [in] This value will be added to element in var. */, - const std::vector& indices + const idx_t_vec& indices /**< [in] List of indices, one for each var dimension. */, bool strict_indices = true /**< [in] If true, indices must be within domain or padding. @@ -607,7 +684,7 @@ namespace yask { */ virtual idx_t add_to_element(double val /**< [in] This value will be added to element in var. */, - const std::initializer_list& indices + const idx_t_init_list& indices /**< [in] List of indices, one for each var dimension. */, bool strict_indices = true /**< [in] If true, indices must be within domain or padding. @@ -643,9 +720,9 @@ namespace yask { */ virtual idx_t set_elements_in_slice_same(double val /**< [in] All elements in the slice will be set to this. */, - const std::vector& first_indices + const idx_t_vec& first_indices /**< [in] List of initial indices, one for each var dimension. */, - const std::vector& last_indices + const idx_t_vec& last_indices /**< [in] List of final indices, one for each var dimension. */, bool strict_indices = true /**< [in] If true, indices must be within domain or padding. @@ -675,9 +752,9 @@ namespace yask { virtual idx_t set_elements_in_slice(const void* buffer_ptr /**< [out] Pointer to buffer where values will be read. */, - const std::vector& first_indices + const idx_t_vec& first_indices /**< [in] List of initial indices, one for each var dimension. */, - const std::vector& last_indices + const idx_t_vec& last_indices /**< [in] List of final indices, one for each var dimension. */ ) =0; #ifdef COPY_SLICE_IMPLEMENTED @@ -698,34 +775,34 @@ namespace yask { virtual idx_t set_elements_in_slice(const yk_var_ptr source /**< [in] Var from which elements will be read. */, - const std::vector& first_source_indices + const idx_t_vec& first_source_indices /**< [in] List of starting indices in the source var, one for each var dimension. */, - const std::vector& first_target_indices + const idx_t_vec& first_target_indices /**< [in] List of starting indices in this (target) var, one for each var dimension. */, - const std::vector& last_target_indices + const idx_t_vec& last_target_indices /**< [in] List of final indices in this (target) var, one for each var dimension. */ ) =0; #endif - /// Format the indices for pretty-printing. + /// Format the indices for human-readable display. /** Provide indices in a list in the same order returned by get_dim_names(). @returns A string containing the var name and the index values. */ virtual std::string - format_indices(const std::vector& indices + format_indices(const idx_t_vec& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #ifndef SWIG - /// Format the indices for pretty-printing. + /// Format the indices for human-readable display. /** See format_indices(). @returns A string containing the var name and the index values. */ virtual std::string - format_indices(const std::initializer_list& indices + format_indices(const idx_t_init_list& indices /**< [in] List of indices, one for each var dimension. */ ) const =0; #endif @@ -754,6 +831,8 @@ namespace yask { /// **[Advanced]** Set the maximum L1-norm of a neighbor rank for halo exchange. /** + This should only be used to override the value calculated automatically by + the YASK compiler. @see get_halo_exchange_l1_norm(). */ virtual void @@ -937,28 +1016,6 @@ namespace yask { idx_t idx /**< [in] New value for first index. May be negative. */ ) =0; - /// **[Advanced]** Get the first accessible index in this var in this rank in the specified domain dimension. - /** - Equivalent to get_first_local_index(dim), where `dim` is a domain dimension. - @returns First valid index in this var. - */ - virtual idx_t - get_first_rank_alloc_index(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - - /// **[Advanced]** Get the last accessible index in this var in this rank in the specified domain dimension. - /** - Equivalent to get_last_local_index(dim), where `dim` is a domain dimension. - @returns Last valid index in this var. - */ - virtual idx_t - get_last_rank_alloc_index(const std::string& dim - /**< [in] Name of dimension to get. - Must be one of - the names from yk_solution::get_domain_dim_names(). */ ) const =0; - /// **[Advanced]** Determine whether storage has been allocated. /** @returns `true` if storage has been allocated, @@ -1070,7 +1127,7 @@ namespace yask { all elements of a var via its raw buffer, e.g., add some constant value to all elements. - If the layouts of two vars are identical, you can use their - raw buffers to copy or compare the var contents for equality or + raw buffers to copy all data from one to the other or perform element-wise binary mathematical operations on them, e.g., add all elements from one var to another. @@ -1079,6 +1136,7 @@ namespace yask { index and that element's offset from the beginning of the buffer such as row-major or column-major layout. - All elements in the buffer are part of the rank domain or halo. + - All elements in the buffer contain valid floating-point values. Thus, - You should not perform any operations dependent on @@ -1090,50 +1148,25 @@ namespace yask { */ virtual void* get_raw_storage_buffer() =0; - /* Deprecated APIs for yk_var found below should be avoided. - Use the more explicit form found in the documentation. */ - - /// **[Deprecated]** Use get_left_halo_size() and get_right_halo_size(). - inline idx_t - get_halo_size(const std::string& dim) const { - return get_left_halo_size(dim); - } - /// **[Deprecated]** Use get_left_pad_size() and get_right_pad_size(). - inline idx_t - get_pad_size(const std::string& dim) const { - return get_left_pad_size(dim); - } - /// **[Deprecated]** Use get_left_extra_pad_size() and get_right_extra_pad_size(). - inline idx_t - get_extra_pad_size(const std::string& dim) const { - return get_left_extra_pad_size(dim); - } - /// **[Deprecated]** Use are_indices_local(). - inline bool - is_element_allocated(const std::vector& indices - /**< [in] List of indices, one for each var dimension. */ ) const { - return are_indices_local(indices); - } - -#ifndef SWIG - /// **[Deprecated]** Use are_indices_local(). - inline bool - is_element_allocated(const std::initializer_list& indices - /**< [in] List of indices, one for each var dimension. */ ) const { - return are_indices_local(indices); + /// **[Deprecated]** Use get_first_local_index(). + YASK_DEPRECATED + virtual idx_t + get_first_rank_alloc_index(const std::string& dim) const { + return get_first_local_index(dim); } -#endif - /// **[Deprecated]** Use fuse_vars(). - inline void - fuse_grids(yk_var_ptr source) { - fuse_vars(source); + /// **[Deprecated]** Use get_last_local_index(). + YASK_DEPRECATED + virtual idx_t + get_last_rank_alloc_index(const std::string& dim) const { + return get_last_local_index(dim); } }; // yk_var. /// **[Deprecated]** Use yk_var. + YASK_DEPRECATED typedef yk_var yk_grid; /** @}*/ diff --git a/include/yask_common_api.hpp b/include/yask_common_api.hpp index 8dbbae8a..0ac93758 100644 --- a/include/yask_common_api.hpp +++ b/include/yask_common_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -31,6 +31,10 @@ IN THE SOFTWARE. #pragma once +#include +#include +#include +#include #include #include #include @@ -39,6 +43,21 @@ IN THE SOFTWARE. #include #include +// Things SWIG can't handle. +#ifdef SWIG +#ifndef YASK_DEPRECATED +#define YASK_DEPRECATED +#endif +#define YASK_INT64_T long int +#else +/// Deprecated attribute. +#ifndef YASK_DEPRECATED +#define YASK_DEPRECATED [[deprecated]] +#endif +/// Signed 64-bit int. +#define YASK_INT64_T std::int64_t +#endif + namespace yask { /** @@ -55,11 +74,20 @@ namespace yask { /// Type to use for indexing grids. /** Index types are signed to allow negative indices in padding/halos. */ -#ifdef SWIG - typedef long int idx_t; // SWIG doesn't seem to understand int64_t. -#else - typedef std::int64_t idx_t; -#endif + typedef YASK_INT64_T idx_t; + + /// Vector of indices. + typedef std::vector idx_t_vec; + + /// Initializer list of indices. + /** + @note This type is not available in the Python API. + Use `idx_t_vec` instead. + */ + typedef std::initializer_list idx_t_init_list; + + /// Vector of strings. + typedef std::vector string_vec; // Forward declarations of class-pointers. diff --git a/include/yask_compiler_api.hpp b/include/yask_compiler_api.hpp index 2e0ad70b..fcd3cdc0 100644 --- a/include/yask_compiler_api.hpp +++ b/include/yask_compiler_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -182,7 +182,7 @@ namespace yask { avx | YASK kernel for CORE AVX ISA. avx2 | YASK kernel for CORE AVX2 ISA. avx512 | YASK kernel for CORE AVX-512 ISA. - avx512lo| YASK kernel for CORE AVX-512 ISA with 256-bit SIMD. + avx512-ymm | YASK kernel for CORE AVX-512 ISA with 256-bit SIMD. knl | YASK kernel for MIC AVX-512 ISA. knc | YASK kernel for Knights Corner ISA. dot | DOT-language description. @@ -251,7 +251,7 @@ namespace yask { /**< [in] Dimensions of the var. Each dimension is identified by an associated index. */ ) =0; -#ifndef SWIG + #ifndef SWIG /// Create an n-dimensional variable in the solution. /** C++ initializer-list version with same semantics as @@ -266,7 +266,7 @@ namespace yask { const std::initializer_list& dims /**< [in] Dimensions of the var. Each dimension is identified by an associated index. */ ) =0; -#endif + #endif /// Create an n-dimensional scratch variable in the solution. /** @@ -292,7 +292,7 @@ namespace yask { /**< [in] Dimensions of the var. Each dimension is identified by an associated index. */ ) =0; -#ifndef SWIG + #ifndef SWIG /// Create an n-dimensional scratch variable in the solution. /** C++ initializer-list version with same semantics as @@ -308,7 +308,7 @@ namespace yask { const std::initializer_list& dims /**< [in] Dimensions of the var. Each dimension is identified by an associated index. */ ) =0; -#endif + #endif /// Get the number of vars in the solution. /** @@ -409,10 +409,6 @@ namespace yask { int level) =0; /// Set the prefetch distance for the given cache. - /** - If the prefetch distance is not set for a given cache, - a default will be used based on the target format. - */ virtual void set_prefetch_dist(/** [in] Cache level: 1 or 2. */ int level, @@ -437,7 +433,7 @@ namespace yask { /**< [out] Pointer to object to receive formatted output. See \ref yask_output_factory. */) =0; -#ifndef SWIG + #ifndef SWIG /// **[Advanced]** Callback type for call_before_output(). typedef std::function output_hook_t; @@ -458,7 +454,7 @@ namespace yask { virtual void call_before_output(/** [in] callback function */ output_hook_t hook_fn) =0; -#endif + #endif /// **[Advanced]** Add block of custom C++ code to the kernel solution. /** @@ -495,7 +491,7 @@ namespace yask { Allows writing the code without the surrounding quotes, making it easier to format in many editors and IDEs (and perhaps look somewhat like a lambda function). */ -#define CALL_AFTER_NEW_SOLUTION(...) call_after_new_solution(#__VA_ARGS__) + #define CALL_AFTER_NEW_SOLUTION(...) call_after_new_solution(#__VA_ARGS__) /// **[Advanced]** Explicitly define and order the domain dimensions used in the solution. /** @@ -511,7 +507,7 @@ namespace yask { set_domain_dims(const std::vector& dims /**< [in] Domain dimensions of the solution. */ ) =0; -#ifndef SWIG + #ifndef SWIG /// **[Advanced]** Explicitly define and order the domain dimensions used in the solution. /** C++ initializer-list version with same semantics as @@ -521,7 +517,7 @@ namespace yask { virtual void set_domain_dims(const std::initializer_list& dims /**< [in] Domain dimensions of the solution. */ ) =0; -#endif + #endif /// **[Advanced]** Explicitly identify the step dimension in the solution. /** @@ -640,6 +636,7 @@ namespace yask { clear_dependencies() =0; /// **[Deprecated]** Use set_target() and output_solution(). + YASK_DEPRECATED inline void format(const std::string& format_type, yask_output_ptr output) { @@ -648,50 +645,57 @@ namespace yask { } /// **[Deprecated]** Use new_var(). + YASK_DEPRECATED inline yc_var_ptr new_grid(const std::string& name, const std::vector& dims) { return new_var(name, dims); } -#ifndef SWIG + #ifndef SWIG /// **[Deprecated]** Use new_var(). - inline yc_var_ptr + YASK_DEPRECATED + inline yc_var_ptr new_grid(const std::string& name, const std::initializer_list& dims) { return new_var(name, dims); } -#endif + #endif /// **[Deprecated]** Use new_scratch_var(). + YASK_DEPRECATED inline yc_var_ptr new_scratch_grid(const std::string& name, const std::vector& dims) { return new_scratch_var(name, dims); } -#ifndef SWIG + #ifndef SWIG /// **[Deprecated]** Use new_scratch_var(). + YASK_DEPRECATED inline yc_var_ptr new_scratch_grid(const std::string& name, const std::initializer_list& dims) { return new_scratch_var(name, dims); } -#endif + #endif /// **[Deprecated]** Use get_num_vars(). + YASK_DEPRECATED inline int get_num_grids() const { return get_num_vars(); } /// **[Deprecated]** Use get_vars(). + YASK_DEPRECATED inline std::vector get_grids() { return get_vars(); } /// **[Deprecated]** Use get_var(). + YASK_DEPRECATED inline yc_var_ptr get_grid(const std::string& name) { return get_var(name); @@ -728,7 +732,7 @@ namespace yask { in the \ref yc_solution. @returns List of names of all the dimensions used in this var. */ - virtual std::vector + virtual string_vec get_dim_names() const =0; /// Create a reference to a point in this var. @@ -743,7 +747,7 @@ namespace yask { These must appear in the same order as when the var was created. */ ) =0; -#ifndef SWIG + #ifndef SWIG /// Create a reference to a point in this var. /** C++ initializer-list version with same semantics as @@ -756,38 +760,7 @@ namespace yask { */ virtual yc_var_point_node_ptr new_var_point(const std::initializer_list& index_exprs) = 0; -#endif - - /// Create a reference to a point in this var using relative offsets. - /** - A shorthand function for calling new_var_point() when - all index expressions are constant offsets. - Each offset refers to the dimensions defined when the - var was created via yc_solution::new_var(). - - Example: if `g = new_var("data", {t, x, y})` with step-dimension `t` - and domain-dimensions `x` and `y`, - `g->new_relative_var_point({1, -1, 0})` refers to the same point as - `g->new_var_point({t + 1, x - 1, y})`. - - @warning This convenience function can only be used when every - dimension of the var is either the step dimension or a domain dimension. - If this is not the case, use new_var_point(). - @returns Pointer to AST node used to read from or write to point in var. */ - virtual yc_var_point_node_ptr - new_relative_var_point(const std::vector& dim_offsets - /**< [in] offset from evaluation index in each dim. */ ) =0; - -#ifndef SWIG - /// Create a reference to a point in this var using relative offsets. - /** - C++ initializer-list version with same semantics as - the vector version of new_relative_var_point(). - @note Not available in the Python API. Use the vector version. - @returns Pointer to AST node used to read or write from point in var. */ - virtual yc_var_point_node_ptr - new_relative_var_point(const std::initializer_list& dim_offsets) = 0; -#endif + #endif /// **[Advanced]** Get whether the allocation of the step dimension of this var can be modified at run-time. /** @@ -824,25 +797,45 @@ namespace yask { /**< [in] Number of elements to allocate in the step dimension. */) =0; /// **[Deprecated]** Use new_var_point(). + YASK_DEPRECATED + virtual yc_var_point_node_ptr + new_relative_var_point(const std::vector& dim_offsets) =0; + #ifndef SWIG + /// **[Deprecated]** Use new_var_point(). + YASK_DEPRECATED + virtual yc_var_point_node_ptr + new_relative_var_point(const std::initializer_list& dim_offsets) = 0; + #endif + + /// **[Deprecated]** Use new_var_point(). + YASK_DEPRECATED inline yc_var_point_node_ptr new_grid_point(const std::vector& index_exprs) { return new_var_point(index_exprs); } + #ifndef SWIG /// **[Deprecated]** Use new_var_point(). + YASK_DEPRECATED inline yc_var_point_node_ptr new_grid_point(const std::initializer_list& index_exprs) { return new_var_point(index_exprs); } + #endif + /// **[Deprecated]** Use new_relative_var_point(). + YASK_DEPRECATED inline yc_var_point_node_ptr new_relative_grid_point(const std::vector& dim_offsets) { return new_relative_var_point(dim_offsets); } + #ifndef SWIG /// **[Deprecated]** Use new_relative_var_point(). + YASK_DEPRECATED inline yc_var_point_node_ptr new_relative_grid_point(const std::initializer_list& dim_offsets) { return new_relative_var_point(dim_offsets); } + #endif }; // yc_var. @@ -906,7 +899,7 @@ namespace yask { _var = soln->new_var(name, dims); } -#ifndef SWIG + #ifndef SWIG /// Contructor taking an initializer_list of index vars. /** A wrapper around yc_solution::new_var() and @@ -928,7 +921,7 @@ namespace yask { else _var = soln->new_var(name, dims); } -#endif + #endif /// Contructor for a simple scalar value. /** @@ -974,7 +967,7 @@ namespace yask { return _var->new_var_point(index_exprs); } -#ifndef SWIG + #ifndef SWIG /// Create an expression for a point in a var. /** A wrapper around yc_var::new_var_point(). @@ -1043,18 +1036,22 @@ namespace yask { return _var->new_var_point({i1}); } -#endif + #endif }; // yc_var_proxy. /** @}*/ /// **[Deprecated]** Use yc_var. + YASK_DEPRECATED typedef yc_var yc_grid; /// **[Deprecated]** Use yc_var_ptr. + YASK_DEPRECATED typedef yc_var_ptr yc_grid_ptr; /// **[Deprecated]** Use yc_var_point_node. + YASK_DEPRECATED typedef yc_var_point_node yc_grid_point_node; /// **[Deprecated]** Use yc_var_point_node_ptr. + YASK_DEPRECATED typedef yc_var_point_node_ptr yc_grid_point_node_ptr; } // namespace yask. diff --git a/include/yask_kernel_api.hpp b/include/yask_kernel_api.hpp index 731464b1..42e78ba4 100644 --- a/include/yask_kernel_api.hpp +++ b/include/yask_kernel_api.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -169,25 +169,54 @@ namespace yask { virtual ~yk_env() {} /// Set object to receive debug output. - virtual void + /** + This is a static method, implying the following: + - This setting may be changed before creating a `yk_env` object. + - Calling this method applies settings globally. + */ + static void set_debug_output(yask_output_ptr debug /**< [out] Pointer to object to receive debug output. - See \ref yask_output_factory. */ ) =0; + See \ref yask_output_factory. */ ); + + /// Disable the debug output. + /** + Shortcut for calling `set_debug_output()` with a `yask_null_output_ptr`; + */ + static void + disable_debug_output(); /// Get object to receive debug output. /** - Returns pointer to \ref yask_output set via set_debug_output + This is a static method, implying the following: + - This method may be called before creating a `yk_env` object. + + @returns Pointer to \ref yask_output set via set_debug_output or pointer to a \ref yask_stdout_output if not set. */ - virtual yask_output_ptr - get_debug_output() const =0; + static yask_output_ptr + get_debug_output(); /// Enable or disable additional debug tracing. /** + This is a static method, implying the following: + - This setting may be changed before creating a `yk_env` object. + - Calling this method applies settings globally. + Must also compile with general tracing and/or memory-access tracing enabled. */ - virtual void - set_trace_enabled(bool enable) =0; + static void + set_trace_enabled(bool enable); + + /// Get whether tracing is enabled. + /** + This is a static method, implying the following: + - This method may be called before creating a `yk_env` object. + + @returns Whether tracing is enabled. + */ + static bool + is_trace_enabled(); /// Get number of MPI ranks. /** @@ -212,8 +241,10 @@ namespace yask { }; // yk_env. /// **[Deprecated]** Use yk_var. + YASK_DEPRECATED typedef yk_var yk_grid; /// **[Deprecated]** Use yk_var_ptr. + YASK_DEPRECATED typedef yk_var_ptr yk_grid_ptr; /** @}*/ diff --git a/src/common/combo.cpp b/src/common/combo.cpp index f43c1827..4640056a 100644 --- a/src/common/combo.cpp +++ b/src/common/combo.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -64,15 +64,16 @@ namespace yask { } // Get the 'r'th set of 'k' elements from set of integers between '0' and 'n-1'. - // Returns vector of 'k' values. + // Returns bitmask with 'k' bits set, where bit 'b' = 1 indicates integer 'b' is in set. // 'r' must be between '0' and 'n_choose_k(n, k)-1'. - vector n_choose_k_set(int n, int k, int r) { + size_t n_choose_k_set(int n, int k, int r) { assert(n >= 0); assert(k >= 0); assert(r >= 0); assert(r < n_choose_k(n, k)); + assert(size_t(n) <= sizeof(size_t) * 8); - vector c; + size_t c = 0; // Empty set. if (n <= 0 || k <= 0) @@ -80,23 +81,31 @@ namespace yask { // Pick one item. if (k == 1) { - c.push_back(r); + set_bit(c, r); return c; } // Pick k items. + int ca[sizeof(size_t)]; int j = 0; for (int i = 0; i < k-1; i++) { - c.push_back((i == 0) ? -1 : c[i-1]); + ca[i] = (i == 0) ? -1 : ca[i-1]; while (true) { - c[i]++; - int nc = n_choose_k(n - (c[i]+1), k - (i+1)); + ca[i]++; + int nc = n_choose_k(n - (ca[i]+1), k - (i+1)); if (j + nc >= r + 1) break; j += nc; } } - c.push_back(c[k-2] + r - j + 1); + ca[k-1] = ca[k-2] + r - j + 1; + + for (int i = 0; i < k; i++) { + assert(ca[i] >= 0); + assert(ca[i] < n); + set_bit(c, ca[i]); + } + return c; } diff --git a/src/common/combo.hpp b/src/common/combo.hpp index 11bad3cd..96f3224d 100644 --- a/src/common/combo.hpp +++ b/src/common/combo.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -28,7 +28,7 @@ IN THE SOFTWARE. // Include this first to assure NDEBUG is set properly. #include "yask_assert.hpp" -#include +#include namespace yask { @@ -36,7 +36,27 @@ namespace yask { int n_choose_k(int n, int k); // Get the 'r'th set of 'k' elements from set of integers between '0' and 'n-1'. - // Returns vector of 'k' values. + // Returns bitmask with 'k' bits set, where bit 'b' = 1 indicates integer 'b' is in set. // 'r' must be between '0' and 'n_choose_k(n, k)-1'. - std::vector n_choose_k_set(int n, int k, int r); + size_t n_choose_k_set(int n, int k, int r); + + // Handle bits in a bitset. + template + inline bool is_bit_set(const T set, int i) { + assert(i >= 0); + assert(size_t(i) <= sizeof(T) * 8); + return (set & (T(1) << i)) != 0; + } + template + inline void set_bit(T& set, int i) { + assert(i >= 0); + assert(size_t(i) <= sizeof(T) * 8); + set |= (T(1) << i); + } + template + inline void clear_bit(T& set, int i) { + assert(i >= 0); + assert(size_t(i) <= sizeof(T) * 8); + set &= ~(T(1) << i); + } } diff --git a/src/common/common.mk b/src/common/common.mk index ec244469..fe81569a 100644 --- a/src/common/common.mk +++ b/src/common/common.mk @@ -1,6 +1,6 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -23,6 +23,17 @@ # Common Makefile settings. # YASK_BASE should be set before including this. +# YASK_BASE and all the dirs based on it should be set with full (not relative) paths. + +# Name strings. +TIMESTAMP := $(shell date '+%Y-%m-%d') +HOSTNAME := $(shell hostname) + +# Vars for special strings. +empty := +space := $(empty) $(empty) +comma := , +colon := : # Set YASK_OUTPUT_DIR to change where all output files go. YASK_OUTPUT_DIR ?= $(YASK_BASE) @@ -41,17 +52,19 @@ LIB_OUT_DIR := $(YASK_OUT_BASE)/lib BIN_OUT_DIR := $(YASK_OUT_BASE)/bin BUILD_OUT_DIR := $(YASK_OUT_BASE)/build PY_OUT_DIR := $(YASK_OUT_BASE)/yask +TEST_LOG_OUT_DIR := $(YASK_OUT_BASE)/logs/tests.$(HOSTNAME).$(TIMESTAMP) # OS-specific ifeq ($(shell uname -o),Cygwin) SO_SUFFIX := .dll - RUN_PREFIX := env PATH="${PATH}:$(LIB_DIR):$(LIB_OUT_DIR):$(YASK_DIR):$(PY_OUT_DIR)" + RUN_PREFIX := env PATH="${PATH}:$(LIB_DIR):$(LIB_OUT_DIR):$(PY_OUT_DIR):$(YASK_DIR)" PYTHON := python3 else SO_SUFFIX := .so RUN_PREFIX := env I_MPI_DEBUG=+5 I_MPI_PRINT_VERSION=1 OMP_DISPLAY_ENV=VERBOSE KMP_VERSION=1 - PYTHON := python + PYTHON := python3 endif +SHELL := /bin/bash # Common source. COMM_DIR := $(SRC_DIR)/common @@ -71,24 +84,43 @@ YC_EXEC := $(BIN_OUT_DIR)/$(YC_BASE).exe YC_SRC_DIR := $(SRC_DIR)/compiler # Tools. +CXX := icpx SWIG := swig PERL := perl MKDIR := mkdir -p -v BASH := bash +INDENT := $(UTILS_BIN_DIR)/yask_indent.sh # Find include path needed for python interface. # NB: constructing string inside print() to work for python 2 or 3. PYINC := $(addprefix -I,$(shell $(PYTHON) -c 'import distutils.sysconfig; print(distutils.sysconfig.get_python_inc() + " " + distutils.sysconfig.get_python_inc(plat_specific=1))')) RUN_PYTHON := $(RUN_PREFIX) \ - env PYTHONPATH=$(LIB_DIR):$(LIB_OUT_DIR):$(YASK_DIR):$(PY_OUT_DIR):$(PYTHONPATH) $(PYTHON) + env PYTHONPATH=$(LIB_DIR):$(LIB_OUT_DIR):$(PY_OUT_DIR):$(YASK_DIR):$(PYTHONPATH) $(PYTHON) # Function to check for pre-defined compiler macro. +# Invokes compiler using 1st arg. +# Returns '1' if 2nd arg is defined, '0' if not. # Ex: "ifeq ($(call MACRO_DEF,$(CXX),__clang__),1)"... MACRO_DEF = $(shell $(1) -x c++ /dev/null -dM -E | grep -m 1 -c $(2)) -# Options to avoid warnings when compiling SWIG-generated code w/gcc. -SWIG_GCCFLAGS := -Wno-class-memaccess -Wno-stringop-overflow -Wno-stringop-truncation +# Function to run a command serially, even with parallel build. +SERIALIZE = exec {fd}>/tmp/$$USER.YASK.build-lock; \ + flock -x $$fd; \ + $(1) + +# Function to create a directory. +# Tries to avoid the possible race condition when calling mkdir in parallel. +# 1st arg is dir name. +# Ex: "$(call MK_DIR,path)" +MK_DIR = @ if [ \! -d $(1) ]; then \ + $(call SERIALIZE,$(MKDIR) $(1)); fi + +# Script to remove unsupported function in python 3.8+. +SWIG_PATCH := perl -i -n -e 'print unless /_PyObject_GC_UNTRACK/' + +# Options for compiling SWIG-generated code w/gcc. +SWIG_GCCFLAGS := -DYASK_DEPRECATED='' # Define deprecated macro used by SWIG. DBL_EPSILON_CXXFLAG := -DDBL_EPSILON=2.2204460492503131e-16 diff --git a/src/common/common_utils.cpp b/src/common/common_utils.cpp index 90217c15..6986b0bc 100644 --- a/src/common/common_utils.cpp +++ b/src/common/common_utils.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -42,8 +42,8 @@ namespace yask { // fields to allow proper alphanumeric sorting // for numbers above 9 (at least up to 99). - // Format: "major.minor.patch". - const string version = "3.05.07"; + // Format: "major.minor.patch[-alpha|-beta]". + const string version = "4.00.00"; string yask_get_version_string() { return version; diff --git a/src/common/common_utils.hpp b/src/common/common_utils.hpp index 4272e48c..162d3c90 100644 --- a/src/common/common_utils.hpp +++ b/src/common/common_utils.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -55,7 +55,7 @@ inline void omp_set_num_threads(int n) { } inline void omp_set_nested(int n) { } inline int omp_get_max_active_levels() { return 1; } inline void omp_set_max_active_levels(int n) { } -inline int omp_get_level() { return 1; } +inline int omp_get_level() { return 0; } inline void omp_init_lock(omp_lock_t* p) { } inline bool omp_set_lock(omp_lock_t* p) { return true; } inline void omp_unset_lock(omp_lock_t* p) { } @@ -98,19 +98,80 @@ namespace yask { extern std::string make_num_str(idx_t num); extern std::string make_num_str(double num); + // Divide 'num' equally into 'nparts'. + // Returns the size of the 'n'th part, + // where 0 <= 'n' < 'nparts'. + // Example: + // div_equally_size_n(6, 4, 0) returns 2. + // div_equally_size_n(6, 4, 1) returns 2. + // div_equally_size_n(6, 4, 2) returns 1. + // div_equally_size_n(6, 4, 3) returns 1. + template + inline T div_equally_size_n(T num, T nparts, T n) { + host_assert(n >= 0); + host_assert(n < nparts); + T p = num / nparts; + T rem = num % nparts; + p += (n < rem) ? 1 : 0; + return p; + } + + // Divide 'num' equally into 'nparts'. + // Returns the *cumulative* sizes of the 0-'n'th parts, + // if 0 <= 'n' < 'nparts' and 0 if n < 0. + // The <0 case is handy for calculating the initial + // starting point when passing 'n'-1 and 'n'==0. + // Example: + // div_equally_cumu_size_n(6, 4, -1) returns 0. + // div_equally_cumu_size_n(6, 4, 0) returns 2. + // div_equally_cumu_size_n(6, 4, 1) returns 4. + // div_equally_cumu_size_n(6, 4, 2) returns 5. + // div_equally_cumu_size_n(6, 4, 3) returns 6. + template + inline T div_equally_cumu_size_n(T num, T nparts, T n) { + if (n < 0) + return 0; + host_assert(n >= 0); + host_assert(n < nparts); + T p = (num / nparts) * (n + 1); + T rem = num % nparts; + p += (n < rem) ? (n + 1) : rem; + return p; + } + + // Divide 'num' equally into 'nparts'. + // Returns size of all parts. + // Example: div_equally_all_sizes(6, 4) returns <2, 2, 1, 1>. + template + inline std::vector div_equally_all_sizes(T num, T nparts) { + std::vector p(nparts, num / nparts); + for (T i = 0; i < num % nparts; i++) + p[i]++; + return p; + } + // A var that behaves like OMP_NUM_THREADS to specify the // default number of threads in each level. + // TODO: try to remove the need for these vars by using + // OMP APIs to discover the nesting levels and num threads. constexpr int yask_max_levels = 2; extern int yask_num_threads[]; // Get number of threads that will execute a yask_parallel_for() loop // based on the current OpenMP nesting level. - inline idx_t yask_get_num_threads() { + inline int yask_get_num_threads() { + + // Nested parallel regions. if (omp_get_max_active_levels() > 1 && - yask_num_threads[0] && yask_num_threads[1]) + yask_num_threads[0] > 0 && + yask_num_threads[1] > 1) return yask_num_threads[0] * yask_num_threads[1]; - else if (yask_num_threads[0]) + + // Single parallel region. + else if (yask_num_threads[0] > 0) return yask_num_threads[0]; + + // YASK thread vars not set; use OMP val. else return omp_get_num_threads(); } @@ -119,70 +180,141 @@ namespace yask { // 'start' will be 'begin', 'begin'+'stride', 'begin'+2*'stride', etc. // 'stop' will be 'begin'+'stride', etc. // 'thread_num' will be a unique number across the nested threads. + //#define DEBUG_PAR_FOR inline void yask_parallel_for(idx_t begin, idx_t end, idx_t stride, std::function visitor) { if (end <= begin) return; + + // Number of iterations in canonical loop. + idx_t niter = CEIL_DIV(end - begin, stride); + #ifdef DEBUG_PAR_FOR + std::cout << "** yask_parallel_for: [" << begin << "..." << end << ") by " << stride << + ": " << niter << " iters\n"; + #endif + + // Only 1 value. + if (niter == 1) { + visitor(begin, end, 0); + return; + } -#ifndef _OPENMP - // Canonical loop. - for (idx_t i = 0; i < end; i += stride) { + #ifndef _OPENMP + // Canonical sequential loop. + for (idx_t i = begin; i < end; i += stride) { idx_t stop = std::min(i + stride, end); idx_t tn = omp_get_thread_num(); visitor(i, stop, tn); } -#else - // Non-nested. + #else + + // Non-nested parallel. if (omp_get_max_active_levels() < 2 || - !yask_num_threads[0] || !yask_num_threads[1]) { + yask_num_threads[0] <= 0 || + yask_num_threads[1] <= 1 || + niter <= yask_num_threads[0]) { - if (yask_num_threads[0]) + if (yask_num_threads[0] > 0) omp_set_num_threads(yask_num_threads[0]); -#pragma omp parallel for schedule(static) - for (idx_t i = 0; i < end; i += stride) { + #pragma omp parallel for schedule(static) + for (idx_t i = begin; i < end; i += stride) { idx_t stop = std::min(i + stride, end); idx_t tn = omp_get_thread_num(); visitor(i, stop, tn); } } - // Nested. + // Nested parallel. else { // Number of outer threads. - idx_t nthr = yask_num_threads[0]; - omp_set_num_threads(nthr); - - // Number of iterations in canonical loop. - idx_t niter = CEIL_DIV(end - begin, stride); - - // Num iters per outer thread. - idx_t niters_per_thr = CEIL_DIV(niter, nthr); + idx_t nthr0 = yask_num_threads[0]; + assert(nthr0 > 0); + omp_set_num_threads(nthr0); - // Outer parallel loop. -#pragma omp parallel for schedule(static) - for (idx_t n = 0; n < nthr; n++) { + // Outer parallel region. + #pragma omp parallel + { + idx_t n0 = omp_get_thread_num(); // Calculate begin and end points for this thread. - idx_t tbegin = n * niters_per_thr * stride; - idx_t tend = std::min(end, tbegin + niters_per_thr * stride); - - // Set number of threads for the nested OMP loop. - idx_t tnthr = yask_num_threads[1]; - omp_set_num_threads(tnthr); - - // Inner parallel loop over elements. -#pragma omp parallel for schedule(static) - for (idx_t i = tbegin; i < tend; i += stride) { - idx_t stop = std::min(i + stride, end); - idx_t thread_num = n * tnthr + omp_get_thread_num(); - visitor(i, stop, thread_num); + idx_t tbegin = div_equally_cumu_size_n(niter, nthr0, n0 - 1) * stride; + idx_t tend = div_equally_cumu_size_n(niter, nthr0, n0) * stride; + + #ifdef DEBUG_PAR_FOR + #pragma omp critical + std::cout << "** outer thread " << n0 << ": [" << tbegin << "..." << tend << ") by " << + stride << "\n" << std::flush; + #endif + assert(tend >= tbegin); + + // Nothing to do? + if (tend <= tbegin) { + } + + // Only need one in this thread? + else if (tend - tbegin == 1) + visitor(tbegin, tend, n0); + + else { + + // Set number of threads for the nested OMP loop. + // (Doesn't seem to work w/g++ 8.2.0: just starts 1 nested + // thread if nthr0 > 1.) + idx_t nthr1 = yask_num_threads[1]; + assert(nthr1 > 1); + omp_set_num_threads(nthr1); + + #ifdef DEBUG_PAR_FOR + // Test OMP region w/o for loop. + #pragma omp parallel + { + idx_t n1 = omp_get_thread_num(); + idx_t thread_num = n0 * nthr1 + n1; + #pragma omp critical + std::cout << "** thread " << thread_num << + "(" << n0 << ":" << n1 << + ")\n" << std::flush; + } + #endif + + // Inner parallel loop over elements. + #pragma omp parallel for schedule(static) + for (idx_t i = tbegin; i < tend; i += stride) { + idx_t stop = std::min(i + stride, tend); + idx_t n1 = omp_get_thread_num(); + idx_t thread_num = n0 * nthr1 + n1; + #ifdef DEBUG_PAR_FOR + #pragma omp critical + std::cout << "** thread " << thread_num << + "(" << n0 << ":" << n1 << + "): [" << i << "..." << stop << ") by " << + stride << "\n" << std::flush; + #endif + visitor(i, stop, thread_num); + } } } } -#endif + #endif + } + + // Sequential version of yask_parallel_for(). + inline void yask_for(idx_t begin, idx_t end, idx_t stride, + std::function visitor) { + if (end <= begin) + return; + + // Canonical sequential loop. + for (idx_t i = begin; i < end; i += stride) { + idx_t stop = std::min(i + stride, end); + idx_t tn = omp_get_thread_num(); + visitor(i, stop, tn); + } } + // Set that retains order of things added. // Or, vector that allows insertion if element doesn't exist. diff --git a/src/common/fd_coeff2.cpp b/src/common/fd_coeff2.cpp index eba4c16c..c501e226 100644 --- a/src/common/fd_coeff2.cpp +++ b/src/common/fd_coeff2.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/common/idiv.hpp b/src/common/idiv.hpp index b8a8ad07..2352b161 100644 --- a/src/common/idiv.hpp +++ b/src/common/idiv.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -51,7 +51,7 @@ namespace yask { template inline T idiv_flr(T a, T b) { - assert(b); + host_assert(b); //return (a<0 ? a-(b-1) : a) / b; //return (a - (a<0 ? b-1 : 0)) / b; return (a + (a>>(sizeof(a)*8-1)) * (b-1)) / b; @@ -59,25 +59,25 @@ namespace yask { template inline T ceil_idiv_flr(T a, T b) { - assert(b); + host_assert(b); return idiv_flr(a + b - 1, b); } template inline T round_up_flr(T a, T b) { - assert(b); + host_assert(b); return (ceil_idiv_flr(a, b) * b); } template inline T round_down_flr(T a, T b) { - assert(b); + host_assert(b); return (idiv_flr(a, b) * b); } template inline T imod_flr(T a, T b) { - assert(b); + host_assert(b); //return ((a % b) + b) % b; //return ((a < 0) ? ((a % b) + b) : a) % b; //T c = a % b; return (c < 0) ? c + b : c; diff --git a/src/common/output.cpp b/src/common/output.cpp index 876ff9a1..434fcb9c 100644 --- a/src/common/output.cpp +++ b/src/common/output.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -140,6 +140,5 @@ namespace yask { assert(p.get()); return p; } - } diff --git a/src/common/swig/yask_common_api.i b/src/common/swig/yask_common_api.i index f679efd8..2fd11e68 100644 --- a/src/common/swig/yask_common_api.i +++ b/src/common/swig/yask_common_api.i @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/common/tests/combo_test.cpp b/src/common/tests/combo_test.cpp index 623400cb..59b11533 100644 --- a/src/common/tests/combo_test.cpp +++ b/src/common/tests/combo_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -30,42 +30,43 @@ IN THE SOFTWARE. #include "combo.hpp" #include +#include using namespace std; using namespace yask; int main(int argc, char** argv) { - int n = 5; - - int exnc[n+2] = { 1, 5, 10, 10, 5, 1, 0 }; + constexpr int n = 5; + int exnc[n+2] = { 1, 5, 10, 10, 5, 1, 0 }; // expected num combos. for (int k = 0; k <= n+1; k++) { - int nc = n_choose_k(n,k); + + // Num combos. + int nc = n_choose_k(n, k); cout << "choose(" << n << ", " << k << ") = " << nc << endl; assert(nc == exnc[k]); - - vector> cvv; - for (int r = 0; r < nc; r++) { - auto cv = n_choose_k_set(n, k, r); - cout << " combo #" << r << " = "; - assert(cv.size() == (size_t)k); - for (int i = 0; i < k; i++) { - cout << " " << cv[i]; - assert(cv[i] >= 0); - assert(cv[i] < n); - // Make sure this element is unique and in order. - if (i > 0) - assert(cv[i] > cv[i-1]); + // Each combo. + vector cv; + for (int r = 0; r < nc; r++) { + auto cmask = n_choose_k_set(n, k, r); + cout << " combo #" << r << " ="; + int nset = 0; + for (int i = 0; i < n; i++) { + if (is_bit_set(cmask, i)) { + cout << " " << i; + nset++; + } } - cout << endl; + cout << " (0x" << hex << cmask << dec << ")\n"; + assert(nset == k); // Make sure this set is unique. - for (size_t i = 0; i < cvv.size(); i++) { - auto& cvi = cvv[i]; - assert(cv != cvi); + for (size_t i = 0; i < cv.size(); i++) { + auto& cvi = cv[i]; + assert(cmask != cvi); } - cvv.push_back(cv); + cv.push_back(cmask); } } return 0; diff --git a/src/common/tests/tuple_test.cpp b/src/common/tests/tuple_test.cpp index c753482d..d23e01fa 100644 --- a/src/common/tests/tuple_test.cpp +++ b/src/common/tests/tuple_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -40,7 +40,7 @@ void ttest(bool first_inner) { t1.set_first_inner(first_inner); t1.add_dim_back("x", 3); t1.add_dim_back("y", 4); - assert(t1._get_num_dims() == 2); + assert(t1.get_num_dims() == 2); assert(t1[0] == 3); assert(t1[1] == 4); assert(t1["x"] == 3); @@ -87,55 +87,85 @@ void ttest(bool first_inner) { } } + // Test w/increasing number of dims. for (int d = 0; d <= 3; d++) { IntTuple t2; if (d > 0) t2.add_dim_back("x", 3); if (d > 1) t2.add_dim_back("y", 4); - if (d > 2) t2.add_dim_back("z", 3); + if (d > 2) t2.add_dim_back("z", 5); + auto n = t2.product(); os << d << "-d sequential visit test...\n"; j = 0; + size_t sumk = 0; t2.visit_all_points ([&](const IntTuple& ofs, size_t k) { - auto i = t2.layout(ofs); - os << " offset at " << ofs.make_dim_val_str() << " = " << i << endl; - - if (first_inner) { - assert(i == j); - assert(i == k); - } - j++; - return true; - }); - assert(int(j) == t2.product()); - - os << d << "-d parallel visit test...\n"; - omp_set_nested(1); - omp_set_max_active_levels(2); - yask_num_threads[0] = 4; - yask_num_threads[1] = 2; - j = 0; - t2.visit_all_points_in_parallel - ([&](const IntTuple& ofs, size_t k) { - - auto i = t2.layout(ofs); -#pragma omp critical - { - os << " offset at " << ofs.make_dim_val_str() << " = " << i << endl; - j++; - } - - if (first_inner) - assert(i == k); - return true; - }); - assert(int(j) == t2.product()); + os << " offset at " << ofs.make_dim_val_str() << flush; + for (int d1 = 0; d1 < d; d1++) { + assert(ofs[d1] >= 0); + assert(ofs[d1] < t2[d1]); + } + auto i = t2.layout(ofs); + os << " = " << i << endl; + assert(int(k) < n); + assert(i == j); + assert(i == k); + j++; + sumk += k; + return true; // continue. + }); + assert(int(j) == n); + assert(int(sumk) == n * (n-1) / 2); + + // Test w/different num threads. + for (int t0 : {1, 2, 3}) { + for (int t1 : {1, 2}) { + + os << d << "-d parallel visit test...\n"; + omp_set_max_active_levels(2); + yask_num_threads[0] = t0; + yask_num_threads[1] = t1; + os << "using " << t0 << " * " << t1 << " thread(s)\n"; + assert(t0 * t1 == yask_get_num_threads()); + j = 0; + sumk = 0; + t2.visit_all_points_in_parallel + ([&](const IntTuple& ofs, size_t k) { + + assert(int(k) < n); + auto i = t2.layout(ofs); + #pragma omp critical + { + os << " offset at " << ofs.make_dim_val_str() << " = " << i << endl; + j++; + sumk += k; + } + assert(i == k); + return true; + }); + assert(int(j) == n); + assert(int(sumk) == n * (n-1) / 2); + } + } } } int main(int argc, char** argv) { + + // Test some functions that tuples depend on. + assert(div_equally_size_n(6, 4, 0) == 2); + assert(div_equally_size_n(6, 4, 1) == 2); + assert(div_equally_size_n(6, 4, 2) == 1); + assert(div_equally_size_n(6, 4, 3) == 1); + assert(div_equally_cumu_size_n(6, 4, -1) == 0); + assert(div_equally_cumu_size_n(6, 4, 0) == 2); + assert(div_equally_cumu_size_n(6, 4, 1) == 4); + assert(div_equally_cumu_size_n(6, 4, 2) == 5); + assert(div_equally_cumu_size_n(6, 4, 3) == 6); + + // Test tuples. ttest(true); ttest(false); cout << "End of YASK tuple test.\n"; diff --git a/src/common/tuple.cpp b/src/common/tuple.cpp index 5d7a54ed..f71e78da 100644 --- a/src/common/tuple.cpp +++ b/src/common/tuple.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -25,6 +25,8 @@ IN THE SOFTWARE. ///////// Tuple implementation. +// See tuple.hpp for method documentation. + #include "yask_common_api.hpp" #include "tuple.hpp" @@ -65,8 +67,8 @@ namespace yask { } template - const std::vector Tuple::get_dim_names() const { - std::vector names; + const string_vec Tuple::get_dim_names() const { + string_vec names; for (auto& i : _q) names.push_back(i._get_name()); return names; @@ -83,13 +85,15 @@ namespace yask { } } template - void Tuple::add_dim_front(const std::string& dim, const T& val) { + void Tuple::add_dim_at(int posn, const std::string& dim, const T& val) { auto* p = lookup(dim); if (p) *p = val; else { + assert(posn >= 0); + assert(posn <= int(_q.size())); Scalar sv(dim, val); - _q.insert(_q.begin(), sv); + _q.insert(_q.begin() + posn, sv); } } @@ -145,6 +149,7 @@ namespace yask { return true; } + // Returns true only if all dims and values are same. template bool Tuple::operator==(const Tuple& rhs) const { @@ -160,6 +165,8 @@ namespace yask { return true; } + // Not necessarily a meaningful less-than operator, but + // works for ordering sets, map keys, etc. template bool Tuple::operator<(const Tuple& rhs) const { if (size() < rhs.size()) return true; @@ -236,9 +243,9 @@ namespace yask { // Loop thru dims. int start_dim = _first_inner ? 0 : size()-1; - int end_dim = _first_inner ? size() : -1; + int stop_dim = _first_inner ? size() : -1; int step_dim = _first_inner ? 1 : -1; - for (int di = start_dim; di != end_dim; di += step_dim) { + for (int di = start_dim; di != stop_dim; di += step_dim) { auto& i = _q.at(di); //auto& dim = i._get_name(); size_t dsize = size_t(i.get_val()); @@ -265,7 +272,7 @@ namespace yask { // For some reason, copying *this and erasing // the element in newt._q causes an exception. Tuple newt; - for (int i = 0; i < _get_num_dims(); i++) { + for (int i = 0; i < get_num_dims(); i++) { if (i != posn) newt.add_dim_back(get_dim_name(i), get_val(i)); } @@ -323,8 +330,8 @@ namespace yask { template std::string Tuple::make_dim_val_offset_str(std::string separator, - std::string prefix, - std::string suffix) const { + std::string prefix, + std::string suffix) const { std::ostringstream oss; int n = 0; for (auto i : _q) { @@ -345,8 +352,8 @@ namespace yask { // Return a "compact" set of K factors of N. template - Tuple Tuple::get_compact_factors(idx_t N) const { - int K = _get_num_dims(); + Tuple Tuple::get_compact_factors(T N) const { + T K = get_num_dims(); // Keep track of "best" result, where the best is most compact. Tuple best; @@ -363,8 +370,8 @@ namespace yask { return *this; // already done. // Make list of factors of N. - vector facts; - for (idx_t n = 1; n <= N; n++) + vector facts; + for (T n = 1; n <= N; n++) if (N % n == 0) facts.push_back(n); @@ -430,7 +437,7 @@ namespace yask { break; // done. } // keep or not. - assert(best.size() == K); + assert(best.get_num_dims() == int(K)); assert(best.product() == N); return best; } diff --git a/src/common/tuple.hpp b/src/common/tuple.hpp index 2a987048..5739c28c 100644 --- a/src/common/tuple.hpp +++ b/src/common/tuple.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -136,8 +136,8 @@ namespace yask { std::vector> _q; // First-inner vars control ordering. Example: dims x, y, z. - // If _first_inner == true, x is unit stride (col major). - // If _first_inner == false, z is unit stride (row major). + // If _first_inner == true, x is unit stride (col major, like fortran). + // If _first_inner == false, z is unit stride (row major, like C). // This setting affects [un]layout() and visit_all_points(). bool _first_inner = true; // whether first dim is used for inner loop. @@ -153,12 +153,12 @@ namespace yask { size_t size() const { return _q.size(); } - int _get_num_dims() const { + int get_num_dims() const { return int(_q.size()); } // Return all dim names. - const std::vector get_dim_names() const; + const string_vec get_dim_names() const; // Get iteratable contents. const std::vector>& get_dims() const { @@ -233,6 +233,14 @@ namespace yask { return get_val(i); } + // Return values in range of dim posns (all must exist). + std::vector get_vals(int start, int num) const { + std::vector res(num, 0); + for (int i = 0; i < num; i++) + res[i] = get_val(start + i); + return res; + } + ////// Methods to get things by name. // Return dim posn or -1 if it doesn't exist. @@ -298,9 +306,15 @@ namespace yask { void add_dim_back(const Scalar& sc) { add_dim_back(sc._get_name(), sc.get_val()); } - void add_dim_front(const std::string& dim, const T& val); + void add_dim_at(int posn, const std::string& dim, const T& val); + void add_dim_at(int posn, const Scalar& sc) { + add_dim_at(posn, sc._get_name(), sc.get_val()); + } + void add_dim_front(const std::string& dim, const T& val) { + add_dim_at(0, dim, val); + } void add_dim_front(const Scalar& sc) { - add_dim_front(sc._get_name(), sc.get_val()); + add_dim_at(0, sc._get_name(), sc.get_val()); } // Set value by dim posn (posn i must exist). @@ -373,6 +387,23 @@ namespace yask { va_end(args); } + // Set values from starting point; positions must exist. + // Values before start or after size of vals are unchanged. + void set_vals(int start, const std::vector& vals) { + int i = 0; + for (auto v : vals) { + set_val(i + start, v); + i++; + } + } + void set_vals(int start, const std::initializer_list& vals) { + int i = 0; + for (auto v : vals) { + set_val(i + start, v); + i++; + } + } + // Copy 'this', then add dims and values from 'rhs' that are NOT // in 'this'. Return resulting union. // Similar to set_vals(rhs, true), but does not change existing @@ -406,13 +437,13 @@ namespace yask { return !((*this) < rhs); } - // Convert n_d 'offsets' to 1D offset using values in 'this' as sizes of n_d space. + // Convert N-d 'offsets' to 1D offset using values in 'this' as sizes of N-d space. // If 'strict_rhs', RHS dims must be same and in same order as this; // else, only matching ones are considered and missing offsets are zero (0). // If '_first_inner', first dim varies most quickly; else last dim does. size_t layout(const Tuple& offsets, bool strict_rhs=true) const; - // Convert 1D 'offset' to n_d offsets using values in 'this' as sizes of n_d space. + // Convert 1D 'offset' to N-d offsets using values in 'this' as sizes of N-d space. Tuple unlayout(size_t offset) const; // Create a new Tuple with the given dimension removed. @@ -427,9 +458,12 @@ namespace yask { // Create a new Tuple with the given dimension removed. Tuple remove_dim(int posn) const; - // reductions. - // Apply function over all elements, returning one value. - T reduce(std::function reducer) const { + // Reductions. + // Apply 'reducer' to first pair of elements. + // Then, apply 'reducer' result of that and next element. + // Repeat for all elements. + // Returns final value or 0 if no elements. + inline T reduce(std::function reducer) const { T result = 0; int n = 0; for (auto i : _q) { @@ -439,13 +473,11 @@ namespace yask { } return result; } + + // These reducers return 0 if no elements. T sum() const { return reduce([&](T lhs, T rhs){ return lhs + rhs; }); } - T product() const { - return _q.size() ? - reduce([&](T lhs, T rhs){ return lhs * rhs; }) : 1; - } T max() const { return reduce([&](T lhs, T rhs){ return std::max(lhs, rhs); }); } @@ -453,13 +485,19 @@ namespace yask { return reduce([&](T lhs, T rhs){ return std::min(lhs, rhs); }); } - // pair-wise functions. + // These reducers return 1 if no elements. + T product() const { + return _q.size() ? + reduce([&](T lhs, T rhs){ return lhs * rhs; }) : 1; + } + + // Pair-wise functions. // Apply function to each pair, creating a new Tuple. - // if strict_rhs==true, RHS elements must be same as this; - // else, only matching ones are considered. - Tuple combine_elements(std::function combiner, - const Tuple& rhs, - bool strict_rhs=true) const { + // if strict_rhs==true, RHS size and element names must be same as this; + // else, only matching ones (by name) are considered. + inline Tuple combine_elements(std::function combiner, + const Tuple& rhs, + bool strict_rhs=true) const { Tuple newt = *this; if (strict_rhs) { assert(are_dims_same(rhs, true)); @@ -485,28 +523,28 @@ namespace yask { } Tuple add_elements(const Tuple& rhs, bool strict_rhs=true) const { return combine_elements([&](T lhs, T rhs){ return lhs + rhs; }, - rhs, strict_rhs); + rhs, strict_rhs); } Tuple sub_elements(const Tuple& rhs, bool strict_rhs=true) const { return combine_elements([&](T lhs, T rhs){ return lhs - rhs; }, - rhs, strict_rhs); + rhs, strict_rhs); } Tuple mult_elements(const Tuple& rhs, bool strict_rhs=true) const { return combine_elements([&](T lhs, T rhs){ return lhs * rhs; }, - rhs, strict_rhs); + rhs, strict_rhs); } Tuple max_elements(const Tuple& rhs, bool strict_rhs=true) const { return combine_elements([&](T lhs, T rhs){ return std::max(lhs, rhs); }, - rhs, strict_rhs); + rhs, strict_rhs); } Tuple min_elements(const Tuple& rhs, bool strict_rhs=true) const { return combine_elements([&](T lhs, T rhs){ return std::min(lhs, rhs); }, - rhs, strict_rhs); + rhs, strict_rhs); } - // Apply func to each element, creating a new Tuple. + // Apply 'func' to each element and 'rhs', creating a new Tuple. Tuple map_elements(std::function func, - T rhs) const { + T rhs) const { Tuple newt = *this; for (size_t i = 0; i < _q.size(); i++) { auto& tval = _q[i].get_val(); @@ -515,6 +553,7 @@ namespace yask { } return newt; } + // Apply 'func' to each element, creating a new Tuple. Tuple map_elements(std::function func) const { Tuple newt = *this; for (size_t i = 0; i < _q.size(); i++) { @@ -526,23 +565,23 @@ namespace yask { } Tuple add_elements(T rhs) const { return map_elements([&](T lhs, T rhs){ return lhs + rhs; }, - rhs); + rhs); } Tuple sub_elements(T rhs) const { return map_elements([&](T lhs, T rhs){ return lhs - rhs; }, - rhs); + rhs); } Tuple mult_elements(T rhs) const { return map_elements([&](T lhs, T rhs){ return lhs * rhs; }, - rhs); + rhs); } Tuple max_elements(T rhs) const { return map_elements([&](T lhs, T rhs){ return std::max(lhs, rhs); }, - rhs); + rhs); } Tuple min_elements(T rhs) const { return map_elements([&](T lhs, T rhs){ return std::min(lhs, rhs); }, - rhs); + rhs); } Tuple neg_elements() const { return map_elements([&](T in){ return -in; }); @@ -553,177 +592,161 @@ namespace yask { // make string like "4x3x2" or "4, 3, 2". std::string make_val_str(std::string separator=", ", - std::string prefix="", - std::string suffix="") const; + std::string prefix="", + std::string suffix="") const; // make string like "x, y, z" or "int x, int y, int z". std::string make_dim_str(std::string separator=", ", - std::string prefix="", - std::string suffix="") const; + std::string prefix="", + std::string suffix="") const; // make string like "x=4, y=3, z=2". std::string make_dim_val_str(std::string separator=", ", - std::string infix="=", - std::string prefix="", - std::string suffix="") const; + std::string infix="=", + std::string prefix="", + std::string suffix="") const; // make string like "x+4, y, z-2". std::string make_dim_val_offset_str(std::string separator=", ", - std::string prefix="", - std::string suffix="") const; + std::string prefix="", + std::string suffix="") const; // Return a "compact" set of K factors of N, // a set of factors with largest factor as small as possible, // where K is the size of 'this'. // Any non-zero numbers in 'this' will be kept if possible. - Tuple get_compact_factors(idx_t N) const; - - // Call the 'visitor' lambda function at every point in the space defined by 'this'. - // 'idx' parameter contains sequentially-numbered index. - // Visitation order is with first dimension in unit stride, i.e., a conceptual - // "outer loop" iterates through last dimension, ..., and an "inner loop" iterates - // through first dimension. If '_first_inner' is false, it is done the opposite way. - // Visitor should return 'true' to keep going or 'false' to stop. - void visit_all_points(std::function visitor) const { - - // Init lambda fn arg with *this to get dim names. - // Values will get set during scan. - Tuple tp(*this); - - // 0-D? - if (!_q.size()) - visitor(tp, 0); - - // Call recursive version. - // Set begin/step dims depending on nesting. - else if (_first_inner) - _visit_all_points(visitor, size()-1, -1, tp); - else - _visit_all_points(visitor, 0, 1, tp); - } - - // Call the 'visitor' lambda function at every point in the space defined by 'this'. - // 'idx' parameter contains sequentially-numbered index. - // Visitation order is not predictable. - // Visitor return value only stops visit on one thread. - void visit_all_points_in_parallel(std::function visitor) const { - - // 0-D? - if (!_q.size()) { - Tuple tp(*this); - visitor(tp, 0); + Tuple get_compact_factors(T N) const; + + // Advance Tuple 'tp' containing indices in the space defined by + // 'this' to the next logical index. + // Input 'tp' must contain valid indices, i.e., each value must + // be between 0 and N-1, where N is the value in the corresponding + // dim in 'this'. + // If 'tp' is at last index, "wraps-around" to all zeros. + inline void next_index(Tuple& tp) const { + const int nd = get_num_dims(); + const int inner_dim = _first_inner ? 0 : nd-1; + const int dim_step = _first_inner ? 1 : -1; + + // Increment inner dim. + tp[inner_dim]++; + + // Wrap around indices as needed. + // First test is redundant, but keeps us from entering loop most times. + if (tp[inner_dim] >= get_val(inner_dim)) { + for (int j = 0, k = inner_dim; j < nd; j++, k += dim_step) { + + // If too far in dim 'k', set idx to 0 and increment idx in next dim. + if (tp[k] >= get_val(k)) { + tp[k] = 0; + int nxt_dim = k + dim_step; + auto* p = tp.lookup(nxt_dim); + if (p) + (*p)++; + } + else + break; + } } - - // Call order-independent version. - // Set begin/end/step dims depending on nesting. - // TODO: set this depending on dim sizes. - else if (_first_inner) - _visit_all_points_in_par(visitor, size()-1, -1); - else - _visit_all_points_in_par(visitor, 0, 1); } + + // Call the 'visitor' lambda function at every point sequentially in + // the space defined by 'this'. 'idx' parameter contains + // sequentially-numbered index. Visitation order is with first + // dimension in unit stride, i.e., a conceptual "outer loop" + // iterates through last dimension, ..., and an "inner loop" + // iterates through first dimension. If '_first_inner' is false, it + // is done the opposite way. Visitor should return 'true' to keep + // going or 'false' to stop. Returns 'false' if any visitor + // returned 'false' and visitation was stopped; otherwise 'true'. + // Example: + // sizes_tuple.visit_all_points([&](const Tuple& pt, size_t idx) { ... }); + bool visit_all_points(std::function visitor) const { + Tuple tp(*this); + tp.set_vals_same(0); - protected: + // Total number of points to visit. + idx_t ne = product(); - // Visit elements recursively. - bool _visit_all_points(std::function visitor, - int cur_dim_num, int step, Tuple& tp) const { - auto& sc = _q.at(cur_dim_num); - auto dsize = sc.get_val(); - int last_dim_num = (step > 0) ? size()-1 : 0; - - // If no more dims, iterate along current dimension and call - // visitor. - if (cur_dim_num == last_dim_num) { - - // Get unique index to first position. - tp.set_val(cur_dim_num, 0); - size_t idx0 = layout(tp); - - // Loop through points. - for (T i = 0; i < dsize; i++) { - tp.set_val(cur_dim_num, i); - bool ok = visitor(tp, idx0 + i); - - // Leave if visitor returns false. - if (!ok) - return false; - } + // 1 point? + if (ne <= 1) { + bool ok = visitor(tp, 0); + return ok; } - // Else, iterate along current dimension and recurse to - // next/prev dimension. - else { - for (T i = 0; i < dsize; i++) { - tp.set_val(cur_dim_num, i); + // Visit each point in sequential order. + for (T i = 0; i < ne; i++) { - // Recurse. - bool ok = _visit_all_points(visitor, cur_dim_num + step, step, tp); + // Call visitor. + bool ok = visitor(tp, i); + if (!ok) + return false; - // Leave if visitor returns false. - if (!ok) - return false; - } + // Jump to next index. + next_index(tp); } return true; } - // First call from public visit_all_points_in_parallel(visitor). - bool _visit_all_points_in_par(std::function visitor, - int cur_dim_num, int step) const { -#ifdef _OPENMP - auto nd = _get_num_dims(); + // Call the 'visitor' lambda function at every point in the space defined by 'this'. + // 'idx' parameter contains sequentially-numbered index. + // Visitation order is not predictable. + // Visitation concurrency is not predicable. + // Visitor return value is ignored. + void visit_all_points_in_parallel(std::function visitor) const { + // Total number of points to visit. + idx_t ne = product(); - // If one dim, parallelize across it. - if (nd == 1) { - assert(cur_dim_num == 0); - auto dsize = get_val(cur_dim_num); + // 1 point? + if (ne <= 1) { Tuple tp(*this); - - // Loop through points. - // Each thread gets its own copy of 'tp', which - // gets updated with the loop index. - // TODO: convert to yask_parallel_for(). -#pragma omp parallel for firstprivate(tp) - for (T i = 0; i < dsize; i++) { - tp.set_val(cur_dim_num, i); - visitor(tp, i); - } - } - - // If >1 dim, parallelize over outer dims only, - // streaming across inner dim in each thread. - // This is to maximize HW prefetch benefit. - else { - - // Total number of elements to visit. - T ne = product(); - - // Number of elements in last dim. - int last_dim_num = (step > 0) ? nd-1 : 0; - T nel = get_val(last_dim_num); - - // Parallel loop over elements w/stride = size of - // last dim. - yask_parallel_for(0, ne, nel, - [&](idx_t start, idx_t stop, idx_t thread_num) { - - // Convert linear index to n-dimensional tuple. - Tuple tp = unlayout(start); - - // Visit points in last dim. - _visit_all_points(visitor, last_dim_num, step, tp); - }); + tp.set_vals_same(0); + visitor(tp, 0); + return; } - return true; -#else - // Call recursive version to handle all dims. - Tuple tp(*this); - return _visit_all_points(visitor, cur_dim_num, step, tp); -#endif + #ifdef _OPENMP + + // Num threads to be started. + idx_t nthr = yask_get_num_threads(); + + // Start sequential visits in parallel. + // (Not guaranteed that each tnum will be unique in every OMP + // impl, so don't rely on it.) + yask_parallel_for + (0, nthr, 1, + [&](idx_t n, idx_t np1, idx_t tnum) { + + // Start and stop indices for this thread. + idx_t start = div_equally_cumu_size_n(ne, nthr, n - 1); + idx_t stop = div_equally_cumu_size_n(ne, nthr, n); + assert(stop >= start); + if (stop <= start) + return; // from lambda. + + // Make tuple for this thread. + Tuple tp = *this; + + // Convert 1st linear index to n-dimensional tuple. + tp = unlayout(start); + + // Visit each point in sequential order. + for (T i = start; i < stop; i++) { + + // Call visitor. + visitor(tp, i); + + // Jump to next index. + next_index(tp); + } + }); + + #else + // No OMP; use sequential version. + visit_all_points(visitor); + #endif } }; // Tuple. @@ -745,8 +768,10 @@ namespace std { public : size_t operator()(const yask::Tuple &x ) const { size_t h = 0; - for (int i = 0; i < x._get_num_dims(); i++) { - h ^= size_t(i) ^ std::hash()(x.get_val(i)) ^ std::hash()(x.get_dim_name(i)); + for (int i = 0; i < x.get_num_dims(); i++) { + h ^= size_t(i) ^ + std::hash()(x.get_val(i)) ^ + std::hash()(x.get_dim_name(i)); } return h; } diff --git a/src/common/yask_assert.hpp b/src/common/yask_assert.hpp index 1368b9e8..de57d8c0 100644 --- a/src/common/yask_assert.hpp +++ b/src/common/yask_assert.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -23,13 +23,40 @@ IN THE SOFTWARE. *****************************************************************************/ #pragma once +#include + +// Hack macros for the C preprocessor. +#define YSTR1(s) #s +#define YSTR2(s) YSTR1(s) +#define YPRAGMA(x) _Pragma(#x) +#define YCAT(a, ...) YPRIM_CAT(a, __VA_ARGS__) +#define YPRIM_CAT(a, ...) a ## __VA_ARGS__ +#define YIIF(c) YPRIM_CAT(IIF_, c) +#define YIIF_0(t, ...) __VA_ARGS__ +#define YIIF_1(t, ...) t // Control assert() by turning on with CHECK instead of turning off with // NDEBUG. This makes it off by default. #ifdef CHECK + +// Temporarily replace assert() with printf() when offloading, but +// this doesn't cause program to halt. +// Also define host_assert() to be a stub. +#if defined(USE_OFFLOAD) && !defined(USE_OFFLOAD_X86) +#define assert(expr) \ + ((expr) ? \ + ((void)0) : \ + ((void)printf("YASK: ***** assertion '%s' failed at %s:%i\n", \ + YSTR1(expr), __FILE__, __LINE__))) +#define host_assert(expr) ((void)0) +#else #include +#define host_assert(expr) assert(expr) +#endif + #else #define assert(expr) ((void)0) +#define host_assert(expr) ((void)0) #define NDEBUG #endif diff --git a/src/compiler/Makefile b/src/compiler/Makefile index 2921d17c..4f993435 100644 --- a/src/compiler/Makefile +++ b/src/compiler/Makefile @@ -1,6 +1,6 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -37,7 +37,9 @@ YC_TEST_SRC_DIR := $(abspath ./tests) YC_OUT_BASE := $(BUILD_OUT_DIR)/compiler YC_SWIG_OUT_DIR := $(YC_OUT_BASE)/swig YC_OBJ_DIR := $(YC_OUT_BASE)/obj -YC_STENCIL_DIR := $(SRC_DIR)/stencils +YC_STENCIL_DIR := $(SRC_DIR)/stencils +YC_STENCIL_DIRS := $(shell find -L $(YC_STENCIL_DIR) -type d) +YC_STENCIL_OBJ_DIR := $(YC_OUT_BASE)/obj/stencils # Compiler files. YC_MODULE := $(YC_BASE) @@ -46,20 +48,22 @@ YC_PY_LIB := $(PY_OUT_DIR)/_$(YC_MODULE)$(SO_SUFFIX) YC_PY_MOD := $(PY_OUT_DIR)/$(YC_MODULE).py YC_TEST_EXEC := $(BIN_OUT_DIR)/$(YC_BASE)_api_test.exe YC_TEST_EXEC_WITH_EXCEPTION := $(BIN_OUT_DIR)/$(YC_BASE)_api_exception_test.exe -YC_SRC_NAMES := Expr ExprUtils Var Settings Eqs Print Vec Cpp CppIntrin YaskKernel Solution -YC_STENCIL_NAMES:= $(notdir $(patsubst %.cpp,%,$(wildcard $(YC_STENCIL_DIR)/*.cpp))) +YC_SRC_NAMES := Expr VarPoint ExprUtils Var Settings Eqs Print Vec Cpp CppIntrin YaskKernel Solution YC_OBJS := $(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_SRC_NAMES) $(COMM_SRC_NAMES))) -YC_STENCIL_OBJS := $(addprefix $(YC_OBJ_DIR)/,$(addsuffix .o,$(YC_STENCIL_NAMES))) YC_INC_DIRS := $(INC_DIR) $(YC_LIB_SRC_DIR) $(COMM_DIR) $(COEFF_DIR) -YC_INC_GLOB := $(wildcard $(addsuffix /*.hpp,$(YC_INC_DIRS))) -YC_STENCIL_INC_GLOB := $(wildcard $(YC_STENCIL_DIR)/*.hpp $(YC_STENCIL_DIR)/*/*.hpp) +YC_INC_FILES := $(wildcard $(addsuffix /*.hpp,$(YC_INC_DIRS))) +YC_STENCIL_SRC_FILES := $(foreach dir,$(YC_STENCIL_DIRS),$(wildcard $(dir)/*.cpp)) +YC_STENCIL_INC_FILES := $(foreach dir,$(YC_STENCIL_DIRS),$(wildcard $(dir)/*.hpp)) +YC_STENCIL_NAMES := $(notdir $(basename $(YC_STENCIL_SRC_FILES))) +YC_STENCIL_OBJS := $(addprefix $(YC_STENCIL_OBJ_DIR)/,$(addsuffix .o,$(YC_STENCIL_NAMES))) # Compiler and default flags. -YC_CXX ?= g++ # usually faster than icpc for building the compiler. +YC_CXX := $(CXX) YC_CXXOPT := -O2 -YC_CXXFLAGS := -g -std=c++11 $(YC_CXXOPT) -Wall -YC_CXXFLAGS += -Wno-unknown-pragmas -Wno-unused-variable -Wno-unused-but-set-variable +YC_CXXFLAGS := -g -std=c++17 -Wall +YC_CXXFLAGS += -Wno-unknown-pragmas -Wno-unused-variable YC_CXXFLAGS += $(INC_CXXFLAGS) +YC_CXXFLAGS += $(YC_CXXOPT) YC_INC_CXXFLAGS := $(addprefix -I,$(YC_INC_DIRS)) # Add user-defined flags. @@ -69,29 +73,23 @@ YC_CXXFLAGS += $(EXTRA_YC_CXXFLAGS) YC_LD := $(YC_CXX) YC_LFLAGS := -lrt -Wl,-rpath=$(LIB_OUT_DIR) -L$(LIB_OUT_DIR) -l$(YC_BASE) -# Compile rules. -# For example stencils, -# - Build at O0 to avoid C++ compiler wasting time optimizing them. -# - Set macro to use internal DSL instead of conflicting API operators. +# Source dirs. +VPATH := $(patsubst $space,$colon,$(YC_STENCIL_DIRS) $(COMM_DIR) $(COEFF_DIR) $(YC_LIB_SRC_DIR)) -$(YC_OBJ_DIR)/%.o: $(YC_STENCIL_DIR)/%.cpp $(INC_GLOB) $(YC_STENCIL_INC_GLOB) - $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ -DUSE_INTERNAL_DSL -O0 -c -o $@ $< - @ls -l $@ +# Compile rules for non-stencil files. -$(YC_OBJ_DIR)/%.o: $(COMM_DIR)/%.cpp $(YC_INC_GLOB) - $(MKDIR) $(YC_OBJ_DIR) +$(YC_OBJ_DIR)/%.o: %.cpp $(YC_INC_FILES) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) $(YC_INC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< @ls -l $@ -$(YC_OBJ_DIR)/%.o: $(COEFF_DIR)/%.cpp $(YC_INC_GLOB) - $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) $(YC_INC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< - @ls -l $@ +# Compile rules for example stencils, +# - Build at O0 to avoid C++ compiler wasting time optimizing them. +# - Set macro to use internal DSL instead of conflicting API operators. -$(YC_OBJ_DIR)/%.o: $(YC_LIB_SRC_DIR)/%.cpp $(YC_INC_GLOB) - $(MKDIR) $(YC_OBJ_DIR) - $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) $(YC_INC_CXXFLAGS) -x c++ -fPIC -c -o $@ $< +$(YC_STENCIL_OBJ_DIR)/%.o: %.cpp $(INC_FILES) $(YC_STENCIL_INC_FILES) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -O0 -x c++ -DUSE_INTERNAL_DSL -c -o $@ $< @ls -l $@ ######## Primary targets. @@ -104,12 +102,12 @@ compiler: $(YC_EXEC) $(MAKE) old-code-check $(YC_LIB): $(YC_OBJS) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -shared -o $@ $^ @ls -l $@ $(YC_EXEC): compiler_main.cpp $(YC_STENCIL_OBJS) $(YC_LIB) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_LD) $(YC_CXXFLAGS) $(YC_INC_CXXFLAGS) $^ $(YC_LFLAGS) -o $@ @ls -l $@ $(MAKE) echo-settings @@ -125,21 +123,23 @@ api: compiler $(YC_PY_LIB) # Also builds $(YC_PY_MOD) $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp: $(YC_SWIG_DIR)/yask*.i $(INC_DIR)/*.hpp $(SWIG) -version - $(MKDIR) $(YC_SWIG_OUT_DIR) $(PY_OUT_DIR) + $(call MK_DIR,$(dir $@)) + $(call MK_DIR,$(PY_OUT_DIR)) $(SWIG) -v -DYC_MODULE=$(YC_MODULE) -cppext cpp \ -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig -I$(COEFF_DIR) \ -c++ -python -o $@ -outdir $(PY_OUT_DIR) -builtin $< + $(SWIG_PATCH) $< # Turn off asserts to work around known SWIG issue: # https://github.com/swig/swig/issues/773 $(YC_OBJ_DIR)/yask_compiler_api_wrap.o: $(YC_SWIG_OUT_DIR)/yask_compiler_api_wrap.cpp - $(MKDIR) $(YC_OBJ_DIR) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -x c++ \ $(SWIG_GCCFLAGS) $(DBL_EPSILON_CXXFLAG) -DNDEBUG $(PYINC) -fPIC -c -o $@ $< @ls -l $@ $(YC_PY_LIB): $(YC_OBJS) $(YC_OBJ_DIR)/yask_compiler_api_wrap.o - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) -shared -o $@ $^ @ls -l $@ @@ -147,7 +147,7 @@ $(YC_PY_LIB): $(YC_OBJS) $(YC_OBJ_DIR)/yask_compiler_api_wrap.o # Build C++ compiler API test. $(YC_TEST_EXEC): $(YC_TEST_SRC_DIR)/yask_compiler_api_test.cpp $(YC_LIB) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) $< $(YC_LFLAGS) -o $@ @ls -l $@ @@ -169,7 +169,7 @@ cxx-yc-api-test: $(YC_TEST_EXEC) # Build C++ compiler API test with exception. $(YC_TEST_EXEC_WITH_EXCEPTION): $(YC_TEST_SRC_DIR)/yask_compiler_api_exception_test.cpp $(YC_LIB) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) $(CXX_PREFIX) $(YC_CXX) $(YC_CXXFLAGS) $< $(YC_LFLAGS) -o $@ @ls -l $@ diff --git a/src/compiler/compiler_main.cpp b/src/compiler/compiler_main.cpp index fbbbb982..ddb4a210 100644 --- a/src/compiler/compiler_main.cpp +++ b/src/compiler/compiler_main.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -92,13 +92,12 @@ void usage(const string& cmd, " -target \n" " Set the output format (required).\n" " Supported formats:\n" - " avx YASK stencil classes for CORE AVX ISA (256-bit HW SIMD vectors).\n" - " avx2 YASK stencil classes for CORE AVX2 ISA (256-bit HW SIMD vectors).\n" - " avx512 YASK stencil classes for CORE AVX-512 ISA (512-bit HW SIMD vectors).\n" - " avx512lo YASK stencil classes for CORE AVX-512 ISA (256-bit HW SIMD vectors).\n" - " knc YASK stencil classes for Knights-Corner ISA (512-bit HW SIMD vectors).\n" - " knl YASK stencil classes for Knights-Landing (MIC) AVX-512 ISA (512-bit HW SIMD vectors).\n" - " intel64 YASK stencil classes for generic C++ (no explicit HW SIMD vectors).\n" + " avx YASK code for CORE AVX ISA (256-bit HW SIMD vectors).\n" + " avx2 YASK code for CORE AVX2 ISA (256-bit HW SIMD vectors).\n" + " avx512 YASK code classes for CORE AVX-512 ISA (512-bit HW SIMD vectors).\n" + " avx512-ymm YASK code for CORE AVX-512 ISA (256-bit HW SIMD vectors).\n" + " knl YASK code for Knights-Landing (MIC) AVX-512 ISA (512-bit HW SIMD vectors).\n" + " intel64 YASK code for generic C++ with 64-bit indices (no explicit HW SIMD vectors).\n" " pseudo Human-readable scalar pseudo-code.\n" " pseudo-long Human-readable scalar pseudo-code with intermediate variables.\n" " dot DOT-language description.\n" @@ -107,25 +106,62 @@ void usage(const string& cmd, " -elem-bytes \n" " Set number of bytes in each FP element (default=" << settings._elem_bytes << ").\n" " Currently, only 4 (single-precision) and 8 (double) are allowed.\n" - " -domain-dims ,,...\n" - " Explicitly name the domain dimensions and set their order.\n" - " In addition, domain dimensions are added when YASK variables are encountered\n" - " in the stencil DSL code.\n" - " Either way, the last unique domain dimension specified will become the 'inner' or\n" - " 'unit-stride' dimension in memory layouts. Thus, this option can be used to override\n" - " the default layout order.\n" - " The domain-dimension order also affects loop nesting and default rank layout.\n" - " -step-dim \n" - " Explicitly set the step dimension.\n" - " By default, the step dimension is defined when YASK variables are encountered\n" - " in the stencil DSL code.\n" " -fold =,...\n" " Set number of elements in each given dimension in a vector block.\n" " Default depends on -elem-bytes setting, domain-dimension order, and print format (below).\n" " If product of fold lengths does not equal SIMD vector length for print\n" " formats with explicit lengths, lengths will adjusted as needed.\n" " -cluster =,...\n" - " Set number of vectors to evaluate in each dimension.\n" + " Set number of vectors to evaluate per inner-loop iteration in each dimension.\n" + " -p \n" + " Write formatted output to .\n" + //" -ps Print stats for all folding options for given vector length.\n" + "\n" + "Advanced options for experimentation or debug:\n" + " -step-dim \n" + " Explicitly set the name of the step dimension, e.g., 't'.\n" + " By default, the step dimension is defined implicitly when YASK variables are encountered\n" + " in the stencil DSL code.\n" + " -domain-dims ,,...\n" + " Explicitly name the domain dimensions and set their order, e.g., 'x,y,z'.\n" + " In addition, domain dimensions are added implicitly when YASK variables are encountered\n" + " in the stencil DSL code.\n" + " The domain-dimension order determines array memory layout, default loop nesting, and\n" + " MPI rank layout. Thus, this option can be used to override those traits compared to\n" + " what would be obtained from the DSL code only.\n" + " -inner-loop-dim \n" + " Specify the domain dimension used for the inner-most stencil-computation loop.\n" + " The default is the last domain dimension specified via -domain_dims or in the\n" + " stencil DSL code.\n" + " For this option, a numerical index is allowed: '1' is the first domain-dim, etc.\n" + " -min-buffer-len \n" + " Create buffers in the inner loop if at least points could be stored in it\n" + " (default=" << settings._min_buffer_len << ").\n" + " -read-ahead-dist \n" + " Number of iterations to read ahead into the inner-loop buffers\n" + " (default=" << settings._read_ahead_dist << ").\n" + " [-no]-inner-misc-layout\n" + " Set YASK-var memory layout so that the misc dim(s) are the inner-most dim(s)\n" + " instead of the outer-most (default=" << settings._inner_misc << ").\n" + " This effectively creates an AoSoA-style layout instead of an SoAoA one,\n" + " where the last 'A' is the SIMD vector.\n" + " If the SIMD-vector length is 1, the last domain dim will always be in\n" + " the inner-most layout dim, even if this contradicts this setting.\n" + " This setting may help decrease the number of memory streams for complex\n" + " kernels when misc dims are used to consolidate vars.\n" + " This disallows dynamically changing the 'misc' dim sizes from the kernel APIs.\n" + " [-no]-outer-domain-layout\n" + " Set YASK-var memory layout so that the first domain dim is the outer-most\n" + " dim, even if the var contains step or misc dims (default=" << settings._outer_domain << ").\n" + " This setting may be useful for run-time allocators that automatically partition\n" + " array layouts across NUMA nodes.\n" + " If the SIMD-vector length is 1, the last domain dim will always be in\n" + " the inner-most layout dim, possibly overriding this setting.\n" + " -[no]-fus\n" + " Make first dimension of fold unit stride (default=" << settings._first_inner << ").\n" + " This controls the intra-vector memory layout.\n" + " The order of dimensions within a folded vector is not necessarily the same as the\n" + " order of the dimensions in the YASK-var memory layouts as described above.\n" " -l1-prefetch-dist \n" " Set L1 prefetch distance to iterations ahead. Use zero (0) to disable.\n" " -l2-prefetch-dist \n" @@ -142,11 +178,11 @@ void usage(const string& cmd, " names are '" << settings._eq_bundle_basename_default << "_0', " << settings._eq_bundle_basename_default << "_1', etc.\n" " This option allows more control over this bundling.\n" - " Example: \"-eq-bundles a=foo,b=b[aeiou]r\" creates one or more eq-bundles named 'a_0', 'a_1', etc.\n" + " Example: \"-eq-bundles a=foo,b=b[ae]r\" creates one or more eq-bundles named 'a_0', 'a_1', etc.\n" " containing updates to each var whose name contains 'foo' and one or more eq-bundles\n" - " named 'b_0', 'b_1', etc. containing updates to each var whose name matches 'b[aeiou]r'.\n" + " named 'b_0', 'b_1', etc. containing updates to each var whose name contains 'bar' or 'ber'.\n" " Standard regex-format tokens in will be replaced based on matches to .\n" - " Example: \"-eq-bundles 'g_$&=b[aeiou]r'\" with vars 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n" + " Example: \"-eq-bundles 'g_$&=b[ae]r'\" with vars 'bar_x', 'bar_y', 'ber_x', and 'ber_y'\n" " would create eq-bundle 'g_bar_0' for vars 'bar_x' and 'bar_y' and eq-bundle 'g_ber_0' for\n" " vars 'ber_x' and 'ber_y' because '$&' is substituted by the string that matches the regex.\n" " [-no]-bundle-scratch\n" @@ -158,19 +194,10 @@ void usage(const string& cmd, " -step-alloc \n" " Specify the size of the step-dimension memory allocation on all vars.\n" " By default, allocations are calculated automatically for each var.\n" - " [-no]-interleave-misc\n" - " Allocate YASK vars with the 'misc' dims as the inner-most dims (default=" << settings._inner_misc << ").\n" - " This disallows dynamcally changing the 'misc' dim sizes during run-time.\n" - " -fus\n" - " Make first dimension of fold unit stride (default=" << settings._first_inner << ").\n" - " This controls the intra-vector memory layout.\n" - " -lus\n" - " Make last dimension of fold unit stride (default=" << (!settings._first_inner) << ").\n" - " This controls the intra-vector memory layout.\n" " [-no]-ul\n" - " Do [not] generate simple unaligned loads (default=" << settings._allow_unaligned_loads << ").\n" - " [Advanced] To use this correctly, only 1D folds are allowed, and\n" - " the memory layout used by YASK must have that same dimension in unit stride.\n" + " [Advanced] Do [not] generate simple unaligned loads (default=" << settings._allow_unaligned_loads << ").\n" + " To use this correctly, only 1D folds are allowed, and\n" + " the array memory layout must have that same dimension in unit stride.\n" " [-no]-opt-comb\n" " Do [not] combine commutative operations (default=" << settings._do_comb << ").\n" " [-no]-opt-reorder\n" @@ -186,15 +213,21 @@ void usage(const string& cmd, " Set heuristic for max single expression-size (default=" << settings._max_expr_size << ").\n" " -min-es \n" " Set heuristic for min expression-size for reuse (default=" << settings._min_expr_size << ").\n" + " [-no]-use-ptrs\n" + " Generate inner-loop kernel code using data pointers & strides, avoiding function calls\n" + " (default=" << settings._use_ptrs << ").\n" + " [-no]-use-safe-ptrs\n" + " Generate kernel code with pointer parameters to base addresses for each YASK var\n" + " (default=" << settings._use_offsets << ").\n" + " This is a workaround for offload-device drivers that don't allow negative indices from\n" + " a pointer that is a kernel argument.\n" + " [-no]-early-loads\n" + " Generate aligned loads before they are needed (default=" << settings._early_loads << ").\n" " [-no]-find-deps\n" " Find dependencies between stencil equations (default=" << settings._find_deps << ").\n" " [-no]-print-eqs\n" " Print each equation when defined (default=" << settings._print_eqs << ").\n" "\n" - " -p \n" - " Write formatted output to .\n" - //" -ps Print stats for all folding options for given vector length.\n" - "\n" "Examples:\n" " " << cmd << " -stencil 3axis -radius 2 -fold x=4,y=4 -target pseudo -p - # '-' for stdout\n" " " << cmd << " -stencil awp -elem-bytes 8 -fold x=4,y=2 -target avx2 -p stencil_code.hpp\n" @@ -205,8 +238,16 @@ void usage(const string& cmd, // Parse command-line and set global cmd-line option vars. // Exits on error. void parse_opts(int argc, const char* argv[], - CompilerSettings& settings) + CompilerSettings& settings, + bool show_invo = false) { + if (show_invo) { + cout << "YASK compiler invocation:"; + for (int argi = 0; argi < argc; argi++) + cout << " " << argv[argi]; + cout << endl; + } + if (argc <= 1) usage(argv[0], settings); @@ -221,7 +262,7 @@ void parse_opts(int argc, const char* argv[], else if (opt == "-fus") settings._first_inner = true; - else if (opt == "-lus") + else if (opt == "-no-fus") settings._first_inner = false; else if (opt == "-ul") settings._allow_unaligned_loads = true; @@ -259,11 +300,27 @@ void parse_opts(int argc, const char* argv[], settings._print_eqs = true; else if (opt == "-no-print-eqs") settings._print_eqs = false; - else if (opt == "-interleave-misc") + else if (opt == "-inner-misc-layout") settings._inner_misc = true; - else if (opt == "-no-interleave-misc") + else if (opt == "-no-inner-misc-layout") settings._inner_misc = false; - + else if (opt == "-outer-domain-layout") + settings._outer_domain = true; + else if (opt == "-no-outer-domain-layout") + settings._outer_domain = false; + else if (opt == "-use-ptrs") + settings._use_ptrs = true; + else if (opt == "-no-use-ptrs") + settings._use_ptrs = false; + else if (opt == "-use-safe-ptrs") + settings._use_offsets = true; + else if (opt == "-no-use-safe-ptrs") + settings._use_offsets = false; + else if (opt == "-early-loads") + settings._early_loads = true; + else if (opt == "-no-early-loads") + settings._early_loads = false; + // add any more options w/o values above. // options w/a value. @@ -289,6 +346,8 @@ void parse_opts(int argc, const char* argv[], settings._eq_bundle_targets = argop; else if (opt == "-step-dim") settings._step_dim = argop; + else if (opt == "-inner-loop-dim") + settings._inner_loop_dim = argop; else if (opt == "-domain-dims") { settings._domain_dims.clear(); @@ -342,6 +401,10 @@ void parse_opts(int argc, const char* argv[], settings._halo_size = val; else if (opt == "-step-alloc") settings._step_alloc = val; + else if (opt == "-min-buffer-len") + settings._min_buffer_len = val; + else if (opt == "-read-ahead-dist") + settings._read_ahead_dist = val; // add any more options w/int values here. @@ -373,13 +436,13 @@ int main(int argc, const char* argv[]) { cout << "YASK -- Yet Another Stencil Kit\n" "YASK Stencil Compiler Utility\n" - "Copyright (c) 2014-2021, Intel Corporation.\n" + "Copyright (c) 2014-2022, Intel Corporation.\n" "Version: " << yask_get_version_string() << endl; try { // Parse options. CompilerSettings settings; - parse_opts(argc, argv, settings); + parse_opts(argc, argv, settings, true); // Find the requested stencil in the registry. auto& stencils = yc_solution_base::get_registry(); diff --git a/src/compiler/lib/Cpp.cpp b/src/compiler/lib/Cpp.cpp index 0ed5d39e..23ef8f38 100644 --- a/src/compiler/lib/Cpp.cpp +++ b/src/compiler/lib/Cpp.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -32,6 +32,7 @@ namespace yask { /////////// Scalar code ///////////// // Format a real, preserving precision. + // Also used for vector code, assuming automatic broadcast. string CppPrintHelper::format_real(double v) { // Int representation equivalent? @@ -51,19 +52,20 @@ namespace yask { // Make call for a point. // This is a utility function used for both reads and writes. string CppPrintHelper::make_point_call(ostream& os, - const VarPoint& gp, - const string& fname, - string opt_arg) { + const VarPoint& gp, + const string& fname, + string opt_arg) { // Get/set local vars. - string var_ptr = get_local_var(os, get_var_ptr(gp), _var_ptr_restrict_type); - string step_arg_var = get_local_var(os, gp.make_step_arg_str(var_ptr, _dims), _step_val_type); + string var_ptr = get_local_var(os, get_var_ptr(gp), _var_ptr_type, "expr"); + string sas = gp.make_step_arg_str(var_ptr, _dims); + string step_arg = sas.length() ? get_local_var(os, sas, _step_val_type, "step") : "0"; string res = var_ptr + "->" + fname + "("; if (opt_arg.length()) res += opt_arg + ", "; string args = gp.make_arg_str(); - res += "{" + args + "}, " + step_arg_var + ", __LINE__)"; + res += "{" + args + "}, " + step_arg + ")"; return res; } @@ -74,451 +76,837 @@ namespace yask { // Return code to update a var point. string CppPrintHelper::write_to_point(ostream& os, const VarPoint& gp, - const string& val) { + const string& val) { return make_point_call(os, gp, "write_elem", val); } /////////// Vector code ///////////// - // Read from a single point. - // Return code for read. - string CppVecPrintHelper::read_from_scalar_point(ostream& os, const VarPoint& gp, - const VarMap* v_map) { - - // Use default var-map if not provided. - if (!v_map) - v_map = &_vec2elem_map; - - // Determine type to avoid virtual call. - bool folded = gp.is_var_foldable(); - string gtype = folded ? "YkVecVar" : "YkElemVar"; + // Create call for a point. + // This is a utility function used for most var accesses. + string CppVecPrintHelper::make_point_call_vec(ostream& os, + const VarPoint& gp, + const string& func_name, + const string& first_arg, + const string& last_arg, + bool is_vec_norm, + const VarMap* var_map) { + + // Vec-norm accesses must be from folded var. + if (is_vec_norm) + assert(gp.is_var_foldable()); + + // Var map is required for non-vec accesses to get elem indices. + else + assert(var_map); // Get/set local vars. - string var_ptr = get_local_var(os, get_var_ptr(gp), _var_ptr_restrict_type); - string step_arg_var = get_local_var(os, gp.make_step_arg_str(var_ptr, _dims), - _step_val_type); - - // Assume that broadcast will be handled automatically by - // operator overloading in kernel code. - // Specify that any indices should use element vars. - string str = var_ptr + "->" + gtype + "::read_elem("; - string args = gp.make_arg_str(v_map); - str += "{" + args + "}, " + step_arg_var + ",__LINE__)"; - return str; + string var_ptr = get_local_var(os, get_var_ptr(gp), + CppPrintHelper::_var_ptr_type, "expr"); + string sas = gp.make_step_arg_str(var_ptr, _dims); + string step_arg = sas.length() ? + get_local_var(os, sas, CppPrintHelper::_step_val_type, "step") : "0"; + + string res = var_ptr + "->" + func_name + "("; + if (first_arg.length()) + res += first_arg + ", "; + string args = is_vec_norm ? + gp.make_norm_arg_str(_dims, var_map) : gp.make_arg_str(var_map); + res += "{" + args + "} ," + step_arg; + if (last_arg.length()) + res += ", " + last_arg; + res += ")"; + return res; } + + // Make base point: + // domain indices = local-offset; + // misc indices = min-val (local-offset); + // other indices = those from 'gp'. + var_point_ptr CppVecPrintHelper::make_var_base_point(const VarPoint& gp) { + var_point_ptr bgp = gp.clone_var_point(); + auto* var = bgp->_get_var(); + assert(var); + for (auto& dim : bgp->get_dims()) { + auto& dname = dim->_get_name(); + auto type = dim->get_type(); - // Read from multiple points that are not vectorizable. - // Return var name. - string CppVecPrintHelper::print_non_vec_read(ostream& os, const VarPoint& gp) { - print_point_comment(os, gp, "Construct folded vector from non-folded"); + if (type == DOMAIN_INDEX || type == MISC_INDEX) { + auto* lofs = lookup_offset(*var, dname); + assert(lofs); + bgp->set_arg_expr(dname, *lofs); + } + } + return bgp; + } - // Make a vec var. - string mv_name = make_var_name(); - os << _line_prefix << get_var_type() << " " << mv_name << _line_suffix; + // Make inner-loop base point: + // domain dim offset = 0; + // misc indices = min-val (local-offset); + // other indices = those from 'gp'. + var_point_ptr CppVecPrintHelper::make_inner_loop_base_point(const VarPoint& gp) { + var_point_ptr bgp = gp.clone_var_point(); + for (auto& dim : gp.get_dims()) { + auto& dname = dim->_get_name(); + auto type = dim->get_type(); + bool use_domain = (type == DOMAIN_INDEX) && + (!_settings._use_many_ptrs || dname == _dims._inner_layout_dim); + bool use_misc = type == MISC_INDEX; - // Loop through all points in the vector fold. - get_fold().visit_all_points([&](const IntTuple& vec_point, - size_t pelem){ + // Set domain dims to current index only, + // i.e., no offset. + if (use_domain) { + IntScalar idi(dname, 0); + bgp->set_arg_offset(idi); + } - // Example: vec_point contains x=0, y=2, z=1, where each val - // is the offset in the given fold dim. We want to map - // x=>x_elem, y=>(y_elem+2), z=>(z_elem+1) in var-point - // index args. - VarMap v_map; - for (auto& dim : vec_point) { - auto& dname = dim._get_name(); - int dofs = dim.get_val(); + // Set misc indices to their min value. + else if (use_misc) { + auto* var = gp._get_var(); + auto min_val = var->get_min_indices()[dname]; + IntScalar idi(dname, min_val); + bgp->set_arg_const(idi); + } + } + return bgp; + } + + // Print creation of var-base pointer of 'gp'. + void CppVecPrintHelper::print_var_base_ptr(ostream& os, const VarPoint& gp) { + if (!_settings._use_ptrs) + return; + + // Got a pointer to it already? + auto* p = lookup_var_base_ptr(gp); + if (!p) { + + // Make base point (misc & domain indices set to min + // values). There will be one pointer for every unique + // var/step-arg combo. + auto bgp = make_var_base_point(gp); + auto* var = bgp->_get_var(); + assert(var); + + // Make and save ptr var for future use. + string ptr_name = make_var_name(var->_get_name() + "_var_base_ptr"); + _var_base_ptrs[*bgp] = ptr_name; + + // Print pointer definition. + print_point_comment(os, *bgp, "Create var-base pointer"); + + // Get pointer to var using normalized indices. + // Ignore out-of-range errors because we might get a base pointer to an + // element before the allocated range. + // TODO: is this still true with local offsets? + bool folded = var->is_foldable(); + auto vp = folded ? + make_point_call_vec(os, *bgp, "get_vec_ptr_norm", "", "false", true) : + make_point_call_vec(os, *bgp, "get_elem_ptr_local", "", "false", false, &_vec2elem_local_map); + + // Ptr should provide unique access if all accesses are through pointers. + // TODO: check for reusing time-slots, e.g., p(t+1) aliased to p(t-1). + // TODO: check for non-ptr accesses via read/write calls. + bool is_unique = false; // !_settings._allow_unaligned_loads; + string type = is_unique ? _var_ptr_restrict_type : _var_ptr_type; + + // Print type and value. + os << _line_prefix << type << " " << ptr_name << " = " << vp << _line_suffix; + } + } - auto& ename = _vec2elem_map.at(dname); - if (dofs == 0) - v_map[dname] = ename; - else { - v_map[dname] = "(" + ename + "+" + to_string(dofs) + ")"; + // Print creation of stride and local-offset vars. + // Save var names for later use. + void CppVecPrintHelper::print_strides(ostream& os, const VarPoint& gp) { + auto* vp = gp._get_var(); + assert(vp); + auto& var = *vp; + const auto& vname = var.get_name(); + + // Already done this var? + if (_ptr_ofs.count(vname)) + return; + + // Index-invariant pointer offset. + string po_var = make_var_name(vname + "_ptr_ofs"); + _ptr_ofs[vname] = po_var; + string po_expr = "idx_t(0)"; + string po_deco = "const"; + + for (int dnum = 0; dnum < var.get_num_dims(); dnum++) { + auto& dim = var.get_dims().at(dnum); + const auto& dname = dim->_get_name(); + auto dtype = dim->get_type(); + bool is_step = dtype == STEP_INDEX; + bool is_misc = dtype == MISC_INDEX; + bool is_inner = dname == _dims._inner_layout_dim; + + auto key = VarDimKey(vname, dname); + if (!is_step) { + os << endl << " // Stride for var '" << vname << + "' in dim '" << dname << "'.\n"; + + string str_var = make_var_name(vname + "_" + dname + "_stride"); + _strides[key] = str_var; + assert(lookup_stride(var, dname)); + + // Get ptr to var core. + string var_ptr = get_local_var(os, get_var_ptr(var), + CppPrintHelper::_var_ptr_restrict_type, + vname + "_core"); + + // Determine stride. + // Default is to obtain dynamic value from var. + string slookup = var_ptr + "->_vec_strides[" + to_string(dnum) + "]"; + string stride = slookup; + string sdeco = "const"; + + // Under certain conditions, strides are known fixed values. + // Must be last dim or followed only by fixed-size misc dims. + bool is_fixed = true; + string fstride; + + // Loop in layout order. + bool dfound = false; + for (int j = 0; j < var.get_num_dims(); j++) { + auto& dimj = var.get_layout_dims().at(j); + auto& dnj = dimj->_get_name(); + auto typej = dimj->get_type(); + + // If misc, must also use inner-misc setting, which disallows + // resizing via APIs. + bool is_miscj = (typej == MISC_INDEX) && _settings._inner_misc; + + // Multply the strides of the dims following the current one. + if (dfound) { + if (is_miscj) { + auto min_idx = var.get_min_indices()[dnj]; + auto max_idx = var.get_max_indices()[dnj]; + auto sz = max_idx - min_idx + 1; + os << " // Indices for inner-dim '" << dnj << "' range from " << + min_idx << " to " << max_idx << ": " << sz << " value(s).\n"; + if (fstride.length()) + fstride += " * "; + fstride += to_string(sz); + } + else { + os << " // Size of inner-dim '" << dnj << "' is not known at " + "compile-time.\n"; + is_fixed = false; // Not a fixed value. + } } + if (dnj == dname) + dfound = true; + } + if (is_fixed) { + sdeco = "constexpr"; + if (fstride.length()) + stride = fstride; + else + stride = "1"; } - // Read or reuse. - string stmt = read_from_scalar_point(os, gp, &v_map); - auto* varname = lookup_elem_var(stmt); - if (!varname) { - - // Read val into a new scalar var. - string vname = make_var_name(); - os << _line_prefix << "real_t " << vname << - " = " << stmt << _line_suffix; - varname = save_elem_var(stmt, vname); + // Print final assignment. + os << _line_prefix << sdeco << " idx_t " << str_var << " = " << + stride << _line_suffix; + if (stride != slookup) + os << _line_prefix << "host_assert(" << slookup << + " == " << str_var << ")" << _line_suffix; + + // Offset to be subtracted from index. + os << endl << " // Index offset for var '" << vname << + "' in dim '" << dname << "'.\n"; + string ofs_var = make_var_name(vname + "_" + dname + "_ofs"); + _offsets[key] = ofs_var; + assert(lookup_offset(var, dname)); + string ofs_deco = "constexpr"; + + string ofs; // Offset value. + string ofs_expr = var_ptr + "->_local_offsets[" + to_string(dnum) + "]"; + if (var.is_scratch() && dtype == DOMAIN_INDEX) { + + // Lookup needed for domain dim in scratch var because scratch + // vars are "relocated" to location of current block. + os << " // Local offset varies because '" << vname << + "' is a scratch var.\n"; + ofs = ofs_expr; + ofs_deco = "const"; } + else if (dtype == MISC_INDEX) { - // Output translated expression for this element. - os << _line_prefix << mv_name << "[" << pelem << "] = " << - *varname << "; // for offset " << vec_point.make_dim_val_str() << - _line_suffix; + // Need min value for misc indices. + os << " // Local offset is minimum misc-index.\n"; + ofs = "idx_t(" + to_string(var.get_min_indices()[dname]) + ")"; + } + else { + os << " // Local offset is zero.\n"; + ofs = "idx_t(0)"; + } - return true; - }); // end of lambda. - return mv_name; - } + // Offset includes local offset and pad. + if (_settings._use_offsets) { + string pad_expr = var_ptr + "->_actl_left_pads[" + to_string(dnum) + "]"; + ofs_expr += " - " + pad_expr; + if (dtype != MISC_INDEX) { + os << " // Offset is adjusted by actual allocated padding.\n"; + ofs += " - " + pad_expr; + } + ofs_deco = "const"; + } - // Print call for a point. - // This is a utility function used for reads & writes. - string CppVecPrintHelper::print_vec_point_call(ostream& os, - const VarPoint& gp, - const string& func_name, - const string& first_arg, - const string& last_arg, - bool is_norm) { + os << _line_prefix << ofs_deco << " idx_t " << ofs_var << " = " << + ofs << _line_suffix; + if (ofs != ofs_expr) + os << _line_prefix << "host_assert(" << ofs_expr << + " == " << ofs_var << ")" << _line_suffix; - // Get/set local vars. - string var_ptr = get_local_var(os, get_var_ptr(gp), CppPrintHelper::_var_ptr_restrict_type); - string step_arg_var = get_local_var(os, gp.make_step_arg_str(var_ptr, _dims), - CppPrintHelper::_step_val_type); + // Build total offset expr. + po_expr += string(" - (") + ofs_var + " * " + str_var + ")"; + } + } - string res = var_ptr + "->" + func_name + "("; - if (first_arg.length()) - res += first_arg + ", "; - string args = is_norm ? gp.make_norm_arg_str(_dims) : gp.make_arg_str(); - res += "{" + args + "} ," + step_arg_var; - if (last_arg.length()) - res += ", " + last_arg; - res += ")"; - return res; + os << "\n // Offset from base ptr to 0th position in var '" << vname << "'.\n" << + _line_prefix << po_deco << " idx_t " << po_var << " = " << + po_expr << _line_suffix; } - // Print code to set pointers of aligned reads. - void CppVecPrintHelper::print_base_ptrs(ostream& os) { - const string& idim = _dims._inner_dim; - - // A set for the aligned reads & writes. - VarPointSet gps; - - // Aligned reads as determined by VecInfoVisitor. - gps = _vv._aligned_vecs; - - // Writes (assume aligned). + // Print creation of inner-loop ptrs. + // To be used before the inner-loop starts. + void CppVecPrintHelper::print_inner_loop_prefix(ostream& os) { + get_point_stats(); + + // A set for both aligned reads & writes. + VarPointSet gps = _aligned_reads; gps.insert(_vv._vec_writes.begin(), _vv._vec_writes.end()); // Loop through all aligned read & write points. for (auto& gp : gps) { - // Can we use a pointer? - if (gp.get_loop_type() != VarPoint::LOOP_OFFSET) + // Can we use a loop pointer? + auto dep_type = gp.get_var_dep(); + if (dep_type != VarPoint::INNER_LOOP_OFFSET) continue; + auto* vp = gp._get_var(); + assert(vp); + auto& var = *vp; + const auto& vname = var.get_name(); + + // Doesn't already exist? + if (!lookup_inner_loop_base_ptr(gp)) { + const auto* vbp = lookup_var_base_ptr(gp); + if (vbp) { + + // Make base point (domain offset = 0; inner-misc indices = min-val). + auto bgp = make_inner_loop_base_point(gp); + + // Get temp var for ptr. + string ptr_name = make_var_name(vname + "_inner_loop_ptr"); + + // Save for future use. + _inner_loop_base_ptrs[*bgp] = ptr_name; + + // Print pointer creation. + auto ofs_expr = get_var_base_ptr_offset(os, *bgp); + os << "\n // Pointer to " << bgp->make_str() << " in loop\n"; + os << _line_prefix << _var_ptr_type << " " << ptr_name << " = " << + *vbp << " + " << ofs_expr << _line_suffix; + } + } + } + } - // Make base point (misc & inner-dim indices = 0). - auto bgp = make_base_point(gp); - - // Not already saved? - if (!lookup_point_ptr(*bgp)) { - - // Get temp var for ptr. - string ptr_name = make_var_name(); + //#define DEBUG_BUFFERS + + // Collect some stats on read points. + // These are used to create buffers and prefetches. + void CppVecPrintHelper::get_point_stats() { - // Save for future use. - save_point_ptr(*bgp, ptr_name); + // Done if there are at least as many reads as original. + if (_aligned_reads.size() >= _vv._aligned_vecs.size()) + return; + + const string& ildim = _settings._inner_loop_dim; + const string& sdim = _dims._step_dim; + + // Loop through all aligned read points. + for (auto& gp : _vv._aligned_vecs) { + Var* vp = const_cast(gp._get_var()); // Need to modify var. + assert(vp); + auto& var = *vp; + const auto& vname = var.get_name(); + + #ifdef DEBUG_BUFFERS + cout << "*** Getting stats for " << gp.make_str() << endl; + #endif + + // Add to read set. + _aligned_reads.insert(gp); + + // Get const offsets for this point. + auto& offsets = gp.get_arg_offsets(); + + // Get offset in step dim, if any. + auto* sofs = offsets.lookup(sdim); + + // Is there also a write to this var that might overwrite + // a read at this step-dim offset? + // This would be true only with immediate replacement (writeback) + // optimization. + bool is_write = false; + if (sofs) { + + auto sdi = var.get_step_dim_info(); + if (sdi.writeback_ofs.count(_stage_name) && + sdi.writeback_ofs.at(_stage_name) == *sofs) { + is_write = true; + #ifdef DEBUG_BUFFERS + cout << "** Found writeback to " << vname << " over ofs " << *sofs << endl; + #endif + } } - // Collect some stats for reads using this ptr. - if (_vv._aligned_vecs.count(gp)) { - auto* p = lookup_point_ptr(*bgp); - assert(p); + // Get offset in inner-loop dim. + // E.g., A(t, x+1, y+4, z-2) => 4 if ildim = 'y'. + auto* ofs = offsets.lookup(ildim); + if (ofs) { - // Get const offsets. - auto& offsets = gp.get_arg_offsets(); + // Vec offset. + auto vofs = *ofs / _dims._fold[ildim]; - // Get offset in inner dim. - // E.g., A(t, x+1, y+4) => 4. - auto* ofs = offsets.lookup(idim); + // Make a copy of this point w/inner-loop index=0. + // E.g., A(t, x+1, y+4, z-2, 5) => A(t, x+1, y, z-2, 5) if ildim = 'y'. + auto key = gp.clone_var_point(); + IntScalar idi(ildim, 0); + key->set_arg_offset(idi); - // Remember lowest inner-dim offset from this ptr. - if (ofs && (!_ptr_ofs_lo.count(*p) || _ptr_ofs_lo[*p] > *ofs)) - _ptr_ofs_lo[*p] = *ofs; + // Remember key for this point. + _inner_loop_key[gp] = key; + // Remember lowest inner-loop dim offset from this key. + if (!_pt_inner_loop_lo.count(*key) || _pt_inner_loop_lo.at(*key) > vofs) + _pt_inner_loop_lo[*key] = vofs; + auto lo = _pt_inner_loop_lo.at(*key); + // Remember highest one. - if (ofs && (!_ptr_ofs_hi.count(*p) || _ptr_ofs_hi[*p] < *ofs)) - _ptr_ofs_hi[*p] = *ofs; - } - } - - // Loop through all aligned read & write points. - set done; - for (auto& gp : gps) { + if (!_pt_inner_loop_hi.count(*key) || _pt_inner_loop_hi.at(*key) < vofs) + _pt_inner_loop_hi[*key] = vofs; + auto hi = _pt_inner_loop_hi.at(*key); + + // Need a buffer? (This will change as new points are + // discovered.) Length will cover range of vecs needed. + // The num of vecs stepped in the inner loop is subtracted + // because we don't need to put the vecs read in the current + // loop iteration in the buffer (until it's shifted at the + // end of the loop.) Then, the length may then be increased + // if reading ahead unless we're also writing back, in which + // case read-ahead can't be used. + auto len = hi - lo + 1; + len -= _inner_loop_vec_step; + #ifdef DEBUG_BUFFERS + cout << "*** Buffer for " << key->make_str() << + " has non-read-ahead length " << len << endl; + #endif + auto mbl = max(_settings._min_buffer_len, 1); + + // Add read-ahead if requested and allowed. + auto rad = _settings._read_ahead_dist; + auto ralv = _inner_loop_vec_step * rad; + auto rale = _inner_loop_elem_step * rad; + if (rad > 0 && !is_write && (len + ralv) >= mbl) { + #ifdef DEBUG_BUFFERS + cout << " *** Adding " << ralv << " vecs to buffer for read-ahead\n"; + #endif + + // Add more read points to read set. + // These may not be in the original set because they are + // for reading ahead. + // If some already exist, it will not hurt to re-add them. + auto ofs = lo + len + 1; + for (int i = 0; i < ralv; i++, ofs++) { + auto rap = key->clone_var_point(); + auto eofs = ofs * _dims._fold[ildim]; + IntScalar idi(ildim, eofs); // At end of buffer. + rap->set_arg_offset(idi); + #ifdef DEBUG_READ_AHEAD + cout << " *** Adding read point " << rap->make_str() << endl; + #endif + _aligned_reads.insert(*rap); // Save new read point. + _inner_loop_key[*rap] = key; // Save its key. + } - // Make base point (inner-dim index = 0). - auto bgp = make_base_point(gp); + // Increase buf len. + len += ralv; - // Got a pointer? - auto* p = lookup_point_ptr(*bgp); - if (!p) - continue; + // Increase var allocation for read-ahead (in elements, + // not vecs). TODO: be more accurate about when to + // increase pad; this assumes it extends beyond halo + // region. + var.update_read_ahead_pad(rale); + } - // Make code for pointer and prefetches. - if (!done.count(*p)) { + // Remember buf len using key if above threshold. + if (len >= mbl) + _pt_buf_len[*key] = len; + } + } + } - // Print pointer creation. - print_point_ptr(os, *p, *bgp); + // Print all aligned loads. + void CppVecPrintHelper::print_early_loads(ostream& os) { + get_point_stats(); - // Print prefetch(es) for this ptr if a read. - if (_vv._aligned_vecs.count(gp)) - print_prefetches(os, false, *p); + os << "\n // Issuing all aligned loads early (before needed).\n"; - done.insert(*p); - } + // Loop through all aligned read points. + // TODO: ignore points in buffer. + for (auto& gp : _aligned_reads) { + read_from_point(os, gp); } + os << "\n // Done issuing all aligned loads early.\n"; } - // Print prefetches for each base pointer. - // 'level': cache level. - // 'ahead': prefetch PF distance ahead instead of up to PF dist. - // TODO: add handling of misc dims. - void CppVecPrintHelper::print_prefetches(ostream& os, - bool ahead, string ptr_var) { + // Print buffer-code for each inner-loop base pointer. + // 'in_loop': just shift and load last one. + void CppVecPrintHelper::print_buffer_code(ostream& os, bool in_loop) { + get_point_stats(); - // cluster mult in inner dim. - const string& idim = _dims._inner_dim; - string imult = "CMULT_" + PrinterBase::all_caps(idim); + set done; + const string& ildim = _settings._inner_loop_dim; + + // Loop through all aligned read points. + // TODO: can we just loop thru _inner_loop_key? + for (auto& gp : _aligned_reads) { + if (_inner_loop_key.count(gp) == 0) + continue; - for (int level = 1; level <= 2; level++) { + // Only need buffer for unique point along inner-loop. + auto key = _inner_loop_key[gp]; + if (done.count(*key)) + continue; - os << "\n // Prefetch to L" << level << " cache if enabled.\n"; - os << _line_prefix << "#if PFD_L" << level << " > 0\n"; + // Need a buffer? + if (_pt_buf_len.count(*key) == 0) + continue; + + auto lo = _pt_inner_loop_lo.at(*key); + auto len = _pt_buf_len[*key]; + auto end = lo + len; + auto* vp = gp._get_var(); + assert(vp); + auto& var = *vp; + const auto& vname = var.get_name(); + + int start_ofs, stop_ofs, start_load; + string bname; + + // Before end of loop. + if (in_loop) { + os << "\n // Update buffer for " << key->make_str() << endl; + os << _line_prefix << "{\n"; + assert(_pt_buf_name.count(*key)); + bname = _pt_buf_name.at(*key); + for (int i = 0; i < len - _inner_loop_vec_step; i++) + os << _line_prefix << bname << "[" << i << "] = " << + bname << "[" << (i + _inner_loop_vec_step) << "]" << _line_suffix; + start_ofs = end; + stop_ofs = end + _inner_loop_vec_step; + start_load = max(len - _inner_loop_vec_step, 0); + } - // Loop thru vec ptrs. - for (auto vp : _vec_ptrs) { - auto& ptr = vp.second; // ptr var name. + // Before start of loop. + else { + bname = make_var_name(vname + "_buf"); + os << "\n // Buffer for " << key->make_str() << " with " << ildim << " vector "; + if (len == 1) + os << "offset " << lo << "\n"; + else + os << "offsets in [" << lo << "..." << (end-1) << "]\n"; + os << _line_prefix << _var_type << " " << bname << "[" << len << "];\n"; + os << _line_prefix << "{\n"; + start_ofs = lo; + stop_ofs = end; + start_load = 0; + } - // Filter by ptr_var if provided. - if (ptr_var.length() && ptr_var != ptr) - continue; + // Load the buffer. + int i = start_load; + for (int vofs = start_ofs; vofs < stop_ofs && i < len; vofs++, i++) { + auto eofs = vofs * _dims._fold[ildim]; // Vector ofs. - // _ptr_ofs{Lo,Hi} contain first and last offsets in idim, - // NOT normalized to vector len. - string left = _dims.make_norm_str(_ptr_ofs_lo[ptr], idim); - if (left.length() == 0) left = "0"; - string right = _dims.make_norm_str(_ptr_ofs_hi[ptr], idim); + // Make pt w/needed offset. + auto ogp = gp.clone_var_point(); + const string& ildim = _settings._inner_loop_dim; // ofs dim. + IntScalar idi(ildim, eofs); + ogp->set_arg_offset(idi); - // Loop bounds. - string start, stop; - - // If fetching ahead, only need to get those following - // the previous one. - if (ahead) - start = "(PFD_L" + to_string(level) + "*" + imult + ")" + right; + // Get value at pt. + string res; + if (_reuse_vars && _vec_vars.count(*ogp)) + res = _vec_vars[*ogp]; + else + res = print_aligned_vec_read(os, *ogp); - // If fetching first time, need to fetch across whole range; - // starting at left edge. - else - start = left; - start = "(" + start + ")"; - - // If fetching again, stop before next one. - if (ahead) - stop = "((PFD_L" + to_string(level) + "+1)*" + imult + ")" + right; + // Save in buf. + os << _line_prefix << bname << "[" << i << "] = " << res << _line_suffix; + } + os << _line_prefix << "} // Setting " << bname << "\n"; + + if (!in_loop) + _pt_buf_name[*key] = bname; + done.insert(*key); + } + } + + // Print prefetches for each inner-loop base pointer. + // 'in_loop' == 'true': prefetch at end of loop; otherwise before loop. + void CppVecPrintHelper::print_prefetches(ostream& os, bool in_loop) { + get_point_stats(); + + // Not currently prefetching anything before loop starts. + if (!in_loop) + return; + + const string& ildim = _settings._inner_loop_dim; + auto& imult = _inner_loop_vec_step; - // If fetching first time, stop where next "ahead" one ends. - else - stop = "(PFD_L" + to_string(level) + "*" + imult + ")" + right; - stop = "(" + stop + ")"; - - // Start loop of prefetches. - os << "\n // For pointer '" << ptr << "'\n" - "#pragma unroll(" << stop << " - " << start << ")\n" << - _line_prefix << " for (int ofs = " << start << - "; ofs < " << stop << "; ofs++) {\n"; - - // Need to print prefetch for every unique var-point read. - set done; - for (auto& gp : _vv._aligned_vecs) { - - // For the current base ptr? - auto bgp = make_base_point(gp); - auto* p = lookup_point_ptr(*bgp); - if (p && *p == ptr) { - - // Expression for this offset from inner-dim var. - string inner_expr = idim + " + ofs"; - - // Expression for ptr offset at this point. - string ofs_expr = get_ptr_offset(gp, inner_expr); - print_point_comment(os, gp, "Prefetch for "); - - // Already done? - if (done.count(ofs_expr)) - os << " // Already accounted for.\n"; - - else { - done.insert(ofs_expr); - - // Prefetch. - os << _line_prefix << " prefetch(&" << ptr << - "[" << ofs_expr << "])" << _line_suffix; - } + // 'level': cache level. + for (int level = 1; level <= 2; level++) { + + // Distance. + if (!_settings._prefetch_dists.count(level)) + continue; + auto pfd = _settings._prefetch_dists.at(level); + if (pfd < 1) + continue; + + os << "\n // Prefetch " << pfd << " iteration(s) ahead to L" << + level << " cache.\n" << + _line_prefix << "{\n"; + + // Loop thru inner-loop stats. + for (auto i : _pt_inner_loop_hi) { + auto& key = i.first; + auto hi = i.second; // Furthest vec read at offset in key. + + // Pts in vec. + int start_ofs = hi + (pfd - 1) * _inner_loop_vec_step + 1; + int stop_ofs = start_ofs + _inner_loop_vec_step; + for (int vofs = start_ofs; vofs < stop_ofs; vofs++) { + auto eofs = vofs * _dims._fold[ildim]; // Vector ofs. + + // Make pt w/needed offset. + auto ogp = key.clone_var_point(); + const string& ildim = _settings._inner_loop_dim; // ofs dim. + IntScalar idi(ildim, eofs); + ogp->set_arg_offset(idi); + + // Get ptr to it. + auto* p = lookup_inner_loop_base_ptr(*ogp); + if (p) { + string ptr_expr = *p; + string ptr_var = ptr_expr; + auto ofs_str = get_inner_loop_ptr_offset(os, *ogp); + if (ofs_str.length()) { + ptr_expr += " + (" + ofs_str + ")"; + ptr_var = make_var_name("vec_ptr"); + os << _line_prefix << CppPrintHelper::_var_ptr_type << " " << ptr_var << + " = " << ptr_expr << _line_suffix; } + + // Insert prefetch. + os << _line_prefix << " prefetch(" << ptr_var << + ")" << _line_suffix; } - // End loop; - os << " }\n"; + // TODO: handle case w/o ptr. } - os << _line_prefix << "#endif // L" << level << " prefetch.\n"; - } + } + os << _line_prefix << "} // L" << level << " prefetching\n"; + } // levels. } - // Make base point (misc & inner-dim indices = 0). - var_point_ptr CppVecPrintHelper::make_base_point(const VarPoint& gp) { - var_point_ptr bgp = gp.clone_var_point(); - for (auto& dim : gp.get_dims()) { - auto& dname = dim->_get_name(); - auto type = dim->get_type(); + // print increments of indices & pointers. + void CppVecPrintHelper::print_end_inner_loop(ostream& os) { + get_point_stats(); - // Set inner domain index to 0. - if (dname == get_dims()._inner_dim) { - IntScalar idi(dname, 0); - bgp->set_arg_const(idi); - } + auto& ild = _settings._inner_loop_dim; + os << "\n // Increment indices and pointers.\n" << + _line_prefix << ild << " += " << + _inner_loop_vec_step << _line_suffix << - // Set misc indices to their min value if they are inside - // inner domain dim. - else if (_settings._inner_misc && type == MISC_INDEX) { - auto* var = gp._get_var(); - auto min_val = var->get_min_indices()[dname]; - IntScalar idi(dname, min_val); - bgp->set_arg_const(idi); - } + _line_prefix << get_local_elem_index(ild) << " += " << + _inner_loop_elem_step << _line_suffix << + + _line_prefix << get_global_elem_index(ild) << " += " << + _inner_loop_elem_step << _line_suffix; + + for (auto& i : _inner_loop_base_ptrs) { + auto& vp = i.first; + auto& ptr = i.second; + auto* stride = lookup_stride(*vp._get_var(), _settings._inner_loop_dim); + assert(stride); + os << _line_prefix << ptr << " += " << + _inner_loop_vec_step << " * " << *stride << _line_suffix; } - return bgp; } - // Print code to set ptr_name to gp. - void CppVecPrintHelper::print_point_ptr(ostream& os, const string& ptr_name, - const VarPoint& gp) { - print_point_comment(os, gp, "Calculate pointer to "); - - // Get pointer to vector using normalized indices. - // Ignore out-of-range errors because we might get a base pointer to an - // element before the allocated range. - auto vp = print_vec_point_call(os, gp, "get_vec_ptr_norm", "", "false", true); - - // Ptr will be unique if: - // - Var doesn't have step dim, or - // - Var doesn't allow dynamic step allocs and the alloc size is one (TODO), or - // - Var doesn't allow dynamic step allocs and all accesses are via - // offsets from the step dim w/compatible offsets (TODO). - // TODO: must also share pointers during code gen in last 2 cases. + // Get expression for offset of 'gp' from var-base pointer. + string CppVecPrintHelper::get_var_base_ptr_offset(ostream& os, + const VarPoint& gp, + const VarMap* var_map) { auto* var = gp._get_var(); - bool is_unique = false; - //bool is_unique = (var->get_step_dim() == nullptr); - string type = is_unique ? _var_ptr_restrict_type : _var_ptr_type; + assert(var); + auto vname = var->get_name(); + string ofs_str; + int nterms = 0; + + // Const offset. + if (_ptr_ofs.count(vname)) { + ofs_str += string("(") + _ptr_ofs[vname] + ")"; + nterms++; + } - // Print type and value. - os << _line_prefix << type << " " << ptr_name << " = " << vp << _line_suffix; + // Construct the point-specific linear offset by adding the products + // of each index with the var's stride in that dim. + for (int i = 0; i < var->get_num_dims(); i++) { + + // Access in layout order. + auto& dimi = gp.get_layout_dims().at(i); + auto typei = dimi->get_type(); + bool is_step = typei == STEP_INDEX; + + // There is a separate pointer for each value of + // the step index, so we don't need to include + // that index in the offset calculation. + if (!is_step) { + string dni = dimi->_get_name(); + + // Construct offset in this dim. + string nas = (gp.get_vec_type() == VarPoint::VEC_FULL) ? + gp.make_norm_arg_str(dni, _dims, var_map) : + gp.make_arg_str(dni, var_map); + + // Get stride in this dim. + auto* stride = lookup_stride(*var, dni); + assert(stride); + + // Mult & add to offset expression. + if (nterms) + ofs_str += " + "; + ofs_str += "((" + nas + ") * (" + *stride + "))"; + nterms++; + } + } + + return ofs_str; } - // Get expression for offset of 'gp' from base pointer. Base pointer - // points to vector with outer-dims == same values as in 'gp', inner-dim - // == 0 and misc dims == their min value. - string CppVecPrintHelper::get_ptr_offset(const VarPoint& gp, const string& inner_expr) { + // Get expression for offset of 'gp' from inner-loop base pointer. Base + // pointer points to vector with domain dim w/no offset or + // same values as in 'gp', and misc dims == their min value. + // Return empty string if no offset. + string CppVecPrintHelper::get_inner_loop_ptr_offset(ostream& os, + const VarPoint& gp, + const VarMap* var_map, + const string& inner_expr) { auto* var = gp._get_var(); - - // Need to create an expression for inner-dim - // and misc indices offsets. - - // Start with offset in inner-dim direction. - // This must the dim that appears before the misc dims - // in the var layout. - string idim = _dims._inner_dim; - string ofs_str = "("; - if (inner_expr.length()) - ofs_str += inner_expr; - else - ofs_str += gp.make_norm_arg_str(idim, _dims); - ofs_str += ")"; - - // Misc indices if they are inside inner-dim. - if (_settings._inner_misc) { - for (int i = 0; i < var->get_num_dims(); i++) { - auto& dimi = gp.get_dims().at(i); - auto& dni = dimi->_get_name(); - auto typei = dimi->get_type(); - if (typei == MISC_INDEX) { - - // Mult by size of remaining misc dims. - for (int j = i; j < var->get_num_dims(); j++) { - auto& dimj = gp.get_dims().at(j); - auto& dnj = dimj->_get_name(); - auto typej = dimj->get_type(); - if (typej == MISC_INDEX) { - auto min_idx = var->get_min_indices()[dnj]; - auto max_idx = var->get_max_indices()[dnj]; - ofs_str += " * (" + to_string(max_idx) + - " - " + to_string(min_idx) + " + 1)"; - } + assert(var); + auto vname = var->get_name(); + string ofs_str; + int nterms = 0; + + // Construct the point-specific linear offset by adding the products + // of each index with the var's stride in that dim. + for (int i = 0; i < var->get_num_dims(); i++) { + auto& dimi = gp.get_layout_dims().at(i); + auto dname = dimi->_get_name(); + auto type = dimi->get_type(); + bool use_domain = (type == DOMAIN_INDEX) && + (!_settings._use_many_ptrs || dname == _dims._inner_layout_dim); + bool use_misc = type == MISC_INDEX; + + // Need to create an expression for offsets. + if (use_domain || use_misc) { + + // Construct offset in this dim. + string nas; + + if (use_domain) { + + // Get const offset in inner dim. + // E.g., if idim == 'y', A(t, x+1, y+4) => 4. + auto& offsets = gp.get_arg_offsets(); + auto ofs = offsets[dname]; + + // Is non-zero? + if (ofs) { + if (_dims._fold_gt1.lookup(dname)) + nas = _dims.make_norm_str(ofs, dname); + else + nas = to_string(ofs); } - - // Add offset of this misc value, which must be const. - auto min_val = var->get_min_indices()[dni]; - auto val = gp.get_arg_consts()[dni]; - ofs_str += " + (" + to_string(val) + " - " + - to_string(min_val) + ")"; + + // Override? + if (dname == _dims._inner_layout_dim && inner_expr.length()) + nas = inner_expr; + } + + // Offset from min value of this misc index. + else { + assert(type == MISC_INDEX); + auto min_val = var->get_min_indices()[dname]; + nas = gp.make_arg_str(dname, var_map) + " - " + to_string(min_val); + } + + // Get stride in this dim. + auto* stride = lookup_stride(*var, dname); + assert(stride); + + // Mult & add to offset expression. + if (nas.length()) { + if (nterms) + ofs_str += " + "; + ofs_str += "((" + nas + ") * (" + *stride + "))"; + nterms++; } } } + return ofs_str; } // Print any needed memory reads and/or constructions to 'os'. // Return code containing a vector of var points. string CppVecPrintHelper::read_from_point(ostream& os, const VarPoint& gp) { + get_point_stats(); string code_str; // Already done and saved. if (_reuse_vars && _vec_vars.count(gp)) code_str = _vec_vars[gp]; // do nothing. - // Can we use a vec pointer? - // Read must be aligned, and we must have a pointer. - else if (_vv._aligned_vecs.count(gp) && - gp.get_vec_type() == VarPoint::VEC_FULL && - gp.get_loop_type() == VarPoint::LOOP_OFFSET) { - - // Got a pointer to the base addr? - auto bgp = make_base_point(gp); - auto* p = lookup_point_ptr(*bgp); - if (p) { -#ifdef DEBUG_GP - cout << " //** reading from point " << gp.make_str() << " using pointer.\n"; -#endif - - // Output read using base addr. - auto ofs_str = get_ptr_offset(gp); - print_point_comment(os, gp, "Read aligned"); - code_str = make_var_name(); - os << _line_prefix << get_var_type() << " " << code_str << " = " << - *p << "[" << ofs_str << "]" << _line_suffix; - } - } - // If not done, continue based on type of vectorization. - if (!code_str.length()) { + else { - // Scalar GP? + // Scalar point? if (gp.get_vec_type() == VarPoint::VEC_NONE) { #ifdef DEBUG_GP cout << " //** reading from point " << gp.make_str() << " as scalar.\n"; #endif - code_str = read_from_scalar_point(os, gp); + code_str = read_from_scalar_point(os, gp, &_vec2elem_local_map); } - // Non-scalar but non-vectorizable GP? + // Non-scalar but non-vectorizable point? else if (gp.get_vec_type() == VarPoint::VEC_PARTIAL) { #ifdef DEBUG_GP cout << " //** reading from point " << gp.make_str() << " as partially vectorized.\n"; #endif - code_str = print_non_vec_read(os, gp); + code_str = print_partial_vec_read(os, gp); } // Everything below this should be VEC_FULL. // An aligned vector block? - else if (_vv._aligned_vecs.count(gp)) { + else if (_aligned_reads.count(gp)) { #ifdef DEBUG_GP cout << " //** reading from point " << gp.make_str() << " as fully vectorized and aligned.\n"; #endif @@ -565,65 +953,202 @@ namespace yask { // Return code to update a vector of var points or null string // if all writes were printed. string CppVecPrintHelper::write_to_point(ostream& os, const VarPoint& gp, - const string& val) { + const string& val) { - // Can we use a pointer? - if (gp.get_loop_type() == VarPoint::LOOP_OFFSET) { + // Use vec write. + // NB: currently, all eqs must be vectorizable on LHS, + // so we only need to handle vectorized writes. + // TODO: relax this restriction. + print_aligned_vec_write(os, gp, val); - // Got a pointer to the base addr? - auto bgp = make_base_point(gp); - auto* p = lookup_point_ptr(*bgp); - if (p) { + return ""; // no returned expression. + } + + // Print aligned memory read. + // This should be the most common type of var read. + string CppVecPrintHelper::print_aligned_vec_read(ostream& os, + const VarPoint& gp) { - // Offset. - auto ofs_str = get_ptr_offset(gp); + // Make comment and function call. + print_point_comment(os, gp, "Read aligned vector"); + string mv_name = make_var_name("vec"); - // Output write using base addr. - print_point_comment(os, gp, "Write aligned"); + // Is it already in a buffer? + bool in_buf = false; + if (_inner_loop_key.count(gp)) { + auto key = _inner_loop_key.at(gp); + if (_pt_buf_name.count(*key)) { + auto& bname = _pt_buf_name.at(*key); - if (_use_masked_writes) - os << _line_prefix << val << ".store_to_masked(" << *p << " + (" << - ofs_str << "), write_mask)" << _line_suffix; - else - os << _line_prefix << val << ".store_to(" << *p << " + (" << - ofs_str << "))" << _line_suffix; + // Offset. + const string& ildim = _settings._inner_loop_dim; // ofs dim. + auto& offsets = gp.get_arg_offsets(); + auto& ofs = offsets[ildim]; // Elem ofs. + auto vofs = ofs / _dims._fold[ildim]; // Vector ofs. + auto lo = _pt_inner_loop_lo.at(*key); + auto len = _pt_buf_len.at(*key); + auto end = lo + len; + int i = vofs - lo; + if (i >= 0 && i < len) { + in_buf = true; + + // Load from buffer. + os << _line_prefix << get_var_type() << " " << mv_name << " = " << + bname << "[" << i << "]" << _line_suffix; + } + } + } + + // Do we have a pointer to the base? + // TODO: handle case with pointer to var base but no ptr to inner-loop base. + auto* p = lookup_inner_loop_base_ptr(gp); + if (p) { + + if (in_buf) + os << _line_prefix << "#ifdef CHECK\n"; + + // Ptr expression. + string ptr_expr = *p; + string ptr_var = ptr_expr; + auto ofs_str = get_inner_loop_ptr_offset(os, gp); + if (ofs_str.length()) { + ptr_expr += " + (" + ofs_str + ")"; + ptr_var = make_var_name("vec_ptr"); + os << _line_prefix << CppPrintHelper::_var_ptr_type << " " << ptr_var << + " = " << ptr_expr << _line_suffix; + } - return ""; + // Check addr. + auto rpn = make_point_call_vec(os, gp, "get_vec_ptr_norm", "", "", true); + os << _line_prefix << "host_assert(" << + ptr_var << " == " << rpn << ")" << _line_suffix; + + // Output load. + // We don't use masked loads because several aligned loads might + // be combined to make a simulated unaligned load. + if (!in_buf) { + os << _line_prefix << get_var_type() << " " << mv_name << _line_suffix; + os << _line_prefix << mv_name << ".load_from(" << ptr_var << ")" << + _line_suffix; + } + + // Check value. + else { + os << _line_prefix << "host_assert(" << mv_name << " == *" << + ptr_var << ")" << _line_suffix << + _line_prefix << "#endif // CHECK\n"; } + + } else if (!in_buf) { + + // If no buffer or pointer, use function call. + auto rvn = make_point_call_vec(os, gp, "read_vec_norm", "", "", true); + os << _line_prefix << get_var_type() << " " << mv_name << " = " << + rvn << _line_suffix; } + + return mv_name; + } - // If no pointer, use vec write. - // NB: currently, all eqs must be vectorizable on LHS, - // so we only need to handle vectorized writes. - // TODO: relax this restriction. - print_aligned_vec_write(os, gp, val); + // Read from a single point. + // Return code for read. + string CppVecPrintHelper::read_from_scalar_point(ostream& os, const VarPoint& gp, + const VarMap* var_map) { + assert(var_map); + auto* var = gp._get_var(); + assert(!var->is_foldable()); // Assume all scalar reads are from non-vec vars. + + // Do we have a pointer to the base? + auto* p = lookup_inner_loop_base_ptr(gp); + if (p) { + + // Ptr expression. + string ptr_expr = *p; + string ptr_var = ptr_expr; + auto ofs_str = get_inner_loop_ptr_offset(os, gp, var_map); + if (ofs_str.length()) { + ptr_expr += " + (" + ofs_str + ")"; + ptr_var = make_var_name("elem_ptr"); + os << _line_prefix << CppPrintHelper::_var_ptr_type << " " << ptr_var << + " = " << ptr_expr << _line_suffix; + } + + // Check addr. + auto rp = make_point_call_vec(os, gp, "get_elem_ptr_local", "", "", false, var_map); + os << _line_prefix << "host_assert(" << + ptr_var << " == " << rp << ")" << _line_suffix; + + // Return expr. + return string("*(") + ptr_var + ")"; + } - return ""; // no returned expression. + else + return make_point_call_vec(os, gp, "read_elem_local", "", "", false, var_map); } + // Read from multiple points that are not vectorized. + // Return var name. + string CppVecPrintHelper::print_partial_vec_read(ostream& os, const VarPoint& gp) { + print_point_comment(os, gp, "Construct folded vector from non-folded data"); - // Print aligned memory read. - string CppVecPrintHelper::print_aligned_vec_read(ostream& os, const VarPoint& gp) { + // Make a vec var. + string mv_name = make_var_name("vec"); + os << _line_prefix << get_var_type() << " " << mv_name << _line_suffix; - print_point_comment(os, gp, "Read aligned"); - auto rvn = print_vec_point_call(os, gp, "read_vec_norm", "", "__LINE__", true); + // Loop through all points in the vector fold. + get_fold().visit_all_points([&](const IntTuple& vec_point, + size_t pelem){ - // Read memory. - string mv_name = make_var_name(); - os << _line_prefix << get_var_type() << " " << mv_name << " = " << rvn << _line_suffix; + // Example: vec_point contains x=0, y=2, z=1, where each val + // is the offset in the given fold dim. We want to map + // x=>x_elem, y=>(y_elem+2), z=>(z_elem+1) in var-point + // index args. + VarMap v_map; + for (auto& dim : vec_point) { + auto& dname = dim._get_name(); + int dofs = dim.get_val(); + + auto& ename = _vec2elem_local_map.at(dname); + if (dofs == 0) + v_map[dname] = ename; + else { + v_map[dname] = "(" + ename + "+" + to_string(dofs) + ")"; + } + } + + // Read or reuse. + string stmt = read_from_scalar_point(os, gp, &v_map); + auto* varname = lookup_elem_var(stmt); + if (!varname) { + + // Read val into a new scalar var. + string vname = make_var_name("scalar"); + os << _line_prefix << "real_t " << vname << + " = " << stmt << _line_suffix; + varname = save_elem_var(stmt, vname); + } + + // Output translated expression for this element. + os << _line_prefix << mv_name << "[" << pelem << "] = " << + *varname << "; // for offset " << vec_point.make_dim_val_str() << + _line_suffix; + + return true; + }); // end of lambda. return mv_name; } // Print unaliged memory read. // Assumes this results in same values as print_unaligned_vec(). + // TODO: use pointer. string CppVecPrintHelper::print_unaligned_vec_read(ostream& os, const VarPoint& gp) { print_point_comment(os, gp, "Read unaligned"); os << " // NOTICE: Assumes constituent vectors are consecutive in memory!" << endl; // Make a var. - string mv_name = make_var_name(); + string mv_name = make_var_name("unaligned_vec"); os << _line_prefix << get_var_type() << " " << mv_name << _line_suffix; - auto vp = print_vec_point_call(os, gp, "get_elem_ptr", "", "true", false); + auto vp = make_point_call_vec(os, gp, "get_elem_ptr", "", "true", false); // Read memory. os << _line_prefix << mv_name << @@ -632,15 +1157,47 @@ namespace yask { } // Print aligned memory write. - string CppVecPrintHelper::print_aligned_vec_write(ostream& os, const VarPoint& gp, - const string& val) { - print_point_comment(os, gp, "Write aligned"); - auto vn = print_vec_point_call(os, gp, "write_vec_norm_masked", val, "write_mask, __LINE__", true); - // without mask: auto vn = print_vec_point_call(os, gp, "write_vec_norm", val, "__LINE__", true); - - // Write temp var to memory. - os << vn; - return val; + void CppVecPrintHelper::print_aligned_vec_write(ostream& os, const VarPoint& gp, + const string& val) { + + print_point_comment(os, gp, "Write aligned vector"); + + // Got a pointer to the base addr? + auto* p = lookup_inner_loop_base_ptr(gp); + if (p) { + + // Ptr expression. + string ptr_expr = *p; + string ptr_var = ptr_expr; + auto ofs_str = get_inner_loop_ptr_offset(os, gp); + if (ofs_str.length()) { + ptr_expr += " + (" + ofs_str + ")"; + ptr_var = make_var_name("var_ptr"); + os << _line_prefix << CppPrintHelper::_var_ptr_type << " " << ptr_var << + " = " << ptr_expr << _line_suffix; + } + + // Check addr. + auto rpn = make_point_call_vec(os, gp, "get_vec_ptr_norm", "", "", true); + os << _line_prefix << "host_assert(" << + ptr_var << " == " << rpn << ")" << _line_suffix; + + // Output store. + os << _line_prefix << val; + if (_write_mask.length()) + os << ".store_to_masked(" << ptr_expr << ", " << _write_mask << ")"; + else + os << ".store_to(" << ptr_expr << ")"; + os << _line_suffix; + } + + else { + + // If no pointer, use function call. + string fn = _write_mask.length() ? "write_vec_norm_masked" : "write_vec_norm"; + auto vn = make_point_call_vec(os, gp, fn, val, _write_mask, true); + os << _line_prefix << vn << _line_suffix; + } } // Print conversion from memory vars to point var gp if needed. @@ -650,7 +1207,7 @@ namespace yask { print_point_comment(os, gp, "Construct unaligned"); // Declare var. - string pv_name = make_var_name(); + string pv_name = make_var_name("unaligned_vec"); os << _line_prefix << get_var_type() << " " << pv_name << _line_suffix; // Contruct it. @@ -689,51 +1246,88 @@ namespace yask { } } + // Print some rank info. + void CppVecPrintHelper::print_rank_data(ostream& os) { + auto& fold = get_fold(); + os << "\n // Rank data.\n"; + int i = 0; + for (auto& dim : fold) { + auto& dname = dim._get_name(); + string rdoname = _rank_domain_offset_prefix + dname; + os << " const idx_t " << rdoname << + " = core_data->_common_core._rank_domain_offsets[" << i << "];\n"; + i++; + } + } + // Print init of element indices. - // Fill _vec2elem_map as side-effect. + // Fill _vec2elem_*_map as side-effect. void CppVecPrintHelper::print_elem_indices(ostream& os) { auto& fold = get_fold(); - os << "\n // Element indices derived from vector indices.\n"; - int i = 0; + os << "\n // Element indices derived from vector indices" + " (only used for non-vectorized vars).\n"; for (auto& dim : fold) { auto& dname = dim._get_name(); - string ename = dname + _elem_suffix; string cap_dname = PrinterBase::all_caps(dname); - os << " idx_t " << ename << - " = _context->rank_domain_offsets[" << i << "] + (" << - dname << " * VLEN_" << cap_dname << ");\n"; - _vec2elem_map[dname] = ename; - i++; + string elname = dname + _elem_suffix_local; + string egname = dname + _elem_suffix_global; + string rdoname = _rank_domain_offset_prefix + dname; + os << " idx_t " << elname << + " = " << dname << " * VLEN_" << cap_dname << ";\n" + " idx_t " << egname << " = " << rdoname << " + " << elname << ";\n"; + _vec2elem_local_map[dname] = elname; + _vec2elem_global_map[dname] = egname; } } - - // Print invariant var-access vars for non-time loop(s). - string CppStepVarPrintVisitor::visit(VarPoint* gp) { - - // Pointer to var. - string var_ptr = _cvph.get_local_var(_os, get_var_ptr(*gp), CppPrintHelper::_var_ptr_restrict_type); + + // Print loop-invariant meta values for each VarPoint. + string CppPreLoopPrintMetaVisitor::visit(VarPoint* gp) { + assert(gp); + + // Pointer to this var's core. + string varp = get_var_ptr(*gp); + string vname = gp->get_var_name(); + if (!_cvph.is_local_var(varp)) + _os << "\n // Pointer to core of var '" << vname << "'.\n"; + string var_ptr = _cvph.get_local_var(_os, varp, + CppPrintHelper::_var_ptr_restrict_type, + vname + "_core"); - // Time var. + // Step var for this access, if any. auto& dims = _cvph.get_dims(); - _cvph.get_local_var(_os, gp->make_step_arg_str(var_ptr, dims), - CppPrintHelper::_step_val_type); - return ""; + string sas = gp->make_step_arg_str(var_ptr, dims); + if (sas.length()) { + if (!_cvph.is_local_var(sas)) + _os << "\n // Step index for var '" << vname << "'.\n"; + _cvph.get_local_var(_os, sas, CppPrintHelper::_step_val_type, + vname + "_step_idx"); + } + + // Print strides and local offsets for this var. + _cvph.print_strides(_os, *gp); + + // Make and print a var-base pointer for this access. + _cvph.print_var_base_ptr(_os, *gp); + + return ""; } - // Print invariant var-access vars for an inner loop. - string CppLoopVarPrintVisitor::visit(VarPoint* gp) { + // Print loop-invariant data values for each VarPoint. + // TODO: fix warning from loading invariant real_vec_t outside of OMP device region. + string CppPreLoopPrintDataVisitor::visit(VarPoint* gp) { + assert(gp); - // Retrieve prior analysis of this var point. - auto loop_type = gp->get_loop_type(); + // Retrieve prior dependence analysis of this var point. + auto dep_type = gp->get_var_dep(); // If invariant, we can load now. - if (loop_type == VarPoint::LOOP_INVARIANT) { + if (dep_type == VarPoint::DOMAIN_VAR_INVARIANT) { // Not already loaded? if (!_cvph.lookup_point_var(*gp)) { string expr = _ph.read_from_point(_os, *gp); string res; - make_next_temp_var(res, gp) << expr << _ph.get_line_suffix(); + make_next_temp_var(res, gp, "expr", "") << expr << _ph.get_line_suffix(); // Save for future use. _cvph.save_point_var(*gp, res); diff --git a/src/compiler/lib/Cpp.hpp b/src/compiler/lib/Cpp.hpp index af0f399a..a579a992 100644 --- a/src/compiler/lib/Cpp.hpp +++ b/src/compiler/lib/Cpp.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -33,7 +33,7 @@ IN THE SOFTWARE. #include "Var.hpp" namespace yask { - + /////////// Scalar code ///////////// // Outputs C++ scalar code for YASK. @@ -47,12 +47,10 @@ namespace yask { CppPrintHelper(const CompilerSettings& settings, const Dimensions& dims, const CounterVisitor* cv, - const string& var_prefix, const string& var_type, const string& line_prefix, const string& line_suffix) : - PrintHelper(settings, dims, cv, var_prefix, var_type, - line_prefix, line_suffix) { } + PrintHelper(settings, dims, cv, var_type, line_prefix, line_suffix) { } virtual ~CppPrintHelper() { } // Format a real, preserving precision. @@ -65,31 +63,35 @@ namespace yask { } // Format a pointer to a var. - virtual string get_var_ptr(const VarPoint& gp) { - const auto* var = gp._get_var(); - string gname = var->_get_name(); - string expr = "(static_cast<_context_type::" + gname + "_type*>(_context_data->"; - if (var->is_scratch()) - expr += gname + "_list[region_thread_idx]"; + virtual string get_var_ptr(const Var& var) { + string gname = var._get_name(); + string expr; + if (var.is_scratch()) + expr = "thread_core_data."; else - expr += gname + "_ptr"; - expr += ".get()->gbp()))"; + expr = "core_data->"; + expr += "var_" + gname + "_core_p.get()"; return expr; } + virtual string get_var_ptr(const VarPoint& gp) { + const auto* var = gp._get_var(); + assert(var); + return get_var_ptr(*var); + } // Make call for a point. // This is a utility function used for both reads and writes. virtual string make_point_call(ostream& os, - const VarPoint& gp, - const string& fname, - string opt_arg = ""); + const VarPoint& gp, + const string& fname, + string opt_arg = ""); // Return a var-point reference. virtual string read_from_point(ostream& os, const VarPoint& gp) override; // Return code to update a var point. virtual string write_to_point(ostream& os, const VarPoint& gp, - const string& val) override; + const string& val) override; }; /////////// Vector code ///////////// @@ -98,31 +100,54 @@ namespace yask { class CppVecPrintHelper : public CppPrintHelper, public VecPrintHelper { - public: - CppVecPrintHelper(VecInfoVisitor& vv, - const CompilerSettings& settings, - const Dimensions& dims, - const CounterVisitor* cv, - const string& var_prefix, - const string& var_type, - const string& line_prefix, - const string& line_suffix) : - CppPrintHelper(settings, dims, cv, - var_prefix, var_type, line_prefix, line_suffix), - VecPrintHelper(vv) { } - protected: - // Vars for tracking pointers to var values. - map _vec_ptrs; // pointers to var vecs. value: ptr-var name. - map _ptr_ofs_lo; // lowest read offset from _vec_ptrs in inner dim. - map _ptr_ofs_hi; // highest read offset from _vec_ptrs in inner dim. + // Name of current stage; + string _stage_name; + + // Name of ptr to lowest-allocated vec for a given point in that var. + // There is a unique ptr for each step-arg per var. + // Thus, there is a many->one mapping for points that vary only by domain and/or misc indices. + // Key: point expr; value: ptr-var name. + map _var_base_ptrs; + + // Name of ptr to current point in inner-loop dim. + // Inner-layout dim index has no offset, and misc-dim indices are at min-value. + // Key: point expr; value: ptr-var name. + map _inner_loop_base_ptrs; + + // Vars for tracking other info about vars. + typedef pair VarDimKey; // var and dim names. + map _strides; // var containing stride expr for given dim in var. + map _offsets; // var containing offset expr for given dim in var. + map _ptr_ofs; // var containing offset expr for key var. + map _inner_loop_key; // offsets perpendicular to inner-loop dim for var. + VarPointSet _aligned_reads; // _vv._aligned_vecs plus those for read-ahead. + + // Read stats. + // Key: point w/no inner-loop offset. + // Offsets and legths in vec-lengths in inner-loop dim. + // TODO: convert to one map with struct value. + map _pt_inner_loop_lo; // lowest read offset in inner-loop dim. + map _pt_inner_loop_hi; // highest read offset in inner-loop dim. + map _pt_buf_len; // buffer length (only exists if needed). + map _pt_buf_name; // buffer name. // Element indices. - string _elem_suffix = "_elem"; - VarMap _vec2elem_map; // maps vector indices to elem indices; filled by print_elem_indices. + string _elem_suffix_global = "_global_elem"; + string _elem_suffix_local = "_local_elem"; + VarMap _vec2elem_local_map, _vec2elem_global_map; + + // Rank vars. + string _rank_domain_offset_prefix = "rank_domain_offset_"; - bool _use_masked_writes = true; + // Set to var name of write mask if/when used. + string _write_mask = ""; + + // Inner-loop steps. + bool _is_using_cluster = false; + int _inner_loop_vec_step = 1; + int _inner_loop_elem_step = 1; // A simple constant. virtual string add_const_expr(ostream& os, double v) override { @@ -137,20 +162,21 @@ namespace yask { // Print a comment about a point. // This is a utility function used for both reads and writes. virtual void print_point_comment(ostream& os, const VarPoint& gp, - const string& verb) const { + const string& verb) const { - os << endl << " // " << verb << " vector starting at " << + os << endl << " // " << verb << " at " << gp.make_str() << "." << endl; } - // Return code for a vectorized point. + // Return code for a var function call at a point. // This is a utility function used for both reads and writes. - virtual string print_vec_point_call(ostream& os, - const VarPoint& gp, - const string& func_name, - const string& first_arg, - const string& last_arg, - bool is_norm); + virtual string make_point_call_vec(ostream& os, + const VarPoint& gp, + const string& func_name, + const string& first_arg, + const string& last_arg, + bool is_vector_normalized, + const VarMap* var_map = 0); // Print aligned memory read. virtual string print_aligned_vec_read(ostream& os, const VarPoint& gp) override; @@ -160,8 +186,8 @@ namespace yask { virtual string print_unaligned_vec_read(ostream& os, const VarPoint& gp) override; // Print aligned memory write. - virtual string print_aligned_vec_write(ostream& os, const VarPoint& gp, - const string& val) override; + virtual void print_aligned_vec_write(ostream& os, const VarPoint& gp, + const string& val) override; // Print conversion from memory vars to point var gp if needed. // This calls print_unaligned_vec_ctor(), which can be overloaded @@ -176,11 +202,11 @@ namespace yask { // Read from a single point to be broadcast to a vector. // Return code for read. virtual string read_from_scalar_point(ostream& os, const VarPoint& gp, - const VarMap* v_map=0) override; + const VarMap* var_map) override; // Read from multiple points that are not vectorizable. // Return var name. - virtual string print_non_vec_read(ostream& os, const VarPoint& gp) override; + virtual string print_partial_vec_read(ostream& os, const VarPoint& gp) override; // Print construction for one point var pv_name from elems. // This version prints inefficient element-by-element assignment. @@ -190,20 +216,51 @@ namespace yask { } // Get offset from base pointer. - virtual string get_ptr_offset(const VarPoint& gp, - const string& inner_expr = ""); + virtual string get_var_base_ptr_offset(ostream& os, const VarPoint& gp, + const VarMap* var_map = 0); + virtual string get_inner_loop_ptr_offset(ostream& os, const VarPoint& gp, + const VarMap* var_map = 0, + const string& inner_ofs = ""); public: - + CppVecPrintHelper(VecInfoVisitor& vv, + const CompilerSettings& settings, + const Dimensions& dims, + const CounterVisitor* cv, + const string& var_type, + const string& line_prefix, + const string& line_suffix) : + CppPrintHelper(settings, dims, cv, + var_type, line_prefix, line_suffix), + VecPrintHelper(vv) { + set_using_cluster(false); + } + // Whether to use masks during write. - virtual void set_use_masked_writes(bool do_use) { - _use_masked_writes = do_use; + virtual void set_write_mask(string mask_var) { + _write_mask = mask_var; } - virtual bool get_use_masked_writes() const { - return _use_masked_writes; + virtual string get_write_mask() const { + return _write_mask; } - // Print any needed memory reads and/or constructions to 'os'. + // Set step lengths. + virtual void set_using_cluster(bool use) { + _is_using_cluster = use; + const string& ildim = _settings._inner_loop_dim; + _inner_loop_vec_step = use ? _dims._cluster_mults[ildim] : 1; + _inner_loop_elem_step = _inner_loop_vec_step * _dims._fold[ildim]; + } + + // Set stage name. + virtual void set_stage_name(const string& sname) { + _stage_name = sname; + } + + // Collect some stats on points. + virtual void get_point_stats(); + + // Print any needed memory reads and/or constructions to 'os'. // Return code containing a vector of var points. virtual string read_from_point(ostream& os, const VarPoint& gp) override; @@ -212,69 +269,116 @@ namespace yask { // if all writes were printed. virtual string write_to_point(ostream& os, const VarPoint& gp, const string& val) override; - // Print code to set pointers of aligned reads. - virtual void print_base_ptrs(ostream& os); + // Make var base point (first allocated point). + virtual var_point_ptr make_var_base_point(const VarPoint& gp); + + // Make inner-loop base point (no inner-layout offset; misc-dim indices = min-val). + virtual var_point_ptr make_inner_loop_base_point(const VarPoint& gp); + + // Print code to create base pointers for aligned reads. + virtual void print_var_base_ptr(ostream& os, const VarPoint& gp); - // Make base point (misc & inner-dim indices = 0). - virtual var_point_ptr make_base_point(const VarPoint& gp); + // Print things needed before inner loop. + virtual void print_inner_loop_prefix(ostream& os); - // Print prefetches for each base pointer. - // Print only 'ptr_var' if provided. - virtual void print_prefetches(ostream& os, bool ahead, string ptr_var = ""); + // Print all aligned loads before they're needed. + virtual void print_early_loads(ostream& os); + // Print prefetches for each inner-loop base pointer. + // 'in_loop': prefetch PF distance ahead instead of up to PF dist. + virtual void print_prefetches(ostream& os, bool in_loop); + + // Print buffer-code for each inner-loop base pointer. + // 'in_loop': just shift and load last one. + virtual void print_buffer_code(ostream& os, bool in_loop); + + // print init of rank constants. + virtual void print_rank_data(ostream& os); + // print init of un-normalized indices. virtual void print_elem_indices(ostream& os); + // print increments of indices & pointers. + virtual void print_end_inner_loop(ostream& os); + // get un-normalized index. - virtual const string& get_elem_index(const string& dname) const { - return _vec2elem_map.at(dname); + virtual const string& get_local_elem_index(const string& dname) const { + return _vec2elem_local_map.at(dname); + } + virtual const string& get_global_elem_index(const string& dname) const { + return _vec2elem_global_map.at(dname); } - // Print code to set ptr_name to gp. - virtual void print_point_ptr(ostream& os, const string& ptr_name, const VarPoint& gp); + // Print strides for 'gp'. + virtual void print_strides(ostream& os, const VarPoint& gp); // Access cached values. - virtual void save_point_ptr(const VarPoint& gp, string var) { - _vec_ptrs[gp] = var; + virtual string* lookup_var_base_ptr(const VarPoint& gp) { + auto bgp = make_var_base_point(gp); + if (_var_base_ptrs.count(*bgp)) + return &_var_base_ptrs.at(*bgp); + return 0; + } + virtual string* lookup_inner_loop_base_ptr(const VarPoint& gp) { + auto bgp = make_inner_loop_base_point(gp); + if (_inner_loop_base_ptrs.count(*bgp)) + return &_inner_loop_base_ptrs.at(*bgp); + return 0; } - virtual string* lookup_point_ptr(const VarPoint& gp) { - if (_vec_ptrs.count(gp)) - return &_vec_ptrs.at(gp); + virtual string* lookup_stride(const Var& var, const string& dim) { + auto key = VarDimKey(var.get_name(), dim); + if (_strides.count(key)) + return &_strides.at(key); + return 0; + } + virtual string* lookup_offset(const Var& var, const string& dim) { + auto key = VarDimKey(var.get_name(), dim); + if (_offsets.count(key)) + return &_offsets.at(key); return 0; } }; - // Outputs the time-invariant variables. - class CppStepVarPrintVisitor : public PrintVisitorBase { + // Outputs loop-invariant values. + class CppPreLoopPrintVisitor : public PrintVisitorBase { protected: CppVecPrintHelper& _cvph; public: - CppStepVarPrintVisitor(ostream& os, + CppPreLoopPrintVisitor(ostream& os, CppVecPrintHelper& ph, const VarMap* var_map = 0) : PrintVisitorBase(os, ph, var_map), - _cvph(ph) { } - - // A var access. - virtual string visit(VarPoint* gp); + _cvph(ph) { + _visit_equals_lhs = true; + _visit_var_point_args = true; + _visit_conds = true; + } virtual string get_var_ptr(VarPoint& gp) { return _cvph.get_var_ptr(gp); } -}; + }; - // Outputs the loop-invariant variables for an inner loop. - class CppLoopVarPrintVisitor : public PrintVisitorBase { - protected: - CppVecPrintHelper& _cvph; + // Meta values such as strides and pointers. + class CppPreLoopPrintMetaVisitor : public CppPreLoopPrintVisitor { + public: + CppPreLoopPrintMetaVisitor(ostream& os, + CppVecPrintHelper& ph, + const VarMap* var_map = 0) : + CppPreLoopPrintVisitor(os, ph, var_map) { } + // A var access. + virtual string visit(VarPoint* gp); + }; + + // Data values. + class CppPreLoopPrintDataVisitor : public CppPreLoopPrintVisitor { public: - CppLoopVarPrintVisitor(ostream& os, - CppVecPrintHelper& ph, - const VarMap* var_map = 0) : - PrintVisitorBase(os, ph, var_map), - _cvph(ph) { } + CppPreLoopPrintDataVisitor(ostream& os, + CppVecPrintHelper& ph, + const VarMap* var_map = 0) : + CppPreLoopPrintVisitor(os, ph, var_map) { } // A var access. virtual string visit(VarPoint* gp); @@ -285,7 +389,9 @@ namespace yask { protected: EqStages& _eq_stages; // stages of bundles w/o inter-dependencies. EqBundles& _cluster_eq_bundles; // eq-bundles for scalar and vector. - string _context, _context_base, _context_hook; // class names; + string _stencil_prefix; + string _context, _context_hook; // class names; + string _core_t, _thread_core_t; // core struct names; // Print an expression as a one-line C++ comment. void add_comment(ostream& os, EqBundle& eq); @@ -296,11 +402,15 @@ namespace yask { virtual CppVecPrintHelper* new_cpp_vec_print_helper(VecInfoVisitor& vv, CounterVisitor& cv) { return new CppVecPrintHelper(vv, _settings, _dims, &cv, - "temp", "real_vec_t", " ", ";\n"); + "real_vec_t", " ", ";\n"); } // Print extraction of indices. - virtual void print_indices(ostream& os) const; + virtual void print_indices(ostream& os, + bool print_step = true, + bool print_domain = true, + const string prefix = "", + const string inner_var_prefix = "") const; // Print pieces of YASK output. virtual void print_macros(ostream& os); @@ -308,7 +418,6 @@ namespace yask { virtual void print_eq_bundles(ostream& os); virtual void print_context(ostream& os); - public: YASKCppPrinter(StencilSolution& stencil, EqBundles& eq_bundles, @@ -319,9 +428,11 @@ namespace yask { _cluster_eq_bundles(cluster_eq_bundles) { // name of C++ struct. - _context = "StencilContext_" + _stencil._get_name(); - _context_base = _context + "_data"; - _context_hook = _context + "_hook"; + _stencil_prefix = "stencil_" + _stencil._get_name() + "_"; + _context = _stencil_prefix + "context_t"; + _context_hook = _stencil_prefix + "hook_t"; + _core_t = _stencil_prefix + "core_t"; + _thread_core_t = _stencil_prefix + "thread_core_t"; } virtual ~YASKCppPrinter() { } diff --git a/src/compiler/lib/CppIntrin.cpp b/src/compiler/lib/CppIntrin.cpp index 9c8f7805..76f1f955 100644 --- a/src/compiler/lib/CppIntrin.cpp +++ b/src/compiler/lib/CppIntrin.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -41,6 +41,7 @@ void CppIntrinPrintHelper::try_align(ostream& os, // Find case(s) that can use valignd. // Try all possible combinations of 2 aligned vectors, including // each vector paired w/itself. + // TODO: develop a more targeted algorithm instead of using a blind search. for (auto mi = aligned_vecs.begin(); mi != aligned_vecs.end(); mi++) { auto& mv1 = *mi; for (auto mj = aligned_vecs.begin(); mj != aligned_vecs.end(); mj++) { @@ -190,6 +191,7 @@ void CppIntrinPrintHelper::try_perm1(ostream& os, size_t nelems = elems.size(); // Try a permute of each aligned vector. + // TODO: develop a more targeted algorithm instead of using a blind search. for (auto mi = aligned_vecs.begin(); mi != aligned_vecs.end(); mi++) { auto mv = *mi; @@ -291,6 +293,7 @@ void CppIntrinPrintHelper::try_perm2(ostream& os, // Find case(s) that can use perm2. Try all possible combinations // of 2 aligned vectors, but NOT including each vector paired // w/itself. (For that, we can use perm1.) + // TODO: develop a more targeted algorithm instead of using a blind search. for (auto mi = aligned_vecs.begin(); mi != aligned_vecs.end(); mi++) { auto& mv1 = *mi; for (auto mj = aligned_vecs.begin(); mj != aligned_vecs.end(); mj++) { diff --git a/src/compiler/lib/CppIntrin.hpp b/src/compiler/lib/CppIntrin.hpp index 84985bdc..568e31b7 100644 --- a/src/compiler/lib/CppIntrin.hpp +++ b/src/compiler/lib/CppIntrin.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -45,12 +45,11 @@ namespace yask { const CompilerSettings& settings, const Dimensions& dims, const CounterVisitor* cv, - const string& var_prefix, const string& var_type, const string& line_prefix, const string& line_suffix) : CppVecPrintHelper(vv, settings, dims, cv, - var_prefix, var_type, line_prefix, line_suffix) { } + var_type, line_prefix, line_suffix) { } // Dtor. virtual ~CppIntrinPrintHelper() { } @@ -115,34 +114,6 @@ namespace yask { }; - // Specialization for KNC. - class CppKncPrintHelper : public CppIntrinPrintHelper { - protected: - - // Try all applicable strategies. - virtual void try_strategies(ostream& os, - const string& pv_name, - size_t nelems_target, - const VecElemList& elems, - set& done_elems, - const VarPointSet& aligned_vecs) { - try_align(os, pv_name, nelems_target, elems, done_elems, aligned_vecs, true); - try_perm1(os, pv_name, nelems_target, elems, done_elems, aligned_vecs); - } - - public: - CppKncPrintHelper(VecInfoVisitor& vv, - const CompilerSettings& settings, - const Dimensions& dims, - const CounterVisitor* cv, - const string& var_prefix, - const string& var_type, - const string& line_prefix, - const string& line_suffix) : - CppIntrinPrintHelper(vv, settings, dims, cv, - var_prefix, var_type, line_prefix, line_suffix) { } - }; - // Specialization for KNL, SKX, etc. class CppAvx512PrintHelper : public CppIntrinPrintHelper { protected: @@ -164,12 +135,11 @@ namespace yask { const CompilerSettings& settings, const Dimensions& dims, const CounterVisitor* cv, - const string& var_prefix, const string& var_type, const string& line_prefix, const string& line_suffix) : CppIntrinPrintHelper(vv, settings, dims, cv, - var_prefix, var_type, line_prefix, line_suffix) { } + var_type, line_prefix, line_suffix) { } }; // Specialization for AVX, AVX2. @@ -191,43 +161,20 @@ namespace yask { const CompilerSettings& settings, const Dimensions& dims, const CounterVisitor* cv, - const string& var_prefix, const string& var_type, const string& line_prefix, const string& line_suffix) : CppIntrinPrintHelper(vv, settings, dims, cv, - var_prefix, var_type, line_prefix, line_suffix) { } - }; - - // Print KNC intrinsic code. - class YASKKncPrinter : public YASKCppPrinter { - protected: - virtual CppVecPrintHelper* new_cpp_vec_print_helper(VecInfoVisitor& vv, - CounterVisitor& cv) { - return new CppKncPrintHelper(vv, _settings, _dims, &cv, - "temp", "real_vec_t", " ", ";\n"); - } - - public: - YASKKncPrinter(StencilSolution& stencil, - EqBundles& eq_bundles, - EqStages& eq_stages, - EqBundles& cluster_eq_bundles) : - YASKCppPrinter(stencil, eq_bundles, eq_stages, cluster_eq_bundles) { } - - virtual int num_vec_elems() const { return 64 / _settings._elem_bytes; } - - // Whether multi-dim folding is efficient. - virtual bool is_folding_efficient() const { return true; } + var_type, line_prefix, line_suffix) { } }; // Print 256-bit AVX intrinsic code. class YASKAvx256Printer : public YASKCppPrinter { protected: virtual CppVecPrintHelper* new_cpp_vec_print_helper(VecInfoVisitor& vv, - CounterVisitor& cv) { + CounterVisitor& cv) override { return new CppAvx256PrintHelper(vv, _settings, _dims, &cv, - "temp", "real_vec_t", " ", ";\n"); + "real_vec_t", " ", ";\n"); } public: @@ -237,7 +184,9 @@ namespace yask { EqBundles& cluster_eq_bundles) : YASKCppPrinter(stencil, eq_bundles, eq_stages, cluster_eq_bundles) { } - virtual int num_vec_elems() const { return 32 / _settings._elem_bytes; } + virtual int num_vec_elems() const override { + return 32 / _settings._elem_bytes; + } }; // Print 512-bit AVX intrinsic code. @@ -245,9 +194,9 @@ namespace yask { protected: bool _is_lo; virtual CppVecPrintHelper* new_cpp_vec_print_helper(VecInfoVisitor& vv, - CounterVisitor& cv) { + CounterVisitor& cv) override { return new CppAvx512PrintHelper(vv, _settings, _dims, &cv, - "temp", "real_vec_t", " ", ";\n"); + "real_vec_t", " ", ";\n"); } public: @@ -259,12 +208,12 @@ namespace yask { YASKCppPrinter(stencil, eq_bundles, eq_stages, cluster_eq_bundles), _is_lo(is_lo) { } - virtual int num_vec_elems() const { + virtual int num_vec_elems() const override { return (_is_lo ? 32 : 64) / _settings._elem_bytes; } // Whether multi-dim folding is efficient. - virtual bool is_folding_efficient() const { return true; } + virtual bool is_folding_efficient() const override { return true; } }; } // namespace yask. diff --git a/src/compiler/lib/Eqs.cpp b/src/compiler/lib/Eqs.cpp index bae7661f..2931d6f3 100644 --- a/src/compiler/lib/Eqs.cpp +++ b/src/compiler/lib/Eqs.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -34,6 +34,8 @@ IN THE SOFTWARE. namespace yask { + /////// Some locally-defined specialized visitors. + // A visitor to collect vars and points visited in a set of eqs. // For each eq, there are accessors for its output var and point // and its input vars and points. @@ -157,6 +159,195 @@ namespace yask { } }; + // Visitor for determining vectorization potential of var points. + // Vectorization depends not only on the dims of the var itself + // but also on how the var is indexed at each point. + class SetVecVisitor : public ExprVisitor { + const Dimensions& _dims; + + public: + SetVecVisitor(const Dimensions& dims) : + _dims(dims) { + _visit_equals_lhs = true; + _visit_var_point_args = true; + _visit_conds = true; + } + + // Check each var point in expr. + virtual string visit(VarPoint* gp) { + auto* var = gp->_get_var(); + + // Folded dims in the solution. + int soln_nfd = _dims._fold_gt1.size(); + + // Folded dims in this var. + int var_nfd = var->get_num_foldable_dims(); + assert(var_nfd <= soln_nfd); + + // Degenerate case with no folding in soln: we still mark points + // using vars with some domain dims as vectorizable. + if (soln_nfd == 0 && var->is_foldable()) + gp->set_vec_type(VarPoint::VEC_FULL); + + // No foldable dims. + else if (var_nfd == 0) + gp->set_vec_type(VarPoint::VEC_NONE); + + else { + assert(var_nfd > 0); + + // Amount of vectorization allowed primarily depends on number + // of folded dimensions in the var accessed at this point. + // Vectorization is only possible if each access to a vectorized + // dim is a simple offset. For example, in var dim 'x', the + // index in the corresponding posn must be 'x', 'x+n', or 'x-n'. + // TODO: is this redundant with expr analysis? + int fdoffsets = 0; + for (auto fdim : _dims._fold_gt1) { + auto& fdname = fdim._get_name(); + if (gp->get_arg_offsets().lookup(fdname)) + fdoffsets++; + } + assert(fdoffsets <= var_nfd); + + // All folded dims are vectorizable? + if (fdoffsets == soln_nfd) { + assert(var->is_foldable()); + gp->set_vec_type(VarPoint::VEC_FULL); // all good. + } + + // Some dims are vectorizable? + else if (fdoffsets > 0) + gp->set_vec_type(VarPoint::VEC_PARTIAL); + + // No dims are vectorizable. + else + gp->set_vec_type(VarPoint::VEC_NONE); + + } + + // Also check args of this var point. + return ExprVisitor::visit(gp); + } + }; + + // Visitor to find set of all referenced index vars. + class FindIndicesVisitor : public ExprVisitor { + + public: + set vars_used; + + FindIndicesVisitor() { + _visit_equals_lhs = true; + _visit_var_point_args = true; + _visit_conds = true; + } + + // Check each index expr; + virtual string visit(IndexExpr* ie) { + vars_used.insert(ie->_get_name()); + return ""; + } + }; + + // Visitor for determining inner-loop accesses of var points. + class SetLoopVisitor : public ExprVisitor { + const Dimensions& _dims; + const CompilerSettings& _settings; + + public: + SetLoopVisitor(const Dimensions& dims, + const CompilerSettings& settings) : + _dims(dims), _settings(settings) { + _visit_equals_lhs = true; + _visit_var_point_args = true; + _visit_conds = true; + } + + // Check each var point in expr. + virtual string visit(VarPoint* gp) { + + // Info from var. + auto* var = gp->_get_var(); + auto gdims = var->get_dim_names(); + + // Inner-loop var. + auto& idim = _settings._inner_loop_dim; + + // Access type. + // Assume invariant, then check below. + VarPoint::VarDepType lt = VarPoint::DOMAIN_VAR_INVARIANT; + + // Check every point arg. + auto& args = gp->get_args(); + for (size_t ai = 0; ai < args.size(); ai++) { + auto& arg = args.at(ai); + assert(ai < gdims.size()); + + // Get set of indices used by this arg expr. + FindIndicesVisitor fvv; + arg->accept(&fvv); + + // Does this arg refer to any domain dim? + if (lt == VarPoint::DOMAIN_VAR_INVARIANT) { + for (auto d : _dims._domain_dims) { + auto& dname = d._get_name(); + + if (dname != idim && fvv.vars_used.count(dname)) { + lt = VarPoint::DOMAIN_VAR_DEPENDENT; + break; // out of dim loop; no need to continue. + } + } + } + + // Does this arg refer to idim? + if (fvv.vars_used.count(idim)) { + + // Is it in the idim posn and a simple offset? + int offset = 0; + if (gdims.at(ai) == idim && + arg->is_offset_from(idim, offset)) { + lt = VarPoint::INNER_LOOP_OFFSET; + } + + // Otherwise, this arg uses idim, but not + // in a simple way. + else { + lt = VarPoint::INNER_LOOP_COMPLEX; + break; // out of arg loop; no need to continue. + } + } + } + gp->set_var_dep(lt); + return ""; + } + }; + + // Visitor that will shift each var point by an offset. + class OffsetVisitor: public ExprVisitor { + IntTuple _ofs; + + public: + OffsetVisitor(const IntTuple& ofs) : + _ofs(ofs) { + _visit_equals_lhs = true; + _visit_var_point_args = true; + _visit_conds = true; + } + + // Visit a var point. + virtual string visit(VarPoint* gp) { + + // Shift var _ofs points. + auto ofs0 = gp->get_arg_offsets(); + IntTuple new_loc = ofs0.add_elements(_ofs, false); + gp->set_arg_offsets(new_loc); + return ""; + } + }; + + ////////// Methods. + // Analyze group of equations. // Sets _step_dir in dims. // Finds dependencies based on all eqs if 'settings._find_deps', setting @@ -165,9 +356,9 @@ namespace yask { // TODO: split this into smaller functions. // BIG-TODO: replace dependency algorithms with integration of a polyhedral // library. - void Eqs::analyze_eqs(CompilerSettings& settings, - Dimensions& dims, - ostream& os) { + void Eqs::analyze_eqs(const CompilerSettings& settings, + Dimensions& dims, + ostream& os) { auto& step_dim = dims._step_dim; // Gather initial stats from all eqs. @@ -211,7 +402,7 @@ namespace yask { // LHS must have all domain dims. for (auto& dd : dims._domain_dims) { auto& dname = dd._get_name(); - num_expr_ptr dexpr = op1->get_arg(dname); + auto dexpr = op1->get_arg(dname); if (!dexpr) THROW_YASK_EXCEPTION("Error: var equation " + eq1->make_quoted_str() + " does not use domain-dimension '" + dname + @@ -239,7 +430,8 @@ namespace yask { if (dn == step_dim) { } - // LHS must have simple indices in domain dims. + // LHS must have simple indices in domain dims, e.g., + // 'x', 'y'. else if (dims._domain_dims.lookup(dn)) { // Make expected arg, e.g., 'x'. @@ -546,68 +738,9 @@ namespace yask { topo_sort(); } - // Visitor for determining vectorization potential of var points. - // Vectorization depends not only on the dims of the var itself - // but also on how the var is indexed at each point. - class SetVecVisitor : public ExprVisitor { - const Dimensions& _dims; - - public: - SetVecVisitor(const Dimensions& dims) : - _dims(dims) { - _visit_equals_lhs = true; - _visit_var_point_args = true; - _visit_conds = true; - } - - // Check each var point in expr. - virtual string visit(VarPoint* gp) { - auto* var = gp->_get_var(); - - // Never vectorize scalars. - if (var->get_num_dims() == 0) { - gp->set_vec_type(VarPoint::VEC_NONE); - return ""; // Also, no args to visit. - } - - // Amount of vectorization allowed primarily depends on number - // of folded dimensions in the var accessed at this point. - int var_nfd = var->get_num_foldable_dims(); - int soln_nfd = _dims._fold_gt1.size(); - assert(var_nfd <= soln_nfd); - - // Vectorization is only possible if each access to a vectorized - // dim is a simple offset. For example, in var dim 'x', the - // index in the corresponding posn must be 'x', 'x+n', or 'x-n'. - int fdoffsets = 0; - for (auto fdim : _dims._fold_gt1) { - auto& fdname = fdim._get_name(); - if (gp->get_arg_offsets().lookup(fdname)) - fdoffsets++; - } - assert(fdoffsets <= var_nfd); - - // All folded dims are vectorizable? - // NB: this will always be the case when there is - // no folding in the soln. - if (fdoffsets == soln_nfd) - gp->set_vec_type(VarPoint::VEC_FULL); // all good. - - // Some dims are vectorizable? - else if (fdoffsets > 0) - gp->set_vec_type(VarPoint::VEC_PARTIAL); - - // Uses no folded dims, so scalar only. - else - gp->set_vec_type(VarPoint::VEC_NONE); - - // Also check args of this var point. - return ExprVisitor::visit(gp); - } - }; - // Determine which var points can be vectorized. - void Eqs::analyze_vec(const Dimensions& dims) { + void Eqs::analyze_vec(const CompilerSettings& settings, + const Dimensions& dims) { // Send a 'SetVecVisitor' to each point in // the current equations. @@ -615,82 +748,13 @@ namespace yask { visit_eqs(&svv); } - // Visitor to find referenced vars. - class FindVarsVisitor : public ExprVisitor { - - public: - set vars_used; - - // Check each index expr; - virtual string visit(IndexExpr* ie) { - vars_used.insert(ie->_get_name()); - return ""; - } - }; - - // Visitor for determining inner-loop accesses of var points. - class SetLoopVisitor : public ExprVisitor { - const Dimensions& _dims; - - public: - SetLoopVisitor(const Dimensions& dims) : - _dims(dims) { - _visit_equals_lhs = true; - } - - // Check each var point in expr. - virtual string visit(VarPoint* gp) { - - // Info from var. - auto* var = gp->_get_var(); - auto gdims = var->get_dim_names(); - - // Check loop in this dim. - auto idim = _dims._inner_dim; - - // Access type. - // Assume invariant, then check below. - VarPoint::LoopType lt = VarPoint::LOOP_INVARIANT; - - // Check every point arg. - auto& args = gp->get_args(); - for (size_t ai = 0; ai < args.size(); ai++) { - auto& arg = args.at(ai); - assert(ai < gdims.size()); - - // Get set of vars used. - FindVarsVisitor fvv; - arg->accept(&fvv); - - // Does this arg refer to idim? - if (fvv.vars_used.count(idim)) { - - // Is it in the idim posn and a simple offset? - int offset = 0; - if (gdims.at(ai) == idim && - arg->is_offset_from(idim, offset)) { - lt = VarPoint::LOOP_OFFSET; - } - - // Otherwise, this arg uses idim, but not - // in a simple way. - else { - lt = VarPoint::LOOP_OTHER; - break; // no need to continue. - } - } - } - gp->set_loop_type(lt); - return ""; - } - }; - // Determine loop access behavior of var points. - void Eqs::analyze_loop(const Dimensions& dims) { + void Eqs::analyze_loop(const CompilerSettings& settings, + const Dimensions& dims) { // Send a 'SetLoopVisitor' to each point in // the current equations. - SetLoopVisitor slv(dims); + SetLoopVisitor slv(dims, settings); visit_eqs(&slv); } @@ -873,27 +937,6 @@ namespace yask { cv.print_stats(os, msg); } - // Visitor that will shift each var point by an offset. - class OffsetVisitor: public ExprVisitor { - IntTuple _ofs; - - public: - OffsetVisitor(const IntTuple& ofs) : - _ofs(ofs) { - _visit_equals_lhs = true; - } - - // Visit a var point. - virtual string visit(VarPoint* gp) { - - // Shift var _ofs points. - auto ofs0 = gp->get_arg_offsets(); - IntTuple new_loc = ofs0.add_elements(_ofs, false); - gp->set_arg_offsets(new_loc); - return ""; - } - }; - // Replicate each equation at the non-zero offsets for // each vector in a cluster. void EqBundle::replicate_eqs_in_cluster(Dimensions& dims) @@ -904,7 +947,7 @@ namespace yask { // Loop thru points in cluster. dims._cluster_mults.visit_all_points([&](const IntTuple& cluster_index, - size_t idx) { + size_t idx) { // Don't need copy of one at origin. if (cluster_index.sum() > 0) { @@ -1071,19 +1114,24 @@ namespace yask { #endif // First, set halos based only on immediate accesses. - for (auto& bp : get_all()) { - auto pname = bp->_get_name(); - - for (auto& eq : bp->get_eqs()) { + // Loop thru stages. + for (auto& sp : get_all()) { + auto stage_name = sp->_get_name(); + + // Loop thru equations in this stage. + for (auto& eq : sp->get_eqs()) { // Get all var points touched by this eq. auto& all_pts1 = pv.get_all_pts().at(eq.get()); + auto& out_pt1 = pv.get_output_pts().at(eq.get()); // Update stats of each var accessed in 'eq'. for (auto ap : all_pts1) { auto* g = ap->_get_var(); // var for point 'ap'. - g->update_halo(pname, ap->get_arg_offsets()); + g->update_halo(stage_name, ap->get_arg_offsets()); } + auto* g = out_pt1->_get_var(); + g->update_write_points(stage_name, out_pt1->get_arg_offsets()); } } diff --git a/src/compiler/lib/Eqs.hpp b/src/compiler/lib/Eqs.hpp index 00d02671..96c04495 100644 --- a/src/compiler/lib/Eqs.hpp +++ b/src/compiler/lib/Eqs.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -28,6 +28,7 @@ IN THE SOFTWARE. #pragma once #include "Expr.hpp" +#include "VarPoint.hpp" #include "Settings.hpp" using namespace std; @@ -365,15 +366,17 @@ namespace yask { } // Find dependencies based on all eqs. - virtual void analyze_eqs(CompilerSettings& settings, - Dimensions& dims, - std::ostream& os); + virtual void analyze_eqs(const CompilerSettings& settings, + Dimensions& dims, + std::ostream& os); // Determine which var points can be vectorized. - virtual void analyze_vec(const Dimensions& dims); + virtual void analyze_vec(const CompilerSettings& settings, + const Dimensions& dims); // Determine how var points are accessed in a loop. - virtual void analyze_loop(const Dimensions& dims); + virtual void analyze_loop(const CompilerSettings& settings, + const Dimensions& dims); // Update var access stats. virtual void update_var_stats(); @@ -658,7 +661,7 @@ namespace yask { // Container for multiple equation stages. class EqStages : public DepGroup { protected: - string _base_name = "stencil_stage"; + string _base_name = "stage"; // Bundle index. int _idx = 0; diff --git a/src/compiler/lib/Expr.cpp b/src/compiler/lib/Expr.cpp index 0aa3ac50..b6fd920c 100644 --- a/src/compiler/lib/Expr.cpp +++ b/src/compiler/lib/Expr.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -37,6 +37,9 @@ namespace yask { yc_var* VarPoint::get_var() { return _var; } + const yc_var* VarPoint::get_var() const { + return _var; + } //node_factory API methods. yc_index_node_ptr @@ -518,20 +521,6 @@ namespace yask { return ev->visit(this); } - // EqualsExpr methods. - bool EqualsExpr::is_scratch() { - Var* gp = _get_var(); - return gp && gp->is_scratch(); - } - bool EqualsExpr::is_same(const Expr* other) const { - auto p = dynamic_cast(other); - return p && - _lhs->is_same(p->_lhs.get()) && - _rhs->is_same(p->_rhs.get()) && - are_exprs_same(_cond, p->_cond) && // might be null. - are_exprs_same(_step_cond, p->_step_cond); // might be null. - } - // Commutative methods. bool CommutativeExpr::is_same(const Expr* other) const { auto p = dynamic_cast(other); @@ -605,243 +594,6 @@ namespace yask { return false; } - // VarPoint methods. - VarPoint::VarPoint(Var* var, const num_expr_ptr_vec& args) : - _var(var), _args(args) { - - // Check for correct number of args. - size_t nd = var->get_dims().size(); - if (nd != args.size()) { - FORMAT_AND_THROW_YASK_EXCEPTION("Error: attempt to create a var point in " << - nd << "-D var '" << get_var_name() << "' with " << - args.size() << " indices"); - } - - // Eval each arg. -#ifdef DEBUG_GP - cout << "Creating var point " << make_quoted_str() << "...\n"; -#endif - auto dims = var->get_dims(); - for (size_t i = 0; i < nd; i++) { - auto dim = dims.at(i); - auto dname = dim->_get_name(); - auto arg = args.at(i); - assert(arg); -#ifdef DEBUG_GP - cout << " Arg " << arg->make_quoted_str() << - " at dim '" << dname << "'\n"; -#endif - int offset = 0; - - // A compile-time const? - if (arg->is_const_val()) { -#ifdef DEBUG_GP - cout << " is const val " << arg->get_int_val() << endl; -#endif - IntScalar c(dname, arg->get_int_val()); - set_arg_const(c); - } - - // A simple offset? - else if (arg->is_offset_from(dname, offset)) { -#ifdef DEBUG_GP - cout << " has offset " << offset << endl; -#endif - IntScalar o(dname, offset); - set_arg_offset(o); - } - } - _update_str(); - } - const num_expr_ptr VarPoint::get_arg(const string& dim) const { - for (int di = 0; di < _var->get_num_dims(); di++) { - auto& dn = _var->get_dim_name(di); // name of this dim. - if (dim == dn) - return _args.at(di); - } - return nullptr; - } - const string& VarPoint::get_var_name() const { - return _var->_get_name(); - } - bool VarPoint::is_var_foldable() const { - return _var->is_foldable(); - } - string VarPoint::make_arg_str(const VarMap* var_map) const { - string str; - int i = 0; - for (auto arg : _args) { - if (i++) str += ", "; - str += arg->make_str(var_map); - } - return str; - } - string VarPoint::_make_str(const VarMap* var_map) const { - string str = _var->_get_name() + "(" + - make_arg_str(var_map) + ")"; - return str; - } - string VarPoint::make_logical_var_str(const VarMap* var_map) const { - string str = _var->_get_name(); - if (_consts.size()) - str += "(" + _consts.make_dim_val_str() + ")"; - return str; - } - const index_expr_ptr_vec& VarPoint::get_dims() const { - return _var->get_dims(); - } - - // Make string like "x+(4/VLEN_X)" from - // original arg "x+4" in 'dname' dim. - // This object has numerators; 'fold' object has denominators. - // Args w/o simple offset are not modified. - string VarPoint::make_norm_arg_str(const string& dname, - const Dimensions& dims, - const VarMap* var_map) const { - string res; - - // Const offset? - auto* ofs = _offsets.lookup(dname); - - // Zero offset? - if (ofs && *ofs == 0) - res = dname; - - // dname exists in fold? - else if (ofs && dims._fold.lookup(dname)) - res = "(" + dname + dims.make_norm_str(*ofs, dname) + ")"; - - // Otherwise, just find and format arg as-is. - else { - auto& gdims = _var->get_dims(); - for (size_t i = 0; i < gdims.size(); i++) { - auto gdname = gdims[i]->_get_name(); - if (gdname == dname) - res += _args.at(i)->make_str(var_map); - } - } - - return res; - } - - // Make string like "x+(4/VLEN_X), y, z-(2/VLEN_Z)" from - // original args "x+4, y, z-2". - // This object has numerators; norm object has denominators. - // Args w/o simple offset are not modified. - string VarPoint::make_norm_arg_str(const Dimensions& dims, - const VarMap* var_map) const { - - string res; - auto& gd = _var->get_dims(); - for (size_t i = 0; i < gd.size(); i++) { - if (i) - res += ", "; - auto dname = gd[i]->_get_name(); - res += make_norm_arg_str(dname, dims, var_map); - } - return res; - } - - // Make string like "g->_wrap_step(t+1)" from original arg "t+1" - // if var uses step dim, "0" otherwise. - // If var doesn't allow dynamic alloc, set to fixed value. - string VarPoint::make_step_arg_str(const string& var_ptr, const Dimensions& dims) const { - - auto& gd = _var->get_dims(); - for (size_t i = 0; i < gd.size(); i++) { - auto dname = gd[i]->_get_name(); - auto& arg = _args.at(i); - if (dname == dims._step_dim) { - if (_var->is_dynamic_step_alloc()) - return var_ptr + "->_wrap_step(" + arg->make_str() + ")"; - else { - auto step_alloc = _var->get_step_alloc_size(); - if (step_alloc == 1) - return "0"; // 1 alloc => always index 0. - else - return "imod_flr(" + arg->make_str() + ", " + - to_string(step_alloc) + ")"; - } - } - } - return "0"; - } - - // Set given arg to given offset; ignore if not in step or domain var dims. - void VarPoint::set_arg_offset(const IntScalar& offset) { - - // Find dim in var. - auto gdims = _var->get_dims(); - for (size_t i = 0; i < gdims.size(); i++) { - auto gdim = gdims[i]; - - // Must be domain or step dim. - if (gdim->get_type() == MISC_INDEX) - continue; - - auto dname = gdim->_get_name(); - if (offset._get_name() == dname) { - - // Make offset equation. - int ofs = offset.get_val(); - auto ie = gdim->clone(); - num_expr_ptr nep; - if (ofs > 0) { - auto op = make_shared(ofs); - nep = make_shared(ie, op); - } - else if (ofs < 0) { - auto op = make_shared(-ofs); - nep = make_shared(ie, op); - } - else // 0 offset. - nep = ie; - - // Replace in args. - _args[i] = nep; - - // Set offset. - _offsets.add_dim_back(dname, ofs); - - // Remove const if it exists. - _consts = _consts.remove_dim(dname); - - break; - } - } - _update_str(); - } - - // Set given arg to given const; - void VarPoint::set_arg_const(const IntScalar& val) { - - // Find dim in var. - auto gdims = _var->get_dims(); - for (size_t i = 0; i < gdims.size(); i++) { - auto gdim = gdims[i]; - - auto dname = gdim->_get_name(); - if (val._get_name() == dname) { - - // Make const expr. - int v = val.get_val(); - auto vp = make_shared(v); - - // Replace in args. - _args[i] = vp; - - // Set const - _consts.add_dim_back(dname, v); - - // Remove offset if it exists. - _offsets = _offsets.remove_dim(dname); - - break; - } - } - _update_str(); - } - // Is this expr a simple offset? bool IndexExpr::is_offset_from(string dim, int& offset) { @@ -917,7 +669,7 @@ namespace yask { ostringstream oss; CompilerSettings _dummy_settings; Dimensions _dummy_dims; - PrintHelper ph(_dummy_settings, _dummy_dims, NULL, "temp", "", "", ""); // default helper. + PrintHelper ph(_dummy_settings, _dummy_dims, NULL, "", "", ""); // default helper. CompilerSettings settings; // default settings. PrintVisitorTopDown pv(oss, ph, var_map); string res = accept(&pv); diff --git a/src/compiler/lib/Expr.hpp b/src/compiler/lib/Expr.hpp index b83a56e1..20fb5133 100644 --- a/src/compiler/lib/Expr.hpp +++ b/src/compiler/lib/Expr.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -31,11 +31,12 @@ IN THE SOFTWARE. #include "yask_compiler_api.hpp" +#include +#include #include #include #include #include -#include #include #include @@ -82,7 +83,9 @@ namespace yask { class StencilSolution; struct Dimensions; - typedef map VarMap; // map used when substituting vars. + // Type to convert original index vars (e.g., 'x', 'y') to + // different names (e.g., 'x_elem_local', 'y_elem_local'). + typedef map VarMap; //// Classes to implement parts of expressions. @@ -333,7 +336,7 @@ namespace yask { virtual double get_value() const { return _f; } }; - // Any expression that returns a real (not from a var). + // Any expression that returns a real (not from a YASK var). // This is an expression leaf-node. class CodeExpr : public NumExpr { protected: @@ -771,274 +774,4 @@ namespace yask { } }; - // One specific point in a var. - // This is an expression leaf-node. - class VarPoint : public NumExpr, - public virtual yc_var_point_node { - - public: - - // What kind of vectorization can be done on this point. - // Set via Eqs::analyze_vec(). - enum VecType { VEC_UNSET, - VEC_FULL, // vectorizable in all folded dims. - VEC_PARTIAL, // vectorizable in some folded dims. - VEC_NONE // vectorizable in no folded dims. - }; - - // Analysis of this point for accesses via loops through the inner dim. - // Set via Eqs::analyze_loop(). - enum LoopType { LOOP_UNSET, - LOOP_INVARIANT, // not dependent on inner dim. - LOOP_OFFSET, // only dep on inner dim +/- const in inner-dim posn. - LOOP_OTHER // dep on inner dim in another way. - }; - - protected: - Var* _var = 0; // the var this point is from. - - // Index exprs for each dim, e.g., - // "3, x-5, y*2, z+4" for dims "n, x, y, z". - num_expr_ptr_vec _args; - - // Vars below are calculated from above. - - // Simple offset for each expr that is dim +/- offset, e.g., - // "x=-5, z=4" from above example. - // Includes zero offsets. - // Set in ctor and modified via set_arg_offset/Const(). - IntTuple _offsets; - - // Simple value for each expr that is a const, e.g., - // "n=3" from above example. - // Set in ctor and modified via set_arg_offset/Const(). - IntTuple _consts; - - VecType _vec_type = VEC_UNSET; // allowed vectorization. - LoopType _loop_type = LOOP_UNSET; // analysis for looping. - - // Cache the string repr. - string _def_str; - void _update_str() { - _def_str = _make_str(); - } - string _make_str(const VarMap* var_map = 0) const; - - public: - - // Construct a point given a var and an arg for each dim. - VarPoint(Var* var, const num_expr_ptr_vec& args); - - // Dtor. - virtual ~VarPoint() {} - - // Get parent var info. - const Var* _get_var() const { return _var; } - Var* _get_var() { return _var; } - virtual const string& get_var_name() const; - virtual bool is_var_foldable() const; - virtual const index_expr_ptr_vec& get_dims() const; - - // Accessors. - virtual const num_expr_ptr_vec& get_args() const { return _args; } - virtual const IntTuple& get_arg_offsets() const { return _offsets; } - virtual const IntTuple& get_arg_consts() const { return _consts; } - virtual VecType get_vec_type() const { - assert(_vec_type != VEC_UNSET); - return _vec_type; - } - virtual void set_vec_type(VecType vt) { - _vec_type = vt; - } - virtual LoopType get_loop_type() const { - assert(_loop_type != LOOP_UNSET); - return _loop_type; - } - virtual void set_loop_type(LoopType vt) { - _loop_type = vt; - } - - // Get arg for 'dim' or return null if none. - virtual const num_expr_ptr get_arg(const string& dim) const; - - // Set given arg to given offset; ignore if not in step or domain var dims. - virtual void set_arg_offset(const IntScalar& offset); - - // Set given args to be given offsets. - virtual void set_arg_offsets(const IntTuple& offsets) { - for (auto ofs : offsets) - set_arg_offset(ofs); - } - - // Set given arg to given const. - virtual void set_arg_const(const IntScalar& val); - - // Some comparisons. - bool operator==(const VarPoint& rhs) const { - return _def_str == rhs._def_str; - } - bool operator<(const VarPoint& rhs) const { - return _def_str < rhs._def_str; - } - - // Take ev to each value. - virtual string accept(ExprVisitor* ev); - - // Check for equivalency. - virtual bool is_same(const Expr* other) const { - auto p = dynamic_cast(other); - return p && *this == *p; - } - - // Check for same logical var. - // A logical var is defined by the var itself - // and any const indices. - virtual bool is_same_logical_var(const VarPoint& rhs) const { - return _var == rhs._var && _consts == rhs._consts; - } - - // String w/name and parens around args, e.g., 'u(x, y+2)'. - // Apply substitutions to indices using 'var_map' if provided. - virtual string make_str(const VarMap* var_map = 0) const { - if (var_map) - return _make_str(var_map); - return _def_str; - } - - // String w/name and parens around const args, e.g., 'u(n=4)'. - // Apply substitutions to indices using 'var_map' if provided. - virtual string make_logical_var_str(const VarMap* var_map = 0) const; - - // String w/just comma-sep args, e.g., 'x, y+2'. - // Apply substitutions to indices using 'var_map' if provided. - virtual string make_arg_str(const VarMap* var_map = 0) const; - - // String v/vec-normalized args, e.g., 'x, y+(2/VLEN_Y)'. - // This object has numerators; 'fold' object has denominators. - // Apply substitutions to indices using 'var_map' if provided. - virtual string make_norm_arg_str(const Dimensions& dims, - const VarMap* var_map = 0) const; - - // Make string like "x+(4/VLEN_X)" from original arg "x+4" in 'dname' dim. - // This object has numerators; 'fold' object has denominators. - // Apply substitutions to indices using 'var_map' if provided. - virtual string make_norm_arg_str(const string& dname, - const Dimensions& dims, - const VarMap* var_map = 0) const; - - // Make string like "g->_wrap_step(t+1)" from original arg "t+1" - // if var uses step dim, "0" otherwise. - virtual string make_step_arg_str(const string& var_ptr, const Dimensions& dims) const; - - // Create a deep copy of this expression, - // except pointed-to var is not copied. - virtual num_expr_ptr clone() const { return make_shared(*this); } - virtual var_point_ptr clone_var_point() const { return make_shared(*this); } - - // APIs. - virtual yc_var* get_var(); - }; - - // Equality operator for a var point. - // This defines the LHS as equal to the RHS; it is NOT - // a comparison operator; it is NOT an assignment operator. - // It also holds an optional condition. - class EqualsExpr : public Expr, - public virtual yc_equation_node { - protected: - var_point_ptr _lhs; - num_expr_ptr _rhs; - bool_expr_ptr _cond; - bool_expr_ptr _step_cond; - - public: - EqualsExpr(var_point_ptr lhs, num_expr_ptr rhs, - bool_expr_ptr cond = nullptr, - bool_expr_ptr step_cond = nullptr) : - _lhs(lhs), _rhs(rhs), _cond(cond), _step_cond(step_cond) { } - EqualsExpr(const EqualsExpr& src) : - _lhs(src._lhs->clone_var_point()), - _rhs(src._rhs->clone()) { - if (src._cond) - _cond = src._cond->clone(); - else - _cond = nullptr; - if (src._step_cond) - _step_cond = src._step_cond->clone(); - else - _step_cond = nullptr; - } - - var_point_ptr& _get_lhs() { return _lhs; } - const var_point_ptr& _get_lhs() const { return _lhs; } - num_expr_ptr& _get_rhs() { return _rhs; } - const num_expr_ptr& _get_rhs() const { return _rhs; } - bool_expr_ptr& _get_cond() { return _cond; } - const bool_expr_ptr& _get_cond() const { return _cond; } - void _set_cond(bool_expr_ptr cond) { _cond = cond; } - bool_expr_ptr& _get_step_cond() { return _step_cond; } - const bool_expr_ptr& _get_step_cond() const { return _step_cond; } - void _set_step_cond(bool_expr_ptr step_cond) { _step_cond = step_cond; } - virtual string accept(ExprVisitor* ev); - static string expr_op_str() { return "EQUALS"; } - static string cond_op_str() { return "IF_DOMAIN"; } - static string step_cond_op_str() { return "IF_STEP"; } - - // Get pointer to var on LHS or NULL if not set. - virtual Var* _get_var() { - if (_lhs.get()) - return _lhs->_get_var(); - return NULL; - } - - // LHS is scratch var. - virtual bool is_scratch(); - - // Check for equivalency. - virtual bool is_same(const Expr* other) const; - - // Create a deep copy of this expression. - virtual equals_expr_ptr clone() const { return make_shared(*this); } - virtual yc_equation_node_ptr clone_ast() const { - return clone(); - } - - // APIs. - virtual yc_var_point_node_ptr get_lhs() { return _lhs; } - virtual yc_number_node_ptr get_rhs() { return _rhs; } - virtual yc_bool_node_ptr get_cond() { return _cond; } - virtual yc_bool_node_ptr get_step_cond() { return _step_cond; } - virtual void set_cond(yc_bool_node_ptr cond) { - if (cond) { - auto p = dynamic_pointer_cast(cond); - assert(p); - _cond = p; - } else - _cond = nullptr; - } - virtual void set_step_cond(yc_bool_node_ptr step_cond) { - if (step_cond) { - auto p = dynamic_pointer_cast(step_cond); - assert(p); - _step_cond = p; - } else - _step_cond = nullptr; - } - }; - - typedef set VarPointSet; - typedef set var_point_ptr_set; - typedef vector VarPointVec; - -} // namespace yask. - -// Define hash function for VarPoint for unordered_{set,map}. -namespace std { - using namespace yask; - - template <> struct hash { - size_t operator()(const VarPoint& k) const { - return hash{}(k.make_str()); - } - }; -} + } // namespace yask. diff --git a/src/compiler/lib/ExprUtils.cpp b/src/compiler/lib/ExprUtils.cpp index 27deb23b..2e2b021e 100644 --- a/src/compiler/lib/ExprUtils.cpp +++ b/src/compiler/lib/ExprUtils.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/compiler/lib/ExprUtils.hpp b/src/compiler/lib/ExprUtils.hpp index c8d88212..d1ed5eac 100644 --- a/src/compiler/lib/ExprUtils.hpp +++ b/src/compiler/lib/ExprUtils.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/compiler/lib/Parse.hpp b/src/compiler/lib/Parse.hpp index 62445ec7..d9348dfb 100644 --- a/src/compiler/lib/Parse.hpp +++ b/src/compiler/lib/Parse.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/compiler/lib/Print.cpp b/src/compiler/lib/Print.cpp index 1a67fa37..d8bead8d 100644 --- a/src/compiler/lib/Print.cpp +++ b/src/compiler/lib/Print.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -33,12 +33,14 @@ namespace yask { ////////////// Print visitors /////////////// // Declare a new temp var and set 'res' to it. + // Prepend 'prefix' to name of var. // Print LHS of assignment to it. // 'ex' is used as key to save name of temp var and to write a comment. // If 'comment' is set, use it for the comment. // Return stream to continue w/RHS. - ostream& PrintVisitorBase::make_next_temp_var(string& res, Expr* ex, string comment) { - res = _ph.make_var_name(); + ostream& PrintVisitorBase::make_next_temp_var(string& res, Expr* ex, + string prefix, string comment) { + res = _ph.make_var_name(prefix); if (ex) { _temp_vars[ex] = res; if (comment.length() == 0) @@ -225,7 +227,7 @@ namespace yask { if (too_small) res = td_res; else - make_next_temp_var(res, ex) << td_res << _ph.get_line_suffix(); + make_next_temp_var(res, ex, "expr", "") << td_res << _ph.get_line_suffix(); } // otherwise, there are common subexprs, and top-down is not forced, @@ -274,7 +276,7 @@ namespace yask { // temp2 = -temp1; // with 'temp2' returned. string rhs = ue->_get_rhs()->accept(this); - make_next_temp_var(res, ue) << ue->get_op_str() << rhs << _ph.get_line_suffix(); + make_next_temp_var(res, ue, "expr", "") << ue->get_op_str() << rhs << _ph.get_line_suffix(); return res; } @@ -290,13 +292,14 @@ namespace yask { // Expand both sides, then apply operator to result. // Example: '(a * b) / (c * d)' might output the following: - // temp1 = a * b; - // temp2 = b * c; - // temp3 = temp1 / temp2; - // with 'temp3' returned. + // expr1 = a * b; + // expr2 = b * c; + // expr3 = expr1 / expr2; + // with 'expr3' returned. string lhs = be->_get_lhs()->accept(this); string rhs = be->_get_rhs()->accept(this); - make_next_temp_var(res, be) << lhs << ' ' << be->get_op_str() << ' ' << rhs << _ph.get_line_suffix(); + make_next_temp_var(res, be, "expr", "") << + lhs << ' ' << be->get_op_str() << ' ' << rhs << _ph.get_line_suffix(); return res; } @@ -343,8 +346,8 @@ namespace yask { // Make 2 temp vars. string res2; - make_next_temp_var(res, fe) << "0" << _ph.get_line_suffix(); - make_next_temp_var(res2, paired) << "0" << _ph.get_line_suffix(); + make_next_temp_var(res, fe, "arg0", "") << "0" << _ph.get_line_suffix(); + make_next_temp_var(res2, paired, "arg1", "") << "0" << _ph.get_line_suffix(); // Call function to set both. _os << _ph.get_line_prefix() << @@ -367,7 +370,7 @@ namespace yask { args += ", "; args += ep->accept(this); } - make_next_temp_var(res, fe) << _func_prefix << fe->get_op_str() << + make_next_temp_var(res, fe, "res", "") << _func_prefix << fe->get_op_str() << "(" << args << ")" << _ph.get_line_suffix(); } return res; @@ -418,7 +421,7 @@ namespace yask { // Output this step. string tmp; - make_next_temp_var(tmp, ex, ex_str) << lhs << ' ' << ce->get_op_str() << ' ' << + make_next_temp_var(tmp, ex, "expr", ex_str) << lhs << ' ' << ce->get_op_str() << ' ' << op_str << _ph.get_line_suffix(); lhs = tmp; // result returned and/or used in next iteration. } @@ -438,7 +441,7 @@ namespace yask { // Assign RHS to a temp var. string tmp; - make_next_temp_var(tmp, rp) << rhs << _ph.get_line_suffix(); + make_next_temp_var(tmp, rp, "expr", "") << rhs << _ph.get_line_suffix(); // Comment about update. var_point_ptr gpp = ee->_get_lhs(); @@ -652,7 +655,7 @@ namespace yask { CounterVisitor cv; eq->visit_eqs(&cv); - PrintHelper ph(_settings, _dims, &cv, "temp", "real", " ", ".\n"); + PrintHelper ph(_settings, _dims, &cv, "real", " ", ".\n"); if (eq->cond.get()) { string cond_str = eq->cond->make_str(); diff --git a/src/compiler/lib/Print.hpp b/src/compiler/lib/Print.hpp index 5cfc4e82..536a16b8 100644 --- a/src/compiler/lib/Print.hpp +++ b/src/compiler/lib/Print.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -42,8 +42,7 @@ namespace yask { const CompilerSettings& _settings; // compiler settings. const Dimensions& _dims; // problem dims. const CounterVisitor* _cv; // counter info. - string _var_prefix; // first part of var name. - string _var_type; // type, if any, of var. + string _var_type; // type of vars to be created. string _line_prefix; // prefix for each line. string _line_suffix; // suffix for each line. VarMap _local_vars; // map from expression strings to local var names. @@ -52,13 +51,11 @@ namespace yask { PrintHelper(const CompilerSettings& settings, const Dimensions& dims, const CounterVisitor* cv, - const string& var_prefix, const string& var_type, const string& line_prefix, const string& line_suffix) : _var_num(1), _settings(settings), _dims(dims), _cv(cv), - _var_prefix(var_prefix), _var_type(var_type), - _line_prefix(line_prefix), _line_suffix(line_suffix) { } + _var_type(var_type), _line_prefix(line_prefix), _line_suffix(line_suffix) { } virtual ~PrintHelper() { } @@ -95,14 +92,19 @@ namespace yask { } // Make and return next var name. - virtual string make_var_name() { - return _var_prefix + to_string(_var_num++); + virtual string make_var_name(string prefix) { + return prefix + "_temp" + to_string(_var_num++); + } + + // Determine if local var exists for 'expr'. + virtual bool is_local_var(const string& expr) const { + return _local_vars.count(expr) != 0; } // If var exists for 'expr', return it. // If not, create var of 'type' in 'os' and return it. virtual string get_local_var(ostream& os, const string& expr, - string type = "") { + string type, string prefix) { if (_local_vars.count(expr)) return _local_vars.at(expr); @@ -110,7 +112,7 @@ namespace yask { // Make a var. if (!type.length()) type = _var_type; - string v_name = make_var_name(); + string v_name = make_var_name(prefix); os << _line_prefix << type << " " << v_name << " = " << expr << _line_suffix; _local_vars[expr] = v_name; @@ -176,11 +178,13 @@ namespace yask { map _temp_vars; // Declare a new temp var and set 'res' to it. + // Prepend 'prefix' to name of var. // Print LHS of assignment to it. // 'ex' is used as key to save name of temp var and to write a comment. // If 'comment' is set, use it for the comment. // Return stream to continue w/RHS. - virtual ostream& make_next_temp_var(string& res, Expr* ex, string comment = ""); + virtual ostream& make_next_temp_var(string& res, Expr* ex, + string prefix, string comment); public: // os is used for printing intermediate results as needed. diff --git a/src/compiler/lib/Settings.cpp b/src/compiler/lib/Settings.cpp index db4358ce..49008514 100644 --- a/src/compiler/lib/Settings.cpp +++ b/src/compiler/lib/Settings.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -112,18 +112,44 @@ namespace yask { if (_step_dim.length() == 0) { THROW_YASK_EXCEPTION("Error: no step dimension defined"); } - if (!_domain_dims._get_num_dims()) { + if (!_domain_dims.get_num_dims()) { THROW_YASK_EXCEPTION("Error: no domain dimension(s) defined"); } // Set specific positional dims. - _outer_dim = _domain_dims.get_dim_name(0); - _inner_dim = _domain_dims.get_dim_name(_domain_dims._get_num_dims() - 1); - string _near_inner_dim = _domain_dims._get_num_dims() >= 2 ? - _domain_dims.get_dim_name(_domain_dims._get_num_dims() - 2) : _outer_dim; - - os << "Step dimension: " << _step_dim << endl; - os << "Domain dimension(s): " << _domain_dims.make_dim_str() << endl; + auto ndd = _domain_dims.get_num_dims(); + _outer_layout_dim = _domain_dims.get_dim_name(0); + _inner_layout_dim = _domain_dims.get_dim_name(ndd - 1); + string _near_inner_dim = _domain_dims.get_num_dims() >= 2 ? + _domain_dims.get_dim_name(_domain_dims.get_num_dims() - 2) : _outer_layout_dim; + if (settings._inner_loop_dim.length()) { + if (isdigit(settings._inner_loop_dim[0])) { + int dn = atoi(settings._inner_loop_dim.c_str()); + if (dn < 1) { + os << "Note: adjusting inner-loop-dim " << dn << " to 1.\n"; + dn = 1; + } + if (dn > ndd) { + os << "Note: adjusting inner-loop-dim " << dn << " to " << ndd << ".\n"; + dn = ndd; + } + settings._inner_loop_dim = _domain_dims.get_dim_name(dn - 1); + _inner_loop_dim_num = dn; + } + int dp = _domain_dims.lookup_posn(settings._inner_loop_dim); + if (dp < 0) { + os << "Warning: inner-loop-dim '" << settings._inner_loop_dim << + "' ignored because it's not a domain dim.\n"; + settings._inner_loop_dim.clear(); + } else + _inner_loop_dim_num = dp + 1; + } + if (!settings._inner_loop_dim.length()) { + settings._inner_loop_dim = _inner_layout_dim; + _inner_loop_dim_num = ndd; + } + assert(_inner_loop_dim_num > 0); + assert(_inner_loop_dim_num <= ndd); // Extract domain fold lengths based on cmd-line options. IntTuple fold_opts; @@ -142,8 +168,9 @@ namespace yask { _fold.set_val(dname, sz); fold_opts.add_dim_back(dname, sz); } - os << " Number of SIMD elements: " << vlen << endl; - if (fold_opts._get_num_dims()) + os << "Folding and clustering:\n" + " Number of SIMD elements: " << vlen << endl; + if (fold_opts.get_num_dims()) os << " Requested vector-fold dimension(s) and point-size(s): " << _fold.make_dim_val_str(" * ") << endl; else @@ -152,20 +179,20 @@ namespace yask { // If needed, adjust folding to exactly cover vlen unless vlen is 1. // If vlen is 1, we will allow any folding. if (vlen > 1 && _fold.product() != vlen) { - if (fold_opts._get_num_dims()) - os << "Notice: adjusting requested fold to achieve SIMD length of " << + if (fold_opts.get_num_dims()) + os << "Note: adjusting requested fold to achieve SIMD length of " << vlen << ".\n"; // If 1D, there is only one option. - if (_domain_dims._get_num_dims() == 1) - _fold[_inner_dim] = vlen; + if (_domain_dims.get_num_dims() == 1) + _fold[_inner_layout_dim] = vlen; // If 2D+, adjust folding. else { // Determine inner-dim size separately because // vector-folding works best when folding is - // applied in non-inner dims. + // applied in non-inner-loop dims. int inner_sz = 1; // If specified dims are within vlen, try to use @@ -173,7 +200,7 @@ namespace yask { if (fold_opts.product() < vlen) { // Inner-dim fold-size requested and a factor of vlen? - auto* p = fold_opts.lookup(_inner_dim); + auto* p = fold_opts.lookup(settings._inner_loop_dim); if (p && (vlen % *p == 0)) inner_sz = *p; } @@ -195,13 +222,13 @@ namespace yask { IntTuple inner_opts; for (auto& dim : _domain_dims) { auto& dname = dim._get_name(); - if (dname == _inner_dim) + if (dname == settings._inner_loop_dim) continue; auto* p = fold_opts.lookup(dname); int sz = p ? *p : 0; // 0 => not specified. inner_opts.add_dim_front(dname, sz); // favor more inner ones. } - assert(inner_opts._get_num_dims() == _domain_dims._get_num_dims() - 1); + assert(inner_opts.get_num_dims() == _domain_dims.get_num_dims() - 1); // Get final size of non-inner dims. inner_folds = inner_opts.get_compact_factors(upper_sz); @@ -210,14 +237,14 @@ namespace yask { // Put them into the fold. for (auto& dim : _domain_dims) { auto& dname = dim._get_name(); - if (dname == _inner_dim) + if (dname == settings._inner_loop_dim) _fold[dname] = inner_sz; else if (inner_folds.lookup(dname)) _fold[dname] = inner_folds[dname]; else _fold[dname] = 1; } - assert(_fold._get_num_dims() == _domain_dims._get_num_dims()); + assert(_fold.get_num_dims() == _domain_dims.get_num_dims()); } // Check it. @@ -240,6 +267,46 @@ namespace yask { _fold.set_first_inner(settings._first_inner); _fold_gt1.set_first_inner(settings._first_inner); + + // Order all dims for layout. + // Start w/all domain dims. + _layout_dims = _domain_dims; + + // Insert step dim. + _layout_dims.add_dim_front(_step_dim, 0); + + // Insert misc dims depending on setting. + for (int i = 0; i < _misc_dims.get_num_dims(); i++) { + auto& mdim = _misc_dims.get_dim(i); + if (settings._inner_misc) + _layout_dims.add_dim_back(mdim); + else + _layout_dims.add_dim_at(i, mdim); + } + + // Move outer layout domain dim if requested. + if (settings._outer_domain) { + _layout_dims = _layout_dims.remove_dim(_outer_layout_dim); + _layout_dims.add_dim_front(_outer_layout_dim, 0); + } + + // Move inner layout domain dim if no explicit SIMD. + // This will help enable implicit SIMD when possible. + if (_fold.product() <= 1) { + _layout_dims = _layout_dims.remove_dim(_inner_layout_dim); + _layout_dims.add_dim_back(_inner_layout_dim, 0); + } + + os << "Step dimension: " << _step_dim << endl; + os << "Domain dimension(s): " << _domain_dims.make_dim_str() << endl; + if (_misc_dims.get_num_dims()) + os << "Misc dimension(s): " << _misc_dims.make_dim_str() << endl; + else + os << "No misc dimensions used\n"; + os << "Dimension(s) in layout order: " << _layout_dims.make_dim_str() << endl; + os << "Inner-loop dimension: " << settings._inner_loop_dim << endl; + + // Checks for unaligned loads. if (settings._allow_unaligned_loads) { if (_fold_gt1.size() > 1) { @@ -278,10 +345,6 @@ namespace yask { _cluster_mults.make_dim_val_str(" * ") << endl; os << " Cluster dimension(s) and point-size(s): " << _cluster_pts.make_dim_val_str(" * ") << endl; - if (_misc_dims._get_num_dims()) - os << "Misc dimension(s): " << _misc_dims.make_dim_str() << endl; - else - os << "No misc dimensions used\n"; } // Make string like "+(4/VLEN_X)" or "-(2/VLEN_Y)" or "" if ofs==zero. diff --git a/src/compiler/lib/Settings.hpp b/src/compiler/lib/Settings.hpp index 442ff392..f1e05361 100644 --- a/src/compiler/lib/Settings.hpp +++ b/src/compiler/lib/Settings.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -41,16 +41,20 @@ namespace yask { int _elem_bytes = 4; // bytes in an FP element. string _step_dim; // explicit step dim. vector _domain_dims; // explicit domain dims. + string _inner_loop_dim; // explicit inner-loop dim. + int _min_buffer_len = 1; // min length of an inner-loop buffer. + int _read_ahead_dist = 0; // iterations to read ahead. IntTuple _fold_options; // vector fold. IntTuple _cluster_options; // cluster multipliers. map _prefetch_dists; bool _first_inner = true; // first dimension of fold is unit step. - string _eq_bundle_basename_default = "stencil_bundle"; + string _eq_bundle_basename_default = "bundle"; bool _allow_unaligned_loads = false; bool _bundle_scratch = true; int _halo_size = 0; // 0 => calculate each halo automatically. int _step_alloc = 0; // 0 => calculate each step allocation automatically. - bool _inner_misc = false; + bool _inner_misc = true; + bool _outer_domain = false; int _max_expr_size = 50; int _min_expr_size = 2; bool _do_cse = true; // do common-subexpr elim. @@ -62,6 +66,10 @@ namespace yask { string _var_regex; // vars to update. bool _find_deps = true; bool _print_eqs = false; + bool _use_ptrs = true; // enable access via pointers & strides. + bool _use_many_ptrs = false; // make pointer for almost every point. + bool _use_offsets = false; // compute offsets from var alloc start. + bool _early_loads = true; // issue loads early in the inner loop. }; // Stencil dimensions. @@ -69,9 +77,11 @@ namespace yask { string _step_dim; // step dimension, usually time. IntTuple _domain_dims; // domain dims, usually spatial (with zero value). IntTuple _stencil_dims; // both step and domain dims. - string _inner_dim; // domain dim that will be used in the inner loop. - string _outer_dim; // domain dim that will be used in the outer loop. IntTuple _misc_dims; // misc dims that are not the step or domain. + int _inner_loop_dim_num = 0; // stencil-dim index of inner-loop-dim. + string _inner_layout_dim; // inner-most domain dim in mem array layout. + string _outer_layout_dim; // outer-most domain dim in mem array layout. + IntTuple _layout_dims; // all dims in array-layout order. // Following contain only domain dims. IntTuple _scalar; // points in scalar (value 1 in each). diff --git a/src/compiler/lib/Solution.cpp b/src/compiler/lib/Solution.cpp index 6dc77198..7039428e 100644 --- a/src/compiler/lib/Solution.cpp +++ b/src/compiler/lib/Solution.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -81,29 +81,6 @@ namespace yask { " is not 1 or 2."); if (_settings._prefetch_dists.count(level)) return _settings._prefetch_dists.at(level); - else if (is_target_set()) { - auto target = get_target(); - - // Defaults for various targets. - if (target == "knc") { - if (level == 1) - return 1; - else - return 2; - } - else if (target == "knl") { - if (level == 1) - return 1; - else - return 0; - } - else { - if (level == 1) - return 0; - else - return 2; - } - } return 0; } void StencilSolution::set_prefetch_dist(int level, @@ -112,7 +89,7 @@ namespace yask { if (distance < 0) THROW_YASK_EXCEPTION("Error: prefetch-distance " + to_string(distance) + - " is not positive."); + " is not zero or positive."); _settings._prefetch_dists[level] = distance; } yc_solution_base::soln_map& yc_solution_base::get_registry() { @@ -161,16 +138,16 @@ namespace yask { void StencilSolution::analyze_solution(int vlen, bool is_folding_efficient) { - // Find all the stencil dimensions from the vars. - // Create the final folds and clusters from the cmd-line options. + // Find all the stencil dimensions in the settings and/or vars. + // Create the final folds and clusters. _dims.set_dims(_vars, _settings, vlen, is_folding_efficient, *_dos); - // Determine which vars can be folded. - _vars.set_folding(_dims); + // Count dim types in each var and determine foldability. + _vars.set_dim_counts(_dims); // Determine which var points can be vectorized and analyze inner-loop accesses. - _eqs.analyze_vec(_dims); - _eqs.analyze_loop(_dims); + _eqs.analyze_vec(_settings, _dims); + _eqs.analyze_loop(_settings, _dims); // Find dependencies between equations. _eqs.analyze_eqs(_settings, _dims, *_dos); @@ -219,8 +196,11 @@ namespace yask { else if (target == "hsw" || target == "bdw") target = "avx2"; else if (target == "avx512f" || target == "skx" || - target == "skl" || target == "clx") + target == "skl" || target == "clx" || + target == "avx512-zmm" || target == "avx512hi") target = "avx512"; + else if (target == "avx512lo") + target = "avx512-ymm"; // Ensure all intermediate data is clean. _free(true); @@ -233,13 +213,11 @@ namespace yask { // Data itself will be created in analyze_solution(). if (target == "intel64") _printer = new YASKCppPrinter(*this, *_eq_bundles, *_eq_stages, *_cluster_eq_bundles); - else if (target == "knc") - _printer = new YASKKncPrinter(*this, *_eq_bundles, *_eq_stages, *_cluster_eq_bundles); else if (target == "avx" || target == "avx2") _printer = new YASKAvx256Printer(*this, *_eq_bundles, *_eq_stages, *_cluster_eq_bundles); else if (target == "avx512" || target == "knl") _printer = new YASKAvx512Printer(*this, *_eq_bundles, *_eq_stages, *_cluster_eq_bundles); - else if (target == "avx512lo") + else if (target == "avx512-ymm") _printer = new YASKAvx512Printer(*this, *_eq_bundles, *_eq_stages, *_cluster_eq_bundles, true); else if (target == "dot") _printer = new DOTPrinter(*this, *_cluster_eq_bundles, false); diff --git a/src/compiler/lib/Solution.hpp b/src/compiler/lib/Solution.hpp index 2ba920dd..21754171 100644 --- a/src/compiler/lib/Solution.hpp +++ b/src/compiler/lib/Solution.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -84,12 +84,7 @@ namespace yask { void _free(bool free_printer); public: - StencilSolution(const string& name) : - _name(name) { - yask_output_factory ofac; - auto so = ofac.new_stdout_output(); - set_debug_output(so); - } + StencilSolution(const string& name) : _name(name) { } virtual ~StencilSolution() { _free(true); } // Identification. @@ -125,7 +120,13 @@ namespace yask { _debug_output = debug; // to share ownership of referent. _dos = &_debug_output->get_ostream(); } - virtual yask_output_ptr get_debug_output() const { + virtual yask_output_ptr get_debug_output() { + if (!_debug_output.get()) { + yask_output_factory ofac; + auto so = ofac.new_stdout_output(); + set_debug_output(so); + } + assert(_debug_output.get()); return _debug_output; } virtual void set_name(std::string name) override { diff --git a/src/compiler/lib/Var.cpp b/src/compiler/lib/Var.cpp index 2adf0375..e8b053c1 100644 --- a/src/compiler/lib/Var.cpp +++ b/src/compiler/lib/Var.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -129,40 +129,51 @@ namespace yask { _dims = dims; } + // Simple accessors. + CompilerSettings& Var::get_settings() { return _soln->get_settings(); } + const Dimensions& Var::get_soln_dims() { return _soln->get_dims(); } + // Determine whether var can be folded. - void Var::set_folding(const Dimensions& dims) { + void Var::set_dim_counts(const Dimensions& dims) { + // Find num of dim types in this var. + _num_step_dims = 0; + _num_domain_dims = 0; + _num_misc_dims = 0; _num_foldable_dims = 0; + for (auto gdim : _dims) { + auto& dname = gdim->_get_name(); + auto dtype = gdim->get_type(); - // Never fold scalars, even if there is no vectorization. - if (get_num_dims() == 0) { - _is_foldable = false; - return; - } + if (dtype == STEP_INDEX) + _num_step_dims++; - // Find the number of folded dims used in this var. - for (auto fdim : dims._fold_gt1) { - auto& fdname = fdim._get_name(); - - // Search for dim in var. - bool found = false; - for (auto gdim : _dims) { - auto& gdname = gdim->_get_name(); - if (fdname == gdname) { - found = true; - break; - } + else if (dtype == DOMAIN_INDEX) { + _num_domain_dims++; + if (dims._fold_gt1.lookup(dname)) + _num_foldable_dims++; } - if (found) - _num_foldable_dims++; + + else if (dtype == MISC_INDEX) { + _num_misc_dims++; + } + + else + assert("internal error: unknown dim type"); } - // Can fold if ALL fold dims >1 are used in this var. + // Never fold vars without domain dims, even if there is no vectorization. + if (_num_domain_dims == 0) + _is_foldable = false; + + // Otherwise, can fold if ALL vec dims are used in this var. + else { - // NB: this will always be true if there is no vectorization, i.e., - // both are zero. We do this because the compiler expects stencils - // to be vectorizable. - _is_foldable = _num_foldable_dims == int(dims._fold_gt1.size()); + // NB: this will be true if there is no vectorization, i.e., + // both are zero. We do this because the compiler expects stencils + // to be vectorizable. + _is_foldable = _num_foldable_dims == int(dims._fold_gt1.size()); + } } // Determine whether halo sizes are equal. @@ -280,6 +291,14 @@ namespace yask { update_l1_dist(l1_dist); } + // Update write stages and offsets. + void Var::update_write_points(const string& stage_name, const IntTuple& offsets) { + auto& sdims = get_soln_dims(); + auto* sofs = offsets.lookup(sdims._step_dim); + if (sofs) + _write_points[stage_name] = *sofs; + } + // Update const indices based on 'indices'. void Var::update_const_indices(const IntTuple& indices) { @@ -304,36 +323,31 @@ namespace yask { } // Determine how many values in step-dim are needed. - int Var::get_step_dim_size() const + Var::StepDimInfo Var::get_step_dim_info() const { - // Specified by API. - if (_step_alloc > 0) - return _step_alloc; - + StepDimInfo sdi; + // No step-dim index used. auto step_dim = get_step_dim(); if (!step_dim) - return 1; + return sdi; - // Specified on cmd line. - if (_soln->get_settings()._step_alloc > 0) - return _soln->get_settings()._step_alloc; - // No info stored? if (_halos.size() == 0) - return 1; + return sdi; // Need the max across all stages. int max_sz = 1; - // Loop thru each stage w/halos. + // Loop thru each stage w/halos, including halos w/size zero. for (auto& hi : _halos) { -#ifdef DEBUG_HALOS - auto& pname = hi.first; -#endif + auto& stage_name = hi.first; auto& h2 = hi.second; - // First and last step-dim found. + // Written? + bool is_written = false; + + // First (lowest) and last (highest) step-dim offset. const int unset = -9999; int first_ofs = unset, last_ofs = unset; @@ -347,11 +361,15 @@ namespace yask { auto ofs = j.first; auto& halo = j.second; // halo tuple at step-val 'ofs'. + // Written here? + if (_write_points.count(stage_name) && _write_points.at(stage_name) == ofs) + is_written = true; + // Any existing value? if (halo.size()) { #ifdef DEBUG_HALOS cout << "** var " << _name << " has halo " << halo.make_dim_val_str() << - " at ofs " << ofs << " in stage " << pname << endl; + " at ofs " << ofs << " in stage " << stage_name << endl; #endif // Update vars. @@ -373,26 +391,38 @@ namespace yask { if (last_ofs != unset && first_ofs != unset && last_ofs != first_ofs) { // Default step-dim size is range of step offsets. - // For example, if equation touches 't' through 't+2', - // 'sz' is 3. + // For example, if equation touches 't-1' through 't+2', + // 'sz' is 4. int sz = last_ofs - first_ofs + 1; - // First and last largest halos. - int first_max_halo = 0, last_max_halo = 0; - for (auto& i : h2) { - //auto left = i.first; - auto& h3 = i.second; // map of step-dims to halos. + // Check for possible writeback. + if (is_written) { - if (h3.count(first_ofs) && h3.at(first_ofs).size()) - first_max_halo = max(first_max_halo, h3.at(first_ofs).max()); - if (h3.count(last_ofs) && h3.at(last_ofs).size()) - last_max_halo = max(last_max_halo, h3.at(last_ofs).max()); - } + // First and last largest halos. + int first_max_halo = 0, last_max_halo = 0; + for (auto& i : h2) { + //auto left = i.first; + auto& h3 = i.second; // map of step-dims to halos. + + if (h3.count(first_ofs) && h3.at(first_ofs).size()) + first_max_halo = max(first_max_halo, h3.at(first_ofs).max()); + if (h3.count(last_ofs) && h3.at(last_ofs).size()) + last_max_halo = max(last_max_halo, h3.at(last_ofs).max()); + } - // If first and last halos are zero, we can further optimize storage by - // immediately reusing memory location. - if (sz > 1 && first_max_halo == 0 && last_max_halo == 0) - sz--; + // If first and last halos are zero, we can further optimize + // storage by immediately reusing memory location. + if (sz > 1 && first_max_halo == 0 && last_max_halo == 0) { + int write_ofs = _write_points.at(stage_name); + sz--; + if (last_ofs == write_ofs) // forward step. + sdi.writeback_ofs[stage_name] = first_ofs; // replace lowest read. + else if (first_ofs == write_ofs) // backward step. + sdi.writeback_ofs[stage_name] = last_ofs; // replace lowest read. + else + assert("write ofs is neither first or last"); + } + } // Keep max so far. max_sz = max(max_sz, sz); @@ -400,7 +430,16 @@ namespace yask { } // stages. - return max_sz; + // Override by API. + if (_step_alloc > 0) + sdi.step_dim_size = _step_alloc; + + // Specified on cmd line. + if (_soln->get_settings()._step_alloc > 0) + sdi.step_dim_size = _soln->get_settings()._step_alloc; + + sdi.step_dim_size = max_sz; + return sdi; } // Description of this var. diff --git a/src/compiler/lib/Var.hpp b/src/compiler/lib/Var.hpp index e483074b..0ed2b533 100644 --- a/src/compiler/lib/Var.hpp +++ b/src/compiler/lib/Var.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -28,6 +28,7 @@ IN THE SOFTWARE. #pragma once #include "Expr.hpp" +#include "VarPoint.hpp" using namespace std; @@ -35,16 +36,18 @@ namespace yask { // Fwd decl. struct Dimensions; + class CompilerSettings; // A class for a Var. // This is a generic container for all variables to be accessed // from the kernel. A 0-D var is a scalar, a 1-D var is an array, etc. - // Dims can be the step dim, a domain dim, or anything else. + // Dims can be the step dim, a domain dim, or a misc dim. class Var : public virtual yc_var { protected: string _name; // name of this var. - index_expr_ptr_vec _dims; // dimensions of this var. + index_expr_ptr_vec _dims; // dimensions of this var in param order. + index_expr_ptr_vec _layout_dims; // dimensions of this var in layout order. bool _is_scratch = false; // true if a temp var. // Step-dim info. @@ -54,8 +57,12 @@ namespace yask { // Ptr to solution that this var belongs to (its parent). StencilSolution* _soln = 0; - // How many dims are foldable. - int _num_foldable_dims = -1; // -1 => unknown. + // How many dims of various types. + // -1 => unknown. + int _num_step_dims = -1; + int _num_domain_dims = -1; + int _num_misc_dims = -1; + int _num_foldable_dims = -1; // Whether this var can be vector-folded. bool _is_foldable = false; @@ -72,9 +79,25 @@ namespace yask { // int key: step-dim offset or 0 if no step-dim. map>> _halos; + // Extra padding needed for read-ahead. + int _read_ahead_pad = 0; + + // Key: stage. + // Val: step-index for write in stage. + map _write_points; + // Greatest L1 dist of any halo point that accesses this var. int _l1_dist = 0; + virtual void _check_ok() const { + assert(_num_step_dims >= 0); + assert(_num_step_dims <= 1); + assert(_num_domain_dims >= 0); + assert(_num_misc_dims >= 0); + assert(_num_foldable_dims >= 0); + assert(_num_foldable_dims <= _num_domain_dims); + } + public: // Ctors. Var(string name, @@ -90,14 +113,29 @@ namespace yask { void set_name(const string& name) { _name = name; } string get_descr() const; - // Access dims. + // Access dims for this var (not for soln). + // The are returned in declaration order (not necessarily layout order). virtual const index_expr_ptr_vec& get_dims() const { return _dims; } + IntTuple get_dims_tuple() const { + IntTuple gdims; + for (const auto& dim : _dims) { + const auto& dname = dim->_get_name(); + gdims.add_dim_back(dname, 0); + } + return gdims; + } + virtual const index_expr_ptr_vec& get_layout_dims() const { return _layout_dims; } + virtual index_expr_ptr_vec& get_layout_dims() { return _layout_dims; } // Step dim or null if none. virtual const index_expr_ptr get_step_dim() const { - for (auto d : _dims) - if (d->get_type() == STEP_INDEX) - return d; + _check_ok(); + if (_num_step_dims > 0) { + for (auto d : _dims) + if (d->get_type() == STEP_INDEX) + return d; + assert("internal error: step dim not found"); + } return nullptr; } @@ -107,14 +145,30 @@ namespace yask { // Access to solution. virtual StencilSolution* _get_soln() { return _soln; } virtual void set_soln(StencilSolution* soln) { _soln = soln; } + virtual CompilerSettings& get_settings(); + virtual const Dimensions& get_soln_dims(); + // Get dim-type counts. + virtual int get_num_step_dims() const { + _check_ok(); + return _num_step_dims; + } + virtual int get_num_domain_dims() const { + _check_ok(); + return _num_domain_dims; + } + virtual int get_num_misc_dims() const { + _check_ok(); + return _num_misc_dims; + } + // Get foldablity. virtual int get_num_foldable_dims() const { - assert(_num_foldable_dims >= 0); + _check_ok(); return _num_foldable_dims; } virtual bool is_foldable() const { - assert(_num_foldable_dims >= 0); + _check_ok(); return _is_foldable; } @@ -153,12 +207,23 @@ namespace yask { return h; } + // Extra padding. + virtual void set_read_ahead_pad(int n) { + _read_ahead_pad = n; + } + virtual void update_read_ahead_pad(int n) { + _read_ahead_pad = max(_read_ahead_pad, n); + } + virtual int get_read_ahead_pad() const { + return _read_ahead_pad; + } + // Get max L1 dist of halos. virtual int get_l1_dist() const { return _l1_dist; } - // Determine whether dims are same. + // Determine whether dims are same as 'other' var. virtual bool are_dims_same(const Var& other) const { if (_dims.size() != other._dims.size()) return false; @@ -173,10 +238,14 @@ namespace yask { } // Determine how many values in step-dim are needed. - virtual int get_step_dim_size() const; + struct StepDimInfo { + int step_dim_size = 1; + map writeback_ofs; + }; + virtual StepDimInfo get_step_dim_info() const; - // Determine whether var can be folded. - virtual void set_folding(const Dimensions& dims); + // Determine dim-type counts and whether var can be folded. + virtual void set_dim_counts(const Dimensions& dims); // Determine whether halo sizes are equal. virtual bool is_halo_same(const Var& other) const; @@ -187,6 +256,12 @@ namespace yask { // Update halos and L1 dist based on each value in 'offsets'. virtual void update_halo(const string& stage_name, const IntTuple& offsets); + // Stage(s) with writes. + virtual const map& get_write_points() const { + return _write_points; + } + virtual void update_write_points(const string& stage_name, const IntTuple& offsets); + // Update L1 dist. virtual void update_l1_dist(int l1_dist) { _l1_dist = max(_l1_dist, l1_dist); @@ -209,7 +284,7 @@ namespace yask { assert(dp); return dp->_get_name(); } - virtual std::vector get_dim_names() const; + virtual string_vec get_dim_names() const; virtual bool is_dynamic_step_alloc() const { return !_is_step_alloc_fixed; @@ -220,7 +295,8 @@ namespace yask { } virtual idx_t get_step_alloc_size() const { - return get_step_dim_size(); + auto sdi = get_step_dim_info(); + return sdi.step_dim_size; } virtual void set_step_alloc_size(idx_t size) { @@ -279,10 +355,10 @@ namespace yask { _vars.insert(p); } - // Determine whether each var can be folded. - virtual void set_folding(const Dimensions& dims) { + // Determine dim-type counts and whether each var can be folded. + virtual void set_dim_counts(const Dimensions& dims) { for (auto gp : _vars) - gp->set_folding(dims); + gp->set_dim_counts(dims); } }; diff --git a/src/compiler/lib/VarPoint.cpp b/src/compiler/lib/VarPoint.cpp new file mode 100644 index 00000000..5fcc670c --- /dev/null +++ b/src/compiler/lib/VarPoint.cpp @@ -0,0 +1,327 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +///////// VarPoint and EqualsExpr expression nodes. + +#include "Print.hpp" +#include "ExprUtils.hpp" +#include "Parse.hpp" +#include "Print.hpp" +#include "CppIntrin.hpp" + +namespace yask { + + // VarPoint methods. + VarPoint::VarPoint(Var* var, const num_expr_ptr_vec& args) : + _var(var), _args(args) { + assert(var); + + // Check for correct number of args. + size_t nd = var->get_dims().size(); + if (nd != args.size()) { + FORMAT_AND_THROW_YASK_EXCEPTION("Error: attempt to create a var point in " << + nd << "-D var '" << get_var_name() << "' with " << + args.size() << " indices"); + } + + // Eval each arg. +#ifdef DEBUG_GP + cout << "Creating var point " << make_quoted_str() << "...\n"; +#endif + auto dims = var->get_dims(); + for (size_t i = 0; i < nd; i++) { + auto dim = dims.at(i); + auto dname = dim->_get_name(); + auto arg = args.at(i); + assert(arg); +#ifdef DEBUG_GP + cout << " Arg " << arg->make_quoted_str() << + " at dim '" << dname << "'\n"; +#endif + int offset = 0; + + // A compile-time const? + if (arg->is_const_val()) { +#ifdef DEBUG_GP + cout << " is const val " << arg->get_int_val() << endl; +#endif + IntScalar c(dname, arg->get_int_val()); + set_arg_const(c); + } + + // A simple offset? + else if (arg->is_offset_from(dname, offset)) { +#ifdef DEBUG_GP + cout << " has offset " << offset << endl; +#endif + IntScalar o(dname, offset); + set_arg_offset(o); + } + } + _update_str(); + } + const num_expr_ptr VarPoint::get_arg(const string& dim) const { + for (int di = 0; di < _var->get_num_dims(); di++) { + auto& dn = _var->get_dim_name(di); // name of dim at this posn. + if (dim == dn) + return _args.at(di); + } + return nullptr; + } + const string& VarPoint::get_var_name() const { + return _var->_get_name(); + } + bool VarPoint::is_var_foldable() const { + return _var->is_foldable(); + } + string VarPoint::make_arg_str(const VarMap* var_map) const { + string str; + int i = 0; + for (auto arg : _args) { + if (i++) str += ", "; + str += arg->make_str(var_map); + } + return str; + } + string VarPoint::make_arg_str(const string& dim, + const VarMap* var_map) const { + int i = 0; + auto arg = get_arg(dim); + assert(arg.get()); + string str = arg->make_str(var_map); + return str; + } + string VarPoint::_make_str(const VarMap* var_map) const { + string str = _var->_get_name() + "(" + + make_arg_str(var_map) + ")"; + return str; + } + string VarPoint::make_logical_var_str(const VarMap* var_map) const { + string str = _var->_get_name(); + if (_consts.size()) + str += "(" + _consts.make_dim_val_str() + ")"; + return str; + } + const index_expr_ptr_vec& VarPoint::get_dims() const { + return _var->get_dims(); + } + const index_expr_ptr_vec& VarPoint::get_layout_dims() const { + return _var->get_layout_dims(); + } + + // Make normalized string like "x+(4/VLEN_X)" from + // original arg "x+4" in 'dname' dim. + // Args w/o simple offset are not modified. + string VarPoint::make_norm_arg_str(const string& dname, + const Dimensions& dims, + const VarMap* var_map) const { + string res; + + // Const offset? + auto* ofs = _offsets.lookup(dname); + + // Zero offset? + if (ofs && *ofs == 0) + res = dname; + + // dname exists in fold? + else if (ofs && dims._fold_gt1.lookup(dname)) + res = "(" + dname + dims.make_norm_str(*ofs, dname) + ")"; + + // Otherwise, just find and format arg as-is. + else { + auto& gdims = _var->get_dims(); + for (size_t i = 0; i < gdims.size(); i++) { + auto gdname = gdims[i]->_get_name(); + if (gdname == dname) { + res = _args.at(i)->make_str(var_map); + break; + } + } + } + assert(res.length()); + return res; + } + + // Make string like "x+(4/VLEN_X), y, z-(2/VLEN_Z)" from + // original args "x+4, y, z-2". + // This object has numerators; norm object has denominators. + // Args w/o simple offset are not modified. + string VarPoint::make_norm_arg_str(const Dimensions& dims, + const VarMap* var_map) const { + + string res; + auto& gd = _var->get_dims(); + for (size_t i = 0; i < gd.size(); i++) { + if (i) + res += ", "; + auto dname = gd[i]->_get_name(); + res += make_norm_arg_str(dname, dims, var_map); + } + return res; + } + + // Make string like "g->_wrap_step(t+1)" from original arg "t+1" + // if var uses step dim, "" otherwise. + // If var doesn't allow dynamic alloc, set to fixed value. + string VarPoint::make_step_arg_str(const string& var_ptr, + const Dimensions& dims) const { + + auto& gd = _var->get_dims(); + for (size_t i = 0; i < gd.size(); i++) { + auto dname = gd[i]->_get_name(); + auto& arg = _args.at(i); + if (dname == dims._step_dim) { + if (_var->is_dynamic_step_alloc()) + return var_ptr + "->_wrap_step(" + arg->make_str() + ")"; + else { + auto step_alloc = _var->get_step_alloc_size(); + if (step_alloc == 1) + return "0"; // 1 alloc => always index 0. + else + return "imod_flr(" + arg->make_str() + ", " + + to_string(step_alloc) + ")"; + } + } + } + return ""; + } + + // Set given arg to given offset; ignore if not in step or domain var dims. + void VarPoint::set_arg_offset(const IntScalar& offset) { + + // Find dim in var. + auto gdims = _var->get_dims(); + for (size_t i = 0; i < gdims.size(); i++) { + auto gdim = gdims[i]; + + // Must be domain or step dim. + if (gdim->get_type() == MISC_INDEX) + continue; + + auto dname = gdim->_get_name(); + if (offset._get_name() == dname) { + + // Make offset equation. + int ofs = offset.get_val(); + auto ie = gdim->clone(); + num_expr_ptr nep; + if (ofs > 0) { + auto op = make_shared(ofs); + nep = make_shared(ie, op); + } + else if (ofs < 0) { + auto op = make_shared(-ofs); + nep = make_shared(ie, op); + } + else // 0 offset. + nep = ie; + + // Replace in args. + _args[i] = nep; + + // Set offset. + _offsets.add_dim_back(dname, ofs); + + // Remove const if it exists. + _consts = _consts.remove_dim(dname); + + break; + } + } + _update_str(); + } + + // Set given arg to given const; + void VarPoint::set_arg_const(const IntScalar& val) { + + // Find dim in var. + auto gdims = _var->get_dims(); + for (size_t i = 0; i < gdims.size(); i++) { + auto gdim = gdims[i]; + + auto dname = gdim->_get_name(); + if (val._get_name() == dname) { + + // Make const expr. + int v = val.get_val(); + auto vp = make_shared(v); + + // Replace in args. + _args[i] = vp; + + // Set const + _consts.add_dim_back(dname, v); + + // Remove offset if it exists. + _offsets = _offsets.remove_dim(dname); + + break; + } + } + _update_str(); + } + + // Set given arg to given expr. + void VarPoint::set_arg_expr(const string& expr_dim, const string& expr) { + + // Find dim in var. + auto gdims = _var->get_dims(); + for (size_t i = 0; i < gdims.size(); i++) { + auto gdim = gdims[i]; + auto dname = gdim->_get_name(); + if (expr_dim == dname) { + + // Make expr node. + auto ep = make_shared(expr); + + // Replace in args. + _args[i] = ep; + + // Remove const and/or offset if either exists. + _consts = _consts.remove_dim(dname); + _offsets = _offsets.remove_dim(dname); + + break; + } + } + _update_str(); + } + + // EqualsExpr methods. + bool EqualsExpr::is_scratch() { + Var* gp = _get_var(); + return gp && gp->is_scratch(); + } + bool EqualsExpr::is_same(const Expr* other) const { + auto p = dynamic_cast(other); + return p && + _lhs->is_same(p->_lhs.get()) && + _rhs->is_same(p->_rhs.get()) && + are_exprs_same(_cond, p->_cond) && // might be null. + are_exprs_same(_step_cond, p->_step_cond); // might be null. + } + +} // namespace yask. diff --git a/src/compiler/lib/VarPoint.hpp b/src/compiler/lib/VarPoint.hpp new file mode 100644 index 00000000..ea64a0d3 --- /dev/null +++ b/src/compiler/lib/VarPoint.hpp @@ -0,0 +1,314 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +///////// VarPoint and EqualsExpr expression nodes. + +#pragma once + +namespace yask { + + // One specific point in a var. + // This is an expression leaf-node. + class VarPoint : public NumExpr, + public virtual yc_var_point_node { + + public: + + // What kind of vectorization can be done on this point. + // Set via Eqs::analyze_vec(). + enum VecType { VEC_UNSET, + VEC_FULL, // vectorizable in all vec dims. + VEC_PARTIAL, // vectorizable in some vec dims. + VEC_NONE // vectorizable in no vec dims. + }; + + // Analysis of this point for accesses via loops through the domain dims. + // Set via Eqs::analyze_loop(). + enum VarDepType { DOMAIN_VAR_UNSET, // bogus value. + DOMAIN_VAR_INVARIANT, // not dependent on any domain dim. + DOMAIN_VAR_DEPENDENT, // dependent on some domain dim, but not inner-loop dim. + INNER_LOOP_OFFSET, // dependent on simple offset in inner-loop dim. + INNER_LOOP_COMPLEX // dependent on inner-loop dim in another way. + }; + + protected: + Var* _var = 0; // the var this point is from. + + // Index exprs for each dim, e.g., + // "3, x-5, y*2, z+4" for dims "n, x, y, z". + num_expr_ptr_vec _args; + + // Vars below are calculated from above. + + // Simple offset for each expr that is dim +/- offset, e.g., + // "x=-5, z=4" from above example ('y*2' is not an offset expr). + // Includes zero offsets. + // Set in ctor and modified via set_arg_offset/Const(). + IntTuple _offsets; + + // Simple value for each expr that is a const, e.g., + // "n=3" from above example. + // Set in ctor and modified via set_arg_offset/Const(). + IntTuple _consts; + + VecType _vec_type = VEC_UNSET; // allowed vectorization. + VarDepType _var_dep = DOMAIN_VAR_UNSET; // analysis for looping. + + // Cache the string repr. + string _def_str; + void _update_str() { + _def_str = _make_str(); + } + string _make_str(const VarMap* var_map = 0) const; + + public: + + // Construct a point given a var and an arg for each dim. + VarPoint(Var* var, const num_expr_ptr_vec& args); + + // Dtor. + virtual ~VarPoint() {} + + // Get parent var info. + const Var* _get_var() const { return _var; } + Var* _get_var() { return _var; } + virtual const string& get_var_name() const; + virtual bool is_var_foldable() const; + virtual const index_expr_ptr_vec& get_dims() const; + virtual const index_expr_ptr_vec& get_layout_dims() const; + + // Accessors. + virtual const num_expr_ptr_vec& get_args() const { return _args; } + virtual const IntTuple& get_arg_offsets() const { return _offsets; } + virtual const IntTuple& get_arg_consts() const { return _consts; } + virtual VecType get_vec_type() const { + assert(_vec_type != VEC_UNSET); + return _vec_type; + } + virtual void set_vec_type(VecType vt) { + _vec_type = vt; + } + virtual VarDepType get_var_dep() const { + assert(_var_dep != DOMAIN_VAR_UNSET); + return _var_dep; + } + virtual void set_var_dep(VarDepType vt) { + _var_dep = vt; + } + + // Get arg for 'dim' or return null if none. + virtual const num_expr_ptr get_arg(const string& dim) const; + + // Set given arg to given offset; ignore if not in step or domain var dims. + virtual void set_arg_offset(const IntScalar& offset); + + // Set given args to be given offsets. + virtual void set_arg_offsets(const IntTuple& offsets) { + for (auto ofs : offsets) + set_arg_offset(ofs); + } + + // Set given arg to given const. + virtual void set_arg_const(const IntScalar& val); + + // Set given arg to given expr. + virtual void set_arg_expr(const string& expr_dim, const string& expr); + + // Some comparisons. + bool operator==(const VarPoint& rhs) const { + return _def_str == rhs._def_str; + } + bool operator<(const VarPoint& rhs) const { + return _def_str < rhs._def_str; + } + + // Take ev to each value. + virtual string accept(ExprVisitor* ev); + + // Check for equivalency. + virtual bool is_same(const Expr* other) const { + auto p = dynamic_cast(other); + return p && *this == *p; + } + + // Check for same logical var. A logical var is defined by the var + // itself and any const indices. + virtual bool is_same_logical_var(const VarPoint& rhs) const { + return _var == rhs._var && _consts == rhs._consts; + } + + // String w/name and parens around args, e.g., 'u(x, y+2)'. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_str(const VarMap* var_map = 0) const { + if (var_map) + return _make_str(var_map); + return _def_str; + } + + // String w/name and parens around const args, e.g., 'u(n=4)'. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_logical_var_str(const VarMap* var_map = 0) const; + + // String w/just comma-sep args, e.g., 'x, y+2'. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_arg_str(const VarMap* var_map = 0) const; + + // String w/just comma-sep args, e.g., 'y+2' in 'dname' dim. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_arg_str(const string& dname, + const VarMap* var_map = 0) const; + + // String v/vec-normalized args, e.g., 'x, y+(2/VLEN_Y)'. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_norm_arg_str(const Dimensions& dims, + const VarMap* var_map = 0) const; + + // Make string like "x+(4/VLEN_X)" from original arg "x+4" in 'dname' dim. + // Apply substitutions to indices using 'var_map' if provided. + virtual string make_norm_arg_str(const string& dname, + const Dimensions& dims, + const VarMap* var_map = 0) const; + + // Make string like "g->_wrap_step(t+1)" from original arg "t+1" + // if var uses step dim, "" otherwise. + virtual string make_step_arg_str(const string& var_ptr, const Dimensions& dims) const; + + // Create a deep copy of this expression, + // except pointed-to var is not copied. + virtual num_expr_ptr clone() const { + return make_shared(*this); + } + virtual var_point_ptr clone_var_point() const { + return make_shared(*this); + } + + // APIs. + virtual yc_var* get_var(); + virtual const yc_var* get_var() const; + }; + + // Equality operator for a var point. + // This defines the LHS as equal to the RHS; it is NOT + // a comparison operator; it is NOT an assignment operator. + // It also holds an optional condition. + class EqualsExpr : public Expr, + public virtual yc_equation_node { + protected: + var_point_ptr _lhs; + num_expr_ptr _rhs; + bool_expr_ptr _cond; + bool_expr_ptr _step_cond; + + public: + EqualsExpr(var_point_ptr lhs, num_expr_ptr rhs, + bool_expr_ptr cond = nullptr, + bool_expr_ptr step_cond = nullptr) : + _lhs(lhs), _rhs(rhs), _cond(cond), _step_cond(step_cond) { } + EqualsExpr(const EqualsExpr& src) : + _lhs(src._lhs->clone_var_point()), + _rhs(src._rhs->clone()) { + if (src._cond) + _cond = src._cond->clone(); + else + _cond = nullptr; + if (src._step_cond) + _step_cond = src._step_cond->clone(); + else + _step_cond = nullptr; + } + + var_point_ptr& _get_lhs() { return _lhs; } + const var_point_ptr& _get_lhs() const { return _lhs; } + num_expr_ptr& _get_rhs() { return _rhs; } + const num_expr_ptr& _get_rhs() const { return _rhs; } + bool_expr_ptr& _get_cond() { return _cond; } + const bool_expr_ptr& _get_cond() const { return _cond; } + void _set_cond(bool_expr_ptr cond) { _cond = cond; } + bool_expr_ptr& _get_step_cond() { return _step_cond; } + const bool_expr_ptr& _get_step_cond() const { return _step_cond; } + void _set_step_cond(bool_expr_ptr step_cond) { _step_cond = step_cond; } + virtual string accept(ExprVisitor* ev); + static string expr_op_str() { return "EQUALS"; } + static string cond_op_str() { return "IF_DOMAIN"; } + static string step_cond_op_str() { return "IF_STEP"; } + + // Get pointer to var on LHS or NULL if not set. + virtual Var* _get_var() { + if (_lhs.get()) + return _lhs->_get_var(); + return NULL; + } + + // LHS is scratch var. + virtual bool is_scratch(); + + // Check for equivalency. + virtual bool is_same(const Expr* other) const; + + // Create a deep copy of this expression. + virtual equals_expr_ptr clone() const { return make_shared(*this); } + virtual yc_equation_node_ptr clone_ast() const { + return clone(); + } + + // APIs. + virtual yc_var_point_node_ptr get_lhs() { return _lhs; } + virtual yc_number_node_ptr get_rhs() { return _rhs; } + virtual yc_bool_node_ptr get_cond() { return _cond; } + virtual yc_bool_node_ptr get_step_cond() { return _step_cond; } + virtual void set_cond(yc_bool_node_ptr cond) { + if (cond) { + auto p = dynamic_pointer_cast(cond); + assert(p); + _cond = p; + } else + _cond = nullptr; + } + virtual void set_step_cond(yc_bool_node_ptr step_cond) { + if (step_cond) { + auto p = dynamic_pointer_cast(step_cond); + assert(p); + _step_cond = p; + } else + _step_cond = nullptr; + } + }; + + typedef set VarPointSet; + typedef set var_point_ptr_set; + typedef vector VarPointVec; + +} // namespace yask. + +// Define hash function for VarPoint for unordered_{set,map}. +namespace std { + using namespace yask; + + template <> struct hash { + size_t operator()(const VarPoint& k) const { + return hash{}(k.make_str()); + } + }; +} diff --git a/src/compiler/lib/Vec.cpp b/src/compiler/lib/Vec.cpp index c128c511..09552011 100644 --- a/src/compiler/lib/Vec.cpp +++ b/src/compiler/lib/Vec.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -66,7 +66,7 @@ namespace yask { // Loop through all points in the vector fold. _dims._fold.visit_all_points([&](const IntTuple& vec_point, - size_t pelem){ + size_t pelem){ // Final offset in each dim is offset of var point plus // fold offset. diff --git a/src/compiler/lib/Vec.hpp b/src/compiler/lib/Vec.hpp index b3a43aee..d038adc7 100644 --- a/src/compiler/lib/Vec.hpp +++ b/src/compiler/lib/Vec.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -216,9 +216,8 @@ namespace yask { virtual string print_unaligned_vec_read(ostream& os, const VarPoint& gp) =0; // Print write to an aligned vector block. - // Return expression written. - virtual string print_aligned_vec_write(ostream& os, const VarPoint& gp, - const string& val) =0; + virtual void print_aligned_vec_write(ostream& os, const VarPoint& gp, + const string& val) =0; // Print conversion from existing vars to make an unaligned vector block. // Return var name. @@ -226,16 +225,16 @@ namespace yask { // Print construction for one point var pv_name from elems. virtual void print_unaligned_vec_ctor(ostream& os, const VarPoint& gp, - const string& pv_name) =0; + const string& pv_name) =0; // Read from a single point. // Return code for read. virtual string read_from_scalar_point(ostream& os, const VarPoint& gp, - const VarMap* v_map=0) =0; + const VarMap* v_map) =0; // Read from multiple points that are not vectorizable. // Return var name. - virtual string print_non_vec_read(ostream& os, const VarPoint& gp) =0; + virtual string print_partial_vec_read(ostream& os, const VarPoint& gp) =0; public: VecPrintHelper(VecInfoVisitor& vv, @@ -254,10 +253,9 @@ namespace yask { } // Access cached values. - virtual const string* save_point_var(const VarPoint& gp, const string& var) { + virtual void save_point_var(const VarPoint& gp, const string& var) { _vec_vars[gp] = var; - return &_vec_vars.at(gp); - } + } virtual const string* lookup_point_var(const VarPoint& gp) { if (_vec_vars.count(gp)) return &_vec_vars.at(gp); diff --git a/src/compiler/lib/Visitor.hpp b/src/compiler/lib/Visitor.hpp index 0cd53bd8..54ae534a 100644 --- a/src/compiler/lib/Visitor.hpp +++ b/src/compiler/lib/Visitor.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -28,6 +28,7 @@ IN THE SOFTWARE. #pragma once #include "Expr.hpp" +#include "VarPoint.hpp" namespace yask { @@ -99,7 +100,8 @@ namespace yask { return res; } - // Visit RHS of equals and LHS and conditions per flags. + // Visit RHS of equals always. + // Visit LHS and/or conditions per flags. virtual string visit(EqualsExpr* ee) { if (_visit_equals_lhs) ee->_get_lhs()->accept(this); diff --git a/src/compiler/lib/YaskKernel.cpp b/src/compiler/lib/YaskKernel.cpp index 801cbb9f..427f3363 100644 --- a/src/compiler/lib/YaskKernel.cpp +++ b/src/compiler/lib/YaskKernel.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -30,12 +30,32 @@ IN THE SOFTWARE. namespace yask { // Print extraction of indices. - void YASKCppPrinter::print_indices(ostream& os) const { - os << endl << " // Extract individual indices.\n"; + void YASKCppPrinter::print_indices(ostream& os, + bool print_step, bool print_domain, + const string var_prefix, + const string inner_var_prefix) const { + if (print_step && print_domain) + os << "\n // Extract index for each stencil dim.\n"; + else if (print_step) + os << "\n // Extract index for the step dim.\n"; + else if (print_domain) + os << "\n // Extract index for each domain dim.\n"; + else + return; int i = 0; for (auto& dim : _dims._stencil_dims) { auto& dname = dim._get_name(); - os << " idx_t " << dname << " = idxs[" << i << "];\n"; + bool is_step = dname == _dims._step_dim; + bool is_inner = dname == _settings._inner_loop_dim; + if ((print_step && is_step) || + (print_domain && !is_step)) { + if (inner_var_prefix.length() && is_inner) + os << " idx_t " << dname << " = " << inner_var_prefix << i << ";\n"; + else if (var_prefix.length()) + os << " idx_t " << dname << " = " << var_prefix << i << ";\n"; + else + os << " idx_t " << dname << " = idxs[" << i << "];\n"; + } i++; } } @@ -44,7 +64,7 @@ namespace yask { void YASKCppPrinter::add_comment(ostream& os, EqBundle& eq) { // Use a simple human-readable visitor to create a comment. - PrintHelper ph(_settings, _dims, NULL, "temp", "", " // ", ".\n"); + PrintHelper ph(_settings, _dims, NULL, "", " // ", ".\n"); PrintVisitorTopDown commenter(os, ph); eq.visit_eqs(&commenter); } @@ -52,17 +72,20 @@ namespace yask { // Print YASK code in new stencil context class. void YASKCppPrinter::print(ostream& os) { + string sname = _stencil.get_long_name(); os << "// Automatically-generated code; do not edit.\n" - "\n////// YASK implementation of the '" << _stencil._get_name() << + "\n////// YASK implementation of the '" << sname << "' stencil //////\n"; // Macros. - os << "\n#ifdef DEFINE_MACROS\n"; + os << "\n#if defined(DEFINE_MACROS) && !defined(MACROS_DONE)\n" + "#define MACROS_DONE\n"; print_macros(os); os << "\n#endif // DEFINE_MACROS\n"; // Stencil-context code. - os << "\n#ifdef DEFINE_CONTEXT\n" + os << "\n#if defined(DEFINE_CONTEXT) && !defined(CONTEXT_DONE)\n" + "#define CONTEXT_DONE\n" "namespace yask {" << endl; // First, create a class to hold the data (vars). @@ -83,34 +106,59 @@ namespace yask { // in favor of consts or templates. void YASKCppPrinter::print_macros(ostream& os) { + string sname = _stencil._get_name(); os << "// Stencil solution:\n" - "#define YASK_STENCIL_NAME \"" << _stencil._get_name() << "\"\n" + "#define YASK_STENCIL_NAME \"" << sname << "\"\n" "#define YASK_STENCIL_CONTEXT " << _context << endl; - - os << "\n// target:\n" + os << "\n// Target:\n" "#define YASK_TARGET \"" << _settings._target << "\"\n" - "#define REAL_BYTES (" << _settings._elem_bytes << ")\n"; + "#define REAL_BYTES " << _settings._elem_bytes << endl; - os << "\n// Number of domain dimensions:\n" - "#define NUM_DOMAIN_DIMS " << _dims._domain_dims.size() << "\n"; - int i = 0; - for (auto& dim : _dims._domain_dims) { + auto nsdims = _dims._stencil_dims.size(); + os << "\n// Dimensions:\n" + "#define STEP_DIM " << _dims._step_dim << endl << + "#define INNER_LOOP_DIM " << _settings._inner_loop_dim << endl; + os << "#define NUM_DOMAIN_DIMS " << _dims._domain_dims.size() << endl; + for (int i = 0; i < _dims._domain_dims.get_num_dims(); i++) { + auto& dim = _dims._domain_dims(i); auto& dname = dim._get_name(); - os << "#define DOMAIN_DIM_IDX_" << dname << " (" << (i++) << ")\n"; + os << "#define DOMAIN_DIM_IDX_" << dname << " " << i << endl; } - i = 0; - for (auto& dim : _dims._stencil_dims) { + os << "#define NUM_STENCIL_DIMS " << nsdims << endl; + for (int i = 0; i < _dims._stencil_dims.get_num_dims(); i++) { + auto& dim = _dims._stencil_dims(i); auto& dname = dim._get_name(); - os << "#define STENCIL_DIM_IDX_" << dname << " (" << (i++) << ")\n"; + os << "#define STENCIL_DIM_IDX_" << dname << " " << i << endl; } + os << "#define STENCIL_DIM_IDX_INNER_LOOP " << _dims._inner_loop_dim_num << endl; + os << "#define DOMAIN_LOOP_DIMS "; + bool need_comma = false; + for (int i = 0; i < _dims._domain_dims.get_num_dims(); i++) { + auto& dim = _dims._domain_dims(i); + auto& dname = dim._get_name(); + if (need_comma) + os << ","; + os << (i+1); + need_comma = true; + } + os << "\n#define PICO_BLOCK_OUTER_LOOP_DIMS "; + need_comma = false; + for (int i = 0; i < _dims._domain_dims.get_num_dims(); i++) { + if (i+1 != _dims._inner_loop_dim_num) { + auto& dim = _dims._domain_dims(i); + auto& dname = dim._get_name(); + if (need_comma) + os << ","; + os << (i+1); + need_comma = true; + } + } + os << "\n#define PICO_BLOCK_INNER_LOOP_DIM " << _dims._inner_loop_dim_num << "\n"; int gdims = 0; for (auto gp : _vars) { int ndims = gp->get_num_dims(); gdims = max(gdims, ndims); } - auto nsdims = _dims._stencil_dims.size(); - os << "\n// Number of stencil dimensions (step and domain):\n" - "#define NUM_STENCIL_DIMS " << nsdims << endl; os << "\n// Max number of var dimensions:\n" "#define NUM_VAR_DIMS " << gdims << endl; os << "\n// Max of stencil and var dims:\n" @@ -120,17 +168,25 @@ namespace yask { "#define NUM_STENCIL_EQS " << _stencil.get_num_equations() << endl; // Vec/cluster lengths. - auto nvec = _dims._fold_gt1._get_num_dims(); + auto nvec = _dims._fold_gt1.get_num_dims(); os << "\n// One vector fold: " << _dims._fold.make_dim_val_str(" * ") << endl; for (auto& dim : _dims._fold) { auto& dname = dim._get_name(); string uc_dim = all_caps(dname); - os << "#define VLEN_" << uc_dim << " (" << dim.get_val() << ")" << endl; + os << "#define VLEN_" << uc_dim << " " << dim.get_val() << endl; } os << "namespace yask {\n" + "\n // Number of points or multipliers in domain dims.\n" " constexpr idx_t fold_pts[]{ " << _dims._fold.make_val_str() << " };\n" + " constexpr idx_t cluster_pts[]{ " << _dims._cluster_pts.make_val_str() << " };\n" + " constexpr idx_t cluster_mults[]{ " << _dims._cluster_mults.make_val_str() << " };\n" + "\n // Number of points or multipliers in stencil dims.\n" + " constexpr idx_t stencil_fold_pts[]{ 1, " << _dims._fold.make_val_str() << " };\n" + " constexpr idx_t stencil_cluster_pts[]{ 1, " << _dims._cluster_pts.make_val_str() << " };\n" + " constexpr idx_t stencil_cluster_mults[]{ 1, " << _dims._cluster_mults.make_val_str() << " };\n" "}\n"; - os << "#define VLEN (" << _dims._fold.product() << ")" << endl; + os << "#define VLEN (" << _dims._fold.product() << ")\n" + "#define CPTS (" << _dims._cluster_pts.product() << ")\n"; os << "#define FIRST_FOLD_INDEX_IS_UNIT_STRIDE (" << (_dims._fold.is_first_inner() ? 1 : 0) << ")" << endl; os << "#define NUM_VEC_FOLD_DIMS (" << nvec << ")" << endl; @@ -183,42 +239,38 @@ namespace yask { } } - // Print YASK data class. + // A handy macro to create some local vars based on a Var ptr. + #define VAR_DECLS(gp) \ + int ndims = gp->get_num_dims(); \ + auto gdims = gp->get_dims_tuple(); \ + string var = gp->_get_name(); \ + string vprefix = "var_" + var; \ + string base_t = vprefix + "_base_t"; \ + string ptr_t = vprefix + "_base_ptr_t"; \ + string core_t = vprefix + "_core_t"; \ + string base_ptr = vprefix + "_base_p"; \ + string core_ptr = vprefix + "_core_p"; \ + string var_ptr = vprefix + "_p"; \ + string var_list = vprefix + "_list"; \ + string var_dim_names = vprefix + "_dim_names" + + // Print YASK var types and core-data class. void YASKCppPrinter::print_data(ostream& os) { - // get stats. - CounterVisitor cve; - _eq_bundles.visit_eqs(&cve); - - os << "\n ////// Stencil-specific data //////" << endl << - "class " << _context_base << " : public StencilContext {\n" - "public:\n"; - - // APIs. - os << "\n virtual std::string get_target() const override {\n" - " return \"" << _settings._target << "\";\n" - " }\n" - "\n virtual int get_element_bytes() const override {\n" - " return " << _settings._elem_bytes << ";\n" - " }\n"; - - // Save data for ctor and new-var method. - string ctor_code, ctor_list, new_var_code, scratch_code; - set new_var_dims; - - // Vars. - os << "\n ///// Var(s)." << endl; + // Dim layout order to be applied to all vars, + // regardless of their dim-declaration order. + os << "\n ///// Stencil var type(s).\n" + " // General array layout order (outer-to-inner): "; + for (size_t i = 0; i < _dims._layout_dims.size(); i++) { + if (i > 0) + os << ", "; + os << _dims._layout_dims.get_dim(i)._get_name(); + } + os << ".\n"; + + // Var types. for (auto gp : _vars) { - string var = gp->_get_name(); - int ndims = gp->get_num_dims(); - - // Tuple version of dims. - IntTuple gdims; - for (int dn = 0; dn < ndims; dn++) { - auto& dim = gp->get_dims()[dn]; - auto& dname = dim->_get_name(); - gdims.add_dim_back(dname, 0); - } + VAR_DECLS(gp); os << "\n // The "; if (ndims) @@ -233,9 +285,10 @@ namespace yask { else os << "not updated by any equation (read-only).\n"; if (ndims) { - os << " // Dimensions: "; + os << " // Dimensions in parameter (declaration) order: "; for (int dn = 0; dn < ndims; dn++) { - if (dn) os << ", "; + if (dn) + os << ", "; auto& dim = gp->get_dims()[dn]; auto& dname = dim->_get_name(); os << "'" << dname << "'(#" << (dn+1) << ")"; @@ -244,128 +297,522 @@ namespace yask { } // Use vector-folded layout if possible. + // Possible when a var contains all of the dims with + // vec-len > 1. bool folded = gp->is_foldable(); - string gtype = folded ? "YkVecVar" : "YkElemVar"; + string vtype = folded ? "YkVecVar" : "YkElemVar"; + string ctype = vtype + "Core"; + // Create the template params. // Type-name in kernel is 'VAR_TYPE'. - string type_name = gtype + " vlens; - vector misc_posns; - - // 1-D or more. - if (ndims) { - for (int dn = 0; dn < ndims; dn++) { - auto& dim = gp->get_dims()[dn]; - auto& dname = dim->_get_name(); - auto dtype = dim->get_type(); - bool defer = false; // add dim later. + { + string templ; + bool got_step = false; + + // 1-D or more. + if (ndims) { + int ndone = 0; + auto& ldims = gp->get_layout_dims(); + os << " // Dimensions in layout order: "; + + // Preferred order. + for (size_t i = 0; i < _dims._layout_dims.size(); i++) { + auto& dname = _dims._layout_dims.get_dim(i)._get_name(); + + // Find in this var (if it exists). + for (int dn = 0; dn < ndims; dn++) { + auto& dim = gp->get_dims()[dn]; + if (dname != dim->_get_name()) + continue; + auto dtype = dim->get_type(); + + // Update layout list in var. + ldims.push_back(dim); + + // Step dim? + if (dtype == STEP_INDEX) { + assert(dname == _dims._step_dim); + if (dn > 0) { + THROW_YASK_EXCEPTION("Error: cannot create var '" + var + + "' with dimensions '" + gdims.make_dim_str() + + "' because '" + dname + "' must be first dimension"); + } + got_step = true; + } + + // Inner domain dim? + else if (dname == _dims._inner_layout_dim) { + assert(dtype == DOMAIN_INDEX); + } + + // Add this index position to layout template. + templ += to_string(dn+1); + if (ndone) + os << ", "; + os << dname; + ndone++; + + break; // This dim has been found. + } // dims in parameter order. + } // dims in layout order. + assert(ndims == ndone); + assert(ndims == int(ldims.size())); + os << ".\n"; + + } // not scalar. + + // Scalar. + else + templ += "0d"; // Trivial scalar layout. - // Step dim? - // If this exists, it will get placed near to the end, - // just before the inner & misc dims. - if (dtype == STEP_INDEX) { - assert(dname == _dims._step_dim); - if (dn > 0) { - THROW_YASK_EXCEPTION("Error: cannot create var '" + var + - "' with dimensions '" + gdims.make_dim_str() + - "' because '" + dname + "' must be first dimension"); - } - if (folded) { - step_posn = dn + 1; - defer = true; - } + // Add step-dim flag. + if (got_step) + templ += ", true"; + else + templ += ", false"; + + // Vector lengths. + if (folded) { + for (int dn = 0; dn < ndims; dn++) { + auto& dim = gp->get_dims()[dn]; + auto& dname = dim->_get_name(); + + // Add vector len to list. + auto* p = _dims._fold.lookup(dname); + int dval = p ? *p : 1; + templ += ", " + to_string(dval); } + } - // Inner domain dim? - // If this exists, it will get placed at or near the end. - else if (dname == _dims._inner_dim) { - assert(dtype == DOMAIN_INDEX); - if (folded) { - inner_posn = dn + 1; - defer = true; - } - } + templ.insert(0, " " << ptr_t << ";\n" << + " typedef " << ctype << " " << core_t << ";\n"; + } // vars. - // Add index position to layout. - if (!defer) { - int other_posn = dn + 1; - type_name += to_string(other_posn); + // Types with data needed in kernels. + { + os << "\n // Per-thread data needed in kernel(s).\n" + "struct " << _thread_core_t << " {\n"; + + bool found = false; + for (auto gp : _vars) { + VAR_DECLS(gp); + if (gp->is_scratch()) { + if (!found) + os << "\n // Pointer(s) to scratch-var core data.\n"; + os << " synced_ptr<" << core_t << "> " << core_ptr << ";\n"; + found = true; + } + } + if (!found) + os << "\n // No per-thread data needed for this stencil.\n"; + os << "}; // " << _thread_core_t << endl; + } + { + os << "\n // Data needed in kernel(s).\n" + "struct " << _core_t << " : public StencilCoreBase {\n"; + + os << "\n // Pointer(s) to var core data.\n"; + for (auto gp : _vars) { + VAR_DECLS(gp); + if (!gp->is_scratch()) + os << " synced_ptr<" << core_t << "> " << core_ptr << ";\n"; + } + os << "\n // List of pointer(s) to per-thread data.\n" + " synced_ptr<" << _thread_core_t << "> _thread_core_list;\n" + "}; // " << _core_t << endl; + } + } + + // Print YASK equation bundles. + void YASKCppPrinter::print_eq_bundles(ostream& os) { + + for (auto& bp : _eq_stages.get_all()) { + string stage_name = bp->_get_name(); + os << "\n //////// Stencil "; + if (bp->is_scratch()) + os << "scratch-"; + os << "stage '" << stage_name << "' //////\n"; + + // Bundles in this stage; + for (auto& eq : bp->get_bundles()) { + + // Find equation index. + // TODO: remove need for this. + int ei = 0; + for (; ei < _eq_bundles.get_num(); ei++) { + if (eq == _eq_bundles.get_all().at(ei)) + break; + } + assert(ei < _eq_bundles.get_num()); + string eg_name = eq->_get_name(); + string eg_desc = eq->get_descr(); + string egs_name = _stencil_prefix + eg_name; + + // Stats for this eq_bundle. + CounterVisitor stats; + eq->visit_eqs(&stats); + + os << endl << " ////// Stencil " << eg_desc << " //////\n" << + "\n struct " << egs_name << " {\n" + " const char* _name = \"" << eg_name << "\";\n" + " const int _scalar_fp_ops = " << stats.get_num_ops() << ";\n" + " const int _scalar_points_read = " << stats.get_num_reads() << ";\n" + " const int _scalar_points_written = " << stats.get_num_writes() << ";\n" + " const bool _is_scratch = " << (eq->is_scratch() ? "true" : "false") << ";\n"; + + // Example computation. + os << endl << " // " << stats.get_num_ops() << " FP operation(s) per point:\n"; + add_comment(os, *eq); + + // Domain condition. + { + os << "\n // Determine whether " << egs_name << " is valid at the domain indices " << + _dims._stencil_dims.make_dim_str() << ".\n" + " // Return true if indices are within the valid sub-domain or false otherwise.\n" + " ALWAYS_INLINE static bool is_in_valid_domain(const " << + _core_t << "* core_data, const Indices& idxs) {" + " host_assert(core_data);\n"; + print_indices(os); + if (eq->cond) + os << " return " << eq->cond->make_str() << ";\n"; + else + os << " return true; // full domain.\n"; + os << " }\n"; + + os << "\n // Return whether there is a sub-domain expression.\n" + " ALWAYS_INLINE static bool is_sub_domain_expr() {\n" + " return " << (eq->cond ? "true" : "false") << + ";\n }\n"; + + os << "\n // Return human-readable description of sub-domain.\n" + " inline std::string get_domain_description() const {\n"; + if (eq->cond) + os << " return \"" << eq->cond->make_str() << "\";\n"; + else + os << " return \"true\"; // full domain.\n"; + os << " }\n"; + } + + // Step condition. + { + os << endl << " // Determine whether " << egs_name << + " is valid at the step input_step_index.\n" << + " // Return true if valid or false otherwise.\n" + " ALWAYS_INLINE static bool is_in_valid_step(const " << + _core_t << "* core_data, idx_t input_step_index) {" + " host_assert(core_data);\n"; + if (eq->step_cond) { + os << " idx_t " << _dims._step_dim << " = input_step_index;\n" + "\n // " << eq->step_cond->make_str() << "\n"; + + // C++ scalar print assistant. + CounterVisitor cv; + eq->step_cond->accept(&cv); + CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "real_t", " ", ";\n"); + + // Generate the code. + PrintVisitorTopDown pcv(os, *sp); + string expr = eq->step_cond->accept(&pcv); + os << " return " << expr << ";\n"; } + else + os << " return true; // any step.\n"; + os << " }\n"; + + os << "\n // Return whether there is a step-condition expression.\n" + " ALWAYS_INLINE static bool is_step_cond_expr() {\n" + " return " << (eq->step_cond ? "true" : "false") << + ";\n }\n"; + + os << "\n // Return human-readable description of step condition.\n" + " inline std::string get_step_cond_description() const {\n"; + if (eq->step_cond) + os << " return \"" << eq->step_cond->make_str() << "\";\n"; + else + os << " return \"true\"; // any step.\n"; + os << " }\n"; + } - // Add vector len to list. - if (folded) { - auto* p = _dims._fold.lookup(dname); - int dval = p ? *p : 1; - vlens.push_back(dval); + // LHS step index. + { + os << endl; + if (eq->step_expr) + os << " // Set 'output_step_index' to the step that an update" + " occurs when calling one of the calc_*() methods with" + " 'input_step_index' and return 'true'.\n"; + else + os << "// Return 'false' because this bundle does not update" + " vars with the step dimension.\n"; + os << " ALWAYS_INLINE static bool get_output_step_index(idx_t input_step_index," + " idx_t& output_step_index) {\n"; + if (eq->step_expr) { + os << " idx_t " << _dims._step_dim << " = input_step_index;\n" + " output_step_index = " << eq->step_expr->make_str() << ";\n" + " return true;\n"; } + else + os << " return false;\n"; + os << " }\n"; } + + // Scalar code. + { + // Stencil-calculation code. + // Function header. + os << endl << " // Calculate one scalar result relative to indices " << + _dims._stencil_dims.make_dim_str() << ".\n" + " // There are approximately " << stats.get_num_ops() << + " FP operation(s) per invocation.\n" + " static void calc_scalar(" << + _core_t << "* core_data, int core_idx, const Indices& idxs) {\n" + " host_assert(core_data);\n" + " host_assert(core_data->_thread_core_list.get());\n" + " auto& thread_core_data = core_data->_thread_core_list[core_idx];\n"; + print_indices(os); - // Add deferred posns at end. - if (step_posn) - type_name += to_string(step_posn); - if (inner_posn) - type_name += to_string(inner_posn); - for (auto mp : misc_posns) - type_name += to_string(mp); - } + // C++ scalar print assistant. + CounterVisitor cv; + eq->visit_eqs(&cv); + CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "real_t", " ", ";\n"); - // Scalar. - else - type_name += "0d"; // Trivial scalar layout. + // Generate the code. + PrintVisitorBottomUp pcv(os, *sp); + eq->visit_eqs(&pcv); - // Add step-dim flag. - if (step_posn) - type_name += ", true"; - else - type_name += ", false"; + // End of function. + os << "} // calc_scalar." << endl; - // Add vec lens. - if (folded) { - for (auto i : vlens) - type_name += ", " + to_string(i); - } + delete sp; + } - type_name += ">"; + // Vector/Cluster code. + for (bool do_cluster : { false, true }) { - // Typedef. - string type_def = var + "_type"; - string ptr_type_def = var + "_ptr_type"; - os << " typedef " << type_name << " " << type_def << ";\n" << - " typedef std::shared_ptr<" << type_def << "> " << ptr_type_def << ";\n" - " VarDimNames " + var + "_dim_names;\n"; + // Cluster eq_bundle at same 'ei' index. + // This should be the same eq-bundle because it was copied from the + // scalar one. + auto& vceq = do_cluster ? + _cluster_eq_bundles.get_all().at(ei) : eq; + assert(eg_desc == vceq->get_descr()); - ctor_code += "\n // Var '" + var + "'.\n"; - ctor_code += " " + var + "_dim_names = {" + + // Create vector info for this eq_bundle. The visitor is + // accepted at all nodes in the cluster AST; for each var + // access node in the AST, the vectors needed are determined + // and saved in the visitor. + VecInfoVisitor vv(_dims); + vceq->visit_eqs(&vv); + + // Collect stats. + CounterVisitor cv; + vceq->visit_eqs(&cv); + int num_results = do_cluster ? + _dims._cluster_pts.product() : + _dims._fold.product(); + + // Vector/cluster vars. + string idim = _settings._inner_loop_dim; + string vcstr = do_cluster ? "cluster" : "vector"; + string funcstr = "calc_" + vcstr + "s"; + string nvecs = do_cluster ? "CMULT_" + all_caps(idim) : "1"; + string nelems = (do_cluster ? nvecs + " * ": "") + "VLEN_" + all_caps(idim); + string write_mask = do_cluster ? "" : "write_mask"; + + // Loop-calculation code. + // Function header. + os << endl << " // Calculate a nano-block of " << vcstr << "s bounded by 'norm_nb_idxs'.\n"; + if (do_cluster) + os << " // Each cluster calculates '" << _dims._cluster_pts.make_dim_val_str(" * ") << + "' point(s) containing " << _dims._cluster_mults.product() << " '" << + _dims._fold.make_dim_val_str(" * ") << "' vector(s).\n"; + else + os << " // Each vector calculates '" << _dims._fold.make_dim_val_str(" * ") << + "' point(s).\n"; + os << " // Indices must be rank-relative (not global).\n" + " // Indices must be normalized, i.e., already divided by VLEN_*.\n" + " // SIMD calculations use " << vv.get_num_points() << + " vector block(s) created from " << vv.get_num_aligned_vecs() << + " aligned vector-block(s).\n" + " // There are approximately " << (stats.get_num_ops() * num_results) << + " FP operation(s) per inner-loop iteration.\n" << + " static void " << funcstr << "(" << + _core_t << "* core_data, int core_idx, int block_thread_idx," + " int thread_limit, ScanIndices& norm_nb_idxs"; + if (!do_cluster) + os << ", bit_mask_t " << write_mask; + os << ") {\n" + " FORCE_INLINE_RECURSIVE {\n" + " assert(core_data);\n" + " assert(core_data->_thread_core_list.get());\n" + " auto& thread_core_data = core_data->_thread_core_list[core_idx];\n" + " const Indices& idxs = norm_nb_idxs.start;\n"; + print_indices(os, true, false); // Just step index. + + // C++ vector print assistant. + auto* vp = new_cpp_vec_print_helper(vv, cv); + vp->set_write_mask(write_mask); + vp->set_using_cluster(do_cluster); + vp->set_stage_name(stage_name); + vp->get_point_stats(); + + // Print loop-invariant meta values. + // Store them in the CppVecPrintHelper for later use in the loop body. + os << "\n ////// Loop-invariant meta values.\n"; + CppPreLoopPrintMetaVisitor plpmv(os, *vp); + vceq->visit_eqs(&plpmv); + vp->print_rank_data(os); + + // Print loop-invariant data values. + // Store them in the CppVecPrintHelper for later use in the loop body. + CppPreLoopPrintDataVisitor plpdv(os, *vp); + vceq->visit_eqs(&plpdv); + + // Inner-loop strides. + // Will be 1 for vectors and cluster-mults for clusters. + string inner_strides = do_cluster ? + "stencil_cluster_mults[dn]" : + "idx_t(1)"; + + // Computation loops. + // Include generated loops. + os << + "\n // Nano loops.\n" + "#define NANO_BLOCK_LOOP_INDICES norm_nb_idxs\n" + "\n // Start Nano loop(s).\n" + "#define NANO_BLOCK_USE_LOOP_PART_0\n" + "#include \"yask_nano_block_loops.hpp\"\n"; + os << + "\n // Pico loops inside nano loops.\n" + " // Use macros to get values directly from nano loops.\n" + "#define PICO_BLOCK_BEGIN(dn) NANO_BLOCK_BODY_START(dn)\n" + "#define PICO_BLOCK_END(dn) NANO_BLOCK_BODY_STOP(dn)\n" + "#define PICO_BLOCK_STRIDE(dn) " << inner_strides << "\n"; + os << + "\n // Start Pico outer-loop(s).\n" + "#define PICO_BLOCK_USE_LOOP_PART_0\n" + "#include \"yask_pico_block_loops.hpp\"\n"; + + // Get named domain indices directly from scalar vars. + print_indices(os, false, true, "pico_block_start_", "pico_block_begin_"); + vp->print_elem_indices(os); + + // Create inner-loop base ptrs. + os << "\n // Set up for inner loop.\n"; + vp->print_inner_loop_prefix(os); + + // Initial prefetches, if any. + vp->print_prefetches(os, false); + + // Create and init buffers, if any. + vp->print_buffer_code(os, false); + + auto& ild = _settings._inner_loop_dim; + os << + "\n // Start Pico inner-loop for dim '" << ild << "'.\n" + "#define PICO_BLOCK_USE_LOOP_PART_1\n" + "#include \"yask_pico_block_loops.hpp\"\n"; + + // Issue loads early. + if (_settings._early_loads) + vp->print_early_loads(os); + + // Generate loop body using vars stored in print helper. + // Visit all expressions to cover the whole vector/cluster. + PrintVisitorBottomUp pcv(os, *vp); + vceq->visit_eqs(&pcv); + + // Insert prefetches using vars stored in print helper for next iteration. + vp->print_prefetches(os, true); + + // Shift and fill buffers. + vp->print_buffer_code(os, true); + + // Increment indices, etc. + vp->print_end_inner_loop(os); + + // End of loops. + os << + "\n ////// Loop endings.\n" + "#define PICO_BLOCK_USE_LOOP_PART_2\n" + "#include \"yask_pico_block_loops.hpp\"\n" + "#define NANO_BLOCK_USE_LOOP_PART_1\n" + "#include \"yask_nano_block_loops.hpp\"\n"; + + // End of recursive block & function. + os << "} } // " << funcstr << ".\n"; + delete vp; + } // cluster/vector + + os << "}; // " << egs_name << ".\n" // end of struct. + " static_assert(std::is_trivially_copyable<" << egs_name << ">::value," + "\"Needed for OpenMP offload\");\n"; + + } // stencil eq_bundles. + } // stages. + } + + // Print derived YASK context. + void YASKCppPrinter::print_context(ostream& os) { + + os << "\n ////// User-provided code //////" << endl << + "struct " << _context_hook << " {\n" + " static void call_after_new_solution(yk_solution& kernel_soln) {\n" + " // Code provided by user.\n"; + for (auto& code : _stencil.get_kernel_code()) + os << " " << code << "\n"; + os << " }\n" + "};\n"; + + os << "\n ////// Overall stencil-specific context //////" << endl << + "class " << _context << " : public StencilContext {\n" + " protected:\n"; + + // Save code to be added later. + string ctor_code, new_var_code, scratch_code, core_code; + set new_var_dims; + + // Vars. + os << "\n ///// Var(s)." << endl; + for (auto gp : _vars) { + VAR_DECLS(gp); + + string header = "\n // Var '" + var + "'.\n"; + os << header; + ctor_code += header; + + os << " VarDimNames " << var_dim_names << ";\n"; + ctor_code += " " + var_dim_names + " = {" + gdims.make_dim_str(", ", "\"", "\"") + "};\n"; - string gbp = var + "_base_ptr"; - string init_code = " " + var + "_ptr_type " + gbp + " = std::make_shared<" + type_def + - ">(*this, \"" + var + "\", " + var + "_dim_names);\n" - " assert(" + gbp + ");\n" - " " + var + "_ptr = std::make_shared(" + gbp + ");\n" - " assert(" + var + "_ptr->gbp());\n"; - - // Vars. - if (gp->is_scratch()) { - - // Collection of scratch vars. - os << " VarPtrs " << var << "_list;\n"; - ctor_code += " add_scratch(" + var + "_list);\n"; - } - else { + + // Code to create a local base ptr and set pre-defined generic ptr. + string init_code = + " " + ptr_t + " " + base_ptr + + " = std::allocate_shared<" + base_t + ">" + "(yask_allocator<" + base_t + ">(), *this, \"" + var + "\", " + var_dim_names + ");\n" + " host_assert(" + base_ptr + ");\n" + " " + var_ptr + " = std::make_shared(" + base_ptr + ");\n" + " host_assert(" + var_ptr + "->gbp());\n"; + + if (!gp->is_scratch()) { // Var ptr declaration. // Default ctor gives null ptr. - os << " YkVarPtr " << var << "_ptr;\n"; + os << " YkVarPtr " << var_ptr << ";\n"; + } + else { + + // List of scratch vars, one for each thread. + os << " VarPtrs " << var_list << ";\n"; + ctor_code += " add_scratch(" + var_list + ");\n"; } // Alloc-setting code. @@ -386,9 +833,15 @@ namespace yask { _settings._halo_size : gp->get_halo_size(dname, left); os << " const idx_t " << hvar << " = " << hval << "; // default halo size in '" << dname << "' dimension.\n"; - init_code += " " + var + "_ptr->set" + bstr + "size(\"" + dname + + init_code += " " + var_ptr + "->set" + bstr + "size(\"" + dname + "\", " + hvar + ");\n"; } + + // Extra padding for read-ahead. + if (dname == _settings._inner_loop_dim && + gp->get_read_ahead_pad() > 0) + init_code += " " + var_ptr + "->update_right_extra_pad_size(\"" + dname + + "\", " + to_string(gp->get_read_ahead_pad()) + "); // For read-ahead.\n"; } // non-domain dimension. @@ -397,9 +850,24 @@ namespace yask { string ovar = var + "_ofs_" + dname; int aval = 1; int oval = 0; + string comment; if (dtype == STEP_INDEX) { - aval = gp->get_step_dim_size(); - init_code += " " + var + "_base_ptr->_set_dynamic_step_alloc(" + + auto sdi = gp->get_step_dim_info(); + aval = sdi.step_dim_size; + auto& wb_ofs = sdi.writeback_ofs; + for (auto& i : gp->get_write_points()) { + auto& ws = i.first; + int sofs = i.second; + comment += " Written in stage '" + ws + + "' at step-offset " + to_string(sofs) + + " with writeback (immediate replacement)"; + if (!wb_ofs.count(ws)) + comment += " NOT allowed."; + else + comment += " allowed over read at step-offset " + + to_string(wb_ofs.at(ws)) + "."; + } + init_code += " " + base_ptr + "->_set_dynamic_step_alloc(" + (gp->is_dynamic_step_alloc() ? "true" : "false") + ");\n"; } else { @@ -411,13 +879,14 @@ namespace yask { } } os << " const idx_t " << avar << " = " << aval << - "; // default allocation in '" << dname << "' dimension.\n"; - init_code += " " + var + "_ptr->_set_alloc_size(\"" + dname + + "; // Default allocation in '" << dname << "' dimension." << + comment << "\n"; + init_code += " " + var_ptr + "->_set_alloc_size(\"" + dname + "\", " + avar + ");\n"; if (oval) { os << " const idx_t " << ovar << " = " << oval << "; // first index in '" << dname << "' dimension.\n"; - init_code += " " + var + "_ptr->_set_local_offset(\"" + dname + + init_code += " " + var_ptr + "->_set_local_offset(\"" + dname + "\", " + ovar + ");\n"; } } @@ -428,12 +897,12 @@ namespace yask { auto l1var = var + "_l1_norm"; os << " const int " << l1var << " = " << gp->get_l1_dist() << "; // Max L1-norm of MPI neighbor for halo exchanges.\n"; - init_code += " " + var + "_ptr->set_halo_exchange_l1_norm(" + + init_code += " " + var_ptr + "->set_halo_exchange_l1_norm(" + l1var + ");\n"; } // Allow dynamic misc alloc setting if not interleaved. - init_code += " " + var + "_base_ptr->_set_dynamic_misc_alloc(" + + init_code += " " + base_ptr + "->_set_dynamic_misc_alloc(" + (_settings._inner_misc ? "false" : "true") + ");\n"; @@ -442,22 +911,35 @@ namespace yask { // Var init. ctor_code += init_code; - ctor_code += " add_var(" + var + "_ptr, true, "; + ctor_code += " add_var(" + var_ptr + ", true, "; if (_eq_bundles.get_output_vars().count(gp)) ctor_code += "true /* is an output var */"; else ctor_code += "false /* is not an output var */"; ctor_code += ");\n"; + + // Core init for this var. + core_code += + " auto* " + core_ptr + " = static_cast<" + core_t + "*>(" + var_ptr + "->corep());\n" + " cxt_cd->" + core_ptr + ".set_and_sync(" + core_ptr + ");\n"; } - // For scratch, make code for one vec element. + // For scratch, make code to fill vector. else { - scratch_code += " " + var + "_list.clear();\n" - " for (int i = 0; i < num_threads; i++) {\n" - " YkVarPtr " + var + "_ptr;\n" + + scratch_code += + " " + var_list + ".resize(num_threads);\n" + " for (int i = 0; i < num_threads; i++) {\n" + + + // Make scratch var for 'i'th thread. + " YkVarPtr " + var_ptr + ";\n" + init_code + - " " + var + "_base_ptr->set_scratch(true);\n" + - " " + var + "_list.push_back(" + var + "_ptr);\n" + " " + base_ptr + "->set_scratch(true);\n" + + " " + var_list + "[i] = " + var_ptr + ";\n" + + + // Init core ptr for this var. + " auto* cp = static_cast<" + core_t + "*>(" + var_ptr + "->corep());\n" + " _core_data._thread_core_list[i]." + core_ptr + ".set_and_sync(cp);\n" + " }\n"; } @@ -466,24 +948,47 @@ namespace yask { if (!new_var_dims.count(new_var_key)) { new_var_dims.insert(new_var_key); bool first_var = new_var_code.length() == 0; - if (gdims._get_num_dims()) + if (gdims.get_num_dims()) new_var_code += "\n // Vars with '" + new_var_key + "' dim(s).\n"; else new_var_code += "\n // Scalar vars.\n"; if (!first_var) new_var_code += " else"; - new_var_code += " if (dims == " + var + "_dim_names)\n" - " gp = std::make_shared<" + type_def + ">(*this, name, dims);\n"; + new_var_code += " if (dims == " + var_dim_names + ")\n" + " gp = std::allocate_shared<" + base_t + + ">(yask_allocator<" + base_t + ">(), *this, name, dims);\n"; } - } // vars. + os << "\n // Core data used in kernel(s).\n" + " " << _core_t << " _core_data;\n" << + " std::vector<" << _thread_core_t << ", yask_allocator<" << + _thread_core_t << ">> _thread_cores;\n"; + + // Stencil eq_bundle objects. + os << endl << " // Stencil equation-bundles." << endl; + for (auto& eg : _eq_bundles.get_all()) { + string eg_name = eg->_get_name(); + os << " StencilBundleTempl<" << _stencil_prefix << eg_name << ", " << + _core_t << "> " << eg_name << ";" << endl; + } + + os << "\n public:\n"; + // Ctor. { os << "\n // Constructor.\n" << - " " << _context_base << "(KernelEnvPtr env, KernelSettingsPtr settings) :" - " StencilContext(env, settings)" << ctor_list << - " {\n name = \"" << _stencil._get_name() << "\";\n" + " " << _context << "(KernelEnvPtr kenv, " + "KernelSettingsPtr ksettings, " + "KernelSettingsPtr user_settings) : " << + " StencilContext(kenv, ksettings, user_settings)"; + for (auto& eg : _eq_bundles.get_all()) { + string eg_name = eg->_get_name(); + os << ",\n " << eg_name << "(this)"; + } + os << " {\n" + " STATE_VARS(this);\n" + " name = \"" << _stencil._get_name() << "\";\n" " long_name = \"" << _stencil.get_long_name() << "\";\n"; os << "\n // Create vars (but do not allocate data in them).\n" << @@ -491,424 +996,164 @@ namespace yask { "\n // Update vars with context info.\n" " update_var_info(false);\n"; - // end of ctor. - os << " } // ctor" << endl; - } - - // New-var method. - os << "\n // Make a new var iff its dims match any in the stencil.\n" - " // Returns pointer to the new var or nullptr if no match.\n" - " virtual VarBasePtr new_stencil_var(const std::string& name," - " const VarDimNames& dims) override {\n" - " VarBasePtr gp;\n" << - new_var_code << - " return gp;\n" - " } // new_stencil_var\n"; - - // Scratch-vars method. - os << "\n // Make new scratch vars.\n" - " virtual void make_scratch_vars(int num_threads) override {\n" << - scratch_code << - " } // new_scratch_vars\n"; - - os << "}; // " << _context_base << endl; - } - - // Print YASK equation bundles. - void YASKCppPrinter::print_eq_bundles(ostream& os) { - - for (int ei = 0; ei < _eq_bundles.get_num(); ei++) { - - // Scalar eq_bundle. - auto& eq = _eq_bundles.get_all().at(ei); - string eg_name = eq->_get_name(); - string eg_desc = eq->get_descr(); - string egs_name = "StencilBundle_" + eg_name; - - os << endl << " ////// Stencil " << eg_desc << " //////\n" << - "\n class " << egs_name << " : public StencilBundleBase {\n" - " protected:\n" - " typedef " << _context_base << " _context_type;\n" - " _context_type* _context_data = 0;\n" - " public:\n"; - - // Stats for this eq_bundle. - CounterVisitor stats; - eq->visit_eqs(&stats); + // Push eq-bundle pointers to list. + for (auto& eg : _eq_bundles.get_all()) { + string eg_name = eg->_get_name(); - // Example computation. - os << endl << " // " << stats.get_num_ops() << " FP operation(s) per point:" << endl; - add_comment(os, *eq); + os << "\n // Configure '" << eg_name << "'.\n"; + + // Only want non-scratch bundles in st_bundles. + // Each scratch bundle will be added to its + // parent bundle. + if (!eg->is_scratch()) + os << " st_bundles.push_back(&" << eg_name << ");\n"; + + // Add scratch-bundle deps in proper order. + auto& sdeps = _eq_bundles.get_scratch_deps(eg); + for (auto& eg2 : _eq_bundles.get_all()) { + if (sdeps.count(eg2)) { + string eg2_name = eg2->_get_name(); + os << " " << eg_name << + ".add_scratch_child(&" << eg2_name << ");\n"; + } + } - // Stencil-bundle ctor. - { - os << " " << egs_name << "(" << _context_base << "* context) :\n" - " StencilBundleBase(context),\n" - " _context_data(context) {\n" - " _name = \"" << eg_name << "\";\n" - " _scalar_fp_ops = " << stats.get_num_ops() << ";\n" - " _scalar_points_read = " << stats.get_num_reads() << ";\n" - " _scalar_points_written = " << stats.get_num_writes() << ";\n" - " _is_scratch = " << (eq->is_scratch() ? "true" : "false") << ";\n"; + // Add deps between bundles. + for (auto& dep : _eq_bundles.get_deps(eg)) { + string dep_name = dep->_get_name(); + os << " " << eg_name << + ".add_dep(&" << dep_name << ");\n"; + } + // Populate the var lists in the StencilBundleBase objs. // I/O vars. - os << "\n // The following var(s) are read by " << egs_name << endl; - for (auto gp : eq->get_input_vars()) { + os << "\n // The following var(s) are read by '" << eg_name << "'.\n"; + for (auto gp : eg->get_input_vars()) { + VAR_DECLS(gp); if (gp->is_scratch()) - os << " input_scratch_vecs.push_back(&_context_data->" << gp->_get_name() << "_list);\n"; + os << " " << eg_name << ".input_scratch_vecs.push_back(&" << var_list << ");\n"; else - os << " input_var_ptrs.push_back(_context_data->" << gp->_get_name() << "_ptr);\n"; + os << " " << eg_name << ".input_var_ptrs.push_back(" << var_ptr << ");\n"; } - os << "\n // The following var(s) are written by " << egs_name; - if (eq->step_expr) - os << " at " << eq->step_expr->make_quoted_str(); + os << "\n // The following var(s) are written by '" << eg_name << "'"; + if (eg->step_expr) + os << " at " << eg->step_expr->make_quoted_str(); os << ".\n"; - for (auto gp : eq->get_output_vars()) { + for (auto gp : eg->get_output_vars()) { + VAR_DECLS(gp); if (gp->is_scratch()) - os << " output_scratch_vecs.push_back(&_context_data->" << gp->_get_name() << "_list);\n"; + os << " " << eg_name << ".output_scratch_vecs.push_back(&" << var_list << ");\n"; else - os << " output_var_ptrs.push_back(_context_data->" << gp->_get_name() << "_ptr);\n"; + os << " " << eg_name << ".output_var_ptrs.push_back(" << var_ptr << ");\n"; } - os << " } // Ctor." << endl; - } + } // bundles. - // Domain condition. - { - os << "\n // Determine whether " << egs_name << " is valid at the domain indices " << - _dims._stencil_dims.make_dim_str() << ".\n" - " // Return true if indices are within the valid sub-domain or false otherwise.\n" - " virtual bool is_in_valid_domain(const Indices& idxs) const final {\n"; - print_indices(os); - if (eq->cond) - os << " return " << eq->cond->make_str() << ";\n"; - else - os << " return true; // full domain.\n"; - os << " }\n"; - - os << "\n // Return whether there is a sub-domain expression.\n" - " virtual bool is_sub_domain_expr() const {\n" - " return " << (eq->cond ? "true" : "false") << - ";\n }\n"; - - os << "\n // Return human-readable description of sub-domain.\n" - " virtual std::string get_domain_description() const {\n"; - if (eq->cond) - os << " return \"" << eq->cond->make_str() << "\";\n"; - else - os << " return \"true\"; // full domain.\n"; - os << " }\n"; - } - - // Step condition. - { - os << endl << " // Determine whether " << egs_name << - " is valid at the step input_step_index.\n" << - " // Return true if valid or false otherwise.\n" - " virtual bool is_in_valid_step(idx_t input_step_index) const final {\n"; - if (eq->step_cond) { - os << " idx_t " << _dims._step_dim << " = input_step_index;\n" - "\n // " << eq->step_cond->make_str() << "\n"; - - // C++ scalar print assistant. - CounterVisitor cv; - eq->step_cond->accept(&cv); - CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n"); - - // Generate the code. - PrintVisitorTopDown pcv(os, *sp); - string expr = eq->step_cond->accept(&pcv); - os << " return " << expr << ";\n"; - } - else - os << " return true; // any step.\n"; - os << " }\n"; - - os << "\n // Return whether there is a step-condition expression.\n" - " virtual bool is_step_cond_expr() const {\n" - " return " << (eq->step_cond ? "true" : "false") << - ";\n }\n"; - - os << "\n // Return human-readable description of step condition.\n" - " virtual std::string get_step_cond_description() const {\n"; - if (eq->step_cond) - os << " return \"" << eq->step_cond->make_str() << "\";\n"; - else - os << " return \"true\"; // any step.\n"; - os << " }\n"; - } - - // LHS step index. - { - os << endl; - if (eq->step_expr) - os << " // Set 'output_step_index' to the step that an update" - " occurs when calling one of the calc_*() methods with" - " 'input_step_index' and return 'true'.\n"; - else - os << "// Return 'false' because this bundle does not update" - " vars with the step dimension.\n"; - os << " virtual bool get_output_step_index(idx_t input_step_index," - " idx_t& output_step_index) const final {\n"; - if (eq->step_expr) { - os << " idx_t " << _dims._step_dim << " = input_step_index;\n" - " output_step_index = " << eq->step_expr->make_str() << ";\n" - " return true;\n"; + // Stages. + os << "\n // Create stencil stage(s).\n"; + for (auto& bp : _eq_stages.get_all()) { + if (bp->is_scratch()) + continue; + string bp_name = bp->_get_name(); + os << " auto " << bp_name << " = std::make_shared(this, \"" << + bp_name << "\");\n"; + for (auto& eg : bp->get_bundles()) { + if (eg->is_scratch()) + continue; + string eg_name = eg->_get_name(); + os << " " << bp_name << "->push_back(&" << eg_name << ");\n"; } - else - os << " return false;\n"; - os << " }\n"; - } - - // Scalar code. - { - // Stencil-calculation code. - // Function header. - os << endl << " // Calculate one scalar result relative to indices " << - _dims._stencil_dims.make_dim_str() << ".\n" - " // There are approximately " << stats.get_num_ops() << - " FP operation(s) per invocation.\n" - " virtual void calc_scalar(int region_thread_idx, const Indices& idxs) {\n"; - print_indices(os); - - // C++ scalar print assistant. - CounterVisitor cv; - eq->visit_eqs(&cv); - CppPrintHelper* sp = new CppPrintHelper(_settings, _dims, &cv, "temp", "real_t", " ", ";\n"); - - // Generate the code. - PrintVisitorBottomUp pcv(os, *sp); - eq->visit_eqs(&pcv); - - // End of function. - os << "} // calc_scalar." << endl; - - delete sp; + os << " st_stages.push_back(" << bp_name << ");\n"; } - // Vector/Cluster code. - for (bool do_cluster : { false, true }) { - - // Cluster eq_bundle at same 'ei' index. - // This should be the same eq-bundle because it was copied from the - // scalar one. - auto& vceq = do_cluster ? - _cluster_eq_bundles.get_all().at(ei) : eq; - assert(eg_desc == vceq->get_descr()); - - // Create vector info for this eq_bundle. - // The visitor is accepted at all nodes in the cluster AST; - // for each var access node in the AST, the vectors - // needed are determined and saved in the visitor. - VecInfoVisitor vv(_dims); - vceq->visit_eqs(&vv); - - // Collect stats. - CounterVisitor cv; - vceq->visit_eqs(&cv); - int num_results = do_cluster ? - _dims._cluster_pts.product() : - _dims._fold.product(); - - // Vector/cluster vars. - string idim = _dims._inner_dim; - string vcstr = do_cluster ? "cluster" : "vector"; - string funcstr = "calc_loop_of_" + vcstr + "s"; - string nvecs = do_cluster ? "CMULT_" + all_caps(idim) : "1"; - string nelems = (do_cluster ? nvecs + " * ": "") + "VLEN_" + all_caps(idim); - - // Loop-calculation code. - // Function header. - string istart = "start_" + idim; - string istop = "stop_" + idim; - string istep = "step_" + idim; - string iestep = "step_" + idim + "_elem"; - os << endl << " // Calculate a series of " << vcstr << "s iterating in +'" << idim << - "' direction from " << _dims._stencil_dims.make_dim_str() << - " indices in 'idxs' to '" << istop << "'.\n"; - if (do_cluster) - os << " // Each cluster calculates '" << _dims._cluster_pts.make_dim_val_str(" * ") << - "' point(s) containing " << _dims._cluster_mults.product() << " '" << - _dims._fold.make_dim_val_str(" * ") << "' vector(s).\n"; - else - os << " // Each vector calculates '" << _dims._fold.make_dim_val_str(" * ") << - "' point(s).\n"; - os << " // Indices must be rank-relative (not global).\n" - " // Indices must be normalized, i.e., already divided by VLEN_*.\n" - " // SIMD calculations use " << vv.get_num_points() << - " vector block(s) created from " << vv.get_num_aligned_vecs() << - " aligned vector-block(s).\n" - " // There are approximately " << (stats.get_num_ops() * num_results) << - " FP operation(s) per iteration.\n" << - " void " << funcstr << "(int region_thread_idx, int block_thread_idx," - " const Indices& idxs, idx_t " << istop; - if (!do_cluster) - os << ", idx_t write_mask"; - os << ") {\n"; - print_indices(os); - os << " idx_t " << istart << " = " << idim << ";\n"; - os << " idx_t " << istep << " = " << nvecs << "; // number of vectors per iter.\n"; - os << " idx_t " << iestep << " = " << nelems << "; // number of elements per iter.\n"; - - // C++ vector print assistant. - CppVecPrintHelper* vp = new_cpp_vec_print_helper(vv, cv); - vp->set_use_masked_writes(!do_cluster); - vp->print_elem_indices(os); - - // Start forced-inline code. - os << "\n // Force inlining if possible.\n" - "#if !defined(DEBUG) && defined(__INTEL_COMPILER)\n" - "#pragma forceinline recursive\n" - "#endif\n" - " {\n"; - - // Print time-invariants. - os << "\n // Invariants within a step.\n"; - CppStepVarPrintVisitor svv(os, *vp); - vceq->visit_eqs(&svv); - - // Print loop-invariants. - os << "\n // Inner-loop invariants.\n"; - CppLoopVarPrintVisitor lvv(os, *vp); - vceq->visit_eqs(&lvv); - - // Print pointers and pre-loop prefetches. - vp->print_base_ptrs(os); - - // Actual computation loop. - os << "\n // Inner loop.\n"; - if (_dims._fold.product() == 1) - os << " // Specifying SIMD here because there is no explicit vectorization.\n" - "#pragma omp simd\n"; - os << " for (idx_t " << idim << " = " << istart << "; " << - idim << " < " << istop << "; " << - idim << " += " << istep << ", " << - vp->get_elem_index(idim) << " += " << iestep << ") {\n"; - - // Generate loop body using vars stored in print helper. - // Visit all expressions to cover the whole vector/cluster. - PrintVisitorBottomUp pcv(os, *vp); - vceq->visit_eqs(&pcv); - - // Insert prefetches using vars stored in print helper for next iteration. - vp->print_prefetches(os, true); - - // End of loop. - os << " } // '" << idim << "' loop.\n"; - - // End forced-inline code. - os << " } // Forced-inline block.\n"; - - // End of function. - os << "} // " << funcstr << ".\n"; - delete vp; - } - - os << "}; // " << egs_name << ".\n"; // end of class. - - } // stencil eq_bundles. - } - - // Print final YASK context. - void YASKCppPrinter::print_context(ostream& os) { - - os << "\n ////// User-provided code //////" << endl << - "struct " << _context_hook << " {\n" - " static void call_after_new_solution(yk_solution& kernel_soln) {\n" - " // Code provided by user.\n"; - for (auto& code : _stencil.get_kernel_code()) - os << " " << code << "\n"; - os << " }\n" - "};\n"; - - os << "\n ////// Overall stencil-specific context //////" << endl << - "struct " << _context << " : public " << _context_base << " {" << endl; + os << "\n // Alloc core on offload device.\n" + " auto* cxt_cd = &_core_data;\n" + " offload_map_alloc(cxt_cd, 1);\n"; + + os << "\n // Call code provided by user.\n" << + _context_hook << "::call_after_new_solution(*this);\n"; - // Stencil eq_bundle objects. - os << endl << " // Stencil equation-bundles." << endl; - for (auto& eg : _eq_bundles.get_all()) { - string eg_name = eg->_get_name(); - os << " StencilBundle_" << eg_name << " " << eg_name << ";" << endl; - } - - // Ctor. - os << "\n // Constructor.\n" << - " " << _context << "(KernelEnvPtr env, KernelSettingsPtr settings) : " << - _context_base << "(env, settings)"; - for (auto& eg : _eq_bundles.get_all()) { - string eg_name = eg->_get_name(); - os << ",\n " << eg_name << "(this)"; + // end of ctor. + os << " } // ctor" << endl; } - os << " {\n"; - // Push eq-bundle pointers to list. - os << "\n // Stencil bundles.\n"; - for (auto& eg : _eq_bundles.get_all()) { - string eg_name = eg->_get_name(); - - // Only want non-scratch bundles in st_bundles. - // Each scratch bundles will be added to its - // parent bundle. - if (!eg->is_scratch()) - os << " st_bundles.push_back(&" << eg_name << ");\n"; - - // Add scratch-bundle deps in proper order. - auto& sdeps = _eq_bundles.get_scratch_deps(eg); - for (auto& eg2 : _eq_bundles.get_all()) { - if (sdeps.count(eg2)) { - string eg2_name = eg2->_get_name(); - os << " " << eg_name << - ".add_scratch_child(&" << eg2_name << ");\n"; - } - } - - } // eq-bundles. - - // Deps. - os << "\n // Stencil bundle inter-dependencies.\n"; - for (auto& eg : _eq_bundles.get_all()) { - string eg_name = eg->_get_name(); + // Dtor. + os << "\n // Dtor.\n" + " virtual ~" << _context << "() {\n" + " STATE_VARS(this);\n" + " auto* cxt_cd = &_core_data;\n" + " offload_map_free(cxt_cd, 1);\n" << + " auto* tcl = _thread_cores.data();\n" + " auto nt = _thread_cores.size();\n" + " if (tcl && nt) offload_map_free(tcl, nt);\n" + " }\n"; - // Add deps between bundles. - for (auto& dep : _eq_bundles.get_deps(eg)) { - string dep_name = dep->_get_name(); - os << " " << eg_name << - ".add_dep(&" << dep_name << ");\n"; - } - } // bundles. + // New-var method. + os << "\n // Make a new var iff its dims match any in the stencil.\n" + " // Returns pointer to the new var or nullptr if no match.\n" + " VarBasePtr new_stencil_var(const std::string& name," + " const VarDimNames& dims) override {\n" + " VarBasePtr gp;\n" << + new_var_code << + " return gp;\n" + " } // new_stencil_var\n"; - // Stages. - os << "\n // Stencil stages.\n"; - for (auto& bp : _eq_stages.get_all()) { - if (bp->is_scratch()) - continue; - string bp_name = bp->_get_name(); - os << " auto " << bp_name << " = std::make_shared(this, \"" << - bp_name << "\");\n"; - for (auto& eg : bp->get_bundles()) { - if (eg->is_scratch()) - continue; - string eg_name = eg->_get_name(); - os << " " << bp_name << "->push_back(&" << eg_name << ");\n"; - } - os << " st_stages.push_back(" << bp_name << ");\n"; + // Core methods. + { + os << "\n // Set the core pointers of the non-scratch vars and copy some other info.\n" + " void set_core() override {\n" + " auto* cxt_cd = &_core_data;\n" + " cxt_cd->_common_core.set_core(this);\n" + " offload_copy_to_device(cxt_cd, 1);\n" << + core_code << + " }\n"; + os << "\n // Access the core data.\n" + " StencilCoreBase* corep() override {\n" + " return &_core_data;\n" + " }\n"; } - os << "\n // Call code provided by user.\n" << - _context_hook << "::call_after_new_solution(*this);\n"; - - os << " } // Ctor.\n"; + // Scratch-vars method. + os << "\n // Make new scratch vars for each thread and sync offload core ptr.\n" + " // Does not allocate data for vars.\n" + " // Must call set_core() before this.\n" + " void make_scratch_vars(int num_threads) override {\n" + " STATE_VARS(this);\n" + " TRACE_MSG(\"num threads: \" << num_threads);\n" + "\n // Release old device data for thread array.\n" + " auto* tcl = _thread_cores.data();\n" + " auto old_nt = _thread_cores.size();\n" + " if (tcl && old_nt) offload_map_free(tcl, old_nt);\n" + "\n // Make new array.\n" + " _thread_cores.resize(num_threads);\n" + " tcl = num_threads ? _thread_cores.data() : 0;\n" + " TRACE_MSG(\"data at \" << (void*)tcl);\n" + " if (tcl)\n" + " offload_map_alloc(tcl, num_threads);\n" + " _core_data._thread_core_list.set_and_sync(tcl);\n" + "\n // Create scratch var(s) and set core ptr(s).\n" << + scratch_code << + " } // make_scratch_vars\n"; + + // APIs. + os << "\n virtual std::string get_target() const override {\n" + " return \"" << _settings._target << "\";\n" + " }\n" + "\n virtual int get_element_bytes() const override {\n" + " return " << _settings._elem_bytes << ";\n" + " }\n"; // Dims creator. os << "\n // Create Dims object.\n" " static DimsPtr new_dims() {\n" " auto p = std::make_shared();\n"; - for (int i = 0; i < _dims._fold_gt1._get_num_dims(); i++) + for (int i = 0; i < _dims._fold_gt1.get_num_dims(); i++) os << " p->_vec_fold_layout.set_size(" << i << ", " << _dims._fold_gt1[i] << "); // '" << _dims._fold_gt1.get_dim_name(i) << "'\n"; os << " p->_step_dim = \"" << _dims._step_dim << "\";\n" - " p->_inner_dim = \"" << _dims._inner_dim << "\";\n"; + " p->_inner_layout_dim = \"" << _dims._inner_layout_dim << "\";\n" + " p->_inner_loop_dim = \"" << _settings._inner_loop_dim << "\";\n"; for (auto& dim : _dims._domain_dims) { auto& dname = dim._get_name(); os << " p->_domain_dims.add_dim_back(\"" << dname << "\", 0);\n"; @@ -926,6 +1171,7 @@ namespace yask { auto& dval = dim.get_val(); os << " p->_fold_pts.add_dim_back(\"" << dname << "\", " << dval << ");\n"; } + os << " p->_fold_sizes.set_from_tuple(p->_fold_pts);\n"; for (auto& dim : _dims._fold_gt1) { auto& dname = dim._get_name(); auto& dval = dim.get_val(); diff --git a/src/compiler/swig/yask_compiler_api.i b/src/compiler/swig/yask_compiler_api.i index c8e45fe6..49e5e5d4 100644 --- a/src/compiler/swig/yask_compiler_api.i +++ b/src/compiler/swig/yask_compiler_api.i @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/compiler/tests/yask_compiler_api_exception_test.cpp b/src/compiler/tests/yask_compiler_api_exception_test.cpp index 8148ac2f..d79304bf 100644 --- a/src/compiler/tests/yask_compiler_api_exception_test.cpp +++ b/src/compiler/tests/yask_compiler_api_exception_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -59,51 +59,17 @@ int main() { // Create a var. auto g1 = soln->new_var("test_var", {t, x, y, z}); - // Create an equation for the var. - - auto n1 = fac.new_const_number_node(3.14); - cout << n1->format_simple() << endl; - - auto n2 = fac.new_negate_node(n1); - cout << n2->format_simple() << endl; - // Exception test - cout << "Exception Test: Call 'new_relative_var_point' with wrong argument.\n"; + cout << "Exception Test: Call 'new_var_point' with too many arguments.\n"; try { - auto n3 = g1->new_relative_var_point({0, +1, 0, -2, 1}); + auto n3 = g1->new_var_point({t, x, y, z, x}); } catch (yask_exception& e) { cout << "YASK threw an expected exception.\n"; cout << e.get_message() << endl; - cout << "Exception Test: Catch exception correctly.\n"; + cout << "Exception Test: Caught exception correctly.\n"; num_exception++; } - auto n3 = g1->new_relative_var_point({0, +1, 0, -2}); - cout << n3->format_simple() << endl; - - auto n4a = fac.new_add_node(n2, n3); - auto n4b = fac.new_add_node(n4a, n1); - cout << n4b->format_simple() << endl; - - auto n5 = g1->new_relative_var_point({0, +1, -1, 0}); - cout << n5->format_simple() << endl; - - auto n6 = fac.new_multiply_node(n4b, n5); - cout << n6->format_simple() << endl; - - auto n_lhs = g1->new_relative_var_point({+1, 0, 0, 0}); - cout << n_lhs->format_simple() << endl; - - auto n_eq = fac.new_equation_node(n_lhs, n6); - cout << n_eq->format_simple() << endl; - - cout << "Solution '" << soln->get_name() << "' contains " << - soln->get_num_vars() << " var(s), and " << - soln->get_num_equations() << " equation(s)." << endl; - - // Number of bytes in each FP value. - soln->set_element_bytes(4); - // Exception test cout << "Exception Test: Call 'new_file_output' with invalid dir.\n"; try { @@ -111,48 +77,28 @@ int main() { } catch (yask_exception& e) { cout << "YASK threw an expected exception.\n"; cout << e.get_message() << endl; - cout << "Exception Test: Catch exception correctly.\n"; + cout << "Exception Test: Caught exception correctly.\n"; num_exception++; } - // Generate DOT output. - auto dot_file = ofac.new_file_output("yc-api-test-with-exception-cxx.dot"); - soln->format("dot", dot_file); - cout << "DOT-format written to '" << dot_file->get_filename() << "'.\n"; - - // Generate YASK output. - auto yask_file = ofac.new_file_output("yc-api-test-with-exception-cxx.hpp"); - soln->format("avx", yask_file); - cout << "YASK-format written to '" << yask_file->get_filename() << "'.\n"; - // Exception test - cout << "Exception Test: Call 'format' with wrong format.\n"; + cout << "Exception Test: Call 'set_target' with bad target.\n"; try { - soln->format("wrong_format", dot_file); + soln->set_target("bad_target"); } catch (yask_exception& e) { cout << "YASK threw an expected exception.\n"; cout << e.get_message() << endl; - cout << "Exception Test: Catch exception correctly.\n"; + cout << "Exception Test: Caught exception correctly.\n"; num_exception++; } - // TODO: better to have exception test for the methods below - // Eqs::find_deps (<-EqGroups::make_eq_groups<-StencilSolution::analyze_solution<-StencilSolution::format()) - // EqGroups::sort (<-EqGroups::make_eq_groups<-StencilSolution::analyze_solution<-StencilSolution::format()) - // VarPoint::VarPoint - // cast_expr - // NumExpr::get_num_val, NumExpr::get_int_val, NumExpr::get_bool_val - // Dimensions::set_dims (<-StencilSolution::analyze_solution<-StencilSolution::format) - // ArgParser::parse_key_value_pairs - // YASKCppPrinter::print_data (<-YASKCppPrinter::print<-StencilSolution::format) - // Check whether program handles exceptions or not. if (num_exception != 3) { cerr << "Error: unexpected number of exceptions: " << num_exception << endl; exit(1); } else - cout << "End of YASK compiler API test with exception.\n"; + cout << "End of YASK compiler API test with exceptions.\n"; return 0; } diff --git a/src/compiler/tests/yask_compiler_api_exception_test.py b/src/compiler/tests/yask_compiler_api_exception_test.py index 556ba1a5..1223c655 100755 --- a/src/compiler/tests/yask_compiler_api_exception_test.py +++ b/src/compiler/tests/yask_compiler_api_exception_test.py @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -50,49 +50,18 @@ # Create a var. g1 = soln.new_var("test_var", [t, x, y, z]) - - # Create an expression for the new value. - # This will average some of the neighboring points around the - # current stencil application point in the current timestep. - n0 = g1.new_relative_var_point([0, 0, 0, 0]) # center-point at this timestep. + n0 = g1.new_var_point([t, x, y, z]) # center-point at this timestep. # Exception test - print("Exception Test: Call 'new_relative_var_point' with wrong argument.") + print("Exception Test: Calling 'new_var_point' with too many arguments.") try: - n1 = nfac.new_add_node(n0, g1.new_relative_var_point([0, -1, 0, 0, 1])) # left. + n1 = g1.new_var_point([t, x, y, z, x]) except RuntimeError as e: print ("YASK threw an expected exception.") print (format(e)) - print ("Exception Test: Catch exception correctly.") + print ("Exception Test: Caught exception correctly.") num_exception = num_exception + 1 - # Create an expression using points in g1. - # This will average some of the neighboring points around the - # current stencil application point in the current timestep. - n1 = (g1.new_var_point([t, x, y, z ]) + # center. - g1.new_var_point([t, x-1, y, z ]) + # left. - g1.new_var_point([t, x+1, y, z ]) + # right. - g1.new_var_point([t, x, y-1, z ]) + # above. - g1.new_var_point([t, x, y+1, z ]) + # below. - g1.new_var_point([t, x, y, z-1]) + # in front. - g1.new_var_point([t, x, y, z ])) # behind. - n2 = n1 / 7 # ave of the 7 points. - - # Create an equation to define the value at the next timestep. - n3 = g1.new_relative_var_point([1, 0, 0, 0]) # center-point at next timestep. - n4 = nfac.new_equation_node(n3, n2) # equate to expr n2. - print("Equation before formatting: " + n4.format_simple()) - print("Solution '" + soln.get_name() + "' contains " + - str(soln.get_num_vars()) + " var(s), and " + - str(soln.get_num_equations()) + " equation(s).") - for var in soln.get_vars() : - print("Var " + var.get_name() + - " has the following dim(s): " + - repr(var.get_dim_names())); - - # Number of bytes in each FP value. - soln.set_element_bytes(4) - # Exception test print("Exception Test: Call 'new_file_output' with invalid dir.") try: @@ -100,26 +69,17 @@ except RuntimeError as e: print ("YASK threw an expected exception.") print (format(e)) - print ("Exception Test: Catch exception correctly.") + print ("Exception Test: Caught exception correctly.") num_exception = num_exception + 1 - # Generate DOT output. - dot_file = ofac.new_file_output("yc-api-test-with-exception-py.dot") - soln.format("dot", dot_file) - print("DOT-format written to '" + dot_file.get_filename() + "'.") - - # Generate YASK output. - yask_file = ofac.new_file_output("yc-api-test-with-exception-py.hpp") - soln.format("avx", yask_file) - print("YASK-format written to '" + yask_file.get_filename() + "'.") - # Exception test + print("Exception Test: Call 'set_target' with invalid target.") try: - soln.format("wrong_format", dot_file) + soln.set_target("bad_target") except RuntimeError as e: print ("YASK threw an expected exception.") print (format(e)) - print ("Exception Test: Catch exception correctly.") + print ("Exception Test: Caught exception correctly.") num_exception = num_exception + 1 # Check whether program handles exceptions or not. @@ -127,4 +87,4 @@ print("There is a problem in exception test.") exit(1) else: - print("End of YASK compiler API test with exception.") + print("End of YASK compiler API test with exceptions.") diff --git a/src/compiler/tests/yask_compiler_api_test.cpp b/src/compiler/tests/yask_compiler_api_test.cpp index 90d48ad0..bbf4610d 100644 --- a/src/compiler/tests/yask_compiler_api_test.cpp +++ b/src/compiler/tests/yask_compiler_api_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -119,12 +119,14 @@ int main() { // Generate DOT output. auto dot_file = ofac.new_file_output("yc-api-test-cxx.dot"); - soln->format("dot", dot_file); + soln->set_target("dot"); + soln->output_solution(dot_file); cout << "DOT-format written to '" << dot_file->get_filename() << "'.\n"; // Generate YASK output. auto yask_file = ofac.new_file_output("yc-api-test-cxx.hpp"); - soln->format("avx", yask_file); + soln->set_target("avx"); + soln->output_solution(yask_file); cout << "YASK-format written to '" << yask_file->get_filename() << "'.\n"; cout << "End of YASK compiler API test.\n"; diff --git a/src/compiler/tests/yask_compiler_api_test.py b/src/compiler/tests/yask_compiler_api_test.py index 3d6b159e..a3228ec0 100755 --- a/src/compiler/tests/yask_compiler_api_test.py +++ b/src/compiler/tests/yask_compiler_api_test.py @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -48,16 +48,14 @@ # Example index expression. e0 = x + 3; - print("Simple index expression: " + e0.format_simple()); + print("Simple index expression: ", e0.format_simple()); # Create a var. g1 = soln.new_var("test_var", [t, x, y, z]) # Create simple expressions to reference a point in g1. - n0r = g1.new_relative_var_point([0, 0, 0, 0]) # center-point at this timestep. - print("Simple var-access expression: " + n0r.format_simple()); n0 = g1.new_var_point([t, x, y, z]) # center-point at this timestep. - print("Simple var-access expression: " + n0.format_simple()); + print("Simple var-access expression: ", n0.format_simple()); # Create a more complex expression using points in g1. # This will average some of the neighboring points around the @@ -79,7 +77,7 @@ # values of each point. sn0 = sg1.new_var_point([x, y, z]) # LHS of eq is a point on scratch-var sn1 = nfac.new_equation_node(sn0, n2) # equate to expr n2. - print("Scratch-var equation before formatting: " + sn1.format_simple()) + print("Scratch-var equation before formatting: ", sn1.format_simple()) # Use values in scratch var to make a new eq. sn2 = (sg1.new_var_point([x+1, y, z ]) + @@ -98,35 +96,36 @@ # Create an equation to define the value at the next timestep # using sn5 in sub-domain sd0 and -sn5 otherwise. n4a = nfac.new_equation_node(n3, sn5, sd0) - print("Main-var interior equation before formatting: " + n4a.format_simple()) + print("Main-var interior equation before formatting: ", n4a.format_simple()) n4b = nfac.new_equation_node(n3, sn5n, sd0n) - print("Main-var edge equation before formatting: " + n4b.format_simple()) + print("Main-var edge equation before formatting: ", n4b.format_simple()) # Print some info about the solution. - print("Solution '" + soln.get_name() + "' contains " + - str(soln.get_num_vars()) + " var(s), and " + - str(soln.get_num_equations()) + " equation(s).") + print("Solution '", soln.get_name(), "' contains ", + soln.get_num_vars(), " var(s), and ", + soln.get_num_equations(), " equation(s).") for var in soln.get_vars() : - print("Var " + var.get_name() + - " has the following dim(s): " + - repr(var.get_dim_names())); + print("Var ", var.get_name(), " has the following dim(s): ", + var.get_dim_names()); # Number of bytes in each FP value. soln.set_element_bytes(4) # Generate DOT output. dot_file = ofac.new_file_output("yc-api-test-py.dot") - soln.format("dot", dot_file) - print("DOT-format written to '" + dot_file.get_filename() + "'.") + soln.set_target("dot") + soln.output_solution(dot_file) + print("DOT-format written to '", dot_file.get_filename(), "'.") # Generate YASK output. yask_file = ofac.new_file_output("yc-api-test-py.hpp") - soln.format("avx", yask_file) - print("YASK-format written to '" + yask_file.get_filename() + "'.") + soln.set_target("avx") + soln.output_solution(yask_file) + print("YASK-format written to '", yask_file.get_filename(), "'.") print("Equations:") for eq in soln.get_equations() : - print(" " + eq.format_simple()) + print(" ", eq.format_simple()) - print("Debug output captured:\n" + do.get_string()) + print("Debug output captured:\n", do.get_string()) print("End of YASK compiler API test.") diff --git a/src/contrib/coefficients/fd_coeff.cpp b/src/contrib/coefficients/fd_coeff.cpp index 432510dd..643bb5d2 100644 --- a/src/contrib/coefficients/fd_coeff.cpp +++ b/src/contrib/coefficients/fd_coeff.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -26,8 +26,10 @@ IN THE SOFTWARE. // Finite-differences coefficients code. // Contributed by Jeremy Tillay. -#include +#include #include +#include +#include #include #include "fd_coeff.hpp" @@ -51,7 +53,7 @@ namespace yask { void fd_coeff(double *coeff, const double eval_point, const int order, const double *points, const int num_points) { double c1, c2, c3; - double x_0=eval_point; + double x_0 = eval_point; int m_idx = (order+1)*num_points; int n_idx = num_points; @@ -63,25 +65,38 @@ namespace yask { d[0]=1.0; c1 = 1.0; - for(int n=1; n<=num_points-1;++n){ + for (int n=1; n<=num_points-1; ++n){ c2=1.0; for(int v=0; v<=n-1; ++v){ c3 = points[n] - points[v]; c2 = c2*c3; - for(int m=0; m<=min(n, order); ++m){ - d[m*m_idx+n*n_idx + v] = (points[n]-x_0)*d[m*m_idx + (n-1)*n_idx + v] - m*d[(m-1)*m_idx + (n-1)*n_idx + v]; - d[m*m_idx + n*n_idx + v] *= 1.0/c3; + for (int m=0; m<=min(n, order); ++m) { + double dval = (points[n] - x_0) * d[m * m_idx + (n-1) * n_idx + v]; + if (m > 0) + dval -= m * d[(m-1) * m_idx + (n-1) * n_idx + v]; + dval *= 1.0/c3; + if (dval == -0.0) + dval = 0.0; + assert(isfinite(dval)); + d[m * m_idx + n * n_idx + v] = dval; } } - for(int m=0; m<= min(n, order); ++m){ - d[m*m_idx+n*n_idx+n] = m*d[(m-1)*m_idx+(n-1)*n_idx+(n-1)] - (points[n-1]-x_0)*d[m*m_idx+(n-1)*n_idx+n-1]; - d[m*m_idx+n*n_idx+n] *= c1/c2; + for (int m=0; m<= min(n, order); ++m) { + double dval = 0.0; + if (m > 0) + dval += m * d[(m-1) * m_idx + (n-1) * n_idx + (n-1)]; + dval -= (points[n-1] - x_0) * d[m * m_idx + (n-1) * n_idx + n-1]; + dval *= c1/c2; + if (dval == -0.0) + dval = 0.0; + assert(isfinite(dval)); + d[m*m_idx + n*n_idx + n] = dval; } c1=c2; } for(int i=0; i $@ @ls -l $@ ######## Primary targets. @@ -521,82 +623,82 @@ default: kernel kernel: $(YK_EXEC) $(YK_SCRIPT) $(MAKE_REPORT_FILE) @echo "*** Binary" $(YK_EXEC) "has been built."; \ echo "*** See" $(MAKE_REPORT_FILE) "for build information."; \ - echo "*** Run command:" $(YK_SCRIPT) $(YK_SCRIPT_OPTS) "[options]" + echo "*** Run command:" $(YK_SCRIPT) $(YK_SCRIPT_FLAGS) "[options]" @$(YK_VTUNE_CMD) $(YK_LIB): $(YK_OBJS) $(YK_EXT_OBJS) - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -shared -o $@ $^ $(YK_LIBS) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $(YK_SO_FLAGS) -o $@ $^ $(YK_LIBS) @ls -l $@ $(YK_EXEC): yask_main.cpp $(YK_LIB) - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_LD) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ $(YK_LIBS) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ $(YK_LIBS) @ls -l $@ $(YK_SCRIPT): ./yask.sh - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) cp $< $@ chmod a+rx $@ @ls -l $@ +scripts: $(YK_SCRIPT) + $(MAKE_REPORT_FILE): $(YK_LIB) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) @echo MAKEFLAGS="\"$(MAKEFLAGS)"\" > $@ 2>&1 - $(MAKE) echo-settings >> $@ 2>&1 # Generated source files. $(YK_GEN_DIR)/yask_rank_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(RANK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_RANK_LOOP_OPTS) "$(RANK_LOOP_CODE)" + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(RANK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_RANK_LOOP_FLAGS) "$(RANK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_region_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(REGION_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_REGION_LOOP_OPTS) "$(REGION_LOOP_CODE)" +$(YK_GEN_DIR)/yask_mega_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(MEGA_BLOCK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_MEGA_BLOCK_LOOP_FLAGS) "$(MEGA_BLOCK_LOOP_CODE)" $(YK_GEN_DIR)/yask_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_BLOCK_LOOP_OPTS) "$(BLOCK_LOOP_CODE)" + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(BLOCK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_BLOCK_LOOP_FLAGS) "$(BLOCK_LOOP_CODE)" + +$(YK_GEN_DIR)/yask_micro_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(MICRO_BLOCK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_MICRO_BLOCK_LOOP_FLAGS) "$(MICRO_BLOCK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_mini_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(MINI_BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MINI_BLOCK_LOOP_OPTS) "$(MINI_BLOCK_LOOP_CODE)" +$(YK_GEN_DIR)/yask_nano_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(NANO_BLOCK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_NANO_BLOCK_LOOP_FLAGS) "$(NANO_BLOCK_LOOP_CODE)" -$(YK_GEN_DIR)/yask_sub_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(SUB_BLOCK_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_SUB_BLOCK_LOOP_OPTS) "$(SUB_BLOCK_LOOP_CODE)" +$(YK_GEN_DIR)/yask_pico_block_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(PICO_BLOCK_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_PICO_BLOCK_LOOP_FLAGS) "$(PICO_BLOCK_LOOP_CODE)" $(YK_GEN_DIR)/yask_misc_loops.hpp: $(GEN_LOOPS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) - $(PERL) $< -output $@ $(MISC_LOOP_OPTS) $(EXTRA_LOOP_OPTS) $(EXTRA_MISC_LOOP_OPTS) "$(MISC_LOOP_CODE)" + $(call MK_DIR,$(dir $@)) + $(PERL) $< -macro_file $(YK_CODE_FILE) -output $@ $(MISC_LOOP_FLAGS) $(EXTRA_LOOP_FLAGS) $(EXTRA_MISC_LOOP_FLAGS) "$(MISC_LOOP_CODE)" $(YK_GEN_DIR)/yask_layout_macros.hpp: $(GEN_LAYOUTS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) + $(call MK_DIR,$(dir $@)) $(PERL) $< -m $(NVDIMS) > $@ - @- gindent -fca $@ || \ - indent -fca $@ || \ - echo "note:" $@ "is not properly indented because indent program failed or was not found." + @- $(INDENT) $@ $(YK_GEN_DIR)/yask_layouts.hpp: $(GEN_LAYOUTS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) + $(call MK_DIR,$(dir $@)) $(PERL) $< -d $(NVDIMS) > $@ - @- gindent -fca $@ || \ - indent -fca $@ || \ - echo "note:" $@ "is not properly indented because indent program failed or was not found." + @- $(INDENT) $@ $(YK_GEN_DIR)/yask_var_code.hpp: $(GEN_LAYOUTS) $(YK_CODE_FILE) - $(MKDIR) $(YK_GEN_DIR) + $(call MK_DIR,$(dir $@)) $(PERL) $< -v $(NVDIMS) > $@ # NB: '$(BASH) -o pipefail' ensures failure of command before '| tee' is visible. $(YK_CODE_FILE): $(YC_EXEC) - $(MKDIR) $(dir $@) + $(call MK_DIR,$(dir $@)) @- rm -f $@ $(BASH) -o pipefail -c \ "$(RUN_PREFIX) $< $(YC_FLAGS) -p $@ $(EXTRA_YC_FLAGS) 2>&1 | tee $(YC_REPORT_FILE)" - @- gindent -fca $@ || \ - indent -fca $@ || \ - echo "note:" $@ "is not properly indented because no indent program was found." + @- $(INDENT) $@ headers: $(YK_GEN_HEADERS) @ echo 'Header files generated.' @@ -614,84 +716,24 @@ py-api: $(YK_PY_LIB) # This also creates $(YK_PY_MOD) as a side-effect. $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp: $(YK_SWIG_DIR)/yask*.i $(INC_DIR)/*.hpp $(SWIG) -version - $(MKDIR) $(YK_SWIG_OUT_DIR) $(PY_OUT_DIR) + $(call MK_DIR,$(dir $@)) + $(call MK_DIR,$(PY_OUT_DIR)) $(SWIG) -v -DYK_MODULE=$(YK_PY_MOD_BASE) -cppext cpp \ -DUSE_MPI -DMPI_VERSION=3 -DMPI_Comm=int \ -I$(INC_DIR) -I$(COMM_DIR) -I$(COMM_DIR)/swig -I$(COEFF_DIR) \ -c++ -python -o $@ -outdir $(PY_OUT_DIR) -builtin $< + $(SWIG_PATCH) $< $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.o: $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.cpp - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -x c++ $(SWIG_CXXFLAGS) $(PYINC) -fPIC -c -o $@ $< + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) -x c++ $(SWIG_CXXFLAGS) $(PYINC) -fPIC -c -o $@ $< @ls -l $@ $(YK_PY_LIB): $(YK_OBJS) $(YK_EXT_OBJS) $(YK_SWIG_OUT_DIR)/yask_kernel_api_wrap.o - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) -shared -o $@ $^ $(YK_LIBS) - @ls -l $@ - -# Simple tests. - -$(YK_VAR_TEST_EXEC): $(YK_TEST_SRC_DIR)/var_test.cpp $(YK_LIB) - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ - @ls -l $@ - -#### API tests. - -# Build C++ kernel tests. -$(YK_API_TEST_EXEC): $(YK_TEST_SRC_DIR)/yask_kernel_api_test.cpp $(YK_LIB) - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ - @ls -l $@ - -# Run C++ tests. -cxx-yk-var-test: $(YK_VAR_TEST_EXEC) - @echo '*** Running the C++ YASK var test...' - $(RUN_PREFIX) $< - -cxx-yk-api-test: $(YK_API_TEST_EXEC) - @echo '*** Running the C++ YASK kernel API test...' - $(RUN_PREFIX) $< - -# Run Python kernel API test. -py-yk-api-test: $(YK_TEST_SRC_DIR)/yask_kernel_api_test.py $(YK_PY_LIB) - @echo '*** Running the Python YASK kernel API test...' - $(RUN_PYTHON) $< - -# Build C++ kernel tests with exceptions. -$(YK_API_TEST_EXEC_WITH_EXCEPTION): $(YK_TEST_SRC_DIR)/yask_kernel_api_exception_test.cpp $(YK_LIB) - $(MKDIR) $(dir $@) - $(CXX_PREFIX) $(YK_CXX) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $(YK_SO_FLAGS) -o $@ $^ $(YK_LIBS) @ls -l $@ -# Run C++ tests with exceptions. -cxx-yk-api-test-with-exception: $(YK_API_TEST_EXEC_WITH_EXCEPTION) - @echo '*** Running the C++ YASK kernel API test with exception...' - $(RUN_PREFIX) $< - -# Run Python kernel API test with exceptions. -py-yk-api-test-with-exception: $(YK_TEST_SRC_DIR)/yask_kernel_api_exception_test.py $(YK_PY_LIB) - @echo '*** Running the Python YASK kernel API test with exception...' - $(RUN_PYTHON) $< - -### API tests for the compiler. -# These targets run the tests in the compiler directory, -# then they move the output files into the correct location -# for the kernel. - -# Run Python compiler API test to create stencil-code file. -py-yc-api-test: - $(MAKE) -C $(YC_SRC_DIR) $@ - $(MKDIR) $(YK_GEN_DIR) - mv $(YC_SRC_DIR)/yc-api-test-py.hpp $(YK_CODE_FILE) - -# Run C++ compiler API test to create stencil-code file. -cxx-yc-api-test: - $(MAKE) -C $(YC_SRC_DIR) $@ - $(MKDIR) $(YK_GEN_DIR) - mv $(YC_SRC_DIR)/yc-api-test-cxx.hpp $(YK_CODE_FILE) - ######## Misc targets # Generate the code file using the built-in compiler. @@ -714,106 +756,6 @@ api-no-yc: py-api-no-yc: $(MAKE) $(NO_YC_MAKE_FLAGS) py-api -# Validation runs for each binary. -DEF_TEST_ARGS := -ranks $(ranks) -stencil $(YK_STENCIL) -arch $(YK_ARCH) -v \ - -trial_steps 2 -max_threads 6 -block_threads 2 \ - -ep 0 -mp 0 -no-allow_addl_padding - -val0 := $(DEF_TEST_ARGS) -l 48 -r 32 -b 24 -rt 0 $(EXTRA_TEST_ARGS) -val1 := $(DEF_TEST_ARGS) -l 48 -r 32 -b 24 -mb 16 -rt 1 $(EXTRA_TEST_ARGS) -val2 := $(DEF_TEST_ARGS) -l 63 -r 32 -b 24 -mb 16 -rt 2 $(EXTRA_TEST_ARGS) -val3 := $(DEF_TEST_ARGS) -l 63 -r 48 -b 24 -mb 16 -bt 2 $(EXTRA_TEST_ARGS) -val4a := $(DEF_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -bind_block_threads $(EXTRA_TEST_ARGS) -val4b := $(DEF_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -use_shm -overlap_comms $(EXTRA_TEST_ARGS) -val4c := $(DEF_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -use_shm -no-overlap_comms $(EXTRA_TEST_ARGS) -ranks := 2 - -# Run the kernel binary using several combos of sizes and ranks. -yk-tests: - $(YK_SCRIPT) $(val0) - $(YK_SCRIPT) $(val1) - $(YK_SCRIPT) $(val2) - $(YK_SCRIPT) $(val3) - $(YK_SCRIPT) $(val4a) - -yk-mpi-tests: - $(YK_SCRIPT) $(val4b) - $(YK_SCRIPT) $(val4c) - -# Run the default YASK compiler and kernel. -yc-and-yk-test: $(YK_EXEC) $(YK_SCRIPT) - $(MAKE) ranks=1 yk-tests - if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi - -# Run the YASK kernel test without implicity using the YASK compiler. -yk-test-no-yc: kernel-no-yc $(YK_SCRIPT) - $(MAKE) ranks=1 yk-tests - if (( $(ranks) > 1 )); then $(MAKE) yk-tests; fi - -# Run the kernel API tests for C++ and Python with and w/o expected exceptions. -api-tests: - $(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd - $(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd - $(MAKE) clean; $(MAKE) cxx-yk-api-test-with-exception real_bytes=8 stencil=iso3dfd - $(MAKE) clean; $(MAKE) py-yk-api-test-with-exception stencil=iso3dfd - -# Run several stencils using built-in validation. -# NB: set arch var as applicable. -# NB: save some compile time by using YK_CXXOPT=-O2 or -O1. -# These tests are focused on the kernel and not the compiler. -# For testing both the kernel and compiler in various combinations, -# run the tests from the top-level Makefile. -STENCIL_TEST_ARGS := yc-and-yk-test real_bytes=8 allow_new_var_types=0 check=1 trace=1 -stencil-tests: - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_empty step_dim=t domain_dims=d1,d2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_empty_2d - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_2d fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_2d fold=x=2,y=2 mpi=0 ranks=1 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_2d arch=intel64 fold=x=3,y=2 domain_dims=y,x - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_3d fold=x=2,z=2 domain_dims=z,y,x - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_partial_3d fold=x=2,z=2 domain_dims=x,z,y - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_stream_3d radius=5 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_reverse_2d radius=1 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_func_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_stages_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_stages_2d fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_stages_3d fold=y=2,x=2 domain_dims=x,z,y - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_misc_2d fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_misc_2d fold=x=2,y=2 EXTRA_YC_FLAGS=-interleave-misc - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_step_cond_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_boundary_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_boundary_2d fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_boundary_3d fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_scratch_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_scratch_3d fold=x=2,z=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_scratch_boundary_1d fold=x=4 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=iso3dfd radius=3 fold=x=2,y=2 domain_dims=z,x,y - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=iso3dfd_sponge radius=3 fold=x=2,z=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=3axis fold=x=2,y=2 cluster=x=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=3axis fold=x=2,y=2 cluster=z=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=3axis_with_diags fold=x=2,z=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=3plane fold=y=2,z=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=cube fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=ssg fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=ssg_merged fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=awp fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=awp_abc fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=fsg_abc fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=fsg_merged fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=tti fold=x=2,y=2 - $(MAKE) clean; $(MAKE) $(STENCIL_TEST_ARGS) stencil=test_4d fold=w=2,x=2 - -all-tests: - $(MAKE) clean; $(MAKE) cxx-yk-var-test stencil=test_3d fold=x=4,y=2 - $(MAKE) stencil-tests - $(MAKE) api-tests - -all: - $(MAKE) kernel - $(MAKE) api - $(MAKE) all-tests - # Remove intermediate files. # Should not trigger remake of stencil compiler. # Make this target before rebuilding YASK with any new parameters. @@ -832,116 +774,404 @@ realclean: clean rm -fv $(BUILD_OUT_DIR)/*report.txt - find . -name '*.pyc' -print -delete - find . -name '*~' -print -delete - rm -rf logs echo-settings: - @echo "Build environment for" $(YK_EXEC) on `date`; \ + @echo "Build environment, `date`"; \ echo TARGET=$(TARGET); \ - echo BLOCK_LOOP_CODE="\"$(BLOCK_LOOP_CODE)\""; \ - echo BLOCK_LOOP_INNER_MODS="\"$(BLOCK_LOOP_INNER_MODS)\""; \ - echo BLOCK_LOOP_OPTS="\"$(BLOCK_LOOP_OPTS)\""; \ - echo BLOCK_LOOP_ORDER="\"$(BLOCK_LOOP_ORDER)\""; \ - echo BLOCK_LOOP_OUTER_MODS="\"$(BLOCK_LOOP_OUTER_MODS)\""; \ - echo BLOCK_LOOP_OUTER_VARS="\"$(BLOCK_LOOP_OUTER_VARS)\""; \ echo DEF_ARGS="\"$(DEF_ARGS)\""; \ echo EXTRA_MACROS="\"$(EXTRA_MACROS)\""; \ echo EXTRA_YC_FLAGS="\"$(EXTRA_YC_FLAGS)\""; \ echo EXTRA_YK_CXXFLAGS="\"$(EXTRA_YK_CXXFLAGS)\""; \ echo ISA=$(ISA); \ echo MACROS="\"$(MACROS)\""; \ - echo MINI_BLOCK_LOOP_CODE="\"$(MINI_BLOCK_LOOP_CODE)\""; \ - echo MINI_BLOCK_LOOP_INNER_MODS="\"$(MINI_BLOCK_LOOP_INNER_MODS)\""; \ - echo MINI_BLOCK_LOOP_OPTS="\"$(MINI_BLOCK_LOOP_OPTS)\""; \ - echo MINI_BLOCK_LOOP_ORDER="\"$(MINI_BLOCK_LOOP_ORDER)\""; \ - echo MINI_BLOCK_LOOP_OUTER_MODS="\"$(MINI_BLOCK_LOOP_OUTER_MODS)\""; \ - echo MINI_BLOCK_LOOP_OUTER_VARS="\"$(MINI_BLOCK_LOOP_OUTER_VARS)\""; \ - echo MISC_LOOP_CODE="\"$(MISC_LOOP_CODE)\""; \ - echo MISC_LOOP_INNER_MODS="\"$(MISC_LOOP_INNER_MODS)\""; \ - echo MISC_LOOP_OPTS="\"$(MISC_LOOP_OPTS)\""; \ - echo MISC_LOOP_ORDER="\"$(MISC_LOOP_ORDER)\""; \ - echo MISC_LOOP_OUTER_MODS="\"$(MISC_LOOP_OUTER_MODS)\""; \ - echo MISC_LOOP_OUTER_VARS="\"$(MISC_LOOP_OUTER_VARS)\""; \ echo OMPFLAGS="\"$(OMPFLAGS)\""; \ - echo RANK_LOOP_CODE="\"$(RANK_LOOP_CODE)\""; \ - echo RANK_LOOP_INNER_MODS="\"$(RANK_LOOP_INNER_MODS)\""; \ - echo RANK_LOOP_OPTS="\"$(RANK_LOOP_OPTS)\""; \ - echo RANK_LOOP_ORDER="\"$(RANK_LOOP_ORDER)\""; \ - echo RANK_LOOP_OUTER_MODS="\"$(RANK_LOOP_OUTER_MODS)\""; \ - echo RANK_LOOP_OUTER_VARS="\"$(RANK_LOOP_OUTER_VARS)\""; \ - echo REGION_LOOP_CODE="\"$(REGION_LOOP_CODE)\""; \ - echo REGION_LOOP_INNER_MODS="\"$(REGION_LOOP_INNER_MODS)\""; \ - echo REGION_LOOP_OPTS="\"$(REGION_LOOP_OPTS)\""; \ - echo REGION_LOOP_ORDER="\"$(REGION_LOOP_ORDER)\""; \ - echo REGION_LOOP_OUTER_MODS="\"$(REGION_LOOP_OUTER_MODS)\""; \ - echo REGION_LOOP_OUTER_VARS="\"$(REGION_LOOP_OUTER_VARS)\""; \ - echo SUB_BLOCK_LOOP_CODE="\"$(SUB_BLOCK_LOOP_CODE)\""; \ - echo SUB_BLOCK_LOOP_INNER_MODS="\"$(SUB_BLOCK_LOOP_INNER_MODS)\""; \ - echo SUB_BLOCK_LOOP_INNER_VARS="\"$(SUB_BLOCK_LOOP_INNER_VARS)\""; \ - echo SUB_BLOCK_LOOP_OPTS="\"$(SUB_BLOCK_LOOP_OPTS)\""; \ - echo SUB_BLOCK_LOOP_ORDER="\"$(SUB_BLOCK_LOOP_ORDER)\""; \ - echo SUB_BLOCK_LOOP_OUTER_MODS="\"$(SUB_BLOCK_LOOP_OUTER_MODS)\""; \ - echo SUB_BLOCK_LOOP_OUTER_VARS="\"$(SUB_BLOCK_LOOP_OUTER_VARS)\""; \ + echo VEC_MACROS="\"$(VEC_MACROS)\""; \ echo YASK_OUTPUT_DIR=$(YASK_OUTPUT_DIR); \ echo YC_FLAGS="\"$(YC_FLAGS)\""; \ echo YC_STENCIL=$(YC_STENCIL); \ echo YC_TARGET=$(YC_TARGET); \ echo YK_ARCH=$(YK_ARCH); \ echo YK_BASE=$(YK_BASE); \ + echo YK_EXEC=$(YK_EXEC); \ echo YK_CXX=$(YK_CXX); \ + echo YK_CXXCMD=$(YK_CXXCMD); \ + echo YK_CXXDBG=$(YK_CXXDBG); \ echo YK_CXXFLAGS="\"$(YK_CXXFLAGS)\""; \ echo YK_CXXOPT=$(YK_CXXOPT); \ echo YK_EXT_TAG=$(YK_EXT_TAG); \ echo YK_STENCIL=$(YK_STENCIL); \ echo YK_TAG=$(YK_TAG); \ + echo YK_CXXVER=`$(YK_CXX) --version`; \ echo arch=$(arch); \ echo cluster=$(cluster); \ echo fold=$(fold); \ - echo omp_block_schedule=$(omp_block_schedule); \ - echo omp_misc_schedule=$(omp_misc_schedule); \ - echo omp_region_schedule=$(omp_region_schedule); \ + echo offload=$(offload); \ + echo offload_usm=$(offload_usm); \ echo pfd_l1=$(pfd_l1); \ echo pfd_l2=$(pfd_l2); \ echo radius=$(radius); \ echo real_bytes=$(real_bytes); \ echo stencil=$(stencil); \ echo streaming_stores=$(streaming_stores) + @echo " "; \ + echo RANK_LOOP_CODE="\"$(RANK_LOOP_CODE)\""; \ + echo RANK_LOOP_FLAGS="\"$(RANK_LOOP_FLAGS)\""; \ + echo RANK_LOOP_ORDER="\"$(RANK_LOOP_ORDER)\""; \ + echo RANK_LOOP_MODS="\"$(RANK_LOOP_MODS)\""; \ + echo MEGA_BLOCK_LOOP_CODE="\"$(MEGA_BLOCK_LOOP_CODE)\""; \ + echo MEGA_BLOCK_LOOP_FLAGS="\"$(MEGA_BLOCK_LOOP_FLAGS)\""; \ + echo MEGA_BLOCK_LOOP_ORDER="\"$(MEGA_BLOCK_LOOP_ORDER)\""; \ + echo MEGA_BLOCK_LOOP_MODS="\"$(MEGA_BLOCK_LOOP_MODS)\""; \ + echo BLOCK_LOOP_CODE="\"$(BLOCK_LOOP_CODE)\""; \ + echo BLOCK_LOOP_FLAGS="\"$(BLOCK_LOOP_FLAGS)\""; \ + echo BLOCK_LOOP_ORDER="\"$(BLOCK_LOOP_ORDER)\""; \ + echo BLOCK_LOOP_MODS="\"$(BLOCK_LOOP_MODS)\""; \ + echo MICRO_BLOCK_LOOP_CODE="\"$(MICRO_BLOCK_LOOP_CODE)\""; \ + echo MICRO_BLOCK_LOOP_FLAGS="\"$(MICRO_BLOCK_LOOP_FLAGS)\""; \ + echo MICRO_BLOCK_LOOP_ORDER="\"$(MICRO_BLOCK_LOOP_ORDER)\""; \ + echo MICRO_BLOCK_LOOP_MODS="\"$(MICRO_BLOCK_LOOP_MODS)\""; \ + echo NANO_BLOCK_LOOP_CODE="\"$(NANO_BLOCK_LOOP_CODE)\""; \ + echo NANO_BLOCK_LOOP_FLAGS="\"$(NANO_BLOCK_LOOP_FLAGS)\""; \ + echo NANO_BLOCK_LOOP_ORDER="\"$(NANO_BLOCK_LOOP_ORDER)\""; \ + echo NANO_BLOCK_LOOP_MODS="\"$(NANO_BLOCK_LOOP_MODS)\""; \ + echo PICO_BLOCK_LOOP_CODE="\"$(PICO_BLOCK_LOOP_CODE)\""; \ + echo PICO_BLOCK_LOOP_FLAGS="\"$(PICO_BLOCK_LOOP_FLAGS)\""; \ + echo PICO_BLOCK_LOOP_ORDER="\"$(PICO_BLOCK_LOOP_ORDER)\""; \ + echo PICO_BLOCK_LOOP_MODS="\"$(PICO_BLOCK_LOOP_MODS)\""; \ + echo MISC_LOOP_CODE="\"$(MISC_LOOP_CODE)\""; \ + echo MISC_LOOP_FLAGS="\"$(MISC_LOOP_FLAGS)\""; \ + echo MISC_LOOP_ORDER="\"$(MISC_LOOP_ORDER)\""; \ + echo MISC_LOOP_MODS="\"$(MISC_LOOP_MODS)\""; \ git status uname -a - $(YK_CXX) --version # Print stats on inner SIMD loops from asm file. -# Compile with 'EXTRA_YK_CXXFLAGS=-Fa CXX_PREFIX=' to make asm file. -# (The 'CXX_PREFIX=' part just turns off 'ccmake' because it -# won't rebuild if only '-Fa' is added.) -code-stats: kernel +code-stats: $(YK_LIB) + $(call MK_DIR,$(YK_OBJ_DIR)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) -x c++ -S -o $(YK_OBJ_DIR)/factory.s $(YK_LIB_SRC_DIR)/factory.cpp @echo "Code stats for stencil computation:" - $(PERL) $(VIEW_ASM) -p -l -f=calc_loop_of_clusters factory.s + $(PERL) $(VIEW_ASM) -p -l -f='calc_' $(YK_OBJ_DIR)/factory.s # Print some usage info. help: - @echo "Example performance builds of kernel cmd-line tool:"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd arch=knl # Explicit arch target"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd radius=4 YK_STENCIL=iso3dfd_rt # Specify binary stencil name"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis fold='x=1,y=2,z=4' cluster='x=2'"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis radius=4 real_bytes=8 # DP FP"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=awp EXTRA_YK_CXXFLAGS='-qopt-report -Fa'"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis EXTRA_YK_CXXFLAGS='-qnextgen' # Use next-gen Intel compiler"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis YK_CXX=clang++ mpi=0 # Use clang w/o MPI"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis YK_CXX=clang++ EXTRA_YK_CXXFLAGS='-fno-color-diagnostics' mpi=0 # Use clang w/o color output"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=3axis YK_CXX=mpiCC OMPI_CXX=clang++ # Use clang w/MPI"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=awp vtune=1 # Add VTune start/stop instrumentation"; \ + @echo "Example builds of kernel cmd-line tool:"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd # Build ISO3DFD stencil with default settings"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg # Build SSG stencil with default settings"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg offload=1 # Device offload support"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg offload_usm=1 # Device offload support with unified shared memory"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg vtune=1 # Add VTune start/stop instrumentation"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg mpi=0 # No MPI support"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd radius=4 # Specify stencil radius"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd radius=2 YK_STENCIL=iso3d_r2 # Specify binary stencil name"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd arch=avx512 # Specify arch target"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg fold='x=1,y=2,z=4' # Specify folding (data layout)"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg cluster='x=2' # Specify clustering (unrolling)"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg real_bytes=8 # Use 8-byte (double) floats"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXXOPT='-O2' # Use O2 optimization"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=icpc # Use classic Intel compiler"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg YK_CXX=g++ # Use gnu compiler"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=ssg MPI_CXX=mpiCC # Specify MPI compiler"; \ echo " " - @echo "Example performance builds of kernel API for C++ and Python apps:"; \ + @echo "Example builds of kernel API for C++ and Python apps:"; \ echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd yk-api"; \ echo " $(MAKE) clean; $(MAKE) -j stencil=awp yk-api"; \ echo " " @echo "Example debug builds of kernel cmd-line tool:"; \ - echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' check=1 # No optimization, OpenMP or MPI; internal checking"; \ - echo " $(MAKE) clean; $(MAKE) -j arch=avx stencil=test_2d mpi=0 OMPFLAGS='-qopenmp-stubs' YK_CXXOPT='-O0' trace=1 # Enable tracing; run with '-trace' to get trace"; \ - echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=3axis radius=0 fold='x=1,y=1,z=1' mpi=0 YK_CXX=g++ OMPFLAGS='' YK_CXXOPT='-O0' trace_mem=1 # Trace all mem accesses with '-trace'"; \ + echo " $(MAKE) clean; $(MAKE) -j stencil=iso3dfd mpi=0 omp=0 YK_CXXOPT='-O0' check=1 # No optimization, OpenMP, or MPI; use internal checking"; \ + echo " $(MAKE) clean; $(MAKE) -j arch=avx2 stencil=test_2d YK_CXXOPT='-O0' trace=1 # Enable tracing; run with '-trace' to get trace"; \ + #echo " $(MAKE) clean; $(MAKE) -j arch=intel64 stencil=ssg radius=0 fold='x=1,y=1,z=1' YK_CXXOPT='-O0' trace_mem=1 # Trace all mem accesses with '-trace'"; \ echo " " - @echo "Example builds with test runs:"; \ - echo " $(MAKE) -j all # Normal full API and stencil tests"; \ - echo " $(MAKE) -j all YK_CXX=g++ mpi=0 ranks=1 # Test g++ w/o MPI"; \ - echo " $(MAKE) -j all YK_CXXOPT=-O1 ranks=4 # Test on 4 ranks" + @echo "Example regression tests (run before any git push or pull request):"; \ + echo " $(MAKE) -j all # Normal full API and stencil tests"; \ + echo " $(MAKE) -j all offload=1 # Test device offload"; \ + echo " $(MAKE) -j all offload_usm=1 # Test device offload with unified shared memory"; \ + echo " $(MAKE) -j all ranks=8 # Test with 8 MPI ranks"; \ + echo " $(MAKE) -j all YK_CXXOPT=-O1 # Test at O1 optimization"; \ + echo " $(MAKE) -j all YK_CXX=g++ # Test w/g++"; \ + echo " $(MAKE) -j all mpi=0 # Test w/o MPI"; \ + echo " $(MAKE) -j all YASK_OUTPUT_DIR=test-dir # Test using alternate output dir" + + +################################# +########### Tests ############### +################################# +# TODO: convert all testing to a separate test framework. + +# Default number of ranks for running tests. +# 4 is good because it tests in-plane diagonal exchanges for 2D and 3D tests. +# 8 would test all exchanges for 3D tests. +ifneq ($(mpi),1) +ranks := 1 +else +ranks := 4 +endif + +# Default min & max test number to run. +first_test := 0 +last_test := 999 + +# Default threads. +ifeq ($(offload),1) +outer_threads := 2 +inner_threads := 2 +else +outer_threads := 8 +inner_threads := 2 +endif + +# Makefile functions for folding and clustering. +# Disable folding and clustering for offload testing. +ifeq ($(offload),1) +FOLD = +CLUSTER = +else +FOLD = fold=$(subst $(space),$(comma),$(1)) +CLUSTER = cluster=$(subst $(space),$(comma),$(1)) +endif + +TEST_ARGS := real_bytes=8 allow_new_var_types=0 check=1 trace=1 +TEST_MAKE := $(MAKE) $(TEST_ARGS) + +### Unit tests. + +$(YK_OMP_TEST_EXEC): $(YK_TEST_SRC_DIR)/openmp_test.cpp + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $< -o $@ $(YK_LIBS) + @ls -l $@ + +cxx-yk-omp-test: + rm -f $(YK_OMP_TEST_EXEC) + $(TEST_MAKE) $(YK_OMP_TEST_EXEC) + @echo '*** Running the C++ YASK OpenMP test...' + $(RUN_PREFIX) $(YK_OMP_TEST_EXEC) + +$(YK_VAR_TEST_EXEC): $(YK_TEST_SRC_DIR)/var_test.cpp $(YK_LIB) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ $(YK_LIBS) + @ls -l $@ + +cxx-yk-var-test: + rm -f $(YK_OMP_VAR_EXEC) + $(TEST_MAKE) $(YK_VAR_TEST_EXEC) + @echo '*** Running the C++ YASK var test...' + $(RUN_PREFIX) $(YK_VAR_TEST_EXEC) + +### API tests. + +# Build C++ kernel tests. +$(YK_API_TEST_EXEC): $(YK_TEST_SRC_DIR)/yask_kernel_api_test.cpp $(YK_LIB) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ $(YK_LIBS) + @ls -l $@ + +# Build and run C++ tests. +cxx-yk-api-test: + $(TEST_MAKE) $(YK_API_TEST_EXEC) allow_new_var_types=1 + @echo '*** Running the C++ YASK kernel API test...' + $(RUN_PREFIX) $(YK_API_TEST_EXEC) + +# Run Python kernel API test. +py-yk-api-test: $(YK_TEST_SRC_DIR)/yask_kernel_api_test.py $(YK_PY_LIB) + @echo '*** Running the Python YASK kernel API test...' + $(RUN_PYTHON) $< + +# Build C++ kernel tests with exceptions. +$(YK_API_TEST_EXEC_WITH_EXCEPTION): $(YK_TEST_SRC_DIR)/yask_kernel_api_exception_test.cpp $(YK_LIB) + $(call MK_DIR,$(dir $@)) + $(CXX_PREFIX) $(YK_CXXCMD) $(YK_CXXFLAGS) $< $(YK_LFLAGS) -o $@ $(YK_LIBS) + @ls -l $@ + +# Run C++ tests with exceptions. +cxx-yk-api-test-with-exception: $(YK_API_TEST_EXEC_WITH_EXCEPTION) + @echo '*** Running the C++ YASK kernel API test with exception...' + $(RUN_PREFIX) $< + +# Run Python kernel API test with exceptions. +py-yk-api-test-with-exception: $(YK_TEST_SRC_DIR)/yask_kernel_api_exception_test.py $(YK_PY_LIB) + @echo '*** Running the Python YASK kernel API test with exception...' + $(RUN_PYTHON) $< + +### API tests for the compiler. +# These targets run the tests in the compiler directory, +# then they move the output files into the correct location +# for the kernel. + +# Run Python compiler API test to create stencil-code file. +py-yc-api-test: + $(MAKE) -C $(YC_SRC_DIR) $@ + $(call MK_DIR,$(YK_GEN_DIR)) + mv $(YC_SRC_DIR)/yc-api-test-py.hpp $(YK_CODE_FILE) + +# Run C++ compiler API test to create stencil-code file. +cxx-yc-api-test: + $(MAKE) -C $(YC_SRC_DIR) $@ + $(call MK_DIR,$(YK_GEN_DIR)) + mv $(YC_SRC_DIR)/yc-api-test-cxx.hpp $(YK_CODE_FILE) + +##### Validation runs for each binary. +DEF_TEST_ARGS := -log_dir $(TEST_LOG_OUT_DIR) \ + -ranks $(ranks) -stencil $(YK_STENCIL) -arch $(YK_ARCH) \ + -validate -no-pre_auto_tune -no-auto_tune -no-warmup -num_trials 1 -trial_steps 2 \ + -outer_threads $(outer_threads) -inner_threads $(inner_threads) \ + -ep 0 -mp 0 -no-allow_addl_padding +DEF_MPI_TEST_ARGS := $(DEF_TEST_ARGS) -min_exterior 0 + +# Test args for 1 and >1 ranks. +test_args0 := $(DEF_TEST_ARGS) -l 48 -Mb 32 -b 24 -Mbt 0 $(EXTRA_TEST_ARGS) +test_args1 := $(DEF_TEST_ARGS) -l 64 -b 48 -mb 32 -nb 24 -pb 16 -Mbt 1 $(EXTRA_TEST_ARGS) +test_args2 := $(DEF_TEST_ARGS) -l 63 -Mb 32 -b 24 -mb 16 -Mbt 2 -bundle_allocs $(EXTRA_TEST_ARGS) +test_args3 := $(DEF_TEST_ARGS) -l 63 -Mb 48 -b 24 -mb 16 -bt 2 -no-bundle_allocs $(EXTRA_TEST_ARGS) +ifeq ($(offload),1) +test_args4 := $(DEF_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 $(EXTRA_TEST_ARGS) +else +test_args4 := $(DEF_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -bind_inner_threads $(EXTRA_TEST_ARGS) +endif + +# Test args for >1 ranks only. +ifeq ($(offload),1) +test_args10 := $(DEF_MPI_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -use_device_mpi -no-overlap_comms $(EXTRA_TEST_ARGS) +test_args11 := $(DEF_MPI_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -no-use_device_mpi -overlap_comms $(EXTRA_TEST_ARGS) +else +test_args10 := $(DEF_MPI_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -no-use_shm -overlap_comms $(EXTRA_TEST_ARGS) +test_args11 := $(DEF_MPI_TEST_ARGS) -l 48 -b 24 -mb 16 -bt 2 -use_shm -no-overlap_comms $(EXTRA_TEST_ARGS) +endif + +# Run the kernel binary using several combos of sizes and ranks. +yk-tests: + if (( $(first_test) <= 0 && $(last_test) >= 0 )); then $(YK_SCRIPT) $(test_args0); fi + if (( $(first_test) <= 1 && $(last_test) >= 1 )); then $(YK_SCRIPT) $(test_args1); fi + if (( $(first_test) <= 2 && $(last_test) >= 2 )); then $(YK_SCRIPT) $(test_args2); fi + if (( $(first_test) <= 3 && $(last_test) >= 3 )); then $(YK_SCRIPT) $(test_args3); fi + if (( $(first_test) <= 4 && $(last_test) >= 4 )); then $(YK_SCRIPT) $(test_args4); fi + +# These are only run w/>1 rank. +yk-mpi-tests: + if (( $(first_test) <= 10 && $(last_test) >= 10 )); then $(YK_SCRIPT) $(test_args10); fi + if (( $(first_test) <= 11 && $(last_test) >= 11 )); then $(YK_SCRIPT) $(test_args11); fi + +# Run the default YASK compiler and kernel. +# First run on 1 rank, then multiple ranks if ranks>1. +# This is the primary target for building and running stencil tests. +yc-and-yk-test: $(YK_EXEC) $(YK_SCRIPT) + $(MAKE) ranks=1 yk-tests + if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi + +# Run the YASK kernel test without implicity using the YASK compiler. +yk-test-no-yc: kernel-no-yc $(YK_SCRIPT) + $(MAKE) ranks=1 yk-tests + if (( $(ranks) > 1 )); then $(MAKE) yk-tests yk-mpi-tests; fi + +# Run the kernel API tests for C++ and Python with and w/o expected exceptions. +api-tests: + $(MAKE) clean; $(MAKE) cxx-yk-api-test real_bytes=8 stencil=iso3dfd + $(MAKE) clean; $(MAKE) py-yk-api-test stencil=iso3dfd + $(MAKE) clean; $(MAKE) cxx-yk-api-test-with-exception real_bytes=8 stencil=iso3dfd + $(MAKE) clean; $(MAKE) py-yk-api-test-with-exception stencil=iso3dfd + +# Run several stencils using built-in validation. +# NB: set arch var as applicable. +# NB: save some compile time by using YK_CXXOPT=-O2 or -O1. +# These tests are focused on the kernel and not the compiler. +# For testing both the kernel and compiler in various combinations, +# run the tests from the top-level Makefile. +STENCIL_TEST := $(TEST_MAKE) yc-and-yk-test + +1d-tests: + $(MAKE) clean; $(STENCIL_TEST) stencil=test_stream_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_1d YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_1d YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=4) $(call CLUSTER,x=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_1d YK_STENCIL_SUFFIX=-t3 $(call FOLD,x=4) $(call CLUSTER,x=5) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_1d YK_STENCIL_SUFFIX=-t4 $(call FOLD,x=4) $(call CLUSTER,x=5) read_ahead_dist=3 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_stages_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_func_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_step_cond_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_1d $(call FOLD,x=4) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_boundary_1d $(call FOLD,x=4) + +2d-tests: + $(MAKE) clean; $(STENCIL_TEST) stencil=test_empty step_dim=t domain_dims=d1,d2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_empty_2d + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=2 y=2) inner_loop_dim=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=2 y=2) inner_loop_dim=2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t3 $(call FOLD,x=2 y=2) $(call CLUSTER,y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t4 $(call FOLD,x=2 y=2) $(call CLUSTER,x=3 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t5 $(call FOLD,x=2 y=2) mpi=0 ranks=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_2d YK_STENCIL_SUFFIX=-t6 arch=intel64 $(call FOLD,x=3 y=2) domain_dims=y,x + $(MAKE) clean; $(STENCIL_TEST) stencil=test_reverse_2d YK_STENCIL_SUFFIX=-t1 radius=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_reverse_2d YK_STENCIL_SUFFIX=-t2 radius=1 read_ahead_dist=2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_stages_2d $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=test_misc_2d YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=2 y=2) inner_misc_layout=0 outer_domain_layout=0 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_misc_2d YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=2 y=2) inner_misc_layout=0 outer_domain_layout=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_misc_2d YK_STENCIL_SUFFIX=-t3 $(call FOLD,x=2 y=2) inner_misc_layout=1 outer_domain_layout=0 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_misc_2d YK_STENCIL_SUFFIX=-t4 $(call FOLD,x=2 y=2) inner_misc_layout=1 outer_domain_layout=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_2d $(call FOLD,x=2 y=2) read_ahead_dist=2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_2d $(call FOLD,x=2 y=2) + +3d-tests: + $(MAKE) clean; $(STENCIL_TEST) stencil=test_stream_3d radius=5 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_3d YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=2 y=2 z=2) EXTRA_YC_FLAGS=-fus + $(MAKE) clean; $(STENCIL_TEST) stencil=test_3d YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=2 y=2 z=2) EXTRA_YC_FLAGS=-no-fus + $(MAKE) clean; $(STENCIL_TEST) stencil=test_3d YK_STENCIL_SUFFIX=-t3 $(call FOLD,x=2 z=2) domain_dims=z,y,x + $(MAKE) clean; $(STENCIL_TEST) stencil=test_3d YK_STENCIL_SUFFIX=-t4 $(call FOLD,x=2 z=2) inner_loop_dim=2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_3d YK_STENCIL_SUFFIX=-t5 $(call FOLD,x=2 z=2) NANO_BLOCK_LOOP_MODS=serpentine + $(MAKE) clean; $(STENCIL_TEST) stencil=test_stages_3d $(call FOLD,y=2 x=2) domain_dims=x,z,y + $(MAKE) clean; $(STENCIL_TEST) stencil=test_partial_3d YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=2 z=2) read_ahead_dist=2 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_partial_3d YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=2 z=2) inner_loop_dim=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_partial_3d YK_STENCIL_SUFFIX=-t3 $(call FOLD,x=2 z=2) domain_dims=x,z,y + $(MAKE) clean; $(STENCIL_TEST) stencil=test_boundary_3d $(call FOLD,x=2 y=2) inner_loop_dim=1 + $(MAKE) clean; $(STENCIL_TEST) stencil=test_scratch_3d $(call FOLD,x=2 z=2) inner_loop_dim=x + +# 3D tests w/specific shapes. +3d-tests2: + $(MAKE) clean; $(STENCIL_TEST) stencil=3axis YK_STENCIL_SUFFIX=-t1 $(call FOLD,x=2 y=2) $(call CLUSTER,x=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=3axis YK_STENCIL_SUFFIX=-t2 $(call FOLD,x=2 y=2) $(call CLUSTER,z=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=3axis_with_diags $(call FOLD,x=2 z=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=3plane $(call FOLD,y=2 z=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=cube $(call FOLD,x=2 y=2) + +# 3D tests w/actual seismic stencils. +3d-tests3: + $(MAKE) clean; $(STENCIL_TEST) stencil=iso3dfd radius=3 $(call FOLD,x=2 y=2) domain_dims=z,x,y + $(MAKE) clean; $(STENCIL_TEST) stencil=iso3dfd_sponge radius=6 $(call FOLD,x=2 z=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=ssg $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=ssg_merged $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=awp $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=awp_abc $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=tti $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=fsg_merged $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=fsg_merged_abc $(call FOLD,x=2 y=2) + +# 3D tests w/seismic stencils that may generate too many kernel parameters for offload. +3d-tests4: + $(MAKE) clean; $(STENCIL_TEST) stencil=fsg $(call FOLD,x=2 y=2) + $(MAKE) clean; $(STENCIL_TEST) stencil=fsg_abc $(call FOLD,x=2 y=2) + +4d-tests: + $(MAKE) clean; $(STENCIL_TEST) stencil=test_4d $(call FOLD,w=2 x=2) + +stencil-tests: + $(MAKE) 1d-tests + $(MAKE) 2d-tests + $(MAKE) 3d-tests + $(MAKE) 3d-tests2 + $(MAKE) 3d-tests3 + if (( $(offload) == 0 )); then $(MAKE) 3d-tests4; fi + $(MAKE) 4d-tests + +unit-tests: + $(MAKE) clean; $(MAKE) cxx-yk-omp-test + $(MAKE) clean; $(MAKE) cxx-yk-var-test stencil=test_3d $(call FOLD,x=2 y=2) + +all-tests: + $(MAKE) unit-tests + $(MAKE) stencil-tests + $(MAKE) api-tests + +# First, build a kernel and an API lib w/o any validation just to test the build env. +# Then, build and run all the tests. +all: + $(MAKE) kernel stencil=test_3d + $(MAKE) api stencil=test_3d + $(MAKE) all-tests + diff --git a/src/kernel/lib/alloc.cpp b/src/kernel/lib/alloc.cpp index 2af63442..17fa0648 100644 --- a/src/kernel/lib/alloc.cpp +++ b/src/kernel/lib/alloc.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -23,16 +23,185 @@ IN THE SOFTWARE. *****************************************************************************/ -// This file contains implementations of StencilContext -// specific to the allocating data. +// This file contains implementations of functions for allocating data. #include "yask_stencil.hpp" using namespace std; namespace yask { -#ifdef USE_PMEM - static inline int getnode() { + ////// Allocators and deleters ////// + + // Free device mem. + void DeleterBase::free_dev_mem(char* hostp) { + if (hostp) + offload_map_free(hostp, _nbytes); + } + + // Aligned allocation. + char* yask_aligned_alloc(std::size_t nbytes) { + + // Alignment to use based on size. + const size_t _def_alignment = CACHELINE_BYTES; + const size_t _def_big_alignment = YASK_HUGE_ALIGNMENT; + size_t align = (nbytes >= _def_big_alignment) ? + _def_big_alignment : _def_alignment; + void *p = 0; + + // Some envs have posix_memalign(), some have aligned_alloc(). + #if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 + int ret = posix_memalign(&p, align, nbytes); + if (ret) p = 0; + #else + p = aligned_alloc(align, nbytes); + #endif + + if (!p) + THROW_YASK_EXCEPTION("error: cannot allocate " + make_byte_str(nbytes) + + " aligned to " + make_byte_str(align)); + + // Return as a char* as required for shared_ptr ctor. + return static_cast(p); + } + + // Reverse yask_aligned_alloc() and optional device alloc. + void AlignedDeleter::operator()(char* p) { + free_dev_mem(p); + if (p) + free(p); + } + + // NUMA allocation. + // 'numa_pref' == yask_numa_none: use default aligned alloc. + // 'numa_pref' == yask_numa_offload: use default offload alloc. + // 'numa_pref' >= 0: preferred NUMA node. + // 'numa_pref' < 0: use NUMA policy corresponding to value. + // TODO: get rid of magic-number scheme. + char* numa_alloc(std::size_t nbytes, int numa_pref) { + + void *p = 0; + + if (numa_pref == yask_numa_none) + return yask_aligned_alloc(nbytes); + + else if (numa_pref == yask_numa_offload) + return (char*)offload_alloc_host(nbytes); + + #ifdef USE_NUMA + + // Should we use the numa policy library? + #ifdef USE_NUMA_POLICY_LIB + #pragma omp single + else if (numa_available() != -1) { + numa_set_bind_policy(0); + if (numa_pref >= 0 && numa_pref <= numa_max_node()) + numa_alloc_onnode(nbytes, numa_pref); + else + numa_alloc_local(nbytes); + // Interleaved not available. + } + else + THROW_YASK_EXCEPTION("Error: explicit NUMA policy allocation is not available"); + + // Use mmap/mbind explicitly. + #else + else if (get_mempolicy(NULL, NULL, 0, 0, 0) == 0) { + + // Set mmap flags. + int mmprot = PROT_READ | PROT_WRITE; + int mmflags = MAP_PRIVATE | MAP_ANONYMOUS; + + // Get an anonymous R/W memory map. + p = mmap(0, nbytes, mmprot, mmflags, -1, 0); + + // If successful, apply the desired binding. + if (p && p != MAP_FAILED) { + if (numa_pref >= 0) { + + // Prefer given node. + unsigned long nodemask = 0x1UL << numa_pref; + mbind(p, nbytes, MPOL_PREFERRED, &nodemask, sizeof(nodemask) * 8, 0); + } + else if (numa_pref == yask_numa_interleave) { + + // Use all nodes. + unsigned long nodemask = (unsigned long)-1; + mbind(p, nbytes, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8, 0); + } + + else{ + + // Use local node. + // MPOL_LOCAL was defined in Linux 3.8, so use + // MPOL_DEFAULT as backup on old systems. + #ifdef MPOL_LOCAL + mbind(p, nbytes, MPOL_LOCAL, 0, 0, 0); + #else + mbind(p, nbytes, MPOL_DEFAULT, 0, 0, 0); + #endif + } + } + else + THROW_YASK_EXCEPTION("Error: anonymous mmap of " + make_byte_str(nbytes) + + " failed"); + } + else + THROW_YASK_EXCEPTION("Error: explicit NUMA policy allocation is not available"); + + #endif // not USE_NUMA_POLICY_LIB. + + #else + THROW_YASK_EXCEPTION("Error: NUMA allocation is not enabled; build with numa=1"); + #endif // USE_NUMA. + + // Should not get here w/null p; throw exception. + if (!p) + THROW_YASK_EXCEPTION("Error: cannot allocate " + make_byte_str(nbytes) + + " using numa-node (or policy) " + to_string(numa_pref)); + + // Check alignment. + if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) + FORMAT_AND_THROW_YASK_EXCEPTION("Error: NUMA-allocated " << p << " is not " << + CACHELINE_BYTES << "-byte aligned"); + + // Return as a char* as required for shared_ptr ctor. + return static_cast(p); + } + + // Reverse numa_alloc(). + void NumaDeleter::operator()(char* p) { + free_dev_mem(p); + + if (p && _numa_pref == yask_numa_none) { + free(p); + p = NULL; + } + else if (p && _numa_pref == yask_numa_offload) { + offload_free_host(p); + p = NULL; + } + + #ifdef USE_NUMA + #ifdef USE_NUMA_POLICY_LIB + if (p && numa_available() != -1) { + numa_free(p, _nbytes); + p = NULL; + } + #else + if (p && get_mempolicy(NULL, NULL, 0, 0, 0) == 0) { + munmap(p, _nbytes); + p = NULL; + } + #endif + #endif + if (p) { + free(p); + p = NULL; + } + } + + // Get the current NUMA node. + static int getnode() { #ifdef SYS_getcpu int node, status; status = syscall(SYS_getcpu, NULL, &node, NULL); @@ -41,39 +210,95 @@ namespace yask { return -1; // unavailable #endif } -#endif + // MPI shm allocation. + char* shm_alloc(std::size_t nbytes, + const MPI_Comm* shm_comm, MPI_Win* shm_win) { + + void *p = 0; + + // Allocate using MPI shm. + #ifdef USE_MPI + assert(shm_comm); + assert(shm_win); + MPI_Info win_info; + MPI_Info_create(&win_info); + MPI_Info_set(win_info, "alloc_shared_noncontig", "true"); + MPI_Win_allocate_shared(nbytes, 1, win_info, *shm_comm, &p, shm_win); + MPI_Info_free(&win_info); + MPI_Win_lock_all(0, *shm_win); + #else + THROW_YASK_EXCEPTION("Error: MPI shm allocation is not enabled; build with mpi=1"); + #endif + + if (!p) + THROW_YASK_EXCEPTION("Error: cannot allocate " + make_byte_str(nbytes) + + " using MPI shm"); + + // Check alignment. + if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) + FORMAT_AND_THROW_YASK_EXCEPTION("Error: MPI shm-allocated " << p << " is not " << + CACHELINE_BYTES << "-byte aligned"); + + #ifdef USE_OFFLOAD + THROW_YASK_EXCEPTION("Error: mapping offload device memory to shm not yet supported; " + "use '-no-use_shm'"); + #endif + + // Return as a char* as required for shared_ptr ctor. + return static_cast(p); + } + + // Reverse shm_alloc(). + void ShmDeleter::operator()(char* p) { + free_dev_mem(p); + + #ifdef USE_MPI + assert(_shm_comm); + assert(_shm_win); + MPI_Win_unlock_all(*_shm_win); + MPI_Win_free(_shm_win); + p = NULL; + #else + THROW_YASK_EXCEPTION("Error: MPI shm deallocation is not enabled; build with mpi=1"); + #endif + } + + ///// Memory-alloc functions in StencilContext ///// + // Magic numbers for memory types in addition to those for NUMA. // TODO: get rid of magic-number scheme. constexpr int _shmem_key = 1000; - constexpr int _pmem_key = 2000; // leave space after this for pmem devices. - - // Alloc 'nbytes' for each requested mem type. - // Pointers are returned in 'data_buf'. - // 'nvars' and 'type' are only used for debug msg. - void StencilContext::_alloc_data(const map & nbytes, - const map & nvars, - map >& data_buf, + + // Alloc mem for each requested key. + // 'type' is only used for debug msg. + void StencilContext::_alloc_data(AllocMap& alloc_reqs, const std::string& type) { STATE_VARS(this); - // Loop through each mem type. - for (const auto& i : nbytes) { - int mem_key = i.first; - size_t nb = i.second; - size_t ng = nvars.at(mem_key); + // Loop through each request. + for (auto& i : alloc_reqs) { + auto& key = i.first; + auto& data = i.second; + auto mem_key = key.first; + auto seq_num = key.second; + TRACE_MSG("allocation req <" << mem_key << ", " << seq_num << "> for " << + make_byte_str(data.nbytes)); + if (data.nbytes == 0) + continue; // Alloc data depending on magic key. shared_ptr p; - string msg = "Allocating " + make_byte_str(nb) + - " for " + to_string(ng) + " " + type + "(s) "; + string msg = "Allocating " + make_byte_str(data.nbytes) + + " for " + to_string(data.nvars) + " " + type + "(s) "; + if (mem_key == _shmem_key) { msg += "using MPI shm"; DEBUG_MSG(msg << "..."); - p = shared_shm_alloc(nb, &env->shm_comm, &mpi_info->halo_win); + p = shared_shm_alloc(data.nbytes, &env->shm_comm, &mpi_info->halo_win); // Get pointer for each neighbor rank. -#ifdef USE_MPI + #ifdef USE_MPI int ns = int(mpi_info->neighborhood_size); for (int ni = 0; ni < ns; ni++) { int nr = mpi_info->my_neighbors.at(ni); @@ -90,17 +315,13 @@ namespace yask { TRACE_MSG("MPI shm halo buffer for rank " << nr << " is at " << baseptr << " for " << make_byte_str(sz)); } -#endif - } - else if (mem_key >= _pmem_key) { - auto dev_num = mem_key - _pmem_key; - msg += "on PMEM device " + to_string(dev_num); - DEBUG_MSG(msg << "..."); - p = shared_pmem_alloc(nb, dev_num); + #endif } else { if (mem_key == yask_numa_none) msg += "using default allocator"; + else if (mem_key == yask_numa_offload) + msg += "using default allocator for offloading"; else if (mem_key == yask_numa_local) msg += "preferring local NUMA node"; else if (mem_key == yask_numa_interleave) @@ -110,11 +331,11 @@ namespace yask { else msg += "using mem policy " + to_string(mem_key); DEBUG_MSG(msg << "..."); - p = shared_numa_alloc(nb, mem_key); + p = shared_numa_alloc(data.nbytes, mem_key); } - // Save using original key. - data_buf[mem_key] = p; + // Save ptr value. + data.ptr = p; TRACE_MSG("Got memory at " << static_cast(p.get())); } } @@ -123,49 +344,53 @@ namespace yask { void StencilContext::alloc_var_data() { STATE_VARS(this); - // Allocate I/O vars before read-only vars. + // Allocate read/write vars before read-only vars, and user vars + // last. This ordering plays well with NUMA allocation policies + // where closer (faster) memory is used first. VarPtrs sorted_var_ptrs; - VarPtrSet done; - for (auto op : output_var_ptrs) { - sorted_var_ptrs.push_back(op); - done.insert(op); - } - for (auto gp : var_ptrs) { - if (!done.count(gp)) + { + VarPtrSet done; + for (auto gp : output_var_ptrs) { sorted_var_ptrs.push_back(gp); + done.insert(gp); + } + for (auto gp : orig_var_ptrs) { + if (!done.count(gp)) { + sorted_var_ptrs.push_back(gp); + done.insert(gp); + } + } + for (auto gp : all_var_ptrs) { + if (!done.count(gp)) { + sorted_var_ptrs.push_back(gp); + done.insert(gp); + } + } + TRACE_MSG("var-allocation order:"); + for (auto sp : sorted_var_ptrs) { + auto name = sp->get_name(); + TRACE_MSG(" '" << name << "'" << + (output_var_map.count(name) ? " (output var)" : "")); + } } - done.clear(); - -#ifdef USE_PMEM - os << "PMEM var-allocation priority:" << endl; - for (auto sp : sorted_var_ptrs) { - os << " '" << sp->get_name() << "'"; - if (done.find(sp)!=done.end()) - os << " (output)"; - os << endl; - } -#endif - - // Base ptrs for all default-alloc'd data. - // These pointers will be shared by the ones in the var - // objects, which will take over ownership when these go - // out of scope. - // Key is preferred numa node or -1 for local. - map > _var_data_buf; -#ifdef USE_PMEM - auto preferred_numasize = opts->_numa_pref_max * 1024*1024*(size_t)1024; -#endif - - // Pass 0: assign PMEM node when preferred NUMA node is not enough. - // Pass 1: count required size for each NUMA node, allocate chunk of memory at end. - // Pass 2: distribute parts of already-allocated memory chunk. - for (int pass = 0; pass < 3; pass++) { + // Requests for allocation. + AllocMap alloc_reqs; + + // Pass 0: count required size for each NUMA node, allocate chunk of memory at end. + // Pass 1: distribute parts of already-allocated memory chunk. + for (int pass = 0; pass < 2; pass++) { TRACE_MSG("alloc_var_data pass " << pass << " for " << - var_ptrs.size() << " var(s)"); - - // Count bytes needed and number of vars for each NUMA node. - map npbytes, nvars; + all_var_ptrs.size() << " var(s)"); + + // Reset bytes needed and number of vars for each request. + for (auto& i : alloc_reqs) { + //auto& key = i.first; + auto& data = i.second; + data.nbytes = 0; + data.nvars = 0; + } + int seq_num = 0; // Vars. for (auto gp : sorted_var_ptrs) { @@ -174,59 +399,55 @@ namespace yask { auto& gname = gp->get_name(); auto& gb = gp->gb(); + // Bytes needed for this var. + size_t nbytes = gp->get_num_storage_bytes(); + + // NUMA policy for this var. + int numa_pref = gp->get_numa_preferred(); + // Var data. // Don't alloc if already done. if (!gp->is_storage_allocated()) { - int numa_pref = gp->get_numa_preferred(); - // Set storage if buffer has been allocated in pass 1. - if (pass == 2) { - auto p = _var_data_buf[numa_pref]; + // Determine total amount to alloc. + auto res_bytes = ROUND_UP(nbytes + _data_buf_pad, CACHELINE_BYTES); + + // If not bundling allocations, increase sequence number. + if (!actl_opts->_bundle_allocs) + seq_num++; + + // Make a request key and make or lookup data. + AllocKey req_key = make_pair(numa_pref, seq_num); + auto& req_data = alloc_reqs[req_key]; + + // Set storage if buffer has been allocated in pass 0. + if (pass == 1) { + auto p = req_data.ptr; assert(p); - gp->set_storage(p, npbytes[numa_pref]); + + // Offset into buffer is running byte count in 'npbytes'. + gp->set_storage(p, req_data.nbytes); DEBUG_MSG(gb.make_info_string()); } - // Determine padded size (also offset to next location). - size_t nbytes = gp->get_num_storage_bytes(); - npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, - CACHELINE_BYTES); - nvars[numa_pref]++; - - if (pass == 0) { -#ifdef USE_PMEM - if (preferred_numasize < npbytes[numa_pref]) - if (getnode() == -1) { - os << "Warning: cannot get numa_node information for PMEM allocation;" - " using default numa_pref " << endl; - } - else - - // TODO: change this behavior so that it doesn't actually - // modify the NUMA pref of the var. - gp->set_numa_preferred(_pmem_key + getnode()); -#endif - } + // Running totals. + req_data.nvars++; + req_data.nbytes += res_bytes; - if (pass == 1) + if (pass == 0) TRACE_MSG(" var '" << gname << "' needs " << make_byte_str(nbytes) << - " on NUMA node " << numa_pref); + " w/numa-pref " << numa_pref); } // Otherwise, just print existing var info. - else if (pass == 1) + else if (pass == 0) DEBUG_MSG(gb.make_info_string()); - } - // Reset the counters - if (pass == 0) { - npbytes.clear(); - nvars.clear(); - } + } // vars. - // Alloc for each node. - if (pass == 1) - _alloc_data(npbytes, nvars, _var_data_buf, "var"); + // Alloc for each "numa_pref" type. + if (pass == 0) + _alloc_data(alloc_reqs, "var"); } // var passes. }; @@ -244,7 +465,7 @@ namespace yask { mpi_interior = ext_bb; mpi_interior.bb_valid = false; -#ifdef USE_MPI + #ifdef USE_MPI map num_exchanges; // send/recv => count. map num_elems; // send/recv => count. @@ -254,386 +475,386 @@ namespace yask { // Loop thru all neighbors of this rank. mpi_info->visit_neighbors ([&](const IdxTuple& neigh_offsets, int neigh_rank, int neigh_idx) { - if (neigh_rank == MPI_PROC_NULL) - return; // from lambda fn. - - // Is vectorized exchange allowed based on domain sizes? - // Both my rank and neighbor rank must have *all* domain sizes - // of vector multiples. - bool vec_ok = allow_vec_exchange && - mpi_info->has_all_vlen_mults[mpi_info->my_neighbor_index] && - mpi_info->has_all_vlen_mults[neigh_idx]; - - // Determine size of MPI buffers between neigh_rank and my - // rank for each var and create those that are needed. It - // is critical that the number, size, and shape of my - // send/receive buffers match those of the receive/send - // buffers of my neighbors. Important: Current algorithm - // assumes my left neighbor's buffer sizes can be calculated - // by considering my rank's right side data and vice-versa. - // Thus, all ranks must have consistent data that contribute - // to these calculations. - for (auto& gp : orig_var_ptrs) { - auto& gb = gp->gb(); - auto& gname = gp->get_name(); - bool var_vec_ok = vec_ok; - - - // Get calculated max dist needed for this var. - int maxdist = gp->get_halo_exchange_l1_norm(); - - // Always use max dist with WF. Do this because edge - // and/or corner values may be needed in WF extensions - // even it not needed w/o WFs. - // TODO: determine if max is always needed. - if (wf_steps > 0) - maxdist = NUM_STENCIL_DIMS - 1; - - // Manhattan dist. of current neighbor. - int mandist = mpi_info->man_dists.at(neigh_idx); - - // Check distance. - if (mandist > maxdist) { - TRACE_MSG("no halo exchange needed with rank " << neigh_rank << - " (L1-norm = " << mandist << - ") for var '" << gname << - "' (max L1-norm = " << maxdist << ")"); - continue; // to next var. - } - - // Lookup first & last domain indices and calc exchange sizes - // for this var. - bool found_delta = false; - IdxTuple my_halo_sizes, neigh_halo_sizes; - IdxTuple first_inner_idx, last_inner_idx; - IdxTuple first_outer_idx, last_outer_idx; - for (auto& dim : domain_dims) { - auto& dname = dim._get_name(); - - // Only consider domain dims that are used in this var. - if (gp->is_dim_used(dname)) { - auto vlen = gp->_get_var_vec_len(dname); - auto lhalo = gp->get_left_halo_size(dname); - auto rhalo = gp->get_right_halo_size(dname); - - // Get domain indices for this var. If there - // are no more ranks in the given direction, - // extend the "outer" index to include the halo - // in that direction to make sure all data are - // sync'd. Critical for temporal tiling. - idx_t fidx = gp->get_first_rank_domain_index(dname); - idx_t lidx = gp->get_last_rank_domain_index(dname); - first_inner_idx.add_dim_back(dname, fidx); - last_inner_idx.add_dim_back(dname, lidx); - if (opts->is_first_rank(dname)) - fidx -= lhalo; // extend into left halo. - if (opts->is_last_rank(dname)) - lidx += rhalo; // extend into right halo. - first_outer_idx.add_dim_back(dname, fidx); - last_outer_idx.add_dim_back(dname, lidx); - - // Determine if it is possible to round the - // outer indices to vec-multiples. This will - // be required to allow full vec exchanges for - // this var. We won't do the actual rounding - // yet, because we need to see if it's safe - // in all dims. - // Need +1 and then -1 trick for last. - fidx = round_down_flr(fidx, vlen); - lidx = round_up_flr(lidx + 1, vlen) - 1; - if (fidx < gp->get_first_rank_alloc_index(dname)) - var_vec_ok = false; - if (lidx > gp->get_last_rank_alloc_index(dname)) - var_vec_ok = false; - - // Determine size of exchange in this dim. This - // will be the actual halo size plus any - // wave-front shifts. In the current - // implementation, we need the wave-front shifts - // regardless of whether there is a halo on a - // given var. This is because each - // stencil-bundle gets shifted by the WF angles - // at each step in the WF. - - // Neighbor is to the left in this dim. - if (neigh_offsets[dname] == MPIInfo::rank_prev) { - - // Number of points to be added for WFs. - auto ext = wf_shift_pts[dname]; - - // My halo on my left. - my_halo_sizes.add_dim_back(dname, lhalo + ext); - - // Neighbor halo on their right. - // Assume my right is same as their right. - neigh_halo_sizes.add_dim_back(dname, rhalo + ext); - - // Flag that this var has a neighbor to left or right. - found_delta = true; - } - - // Neighbor is to the right in this dim. - else if (neigh_offsets[dname] == MPIInfo::rank_next) { - - // Number of points to be added for WFs. - auto ext = wf_shift_pts[dname]; - - // My halo on my right. - my_halo_sizes.add_dim_back(dname, rhalo + ext); - - // Neighbor halo on their left. - // Assume my left is same as their left. - neigh_halo_sizes.add_dim_back(dname, lhalo + ext); - - // Flag that this var has a neighbor to left or right. - found_delta = true; - } - - // Neighbor in-line in this dim. - else { - my_halo_sizes.add_dim_back(dname, 0); - neigh_halo_sizes.add_dim_back(dname, 0); - } - - } // domain dims in this var. - } // domain dims. - - // Is buffer needed? - // Example: if this var is 2D in y-z, but only neighbors are in - // x-dim, we don't need any exchange. - if (!found_delta) { - TRACE_MSG("no halo exchange needed for var '" << gname << - "' with rank " << neigh_rank << - " because the neighbor is not in a direction" - " corresponding to a var dim"); - continue; // to next var. - } - - // Round halo sizes if vectorized exchanges allowed. - // Both self and neighbor must be vec-multiples - // and outer indices must be vec-mults or extendable - // to be so. - // TODO: add a heuristic to avoid increasing by a large factor. - if (var_vec_ok) { - for (auto& dim : domain_dims) { - auto& dname = dim._get_name(); - if (gp->is_dim_used(dname)) { - auto vlen = gp->_get_var_vec_len(dname); - - // First index rounded down. - auto fidx = first_outer_idx[dname]; - fidx = round_down_flr(fidx, vlen); - first_outer_idx.set_val(dname, fidx); - - // Last index rounded up. - // Need +1 and then -1 trick because it's last, not end. - auto lidx = last_outer_idx[dname]; - lidx = round_up_flr(lidx + 1, vlen) - 1; - last_outer_idx.set_val(dname, lidx); - - // sizes rounded up. - my_halo_sizes.set_val(dname, ROUND_UP(my_halo_sizes[dname], vlen)); - neigh_halo_sizes.set_val(dname, ROUND_UP(neigh_halo_sizes[dname], vlen)); - - } // domain dims in this var. - } // domain dims. - } - - // Make a buffer in both directions (send & receive). - for (int bd = 0; bd < MPIBufs::n_buf_dirs; bd++) { - - // Begin/end vars to indicate what part - // of main var to read from or write to based on - // the current neighbor being processed. - IdxTuple copy_begin = gb.get_allocs(); - IdxTuple copy_end = gb.get_allocs(); // one past last! - - // Adjust along domain dims in this var. - DOMAIN_VAR_LOOP(i, j) { - auto& dim = domain_dims.get_dim(j); - auto& dname = dim._get_name(); - if (gp->is_dim_used(dname)) { - - // Init range to whole rank domain (including - // outer halos). These may be changed below - // depending on the neighbor's direction. - copy_begin[dname] = first_outer_idx[dname]; - copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. - - // Neighbor direction in this dim. - auto neigh_ofs = neigh_offsets[dname]; - - // Min MPI exterior options. - idx_t min_ext = opts->_min_exterior; - - // Region to read from, i.e., data from inside - // this rank's domain to be put into neighbor's - // halo. So, use neighbor's halo sizes when - // calculating buffer size. - if (bd == MPIBufs::buf_send) { - - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - - // Only read slice as wide as halo from beginning. - copy_begin[dname] = first_inner_idx[dname]; - copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; - - // Adjust LHS of interior. - idx_t ext_end = ROUND_UP(first_inner_idx[dname] + - max(min_ext, neigh_halo_sizes[dname]), - dims->_fold_pts[dname]); - mpi_interior.bb_begin[j] = - max(mpi_interior.bb_begin[j], ext_end); - } - - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - - // Only read slice as wide as halo before end. - copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; - copy_end[dname] = last_inner_idx[dname] + 1; - - // Adjust RHS of interior. - idx_t ext_begin = ROUND_DOWN(last_inner_idx[dname] + 1 - - max(min_ext, neigh_halo_sizes[dname]), - dims->_fold_pts[dname]); - mpi_interior.bb_end[j] = - min(mpi_interior.bb_end[j], ext_begin); - } - - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } - - // Region to write to, i.e., into this rank's halo. - // So, use my halo sizes when calculating buffer sizes. - else if (bd == MPIBufs::buf_recv) { - - // Neighbor is to the left. - if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { - - // Only read slice as wide as halo before beginning. - copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; - copy_end[dname] = first_inner_idx[dname]; - } - - // Neighbor is to the right. - else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { - - // Only read slice as wide as halo after end. - copy_begin[dname] = last_inner_idx[dname] + 1; - copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; - } - - // Else, this neighbor is in same posn as I am in this dim, - // so we leave the default begin/end settings. - } - } // domain dims in this var. - } // domain dims. - - // Sizes of buffer in all dims of this var. - // Also, set begin/end value for non-domain dims. - IdxTuple buf_sizes = gb.get_allocs(); - bool buf_vec_ok = var_vec_ok; - for (auto& dname : gp->get_dim_names()) { - idx_t dsize = 1; - - // domain dim? - if (domain_dims.lookup(dname)) { - dsize = copy_end[dname] - copy_begin[dname]; - - // Check whether alignment and size are multiple of vlen. - auto vlen = gp->_get_var_vec_len(dname); - if (dsize % vlen != 0) - buf_vec_ok = false; - if (imod_flr(copy_begin[dname], vlen) != 0) - buf_vec_ok = false; - } - - // step dim? - // Enable copy over entire allocated range. - // May only copy one step when not using WFs. - else if (dname == step_dim) { - - // Use 0..N as a place-holder range. - // The actual values will be supplied during - // halo exchange. - dsize = gp->get_alloc_size(dname); - copy_begin[dname] = 0; - copy_end[dname] = dsize; - } - - // misc? - // Copy over entire range. - // TODO: make dirty flags for misc dims in vars. - else { - dsize = gp->get_alloc_size(dname); - copy_begin[dname] = gp->get_first_misc_index(dname); - copy_end[dname] = gp->get_last_misc_index(dname) + 1; - assert(copy_end[dname] - copy_begin[dname] == dsize); - } - - // Save computed size. - buf_sizes[dname] = dsize; - - } // all dims in this var. - - // Unique name for buffer based on var name, direction, and ranks. - string bname = gname; - if (bd == MPIBufs::buf_send) - bname += "_send_halo_from_" + to_string(me) + "_to_" + to_string(neigh_rank); - else if (bd == MPIBufs::buf_recv) - bname += "_recv_halo_from_" + to_string(neigh_rank) + "_to_" + to_string(me); - - // Does buffer have non-zero size? - if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { - TRACE_MSG("MPI buffer '" << bname << - "' not needed because there is no data to exchange"); - continue; - } - - // At this point, buf_sizes, copy_begin, and copy_end - // should be set for each dim in this var. - - // Compute last from end. - IdxTuple copy_last = copy_end.sub_elements(1); - - // Make MPI data entry for this var. - auto gbp = mpi_data.emplace(gname, state->_mpi_info); - auto& gbi = gbp.first; // iterator from pair returned by emplace(). - auto& gbv = gbi->second; // value from iterator. - auto& buf = gbv.get_buf(MPIBufs::BufDir(bd), neigh_offsets); - - // Config buffer for this var. - // (But don't allocate storage yet.) - buf.begin_pt = copy_begin; - buf.last_pt = copy_last; - buf.num_pts = buf_sizes; - buf.name = bname; - buf.vec_copy_ok = buf_vec_ok; - - TRACE_MSG("MPI buffer '" << buf.name << - "' configured for rank at relative offsets " << - neigh_offsets.sub_elements(1).make_dim_val_str() << " with " << - buf.num_pts.make_dim_val_str(" * ") << " = " << buf.get_size() << - " element(s) at [" << buf.begin_pt.make_dim_val_str() << - " ... " << buf.last_pt.make_dim_val_str() << - "] with vector-copy " << - (buf.vec_copy_ok ? "enabled" : "disabled")); - num_exchanges[bd]++; - num_elems[bd] += buf.get_size(); - - } // send, recv. - } // vars. - }); // neighbors. + if (neigh_rank == MPI_PROC_NULL) + return; // from lambda fn. + + // Is vectorized exchange allowed based on domain sizes? + // Both my rank and neighbor rank must have *all* domain sizes + // of vector multiples. + bool vec_ok = !actl_opts->force_scalar_exchange && + mpi_info->has_all_vlen_mults[mpi_info->my_neighbor_index] && + mpi_info->has_all_vlen_mults[neigh_idx]; + + // Determine size of MPI buffers between neigh_rank and my + // rank for each var and create those that are needed. It + // is critical that the number, size, and shape of my + // send/receive buffers match those of the receive/send + // buffers of my neighbors. Important: Current algorithm + // assumes my left neighbor's buffer sizes can be calculated + // by considering my rank's right side data and vice-versa. + // Thus, all ranks must have consistent data that contribute + // to these calculations. + for (auto& gp : orig_var_ptrs) { + auto& gb = gp->gb(); + auto& gname = gp->get_name(); + bool var_vec_ok = vec_ok; + + + // Get calculated max dist needed for this var. + int maxdist = gp->get_halo_exchange_l1_norm(); + + // Always use max dist with WF. Do this because edge + // and/or corner values may be needed in WF extensions + // even it not needed w/o WFs. + // TODO: determine if max is always needed. + if (wf_steps > 0) + maxdist = NUM_STENCIL_DIMS - 1; + + // Manhattan dist. of current neighbor. + int mandist = mpi_info->man_dists.at(neigh_idx); + + // Check distance. + if (mandist > maxdist) { + TRACE_MSG("no halo exchange needed with rank " << neigh_rank << + " (L1-norm = " << mandist << + ") for var '" << gname << + "' (max L1-norm = " << maxdist << ")"); + continue; // to next var. + } + + // Lookup first & last domain indices and calc exchange sizes + // for this var. + bool found_delta = false; + IdxTuple my_halo_sizes, neigh_halo_sizes; + IdxTuple first_inner_idx, last_inner_idx; + IdxTuple first_outer_idx, last_outer_idx; + for (auto& dim : domain_dims) { + auto& dname = dim._get_name(); + + // Only consider domain dims that are used in this var. + if (gp->is_dim_used(dname)) { + auto vlen = gp->_get_var_vec_len(dname); + auto lhalo = gp->get_left_halo_size(dname); + auto rhalo = gp->get_right_halo_size(dname); + + // Get domain indices for this var. If there + // are no more ranks in the given direction, + // extend the "outer" index to include the halo + // in that direction to make sure all data are + // sync'd. Critical for temporal tiling. + idx_t fidx = gp->get_first_rank_domain_index(dname); + idx_t lidx = gp->get_last_rank_domain_index(dname); + first_inner_idx.add_dim_back(dname, fidx); + last_inner_idx.add_dim_back(dname, lidx); + if (actl_opts->is_first_rank(dname)) + fidx -= lhalo; // extend into left halo. + if (actl_opts->is_last_rank(dname)) + lidx += rhalo; // extend into right halo. + first_outer_idx.add_dim_back(dname, fidx); + last_outer_idx.add_dim_back(dname, lidx); + + // Determine if it is possible to round the + // outer indices to vec-multiples. This will + // be required to allow full vec exchanges for + // this var. We won't do the actual rounding + // yet, because we need to see if it's safe + // in all dims. + // Need +1 and then -1 trick for last. + fidx = round_down_flr(fidx, vlen); + lidx = round_up_flr(lidx + 1, vlen) - 1; + if (fidx < gp->get_first_local_index(dname)) + var_vec_ok = false; + if (lidx > gp->get_last_local_index(dname)) + var_vec_ok = false; + + // Determine size of exchange in this dim. This + // will be the actual halo size plus any + // wave-front shifts. In the current + // implementation, we need the wave-front shifts + // regardless of whether there is a halo on a + // given var. This is because each + // stencil-bundle gets shifted by the WF angles + // at each step in the WF. + + // Neighbor is to the left in this dim. + if (neigh_offsets[dname] == MPIInfo::rank_prev) { + + // Number of points to be added for WFs. + auto ext = wf_shift_pts[dname]; + + // My halo on my left. + my_halo_sizes.add_dim_back(dname, lhalo + ext); + + // Neighbor halo on their right. + // Assume my right is same as their right. + neigh_halo_sizes.add_dim_back(dname, rhalo + ext); + + // Flag that this var has a neighbor to left or right. + found_delta = true; + } + + // Neighbor is to the right in this dim. + else if (neigh_offsets[dname] == MPIInfo::rank_next) { + + // Number of points to be added for WFs. + auto ext = wf_shift_pts[dname]; + + // My halo on my right. + my_halo_sizes.add_dim_back(dname, rhalo + ext); + + // Neighbor halo on their left. + // Assume my left is same as their left. + neigh_halo_sizes.add_dim_back(dname, lhalo + ext); + + // Flag that this var has a neighbor to left or right. + found_delta = true; + } + + // Neighbor in-line in this dim. + else { + my_halo_sizes.add_dim_back(dname, 0); + neigh_halo_sizes.add_dim_back(dname, 0); + } + + } // domain dims in this var. + } // domain dims. + + // Is buffer needed? + // Example: if this var is 2D in y-z, but only neighbors are in + // x-dim, we don't need any exchange. + if (!found_delta) { + TRACE_MSG("no halo exchange needed for var '" << gname << + "' with rank " << neigh_rank << + " because the neighbor is not in a direction" + " corresponding to a var dim"); + continue; // to next var. + } + + // Round halo sizes if vectorized exchanges allowed. + // Both self and neighbor must be vec-multiples + // and outer indices must be vec-mults or extendable + // to be so. + // TODO: add a heuristic to avoid increasing by a large factor. + if (var_vec_ok) { + for (auto& dim : domain_dims) { + auto& dname = dim._get_name(); + if (gp->is_dim_used(dname)) { + auto vlen = gp->_get_var_vec_len(dname); + + // First index rounded down. + auto fidx = first_outer_idx[dname]; + fidx = round_down_flr(fidx, vlen); + first_outer_idx.set_val(dname, fidx); + + // Last index rounded up. + // Need +1 and then -1 trick because it's last, not end. + auto lidx = last_outer_idx[dname]; + lidx = round_up_flr(lidx + 1, vlen) - 1; + last_outer_idx.set_val(dname, lidx); + + // sizes rounded up. + my_halo_sizes.set_val(dname, ROUND_UP(my_halo_sizes[dname], vlen)); + neigh_halo_sizes.set_val(dname, ROUND_UP(neigh_halo_sizes[dname], vlen)); + + } // domain dims in this var. + } // domain dims. + } + + // Make a buffer in both directions (send & receive). + for (int bd = 0; bd < MPIBufs::n_buf_dirs; bd++) { + + // Begin/end vars to indicate what part + // of main var to read from or write to based on + // the current neighbor being processed. + IdxTuple copy_begin = gb.get_dim_tuple(); + IdxTuple copy_end = gb.get_dim_tuple(); // will set to one past last! + + // Adjust along domain dims in this var. + DOMAIN_VAR_LOOP(i, j) { + auto& dim = domain_dims.get_dim(j); + auto& dname = dim._get_name(); + if (gp->is_dim_used(dname)) { + + // Init range to whole rank domain (including + // outer halos). These may be changed below + // depending on the neighbor's direction. + copy_begin[dname] = first_outer_idx[dname]; + copy_end[dname] = last_outer_idx[dname] + 1; // end = last + 1. + + // Neighbor direction in this dim. + auto neigh_ofs = neigh_offsets[dname]; + + // Min MPI exterior options. + idx_t min_ext = actl_opts->_min_exterior; + + // Region to read from, i.e., data from inside + // this rank's domain to be put into neighbor's + // halo. So, use neighbor's halo sizes when + // calculating buffer size. + if (bd == MPIBufs::buf_send) { + + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + + // Only read slice as wide as halo from beginning. + copy_begin[dname] = first_inner_idx[dname]; + copy_end[dname] = first_inner_idx[dname] + neigh_halo_sizes[dname]; + + // Adjust LHS of interior. + idx_t ext_end = ROUND_UP(first_inner_idx[dname] + + max(min_ext, neigh_halo_sizes[dname]), + dims->_fold_pts[dname]); + mpi_interior.bb_begin[j] = + max(mpi_interior.bb_begin[j], ext_end); + } + + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + + // Only read slice as wide as halo before end. + copy_begin[dname] = last_inner_idx[dname] + 1 - neigh_halo_sizes[dname]; + copy_end[dname] = last_inner_idx[dname] + 1; + + // Adjust RHS of interior. + idx_t ext_begin = ROUND_DOWN(last_inner_idx[dname] + 1 - + max(min_ext, neigh_halo_sizes[dname]), + dims->_fold_pts[dname]); + mpi_interior.bb_end[j] = + min(mpi_interior.bb_end[j], ext_begin); + } + + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } + + // Region to write to, i.e., into this rank's halo. + // So, use my halo sizes when calculating buffer sizes. + else if (bd == MPIBufs::buf_recv) { + + // Neighbor is to the left. + if (neigh_ofs == idx_t(MPIInfo::rank_prev)) { + + // Only read slice as wide as halo before beginning. + copy_begin[dname] = first_inner_idx[dname] - my_halo_sizes[dname]; + copy_end[dname] = first_inner_idx[dname]; + } + + // Neighbor is to the right. + else if (neigh_ofs == idx_t(MPIInfo::rank_next)) { + + // Only read slice as wide as halo after end. + copy_begin[dname] = last_inner_idx[dname] + 1; + copy_end[dname] = last_inner_idx[dname] + 1 + my_halo_sizes[dname]; + } + + // Else, this neighbor is in same posn as I am in this dim, + // so we leave the default begin/end settings. + } + } // domain dims in this var. + } // domain dims. + + // Sizes of buffer in all dims of this var. + // Also, set begin/end value for non-domain dims. + IdxTuple buf_sizes = gb.get_dim_tuple(); + bool buf_vec_ok = var_vec_ok; + for (auto& dname : gp->get_dim_names()) { + idx_t dsize = 1; + + // domain dim? + if (domain_dims.lookup(dname)) { + dsize = copy_end[dname] - copy_begin[dname]; + + // Check whether alignment and size are multiple of vlen. + auto vlen = gp->_get_var_vec_len(dname); + if (dsize % vlen != 0) + buf_vec_ok = false; + if (imod_flr(copy_begin[dname], vlen) != 0) + buf_vec_ok = false; + } + + // step dim? + // Enable copy over entire allocated range. + // May only copy one step when not using WFs. + else if (dname == step_dim) { + + // Use 0..N as a place-holder range. + // The actual values will be supplied during + // halo exchange. + dsize = gp->get_alloc_size(dname); + copy_begin[dname] = 0; + copy_end[dname] = dsize; + } + + // misc? + // Copy over entire range. + // TODO: make dirty flags for misc dims in vars. + else { + dsize = gp->get_alloc_size(dname); + copy_begin[dname] = gp->get_first_misc_index(dname); + copy_end[dname] = gp->get_last_misc_index(dname) + 1; + assert(copy_end[dname] - copy_begin[dname] == dsize); + } + + // Save computed size. + buf_sizes[dname] = dsize; + + } // all dims in this var. + + // Unique name for buffer based on var name, direction, and ranks. + string bname = gname; + if (bd == MPIBufs::buf_send) + bname += "_send_halo_from_" + to_string(me) + "_to_" + to_string(neigh_rank); + else if (bd == MPIBufs::buf_recv) + bname += "_recv_halo_from_" + to_string(neigh_rank) + "_to_" + to_string(me); + + // Does buffer have non-zero size? + if (buf_sizes.size() == 0 || buf_sizes.product() == 0) { + TRACE_MSG("MPI buffer '" << bname << + "' not needed because there is no data to exchange"); + continue; + } + + // At this point, buf_sizes, copy_begin, and copy_end + // should be set for each dim in this var. + + // Compute last from end. + IdxTuple copy_last = copy_end.sub_elements(1); + + // Make MPI data entry for this var. + auto gbp = mpi_data.emplace(gname, state->_mpi_info); + auto& gbi = gbp.first; // iterator from pair returned by emplace(). + auto& gbv = gbi->second; // value from iterator. + auto& buf = gbv.get_buf(MPIBufs::BufDir(bd), neigh_offsets); + + // Config buffer for this var. + // (But don't allocate storage yet.) + buf.begin_pt = copy_begin; + buf.last_pt = copy_last; + buf.num_pts = buf_sizes; + buf.name = bname; + buf.vec_copy_ok = buf_vec_ok; + + TRACE_MSG("MPI buffer '" << buf.name << + "' configured for rank at relative offsets " << + neigh_offsets.sub_elements(1).make_dim_val_str() << " with " << + buf.num_pts.make_dim_val_str(" * ") << " = " << buf.get_size() << + " element(s) at [" << buf.begin_pt.make_dim_val_str() << + " ... " << buf.last_pt.make_dim_val_str() << + "] with vector-copy " << + (buf.vec_copy_ok ? "enabled" : "disabled")); + num_exchanges[bd]++; + num_elems[bd] += buf.get_size(); + + } // send, recv. + } // vars. + }); // neighbors. TRACE_MSG("number of MPI send buffers on this rank: " << num_exchanges[int(MPIBufs::buf_send)]); TRACE_MSG("number of elements in send buffers: " << make_num_str(num_elems[int(MPIBufs::buf_send)])); TRACE_MSG("number of MPI recv buffers on this rank: " << num_exchanges[int(MPIBufs::buf_recv)]); TRACE_MSG("number of elements in recv buffers: " << make_num_str(num_elems[int(MPIBufs::buf_recv)])); // Finalize interior BB if there are multiple ranks and overlap enabled. - if (env->num_ranks > 1 && opts->overlap_comms) { + if (env->num_ranks > 1 && actl_opts->overlap_comms) { mpi_interior.update_bb("interior", this, true); TRACE_MSG("MPI interior BB: [" << mpi_interior.make_range_string(domain_dims) << "]"); } @@ -641,12 +862,6 @@ namespace yask { // At this point, we have all the buffers configured. // Now we need to allocate space for them. - // Base ptrs for all alloc'd data. - // These pointers will be shared by the ones in the var - // objects, which will take over ownership when these go - // out of scope. Key is memory type. - map > _mpi_data_buf; - // A table for send-buffer offsets for all rank pairs for every var: // [var-name][sending-rank][receiving-rank] map>> sb_ofs; @@ -657,6 +872,12 @@ namespace yask { // Make sure pad is big enough for shm locks. assert(_data_buf_pad >= sizeof(SimpleLock)); + // Requests and base ptrs for all alloc'd data. + // These pointers will be shared by the ones in the var + // objects, which will take over ownership when these go + // out of scope. Key is memory type. + AllocMap alloc_reqs; + // Allocate MPI buffers. // Pass 0: count required size, allocate chunk of memory at end. // Pass 1: distribute parts of already-allocated memory chunk. @@ -665,13 +886,18 @@ namespace yask { TRACE_MSG("alloc_mpi_data pass " << pass << " for " << mpi_data.size() << " MPI buffer set(s)"); - // Count bytes needed and number of buffers for each NUMA node. - map npbytes, nbufs; + // Reset bytes needed and number of vars for each request. + for (auto& i : alloc_reqs) { + //auto& key = i.first; + auto& data = i.second; + data.nbytes = 0; + data.nvars = 0; + } + int seq_num = 0; - // Vars. Use the map to ensure same order in all ranks. - for (auto gi : var_map) { - auto& gname = gi.first; - auto& gp = gi.second; + // Vars that may have MPI buffers. + for (auto gp : orig_var_ptrs) { + auto& gname = gp->get_name(); // Are there MPI bufs for this var? if (mpi_data.count(gname) == 0) @@ -693,76 +919,90 @@ namespace yask { int nidx, MPIBufs& bufs) { - // Default is global numa pref setting for MPI - // buffer, not possible override for this var. - int numa_pref = opts->_numa_pref; - - // If neighbor can use MPI shm, set key, etc. - auto nshm_rank = mpi_info->shm_ranks.at(nidx); - if (nshm_rank != MPI_PROC_NULL) { - do_shm = true; - numa_pref = _shmem_key; - assert(nshm_rank < env->num_shm_ranks); - } - - // Send and recv. - for (int bd = 0; bd < MPIBufs::n_buf_dirs; bd++) { - auto& buf = var_mpi_data.get_buf(MPIBufs::BufDir(bd), roffsets); - if (buf.get_size() == 0) - continue; - - // Don't use my mem for the recv buf if using shm. - bool use_mine = !(bd == MPIBufs::buf_recv && nshm_rank != MPI_PROC_NULL); - - // Set storage if buffer has been allocated in pass 0. - if (pass == 1 && use_mine) { - auto base = _mpi_data_buf[numa_pref]; - auto ofs = npbytes[numa_pref]; - assert(base); - auto* rp = buf.set_storage(base, ofs); - TRACE_MSG(" MPI buf '" << buf.name << "' at " << rp << - " for " << make_byte_str(buf.get_bytes())); - - // Write test values & init lock. - *((int*)rp) = me; - *((char*)rp + buf.get_bytes() - 1) = 'Z'; - buf.shm_lock_init(); - - // Save offset. - if (nshm_rank != MPI_PROC_NULL && bd == MPIBufs::buf_send) - sb_ofs[gname].at(my_shm_rank).at(nshm_rank) = ofs; - } - - // Using shm from another rank. - else if (pass == 2 && !use_mine) { - char* base = (char*)mpi_info->halo_buf_ptrs[nidx]; - auto sz = mpi_info->halo_buf_sizes[nidx]; - auto ofs = sb_ofs[gname].at(nshm_rank).at(my_shm_rank); - assert(sz >= ofs + buf.get_bytes() + YASK_PAD_BYTES); - auto* rp = buf.set_storage(base, ofs); - TRACE_MSG(" MPI shm buf '" << buf.name << "' at " << rp << - " for " << make_byte_str(buf.get_bytes())); - - // Check values written by owner rank. - assert(*((int*)rp) == nrank); - assert(*((char*)rp + buf.get_bytes() - 1) == 'Z'); - assert(!buf.is_ok_to_read()); - } - - // Determine padded size (also offset to next location) - // in my mem. - if (use_mine) { - auto sbytes = buf.get_bytes(); - npbytes[numa_pref] += ROUND_UP(sbytes + _data_buf_pad, - CACHELINE_BYTES); - nbufs[numa_pref]++; - if (pass == 0) - TRACE_MSG(" MPI buf '" << buf.name << "' needs " << - make_byte_str(sbytes) << - " (mem-key = " << numa_pref << ")"); - } - } // snd/rcv. - } ); // neighbors. + // Default is global numa pref setting for MPI + // buffer, not possible to override for each var. + int numa_pref = actl_opts->_numa_pref; + + // If neighbor can use MPI shm, set key, etc. + auto nshm_rank = mpi_info->shm_ranks.at(nidx); + if (nshm_rank != MPI_PROC_NULL) { + do_shm = true; + numa_pref = _shmem_key; + assert(nshm_rank < env->num_shm_ranks); + } + + // Send and recv bufs. + for (int bd = 0; bd < MPIBufs::n_buf_dirs; bd++) { + auto& buf = var_mpi_data.get_buf(MPIBufs::BufDir(bd), roffsets); + if (buf.get_size() == 0) + continue; + + #if 0 + // If not bundling allocations, increase sequence number. + if (!actl_opts->_bundle_allocs) + seq_num++; + #endif + + // Make a request key and make or lookup data. + AllocKey req_key = make_pair(numa_pref, seq_num); + auto& req_data = alloc_reqs[req_key]; + + // Don't use my mem for the recv buf if using shm; + // instead, we will share the neighbor's send buf. + bool use_mine = !(bd == MPIBufs::buf_recv && nshm_rank != MPI_PROC_NULL); + + // Set storage if buffer has been allocated in pass 0. + if (pass == 1 && use_mine) { + auto& base = req_data.ptr; + auto ofs = req_data.nbytes; + assert(base); + auto* rp = buf.set_storage(base, ofs); + TRACE_MSG(" MPI buf '" << buf.name << "' at " << rp << + " for " << make_byte_str(buf.get_bytes())); + + // Write test values & init lock. + *((int*)rp) = me; + if (buf.get_bytes() > sizeof(int)) // Room to mark end? + *((char*)rp + buf.get_bytes() - 1) = 'Z'; + buf.shm_lock_init(); + + // Save offset. + if (nshm_rank != MPI_PROC_NULL && bd == MPIBufs::buf_send) + sb_ofs[gname].at(my_shm_rank).at(nshm_rank) = ofs; + } + + // Using shm from another rank. + else if (pass == 2 && !use_mine) { + char* base = (char*)mpi_info->halo_buf_ptrs[nidx]; + auto sz = mpi_info->halo_buf_sizes[nidx]; + auto ofs = sb_ofs[gname].at(nshm_rank).at(my_shm_rank); + assert(sz >= ofs + buf.get_bytes() + YASK_PAD_BYTES); + auto* rp = buf.set_storage(base, ofs); + TRACE_MSG(" MPI shm buf '" << buf.name << "' at " << rp << + " for " << make_byte_str(buf.get_bytes())); + + // Check values written by owner rank in pass 1. + assert(*((int*)rp) == nrank); + if (buf.get_bytes() > sizeof(int)) // Room to mark end? + assert(*((char*)rp + buf.get_bytes() - 1) == 'Z'); + assert(!buf.is_ok_to_read()); + } + + // Determine padded size (also offset to next location) + // in my mem. + if (use_mine) { + auto sbytes = buf.get_bytes(); + req_data.nbytes += ROUND_UP(sbytes + _data_buf_pad, + CACHELINE_BYTES); + req_data.nvars++; + if (pass == 0) + TRACE_MSG(" MPI buf '" << buf.name << "' needs " << + make_byte_str(sbytes) << + " (mem-key = " << numa_pref << ")"); + } + + } // snd/rcv. + } ); // neighbors. // Share offsets between ranks. if (pass == 1 && do_shm) { @@ -777,43 +1017,46 @@ namespace yask { " is " << sb_ofs[gname][rn][rn2]); } } - } // vars. // Alloc for each mem type. if (pass == 0) - _alloc_data(npbytes, nbufs, _mpi_data_buf, "MPI buffer"); + _alloc_data(alloc_reqs, "MPI buffer"); MPI_Barrier(env->shm_comm); } // MPI passes. -#endif + #endif } - // Allocate memory for scratch vars based on number of threads and - // block sizes. + // Delete and re-create all the scratch vars. Delete and re-allocate + // memory for scratch vars based on number of threads and block sizes. + // This destroy-everything-and-start-over approach allows for the + // number of threads and/or block sizes to be changed. + // TODO: be smarter about what to redo. void StencilContext::alloc_scratch_data() { STATE_VARS(this); // Remove any old scratch data. free_scratch_data(); - // Base ptrs for all alloc'd data. + // Requests and base ptrs for all alloc'd data. // This pointer will be shared by the ones in the var // objects, which will take over ownership when it goes // out of scope. - map > _scratch_data_buf; + AllocMap alloc_reqs; // Make sure the right number of threads are set so we // have the right number of scratch vars. - int rthreads = set_region_threads(); + int rthreads, bthreads; + get_num_comp_threads(rthreads, bthreads); - // Delete any existing scratch vars. - // Create new scratch vars, but without any - // data allocated. + // Delete any existing scratch vars. Create new scratch vars, but + // without any data allocated. Update core pointers in generated + // bundles. This function is generated by the YASK compiler. make_scratch_vars(rthreads); - // Find the max mini-block size across all stages. + // Find the max micro-block size across all stages. // They can be different across stages when stage-specific // auto-tuning has been used. IdxTuple mblksize(domain_dims); @@ -821,29 +1064,36 @@ namespace yask { auto& psettings = sp->get_active_settings(); DOMAIN_VAR_LOOP(i, j) { - auto sz = round_up_flr(psettings._mini_block_sizes[i], - fold_pts[j]); + // Round up to cluster size. + auto sz = round_up_flr(psettings._micro_block_sizes[i], + cluster_pts[j]); mblksize[j] = max(mblksize[j], sz); } } - TRACE_MSG("alloc_scratch_data: max mini-block size across stage(s) is " << + TRACE_MSG("max rounded-up micro-block size across stage(s) is " << mblksize.make_dim_val_str(" * ")); // Pass 0: count required size, allocate chunk of memory at end. // Pass 1: distribute parts of already-allocated memory chunk. for (int pass = 0; pass < 2; pass++) { - TRACE_MSG("alloc_scratch_data pass " << pass << " for " << + TRACE_MSG("pass " << pass << " for " << scratch_vecs.size() << " set(s) of scratch vars"); - // Count bytes needed and number of vars for each NUMA node. - map npbytes, nvars; + // Reset bytes needed and number of vars for each request. + for (auto& i : alloc_reqs) { + //auto& key = i.first; + auto& data = i.second; + data.nbytes = 0; + data.nvars = 0; + } + int seq_num = 0; // Loop through each scratch var vector. for (auto* sgv : scratch_vecs) { assert(sgv); // Loop through each scratch var in this vector. - // There will be one for each region thread. + // There will be one for each outer thread. assert(int(sgv->size()) == rthreads); int thr_num = 0; for (auto gp : *sgv) { @@ -858,7 +1108,7 @@ namespace yask { if (gp->is_dim_used(dname)) { - // Set domain size of scratch var to mini-block size. + // Set domain size of scratch var to micro-block size. gp->_set_domain_size(dname, mblksize[dname]); // Conservative allowance for WF exts and/or temporal shifts. @@ -868,35 +1118,44 @@ namespace yask { // Pads. // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]); + gp->update_extra_pad_size(dname, actl_opts->_extra_pad_sizes[dname]); + gp->update_min_pad_size(dname, actl_opts->_min_pad_sizes[dname]); } } // dims. + // If not bundling allocations, increase sequence number. + if (!actl_opts->_bundle_allocs) + seq_num++; + + // Make a request key and make or lookup data. + AllocKey req_key = make_pair(numa_pref, seq_num); + auto& req_data = alloc_reqs[req_key]; + // Set storage if buffer has been allocated. if (pass == 1) { - auto p = _scratch_data_buf[numa_pref]; + auto p = req_data.ptr; assert(p); - gp->set_storage(p, npbytes[numa_pref]); + gp->set_storage(p, req_data.nbytes); TRACE_MSG(gb.make_info_string()); } // Determine size used (also offset to next location). size_t nbytes = gp->get_num_storage_bytes(); - npbytes[numa_pref] += ROUND_UP(nbytes + _data_buf_pad, - CACHELINE_BYTES); - nvars[numa_pref]++; + req_data.nbytes += ROUND_UP(nbytes + _data_buf_pad, + CACHELINE_BYTES); + req_data.nvars++; if (pass == 0) - TRACE_MSG(" scratch var '" << gname << "' for thread " << + TRACE_MSG("scratch var '" << gname << "' for thread " << thr_num << " needs " << make_byte_str(nbytes) << " on NUMA node " << numa_pref); thr_num++; + } // scratch vars. } // scratch-var vecs. // Alloc for each node. if (pass == 0) - _alloc_data(npbytes, nvars, _scratch_data_buf, "scratch var"); + _alloc_data(alloc_reqs, "scratch var"); } // scratch-var passes. } diff --git a/src/kernel/lib/alloc.hpp b/src/kernel/lib/alloc.hpp new file mode 100644 index 00000000..69838394 --- /dev/null +++ b/src/kernel/lib/alloc.hpp @@ -0,0 +1,200 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +#pragma once + +// Support for allocating data in various ways. + +// Provide the needed definitions for NUMA support. +// This is fairly convoluted because of the inconsistency of +// support on various OS releases. +// The USE_NUMA* vars are set in the Makefile. +#ifdef USE_NUMA + +// Use numa policy library? +#ifdef USE_NUMA_POLICY_LIB +#include + +// Use if available. +#elif defined(USE_NUMAIF_H) +#include + +// This is a hack, but some systems are missing . +#elif !defined(NUMAIF_H) +extern "C" { + extern long get_mempolicy(int *policy, const unsigned long *nmask, + unsigned long maxnode, void *addr, int flags); + extern long mbind(void *start, unsigned long len, int mode, + const unsigned long *nmask, unsigned long maxnode, unsigned flags); +} + +// Conservatively don't define MPOL_LOCAL. +#define MPOL_DEFAULT 0 +#define MPOL_PREFERRED 1 +#define MPOL_BIND 2 +#define MPOL_INTERLEAVE 3 + +#endif +#endif + +namespace yask { + + // Customer allocator for objects that may need to be offloaded. + template + struct yask_allocator { + typedef T value_type; + yask_allocator() noexcept {} + + template + yask_allocator(const yask_allocator& other) throw() {}; + + T* allocate (std::size_t n) { + return static_cast(offload_alloc_host(n * sizeof(T))); + } + void deallocate (T* p, std::size_t n) { + offload_free_host(p); + } + }; + + // Generic deleter. + struct DeleterBase { + std::size_t _nbytes; + + // Ctor saves size & device ptr. + DeleterBase(std::size_t nbytes) : + _nbytes(nbytes) { } + + // Free device mem. + void free_dev_mem(char* hostp); + }; + + // Helpers for aligned malloc and free. + extern char* yask_aligned_alloc(std::size_t nbytes); + struct AlignedDeleter : public DeleterBase { + + AlignedDeleter(std::size_t nbytes) : + DeleterBase(nbytes) { } + + // Free p. + void operator()(char* p); + }; + + // Alloc aligned data as a shared ptr. + template + std::shared_ptr shared_aligned_alloc(size_t nbytes) { + + // Alloc mem. + char* cp = yask_aligned_alloc(nbytes); + + // Map alloc to device. + offload_map_alloc(cp, nbytes); + + // Make shared ptr. + auto _base = std::shared_ptr(cp, AlignedDeleter(nbytes)); + return _base; + } + + // Helpers for NUMA malloc and free. + extern char* numa_alloc(std::size_t nbytes, int numa_pref); + struct NumaDeleter : public DeleterBase { + int _numa_pref; + + // Ctor saves data needed for freeing. + NumaDeleter(std::size_t nbytes, int numa_pref) : + DeleterBase(nbytes), + _numa_pref(numa_pref) + { } + + // Free p. + void operator()(char* p); + }; + + // Allocate NUMA memory from preferred node. + template + std::shared_ptr shared_numa_alloc(size_t nbytes, int numa_pref) { + + // Alloc mem. + char* cp = numa_alloc(nbytes, numa_pref); + + // Map alloc to device. + offload_map_alloc(cp, nbytes); + + // Make shared ptr. + auto _base = std::shared_ptr(cp, NumaDeleter(nbytes, numa_pref)); + return _base; + } + + // Helpers for MPI shm malloc and free. + extern char* shm_alloc(std::size_t nbytes, + const MPI_Comm* shm_comm, MPI_Win* shm_win); + struct ShmDeleter : DeleterBase { + const MPI_Comm* _shm_comm; + MPI_Win* _shm_win; + + // Ctor saves data needed for freeing. + ShmDeleter(std::size_t nbytes, + const MPI_Comm* shm_comm, MPI_Win* shm_win): + DeleterBase(nbytes), + _shm_comm(shm_comm), + _shm_win(shm_win) + { } + + // Free p. + void operator()(char* p); + }; + + // Allocate MPI shm memory. + template + std::shared_ptr shared_shm_alloc(size_t nbytes, + const MPI_Comm* shm_comm, MPI_Win* shm_win) { + + // Alloc mem. + char* cp = shm_alloc(nbytes, shm_comm, shm_win); + + // Map alloc to device. + // Shm on device not currently supported. + offload_map_alloc(cp, nbytes); + + // Make shared ptr. + auto _base = std::shared_ptr(cp, ShmDeleter(nbytes, shm_comm, shm_win)); + return _base; + } + + // Key for allocating memory. + // Pair is mem type and sequence number. + typedef std::pair AllocKey; + + // Data for allocating mem. + struct AllocData { + std::shared_ptr ptr; + size_t nbytes = 0; + int nvars = 0; + }; + + // Map from alloc key to data. + typedef std::map AllocMap; + +} + diff --git a/src/kernel/lib/auto_tuner.cpp b/src/kernel/lib/auto_tuner.cpp index 703390f4..89fb23d1 100644 --- a/src/kernel/lib/auto_tuner.cpp +++ b/src/kernel/lib/auto_tuner.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -37,132 +37,117 @@ namespace yask { const std::string& name) : ContextLinker(context), _settings(settings), - _name("auto-tuner") { + _name("auto-tuner") + { + STATE_VARS(this); assert(settings); if (name.length()) _name += "(" + name + ")"; + _prefix = string(" ") + _name + ": "; + clear(settings->_do_auto_tune); } - // Eval auto-tuner for given number of steps. - void StencilContext::eval_auto_tuner(idx_t num_steps) { + // Switch target ptr to next one. + // Sets other target-specific settings. + // Return 'true' if more to do; 'false' if done. + // TODO: replace this with a less error-prone data structure + // and algorithm that will check dependencies more cleanly. + bool AutoTuner::next_target() { STATE_VARS(this); - _at.steps_done += num_steps; - _at.timer.stop(); - if (state->_use_stage_tuners) { - for (auto& sp : st_stages) - sp->get_at().eval(); - } - else - _at.eval(); - } - - // Reset auto-tuners. - void StencilContext::reset_auto_tuner(bool enable, bool verbose) { - for (auto& sp : st_stages) - sp->get_at().clear(!enable, verbose); - _at.clear(!enable, verbose); - } + trial_secs = _settings->_tuner_trial_secs; + max_radius = _settings->_tuner_radius; - // Determine if any auto tuners are running. - bool StencilContext::is_auto_tuner_enabled() const { - STATE_VARS(this); - bool done = true; - if (state->_use_stage_tuners) { - for (auto& sp : st_stages) - if (!sp->get_at().is_done()) - done = false; - } else - done = _at.is_done(); - return !done; - } + outerp = 0; + min_blks = 1; + min_pts = 1; - // Apply auto-tuning immediately, i.e., not as part of normal processing. - // Will alter data in vars. - void StencilContext::run_auto_tuner_now(bool verbose) { - STATE_VARS(this); - if (!is_prepared()) - THROW_YASK_EXCEPTION("Error: run_auto_tuner_now() called without calling prepare_solution() first"); - - DEBUG_MSG("Auto-tuning..."); - YaskTimer at_timer; - at_timer.start(); - - // Temporarily disable halo exchange to tune intra-rank. - enable_halo_exchange = false; - - // Temporarily ignore step conditions to force eval of conditional - // bundles. NB: may affect perf, e.g., if stages A and B run in - // AAABAAAB sequence, perf may be [very] different if run as - // ABABAB..., esp. w/temporal tiling. TODO: work around this. - check_step_conds = false; - - // Init tuners. - reset_auto_tuner(true, verbose); - - // Reset stats. - clear_timers(); - - // Determine number of steps to run. - // If wave-fronts are enabled, run a max number of these steps. - idx_t step_dir = dims->_step_dir; // +/- 1. - idx_t stride_t = min(max(wf_steps, idx_t(1)), +AutoTuner::max_stride_t) * step_dir; - - // Run time-steps until AT converges. - for (idx_t t = 0; ; t += stride_t) { - - // Run stride_t time-step(s). - run_solution(t, t + stride_t - step_dir); - - // AT done on this rank? - if (!is_auto_tuner_enabled()) - break; + // Move to next target. + if (targetp == 0) + targeti = 0; + else + targeti++; + if (targeti >= _settings->_tuner_targets.size()) { + targetp = 0; + return false; + } + auto& target_str = _settings->_tuner_targets.at(targeti); + AT_TRACE_MSG("next target is '" << target_str << "'"); + + // Mega-blocks? + if (target_str == _settings->_mega_block_str) { + targetp = &_settings->_mega_block_sizes; + outerp = &_settings->_rank_sizes; + AT_DEBUG_MSG("searching mega-block sizes..."); + } + + // Blocks? + else if (target_str == _settings->_block_str) { + targetp = &_settings->_block_sizes; + outerp = &_settings->_mega_block_sizes; + + // Set min blocks and pts. + #ifndef USE_OFFLOAD + int rt=0, bt=0; + get_num_comp_threads(rt, bt); + min_blks = max(rt / 2, 1); // At least 1 for every 2 threads. + + // Set min pts (512=8^3). + min_pts = max(min(get_num_domain_points(*outerp) / + min_blks, 512), 1); + #endif + + AT_DEBUG_MSG("searching block sizes..."); } - // Wait for all ranks to finish. - DEBUG_MSG("Waiting for auto-tuner to converge on all ranks..."); - env->global_barrier(); + // Micro-blocks? + else if (target_str == _settings->_micro_block_str) { + targetp = &_settings->_micro_block_sizes; + outerp = &_settings->_block_sizes; + AT_DEBUG_MSG("searching micro-block sizes..."); + } - // reenable normal operation. -#ifndef NO_HALO_EXCHANGE - enable_halo_exchange = true; -#endif - check_step_conds = true; + // Nano-blocks? + else if (target_str == _settings->_nano_block_str) { + targetp = &_settings->_nano_block_sizes; + outerp = &_settings->_micro_block_sizes; + AT_DEBUG_MSG("searching nano-block sizes..."); + } - // Report results. - at_timer.stop(); - DEBUG_MSG("Auto-tuner done after " << steps_done << " step(s) in " << - make_num_str(at_timer.get_elapsed_secs()) << " secs."); - if (state->_use_stage_tuners) { - for (auto& sp : st_stages) - sp->get_at().print_settings(); - } else - _at.print_settings(); - print_temporal_tiling_info(); + // Pico-blocks? + else if (target_str == _settings->_pico_block_str) { + targetp = &_settings->_pico_block_sizes; + outerp = &_settings->_nano_block_sizes; + AT_DEBUG_MSG("searching pico-block sizes..."); + } - // Reset stats. - clear_timers(); + else { + THROW_YASK_EXCEPTION("Error: unrecognized auto-tuner target '" + + target_str + "'"); + } + assert(targetp); + assert(outerp); + + // Reset search state. + at_state.init(this, false); + + // Get initial search center from current target. + at_state.center_sizes = *targetp; + + // Prepare for next target. + AT_TRACE_MSG("starting size: " << at_state.center_sizes.make_dim_val_str(" * ")); + AT_TRACE_MSG("starting search radius: " << at_state.radius); + adjust_settings(false); + return true; } - + // Print the best settings. void AutoTuner::print_settings() const { STATE_VARS(this); - if (tune_mini_blks()) - DEBUG_MSG(_name << ": best-mini-block-size: " << - target_sizes().remove_dim(step_posn).make_dim_val_str(" * ")); - else - DEBUG_MSG(_name << ": best-block-size: " << - target_sizes().remove_dim(step_posn).make_dim_val_str(" * ") << endl << - _name << ": mini-block-size: " << - _settings->_mini_block_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); - DEBUG_MSG(_name << ": sub-block-size: " << - _settings->_sub_block_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); - } - - // Access settings. - bool AutoTuner::tune_mini_blks() const { - return _context->get_settings()->_tune_mini_blks; + _context->print_sizes(_prefix); + _context->print_temporal_tiling_info(_prefix); + _settings->adjust_settings(_context); } // Reset the auto-tuner. @@ -176,64 +161,55 @@ namespace yask { #endif // Apply the best known settings from existing data, if any. - if (best_rate > 0.) { - target_sizes() = best_sizes; - apply(); - DEBUG_MSG(_name << ": applying size " << - best_sizes.make_dim_val_str(" * ")); - } + apply_best(); - // Reset all vars. - results.clear(); - n2big = n2small = n2far = 0; - best_rate = 0.; - radius = max_radius; + // Mark done? done = mark_done; - neigh_idx = 0; - better_neigh_found = false; - ctime = 0.; - csteps = 0; - in_warmup = true; + + // Reset all vars to be ready to start a new tuning. timer.clear(); steps_done = 0; - target_steps = target_sizes()[step_dim]; - center_sizes = target_sizes(); - best_sizes = target_sizes(); - - // Set min blocks to number of region threads. - int rt=0, bt=0; - get_num_comp_threads(rt, bt); - min_blks = rt; + targetp = 0; + outerp = 0; + targeti = 0; + at_state.init(this, true); } // clear. // Check whether sizes within search limits. bool AutoTuner::check_sizes(const IdxTuple& bsize) { bool ok = true; + AT_TRACE_MSG("checking size " << + bsize.make_dim_val_str(" * ")); // Too small? - if (ok && get_num_domain_points(bsize) < min_pts) { - n2small++; + if (get_num_domain_points(bsize) < min_pts) { + at_state.n2small++; + AT_TRACE_MSG("too small"); ok = false; } // Too few? - else if (ok) { - idx_t nblks = get_num_domain_points(outer_sizes()) / + else { + idx_t nblks = get_num_domain_points(*outerp) / get_num_domain_points(bsize); if (nblks < min_blks) { + AT_TRACE_MSG("too big"); ok = false; - n2big++; + at_state.n2big++; } } return ok; } - // Evaluate the previous run and take next auto-tuner step. + // This is a "call-back" routine from run_solution(). If a trial is + // over, it will evaluate that trial and set the state for the next + // auto-tuner step before returning. If a trial is not over, it will + // return to get more data. void AutoTuner::eval() { STATE_VARS(this); - // Get elapsed time and reset. + // Get elapsed time and steps; reset them. double etime = timer.get_elapsed_secs(); timer.clear(); idx_t steps = steps_done; @@ -243,62 +219,40 @@ namespace yask { if (done) return; - // Setup not done? - if (!nullop) - return; - // Cumulative stats and rate. - csteps += steps; - ctime += etime; - double rate = (ctime > 0.) ? (double(csteps) / ctime) : 0.; - double min_secs = _settings->_tuner_min_secs; - TRACE_MSG(_name << " eval() callback: " << steps << " step(s) in " << - etime << " secs; " << csteps << " step(s) in " << - ctime << " secs (" << rate << - " steps/sec) cumulative; best-rate = " << best_rate << - "; min-secs = " << min_secs); + at_state.csteps += steps; + at_state.ctime += etime; + double crate = (at_state.ctime > 0.) ? (double(at_state.csteps) / at_state.ctime) : 0.; + AT_TRACE_MSG("eval() callback: " << steps << " step(s) in " << + etime << " secs; " << at_state.csteps << " step(s) in " << + at_state.ctime << " secs (" << crate << + " steps/sec) cumulative; best-rate = " << at_state.best_rate << + "; trial-secs = " << trial_secs); // Still in warmup? - if (in_warmup) { + if (at_state.in_warmup) { // Warmup not done? - if (ctime < max(warmup_secs, min_secs) && csteps < warmup_steps) + if (at_state.ctime < max(warmup_secs, trial_secs) && + at_state.csteps < warmup_steps) + return; // Keep running. + + // Warmup is done. + AT_DEBUG_MSG("finished warmup for " << + at_state.csteps << " steps(s) in " << + make_num_str(at_state.ctime) << " secs"); + + // Set first target. + targetp = 0; + if (!next_target()) { + + // No targets. + clear(true); + AT_DEBUG_MSG("no enabled auto-tuner targets"); return; - - // Done. - DEBUG_MSG(_name << ": finished warmup for " << - csteps << " steps(s) in " << - make_num_str(ctime) << " secs\n" << - _name << ": tuning " << (tune_mini_blks() ? "mini-" : "") << - "block sizes..."); - in_warmup = false; - - // Restart for first real measurement. - csteps = 0; - ctime = 0; - - // Set center point for search. - center_sizes = target_sizes(); - - // Pick better starting point if needed. - if (!check_sizes(center_sizes)) { - for (auto dim : center_sizes) { - auto& dname = dim._get_name(); - auto& dval = dim.get_val(); - if (dname != step_dim) { - auto dmax = max(idx_t(1), outer_sizes()[dname] / 2); - center_sizes[dname] = dmax; - } - } } - - // Set vars to starting point. - best_sizes = center_sizes; - target_sizes() = center_sizes; - apply(); - TRACE_MSG(_name << ": starting size: " << center_sizes.make_dim_val_str(" * ")); - TRACE_MSG(_name << ": starting search radius: " << radius); - return; + + return; // Start first trial. } // Determine whether we've done enough. @@ -306,70 +260,72 @@ namespace yask { // If the current rate is much less than the best, // we don't need a better measurement. - if (rate > 0. && best_rate > 0. && rate < best_rate * cutoff) + if (crate > 0. && at_state.best_rate > 0. && + crate < at_state.best_rate * cutoff) rate_ok = true; // Enough time or steps to get a good measurement? - else if (ctime >= min_secs || csteps >= min_steps) + else if (at_state.ctime >= trial_secs || at_state.csteps >= trial_steps) rate_ok = true; // Return from eval if we need to do more work. if (!rate_ok) - return; + return; // Get more data for this trial. - // Save result. - results[target_sizes()] = rate; - bool is_better = rate > best_rate; + // Save current result. + at_state.results[*targetp] = crate; + bool is_better = crate > at_state.best_rate; if (is_better) { - best_sizes = target_sizes(); - best_rate = rate; - better_neigh_found = true; + at_state.best_sizes = *targetp; + at_state.best_rate = crate; + at_state.better_neigh_found = true; } // Print progress and reset vars for next time. - DEBUG_MSG(_name << ": search-dist=" << radius << ": " << - make_num_str(rate) << " steps/sec (" << - csteps << " steps(s) in " << make_num_str(ctime) << - " secs) with size " << - target_sizes().remove_dim(step_posn).make_dim_val_str(" * ") << - (is_better ? " -- best so far" : "")); - csteps = 0; - ctime = 0.; + AT_DEBUG_MSG("search-dist=" << at_state.radius << ": " << + make_num_str(crate) << " steps/sec (" << + at_state.csteps << " steps(s) in " << make_num_str(at_state.ctime) << + " secs) with size " << + targetp->remove_dim(step_posn).make_dim_val_str(" * ") << + (is_better ? " -- best so far" : "")); + at_state.csteps = 0; + at_state.ctime = 0.; // At this point, we have gathered perf info on the current settings. // Now, we need to determine next unevaluated point in search space. + // When found, we 'return' from this call-back function. while (true) { // Gradient-descent(GD) search: // Use the neighborhood info from MPI to track neighbors. // TODO: move to a more general place. // Valid neighbor index? - if (neigh_idx < mpi_info->neighborhood_size) { + if (at_state.neigh_idx < mpi_info->neighborhood_size) { // Convert index to offsets in each domain dim. - auto ofs = mpi_info->neighborhood_sizes.unlayout(neigh_idx); + auto ofs = mpi_info->neighborhood_sizes.unlayout(at_state.neigh_idx); // Next neighbor of center point. - neigh_idx++; + at_state.neigh_idx++; // Determine new size. - IdxTuple bsize(center_sizes); + IdxTuple bsize(at_state.center_sizes); bool ok = true; int mdist = 0; // manhattan dist from center. for (auto odim : ofs) { auto& dname = odim._get_name(); // a domain-dim name. auto& dofs = odim.get_val(); // always [0..2]. - // Min and max sizes of this dim. + // Min and max sizes in this dim. auto dmin = dims->_cluster_pts[dname]; - auto dmax = outer_sizes()[dname]; + auto dmax = (*outerp)[dname]; // Determine distance of GD neighbors. auto dist = dmin; // stride by cluster size. dist = max(dist, min_dist); - dist *= radius; + dist *= at_state.radius; - auto sz = center_sizes[dname]; + auto sz = at_state.center_sizes[dname]; // current size in 'odim'. switch (dofs) { case 0: // reduce size in 'odim'. sz -= dist; @@ -378,131 +334,305 @@ namespace yask { case 1: // keep size in 'odim'. break; case 2: // increase size in 'odim'. - sz += dist; + if (sz < dist) + sz = dist; + else + sz += dist; mdist++; break; default: assert(false && "internal error in tune_settings()"); } - // Don't look in far corners. - if (mdist > 2) { - n2far++; - ok = false; - break; // out of dim-loop. - } - - // Too small? - if (sz < dmin) { - n2small++; + // Don't look in all dim combos. + if (mdist > 3) { + at_state.n2far++; ok = false; break; // out of dim-loop. } // Adjustments. - sz = min(sz, dmax); + sz = max(sz, dmin); sz = ROUND_UP(sz, dmin); + sz = min(sz, dmax); // Save. bsize[dname] = sz; } // domain dims. - TRACE_MSG(_name << ": checking size " << - bsize.make_dim_val_str(" * ")); // Check sizes. if (ok && !check_sizes(bsize)) ok = false; + // Valid size and not already evaluated? + if (ok) { + if (at_state.results.count(bsize) > 0) + AT_TRACE_MSG("already evaluated"); + else { + AT_TRACE_MSG("exiting eval() with new size"); - // Valid size and not already checked? - if (ok && results.count(bsize) == 0) { - - // Run next step with this size. - target_sizes() = bsize; - break; // out of block-search loop. + // Run next step with this size. + *targetp = bsize; + adjust_settings(false); + return; + } } } // valid neighbor index. // Beyond last neighbor of current center? + // Determine next search setting. else { - // Should GD continue? - bool stop_gd = !better_neigh_found; + // Should GD continue at this radius from the new best + // point? + bool stop_gd = !at_state.better_neigh_found; // Make new center at best size so far. - center_sizes = best_sizes; + at_state.center_sizes = at_state.best_sizes; // Reset search vars. - neigh_idx = 0; - better_neigh_found = false; + at_state.neigh_idx = 0; + at_state.better_neigh_found = false; - // No new best point, so this is the end of this - // GD search. - if (stop_gd) { + // Check another point at this radius? + if (!stop_gd) + AT_TRACE_MSG("continuing search from " << + at_state.center_sizes.make_dim_val_str(" * ")); - // Move to next radius. - radius /= 2; - - // Done? - if (radius < 1) { + // No new best point, so this is the end of the + // GD search at this radius. + else { - // Reset AT and disable. - clear(true); - DEBUG_MSG(_name << ": done"); - return; + // Move to next radius. + at_state.radius /= 2; + if (at_state.radius >= 1) + AT_TRACE_MSG("new search radius=" << at_state.radius); + + // No more radii for this target. + else { + + // Apply current best result for this target. + apply_best(); + + // Move to next target. + if (next_target()) { + AT_TRACE_MSG("exiting eval() with new target"); + return; + } + + // No more targets. + else { + + // Reset AT and disable. + clear(true); + AT_DEBUG_MSG("done"); + return; + } } - TRACE_MSG(_name << ": new search radius=" << radius); - } - else { - TRACE_MSG(_name << ": continuing search from " << - center_sizes.make_dim_val_str(" * ")); } } // beyond next neighbor of center. - } // search for new setting to try. + } // while(true) search for new setting to try. - // Fix settings for next step. - apply(); - TRACE_MSG(_name << ": next size " << - target_sizes().make_dim_val_str(" * ")); + THROW_YASK_EXCEPTION("internal error: exited from infinite loop"); } // eval. - // Adjust related kernel settings to prepare for a run. - // Does *not* set the settings being tuned. - void AutoTuner::apply() { + // Apply best settings if avail, and adjust other settings. + // Returns true if set. + bool AutoTuner::apply_best() { STATE_VARS(this); - - // Restore step-dim value for block. - target_sizes()[step_posn] = target_steps; - - // Change derived sizes to 0 so adjust_settings() - // will set them to the default. - if (!tune_mini_blks()) { - _settings->_block_group_sizes.set_vals_same(0); - _settings->_mini_block_sizes.set_vals_same(0); + if (at_state.best_rate > 0. && targetp) { + AT_DEBUG_MSG("applying size " << + at_state.best_sizes.make_dim_val_str(" * ")); + *targetp = at_state.best_sizes; + + // Save these results as requested options. + // FIXME: won't work for stage tuning. + if (targetp == &_settings->_mega_block_sizes) + req_opts->_mega_block_sizes = *targetp; + if (targetp == &_settings->_block_sizes) + req_opts->_block_sizes = *targetp; + if (targetp == &_settings->_micro_block_sizes) + req_opts->_micro_block_sizes = *targetp; + if (targetp == &_settings->_nano_block_sizes) + req_opts->_nano_block_sizes = *targetp; + if (targetp == &_settings->_pico_block_sizes) + req_opts->_pico_block_sizes = *targetp; + + // Adjust other settings based on target. + adjust_settings(false); + return true; } - _settings->_mini_block_group_sizes.set_vals_same(0); - _settings->_sub_block_sizes.set_vals_same(0); - _settings->_sub_block_group_sizes.set_vals_same(0); - + return false; + } + + // Adjust related kernel settings to prepare for next run. + void AutoTuner::adjust_settings(bool do_print) { + STATE_VARS(this); + assert(targetp); + + // Reset non-target settings to requested ones. This is done so + // that ajustment will be applied based on requested ones and this + // target instead of adjusted ones. + if (targetp != &_settings->_mega_block_sizes) + _settings->_mega_block_sizes = req_opts->_mega_block_sizes; + if (targetp != &_settings->_block_sizes) + _settings->_block_sizes = req_opts->_block_sizes; + if (targetp != &_settings->_micro_block_sizes) + _settings->_micro_block_sizes = req_opts->_micro_block_sizes; + if (targetp != &_settings->_nano_block_sizes) + _settings->_nano_block_sizes = req_opts->_nano_block_sizes; + if (targetp != &_settings->_pico_block_sizes) + _settings->_pico_block_sizes = req_opts->_pico_block_sizes; + // Save debug output and set to null. auto saved_op = get_debug_output(); - set_debug_output(nullop); - - // Make sure everything is resized based on block size. - _settings->adjust_settings(); + if (!do_print) { + yask_output_factory yof; + auto nullop = yof.new_null_output(); + set_debug_output(nullop); + } + // The following sequence is the required subset of what + // is done in prepare_solution(). + + // Make sure everything is adjusted based on new target size. + _settings->adjust_settings(do_print ? this : 0); + // Update temporal blocking info. + // (Normally called from update_var_info().) _context->update_tb_info(); - // Reallocate scratch data based on new mini-block size. - // TODO: only do this when blocks have increased or - // decreased by a certain percentage. + // Reallocate scratch vars based on new micro-block size. + // TODO: only do this when needed. _context->alloc_scratch_data(); // Restore debug output. set_debug_output(saved_op); } + ///// StencilContext methods to control the auto-tuner(s). + void StencilContext::visit_auto_tuners(std::function visitor) { + STATE_VARS(this); + + if (state->_use_stage_tuners) { + for (auto& sp : st_stages) + visitor(sp->get_at()); + } else + visitor(_at); + } + void StencilContext::visit_auto_tuners(std::function visitor) const { + STATE_VARS(this); + + if (state->_use_stage_tuners) { + for (auto& sp : st_stages) + visitor(sp->get_at()); + } else + visitor(_at); + } + + // Eval auto-tuner for given number of steps. + void StencilContext::eval_auto_tuner() { + visit_auto_tuners + ([&](AutoTuner& at) + { + at.eval(); + }); + } + + // Reset auto-tuners. + void StencilContext::reset_auto_tuner(bool enable, bool verbose) { + visit_auto_tuners + ([&](AutoTuner& at) + { + at.clear(!enable, verbose); + }); + } + + // Determine if any auto tuners are running. + bool StencilContext::is_auto_tuner_enabled() const { + bool done = true; + visit_auto_tuners + ([&](const AutoTuner& at) + { + if (!at.is_done()) + done = false; + }); + return !done; + } + + // Apply auto-tuning immediately, i.e., not as part of normal processing. + // Will alter data in vars. + void StencilContext::run_auto_tuner_now(bool verbose) { + STATE_VARS(this); + if (!is_prepared()) + THROW_YASK_EXCEPTION("Error: run_auto_tuner_now() called without calling prepare_solution() first"); + + DEBUG_MSG("\nAuto-tuning..."); + YaskTimer at_timer; + at_timer.start(); + + // Copy vars to device now so that automatic copy in + // run_solution() will not impact timing. + copy_vars_to_device(); + + // Temporarily disable halo exchange to tune intra-rank. + // Will not produce valid results and will corrupt data. + auto save_halo_exchange = actl_opts->do_halo_exchange; + actl_opts->do_halo_exchange = false; + + // Temporarily ignore step conditions to force eval of conditional + // bundles. NB: may affect perf, e.g., if stages A and B run in + // AAABAAAB sequence, perf may be [very] different if run as + // ABABAB..., esp. w/temporal tiling. TODO: work around this. + check_step_conds = false; + + // Init tuners. + reset_auto_tuner(true, verbose); + + // Reset stats. + clear_timers(); + + // Determine number of steps to run. + // If wave-fronts are enabled, run a max number of these steps. + idx_t step_dir = dims->_step_dir; // +/- 1. + idx_t stride_t = min(max(wf_steps, idx_t(1)), +AutoTuner::max_stride_t) * step_dir; + + // Run time-steps until AT converges. + for (idx_t t = 0; ; t += stride_t) { + + // Run stride_t time-step(s). + run_solution(t, t + stride_t - step_dir); + + // AT done on this rank? + if (!is_auto_tuner_enabled()) + break; + } + + // Wait for all ranks to finish. + #if USE_MPI + DEBUG_MSG("Waiting for auto-tuner to converge on all ranks..."); + env->global_barrier(); + #endif + + // reenable normal operation. + actl_opts->do_halo_exchange = save_halo_exchange; + check_step_conds = true; + + // Report results. + at_timer.stop(); + DEBUG_MSG("Auto-tuner done after " << + make_num_str(at_timer.get_elapsed_secs()) << " secs"); + DEBUG_MSG("Final settings:"); + if (state->_use_stage_tuners) { + for (auto& sp : st_stages) + sp->get_at().print_settings(); + } else + _at.print_settings(); + + // Reset stats. + clear_timers(); + } + } // namespace yask. diff --git a/src/kernel/lib/auto_tuner.hpp b/src/kernel/lib/auto_tuner.hpp index f33b3664..b1366a06 100644 --- a/src/kernel/lib/auto_tuner.hpp +++ b/src/kernel/lib/auto_tuner.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -34,54 +34,80 @@ namespace yask { protected: // Settings to change. May be pointer to solution settings - // or local settings for a pack. + // or local settings for a stage. KernelSettings* _settings = 0; // Name of tuner. std::string _name; - // Null stream to throw away debug info. - yask_output_factory yof; - yask_output_ptr nullop = yof.new_null_output(); + // String to print before each msg. + std::string _prefix; + #define AT_DEBUG_MSG(msg) DEBUG_MSG(_prefix << msg) + #define AT_TRACE_MSG(msg) TRACE_MSG(_prefix << msg) // Whether to print progress. bool verbose = false; // AT parameters. - double warmup_steps = 100; - double warmup_secs = 0.5; // end warmup when either warmup_steps OR warmup_secs is reached. - idx_t min_steps = 100; + double warmup_steps = 1000; + double warmup_secs = 1.0; // end warmup when either warmup_steps OR warmup_secs is reached. + idx_t trial_steps = 500; + double trial_secs = 0.5; // end trial when either trial_steps OR trial_secs is reached. double cutoff = 0.8; // can stop eval if current rate < best rate * cutoff; - idx_t max_radius = 8; // starting search radius. - idx_t min_dist = 4; // min distance to move in any direction per eval. - idx_t min_pts = 512; // 8^3; min points in a block. - idx_t min_blks = 4; // num number of blocks; gets set to number of region threads. - - // Results. - std::unordered_map results; // block-size -> perf cache. - int n2big = 0, n2small = 0, n2far = 0; - - // Best so far. - IdxTuple best_sizes; - double best_rate = 0.; - - // Current point in search. - IdxTuple center_sizes; - idx_t target_steps = 0; - idx_t radius = 0; - bool done = false; - idx_t neigh_idx = 0; - bool better_neigh_found = false; - - // Cumulative vars. - double ctime = 0.; - idx_t csteps = 0; - bool in_warmup = true; + idx_t min_dist = 1; // min distance to move in any direction per trial. + idx_t min_pts = 1; // min pts in an area. + idx_t min_blks = 1; // min number of areas. + idx_t max_radius = 8; // starting search radius; should be a power of 2. + + // State of the search of one set of target sizes. + struct AutoTunerState { + + // Results. + std::unordered_map results; // block-size -> perf cache. + int n2big = 0, n2small = 0, n2far = 0; + + // Best so far. + IdxTuple best_sizes; + double best_rate = 0.; + + // Current location of search. + IdxTuple center_sizes; + idx_t radius = 0; + idx_t neigh_idx = 0; + bool better_neigh_found = false; + + // Cumulative data within a trial or warmup. + double ctime = 0.; + idx_t csteps = 0; + bool in_warmup = true; + + void init(AutoTuner* at, bool warmup_needed) { + results.clear(); + n2big = n2small = n2far = 0; + best_sizes.set_vals_same(0); + best_rate = 0.; + center_sizes.set_vals_same(0); + radius = at->max_radius; + neigh_idx = 0; + better_neigh_found = false; + ctime = 0.; + csteps = 0; + in_warmup = warmup_needed; + } + }; + + // Current state of search. + AutoTunerState at_state; + bool done = true; + IdxTuple* outerp = 0; + IdxTuple* targetp = 0; + size_t targeti = 0; bool check_sizes(const IdxTuple& sizes); + bool next_target(); public: - static constexpr idx_t max_stride_t = 4; + static constexpr idx_t max_stride_t = 100; AutoTuner(StencilContext* context, KernelSettings* settings, @@ -94,30 +120,6 @@ namespace yask { // Increment this to track steps. idx_t steps_done = 0; - // Access settings. - bool tune_mini_blks() const; - IdxTuple& target_sizes() { - return tune_mini_blks() ? - _settings->_mini_block_sizes : _settings->_block_sizes; - } - IdxTuple& outer_sizes() { - return tune_mini_blks() ? - _settings->_block_sizes : _settings->_region_sizes; - } - IdxTuple& target_sizes() const { - return tune_mini_blks() ? - _settings->_mini_block_sizes : _settings->_block_sizes; - } - IdxTuple& outer_sizes() const { - return tune_mini_blks() ? - _settings->_block_sizes : _settings->_region_sizes; - } - - // Change settings pointers. - void set_settings(KernelSettings* p) { - _settings = p; - } - // Reset all state to beginning. void clear(bool mark_done, bool verbose = false); @@ -127,8 +129,12 @@ namespace yask { // Print the best settings. void print_settings() const; - // Apply settings. - void apply(); + // Apply best settings if avail. + // Returns true if set. + bool apply_best(); + + // Adjust related kernel settings to prepare for next run. + void adjust_settings(bool do_print); // Done? bool is_done() const { return done; } diff --git a/src/kernel/lib/cache_model.hpp b/src/kernel/lib/cache_model.hpp index f6cc0031..f5abac82 100644 --- a/src/kernel/lib/cache_model.hpp +++ b/src/kernel/lib/cache_model.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -26,6 +26,8 @@ IN THE SOFTWARE. // Purpose: implement a simple,infinite-size cache model to check // prefetch and/or eviction coverage. +// This features is no longer functional. TODO: reinstate. + #pragma once #include diff --git a/src/kernel/lib/context.cpp b/src/kernel/lib/context.cpp index 4a9301b4..4b7c90df 100644 --- a/src/kernel/lib/context.cpp +++ b/src/kernel/lib/context.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -23,8 +23,8 @@ IN THE SOFTWARE. *****************************************************************************/ -// This file contains implementations of StencilContext methods. -// Also see setup.cpp and soln_apis.cpp. +// This file contains implementations of some StencilContext methods. +// Also see setup.cpp, halo.cpp, and soln_apis.cpp. #include "yask_stencil.hpp" using namespace std; @@ -33,12 +33,29 @@ namespace yask { ///// Top-level methods for evaluating reference and optimized stencils. + // Set the core vars that are needed for running kernels. + void CommonCoreData::set_core(const StencilContext *cxt) { + STATE_VARS_CONST(cxt); + _global_sizes.set_from_tuple(actl_opts->_global_sizes); + _rank_sizes.set_from_tuple(actl_opts->_rank_sizes); + _rank_domain_offsets = cxt->rank_domain_offsets; + } + // Eval stencil bundle(s) over var(s) using reference scalar code. + // Does NOT offload computations. void StencilContext::run_ref(idx_t first_step_index, idx_t last_step_index) { STATE_VARS(this); run_time.start(); + // Since any APIs may have been called in other ranks, mark all + // neighbor vars as possibly dirty. + set_all_neighbor_vars_dirty(); + + // Disable offload. + bool save_offload = KernelEnv::_use_offload; + KernelEnv::_use_offload = false; + // Determine step dir from order of first/last. idx_t step_dir = (last_step_index >= first_step_index) ? 1 : -1; @@ -63,32 +80,37 @@ namespace yask { // Force sub-sizes to whole rank size so that scratch // vars will be large enough. Turn off any temporal blocking. - opts->_region_sizes.set_vals_same(0); - opts->_block_sizes.set_vals_same(0); - opts->_mini_block_sizes.set_vals_same(0); - opts->_sub_block_sizes.set_vals_same(0); - opts->adjust_settings(); + actl_opts->_mega_block_sizes = actl_opts->_rank_sizes; + actl_opts->_block_sizes = actl_opts->_rank_sizes; + actl_opts->_micro_block_sizes = actl_opts->_rank_sizes; + actl_opts->_nano_block_sizes = actl_opts->_rank_sizes; + actl_opts->_pico_block_sizes = actl_opts->_rank_sizes; + actl_opts->adjust_settings(); // Don't print settings. update_var_info(true); // Copy these settings to stages and realloc scratch vars. for (auto& sp : st_stages) - sp->get_local_settings() = *opts; + sp->get_local_settings() = *actl_opts; alloc_scratch_data(); - // Use only one set of scratch vars. - int scratch_var_idx = 0; - // Indices to loop through. // Init from begin & end tuples. - ScanIndices rank_idxs(*dims, false, &rank_domain_offsets); + ScanIndices rank_idxs(false, &rank_domain_offsets); rank_idxs.begin = begin; rank_idxs.end = end; - // Set offsets in scratch vars. - // Requires scratch vars to be allocated for whole - // rank instead of smaller var size. + // Use only one set of scratch vars, i.e., + // we don't have one for each thread. + int scratch_var_idx = 0; + + // Set offsets in scratch vars. For this reference run, scratch + // vars are allocated for the whole rank instead of smaller var + // size. update_scratch_var_info(scratch_var_idx, rank_idxs.begin); + // Doing all parts. + init_mpi_flags(); + // Initial halo exchange. // TODO: get rid of all halo exchanges in this function, // and calculate overall problem in one rank. @@ -113,7 +135,7 @@ namespace yask { rank_idxs.stride[step_posn] = stride_t; // Loop thru bundles. We ignore stages here - // because staging is an optional optimizations. + // because staging is an optional optimization. for (auto* asg : st_bundles) { // Scan through n-D space. @@ -131,7 +153,7 @@ namespace yask { // Find the bundles that need to be processed. // This will be the prerequisite scratch-var - // bundles plus this non-scratch group. + // bundles plus this non-scratch tile. auto sg_list = asg->get_reqd_bundles(); // Loop through all the needed bundles. @@ -142,58 +164,52 @@ namespace yask { ScanIndices misc_idxs = sg->adjust_span(scratch_var_idx, rank_idxs); misc_idxs.stride.set_from_const(1); // ensure unit stride. - // Define misc-loop function. Since stride is always 1, we - // ignore misc_stop. If point is in sub-domain for this - // bundle, then evaluate the reference scalar code. - // TODO: fix domain of scratch vars. -#define MISC_FN(misc_idxs) \ - do { \ - if (sg->is_in_valid_domain(misc_idxs.start)) \ - sg->calc_scalar(scratch_var_idx, misc_idxs.start); \ - } while(0) - // Scan through n-D space. TRACE_MSG("run_ref: step " << start_t << " in bundle '" << sg->get_name() << "': [" << misc_idxs.begin.make_val_str() << " ... " << misc_idxs.end.make_val_str() << ")"); -#include "yask_misc_loops.hpp" -#undef misc_fn + sg->calc_in_domain(scratch_var_idx, misc_idxs); + } // needed bundles. - // Mark vars that [may] have been written to. - // Mark vars as dirty even if not actually written by this - // rank. This is needed because neighbors will not know what - // vars are actually dirty, and all ranks must have the same - // information about which vars are possibly dirty. - update_vars(nullptr, start_t, stop_t, true); + // Mark vars that were updated in this rank. + asg->update_var_info(YkVarBase::self, start_t, true, false, false); - } // all bundles. + // Mark vars that *may* have been written to by any rank. + update_var_info(nullptr, start_t, stop_t, true, false); - } // iterations. + } // all bundles. + + } // iterations. steps_done += abs(end_t - begin_t); // Final halo exchange. exchange_halos(); run_time.stop(); + + // Restore offload setting. + KernelEnv::_use_offload = save_offload; + } // run_ref. // Eval stage(s) over var(s) using optimized code. void StencilContext::run_solution(idx_t first_step_index, idx_t last_step_index) { + TRACE_MSG("running steps " << first_step_index << " ... " << last_step_index); STATE_VARS(this); // User-provided code. call_2idx_hooks(_before_run_solution_hooks, first_step_index, last_step_index); - - // Start timer. + // Start main timer. run_time.start(); - // Start vtune collection. - VTUNE_RESUME; + // Since any APIs may have been called in other ranks, mark all + // neighbor vars as possibly dirty. + set_all_neighbor_vars_dirty(); // Determine step dir from order of first/last. idx_t step_dir = (last_step_index >= first_step_index) ? 1 : -1; @@ -201,7 +217,7 @@ namespace yask { // Find begin, stride and end in step-dim. idx_t begin_t = first_step_index; - // Stride-size in step-dim is number of region steps. + // Stride-size in step-dim is number of mega-block steps. // Then, it is multipled by +/-1 to get proper direction. idx_t stride_t = max(wf_steps, idx_t(1)) * step_dir; assert(stride_t); @@ -217,10 +233,10 @@ namespace yask { end.set_vals(ext_bb.bb_end_tuple(domain_dims), false); end[step_posn] = end_t; IdxTuple stride(stencil_dims); - stride.set_vals(opts->_region_sizes, false); // stride by region sizes. + stride.set_vals(actl_opts->_mega_block_sizes, false); // stride by mega-block sizes. stride[step_posn] = stride_t; - TRACE_MSG("run_solution: [" << + TRACE_MSG("running area [" << begin.make_dim_val_str() << " ... " << end.make_dim_val_str() << ") by " << stride.make_dim_val_str()); @@ -228,145 +244,239 @@ namespace yask { THROW_YASK_EXCEPTION("Error: run_solution() called without calling prepare_solution() first"); if (ext_bb.bb_size < 1) { TRACE_MSG("nothing to do in solution"); - return; } + else { -#ifdef MODEL_CACHE - if (env.my_rank != env.msg_rank) - cache_model.disable(); - if (cache_model.is_enabled()) - os << "Modeling cache...\n"; -#endif - - // Adjust end points for overlapping regions due to wavefront angle. - // For each subsequent time step in a region, the spatial location - // of each block evaluation is shifted by the angle for each - // stage. So, the total shift in a region is the angle * num - // stages * num timesteps. This assumes all stages - // are inter-dependent to find maximum extension. Actual required - // size may be less, but this will just result in some calls to - // calc_region() that do nothing. - // - // Conceptually (showing 2 ranks in t and x dims): - // ----------------------------- t = rt ------------------------------ - // \ | \ \ \| \ | . | / | \ \ \| \ | - // \ | \ \ | \ | . | / \| \ \ | \ | - // \ |r0 \ r1 \ r2 |\ r3\ | . | /r0 | r1 \ r2 \ r3 |\ r4\ | - // \| \ \ | \ \| |/ |\ \ \ | \ \| - // ------------------------------ t = 0 ------------------------------- - // | rank 0 | | | | rank 1 | | - // x = begin[x] end[x] end[x] begin[x] begin[x] end[x] end[x] - // (rank) (rank) (ext) (ext) (rank) (rank) (adj) - // - // |XXXXXX| |XXXXX| <- redundant calculations. - // XXXXXX| <- areas outside of outer ranks not calculated -> |XXXXXXX - // - if (wf_steps > 0) { - DOMAIN_VAR_LOOP(i, j) { - - // The end should be adjusted only if an extension doesn't - // exist. Extentions exist between ranks, so additional - // adjustments are only needed at the end of the right-most - // rank in each dim. See "(adj)" in diagram above. - if (right_wf_exts[j] == 0) - end[i] += wf_shift_pts[j]; + // Copy vars to device as needed. The vars will be left updated + // on the device but not on the host after this call. Thus, if + // this function is called multiple times without accessing any + // data on the host, this should only trigger copying on the + // first call. + copy_vars_to_device(); + + #ifdef MODEL_CACHE + if (env.my_rank != env.msg_rank) + cache_model.disable(); + if (cache_model.is_enabled()) + os << "Modeling cache...\n"; + #endif + + // Adjust end points for overlapping mega-blocks due to wavefront angle. + // For each subsequent time step in a mega-block, the spatial location + // of each block evaluation is shifted by the angle for each + // stage. So, the total shift in a mega-block is the angle * num + // stages * num timesteps. This assumes all stages + // are inter-dependent to find maximum extension. Actual required + // size may be less, but this will just result in some calls to + // calc_mega_block() that do nothing. + // + // Conceptually (showing 2 ranks in t and x dims): + // ----------------------------- t = rt ------------------------------ + // \ | \ \ \| \ | . | / | \ \ \| \ | + // \ | \ \ | \ | . | / \| \ \ | \ | + // \ |r0 \ r1 \ r2 |\ r3\ | . | /r0 | r1 \ r2 \ r3 |\ r4\ | + // \| \ \ | \ \| |/ |\ \ \ | \ \| + // ------------------------------ t = 0 ------------------------------- + // | rank 0 | | | | rank 1 | | + // x = begin[x] end[x] end[x] begin[x] begin[x] end[x] end[x] + // (rank) (rank) (ext) (ext) (rank) (rank) (adj) + // + // |XXXXXX| |XXXXX| <- redundant calculations. + // XXXXXX| <- areas outside of outer ranks not calculated -> |XXXXXXX + // + if (wf_steps > 0) { + DOMAIN_VAR_LOOP_FAST(i, j) { + + // The end should be adjusted only if an extension doesn't + // exist. Extentions exist between ranks, so additional + // adjustments are only needed at the end of the right-most + // rank in each dim. See "(adj)" in diagram above. + if (right_wf_exts[j] == 0) + end[i] += wf_shift_pts[j]; + } } - } - - // If original region covered entire rank in a dim, set - // stride size to ensure only one stride is taken. - DOMAIN_VAR_LOOP(i, j) { - if (opts->_region_sizes[i] >= opts->_rank_sizes[i]) - stride[i] = end[i] - begin[i]; - } - TRACE_MSG("run_solution: after adjustment for " << num_wf_shifts << - " wave-front shift(s): [" << - begin.make_dim_val_str() << " ... " << - end.make_dim_val_str() << ") by " << - stride.make_dim_val_str()); - - // At this point, 'begin' and 'end' should describe the *max* range - // needed in the domain for this rank for the first time step. At - // any subsequent time step, this max may be shifted for temporal - // wavefronts or blocking. Also, for each time step, the *actual* - // range will be adjusted as needed before any actual stencil - // calculations are made. - - // Indices needed for the 'rank' loops. - ScanIndices rank_idxs(*dims, true, &rank_domain_offsets); - rank_idxs.begin = begin; - rank_idxs.end = end; - rank_idxs.stride = stride; - - // Make sure threads are set properly for a region. - set_region_threads(); - // Initial halo exchange. - exchange_halos(); + // At this point, 'begin' and 'end' should describe the *max* range + // needed in the domain for this rank for the first time step. At + // any subsequent time step, this max may be shifted for temporal + // wavefronts or blocking. Also, for each time step, the *actual* + // range will be adjusted as needed before any actual stencil + // calculations are made. + + // Indices needed for the 'rank' loops. + ScanIndices rank_idxs(true, &rank_domain_offsets); + rank_idxs.begin = begin; + rank_idxs.end = end; + rank_idxs.stride = stride; + rank_idxs.tile_size = actl_opts->_rank_tile_sizes; + rank_idxs.adjust_from_settings(actl_opts->_rank_sizes, + actl_opts->_rank_tile_sizes, + actl_opts->_mega_block_sizes); + TRACE_MSG("after adjustment for " << num_wf_shifts << + " wave-front shift(s): [" << + rank_idxs.begin.make_val_str() << " ... " << + rank_idxs.end.make_val_str() << ") by " << + rank_idxs.stride.make_val_str()); + + // Make sure threads are set properly for a mega-block. + set_num_outer_threads(); + + // Initial halo exchange. + exchange_halos(); + + // Number of iterations to get from begin_t to end_t-1, + // jumping by stride_t. + const idx_t num_t = CEIL_DIV(abs(end_t - begin_t), abs(stride_t)); + for (idx_t index_t = 0; index_t < num_t; index_t++) + { + // This value of index_t steps from start_t to stop_t-1. + const idx_t start_t = begin_t + (index_t * stride_t); + const idx_t stop_t = (stride_t > 0) ? + min(start_t + stride_t, end_t) : + max(start_t + stride_t, end_t); + idx_t this_num_t = abs(stop_t - start_t); + + // Set indices that will pass through generated code. + rank_idxs.index[step_posn] = index_t; + rank_idxs.start[step_posn] = start_t; + rank_idxs.stop[step_posn] = stop_t; + rank_idxs.stride[step_posn] = stride_t; + + // Start timer for auto-tuner. + _at.timer.start(); + + // If no wave-fronts (default), loop through stages here, and do + // only one stage at a time in calc_mega_block(). This is similar to + // loop in calc_rank_ref(), but with stages instead of bundles. + if (wf_steps == 0) { + + // Loop thru stages. + for (auto& bp : st_stages) { - // Number of iterations to get from begin_t to end_t-1, - // jumping by stride_t. - const idx_t num_t = CEIL_DIV(abs(end_t - begin_t), abs(stride_t)); - for (idx_t index_t = 0; index_t < num_t; index_t++) - { - // This value of index_t steps from start_t to stop_t-1. - const idx_t start_t = begin_t + (index_t * stride_t); - const idx_t stop_t = (stride_t > 0) ? - min(start_t + stride_t, end_t) : - max(start_t + stride_t, end_t); - idx_t this_num_t = abs(stop_t - start_t); + // Check step. + if (check_step_conds && !bp->is_in_valid_step(start_t)) { + TRACE_MSG("step " << start_t << + " not valid for stage '" << + bp->get_name() << "'"); + continue; + } - // Set indices that will pass through generated code. - rank_idxs.index[step_posn] = index_t; - rank_idxs.start[step_posn] = start_t; - rank_idxs.stop[step_posn] = stop_t; - rank_idxs.stride[step_posn] = stride_t; + // Do MPI-external parts separately? + if (mpi_interior.bb_valid) { + do_mpi_interior = false; + + // Overlap comms and computation by restricting + // mega-block boundaries. Make an external pass for + // each side of each domain dim, e.g., 'left x', + // 'right x', 'left y', ... + DOMAIN_VAR_LOOP(i, j) { + for (bool is_left : { true, false }) { + + // Skip if no halo to calculate in this + // section. + if (!does_exterior_exist(j, is_left)) + continue; + + // Set the proper flags to indicate what + // section we're working on. + do_mpi_left = is_left; + do_mpi_right = !is_left; + mpi_exterior_dim = j; + + // Include automatically-generated loop + // code to call calc_mega_block() for + // each mega-block. The mega-block will be trimmed + // to the active MPI exterior section. + TRACE_MSG("step " << start_t << + " for stage '" << bp->get_name() << + "' in MPI exterior " << + (is_left ? "left-" : "right-") << + domain_dims.get_dim_name(j)); + + // Loop prefix. + #define RANK_LOOP_INDICES rank_idxs + #define RANK_BODY_INDICES mega_block_range + #define RANK_USE_LOOP_PART_0 + #include "yask_rank_loops.hpp" + + // Loop body. + calc_mega_block(bp, mega_block_range); + + // Loop suffix. + #define RANK_USE_LOOP_PART_1 + #include "yask_rank_loops.hpp" + + } // left/right. + } // domain dims. + + // Mark vars that *may* have been written to by + // this stage by any rank. Mark vars as dirty + // even if not actually written by this rank, + // perhaps due to sub-domains or asymmetrical + // stencils. This is needed because neighbors + // will not know what vars are actually dirty, + // and all ranks must have the same information + // about which vars are possibly dirty. + update_var_info(bp, start_t, stop_t, true); + + // Do the appropriate steps for halo exchange of exterior. + do_mpi_left = do_mpi_right = true; + exchange_halos(); + + // Do interior only in next pass. + do_mpi_left = do_mpi_right = false; + do_mpi_interior = true; + + } // Exterior only for overlapping comms. + + // Include automatically-generated loop code to call + // calc_mega_block() for each mega-block. If overlapping + // comms, this will be just the interior. If not, it + // will cover the whole rank. + TRACE_MSG("step " << start_t << + " for stage '" << bp->get_name() << "'"); + + // Loop prefix. + #define RANK_LOOP_INDICES rank_idxs + #define RANK_BODY_INDICES mega_block_range + #define RANK_USE_LOOP_PART_0 + #include "yask_rank_loops.hpp" + + // Loop body. + calc_mega_block(bp, mega_block_range); + + // Loop suffix. + #define RANK_USE_LOOP_PART_1 + #include "yask_rank_loops.hpp" + + // Mark as dirty only if we just did exterior. + bool mark_dirty = do_mpi_left || do_mpi_right; + update_var_info(bp, start_t, stop_t, mark_dirty); + + // Do the appropriate steps for halo exchange depending + // on 'do_mpi_*' flags. + exchange_halos(); - // Start timer for auto-tuner. - _at.timer.start(); + // Set the overlap flags back to default. + init_mpi_flags(); - // If no wave-fronts (default), loop through stages here, and do - // only one stage at a time in calc_region(). This is similar to - // loop in calc_rank_ref(), but with stages instead of bundles. - if (wf_steps == 0) { + } // stages. + } // No WF tiling. - // Loop thru stages. - for (auto& bp : st_stages) { + // If doing wave-fronts, must loop through all stages in + // calc_mega_block(). + else { - // Check step. - if (check_step_conds && !bp->is_in_valid_step(start_t)) { - TRACE_MSG("run_solution: step " << start_t << - " not valid for stage '" << - bp->get_name() << "'"); - continue; - } + // Null ptr => Eval all stages each time + // calc_mega_block() is called. + StagePtr bp; - // Do MPI-external passes? + // Do MPI-external parts separately? if (mpi_interior.bb_valid) { do_mpi_interior = false; - // Old overlap method calculates full blocks in exterior - // and then in interior. Only works without WF tiling. - // Also, if blocks are too big, then the interior is - // too small. For now, keeping code for perf comparison. -#ifdef OVERLAP_WITH_BLOCKS - mpi_exterior_dim = -1; // indicate block method. - - // Overlap comms and computation at a block granularity. - // Set both left and right exterior flags. - do_mpi_left = do_mpi_right = true; - - // Include automatically-generated loop code that calls - // calc_region(bp) for each region. - TRACE_MSG("run_solution: step " << start_t << - " for stage '" << bp->get_name() << "' in MPI exterior"); -#include "yask_rank_loops.hpp" - -#else - mpi_exterior_dim = 0; - // Overlap comms and computation by restricting - // region boundaries. Make an external pass for + // mega-block boundaries. Make an external pass for // each side of each domain dim, e.g., 'left x', // 'right x', 'left y', ... DOMAIN_VAR_LOOP(i, j) { @@ -384,181 +494,127 @@ namespace yask { mpi_exterior_dim = j; // Include automatically-generated loop - // code that calls calc_region(bp) for - // each region. The region will be trimmed + // code to call calc_mega_block(bp) for + // each mega-block. The mega-block will be trimmed // to the active MPI exterior section. - TRACE_MSG("run_solution: step " << start_t << - " for stage '" << bp->get_name() << - "' in MPI exterior dim " << j << - " on the " << (is_left ? "left" : "right")); -#include "yask_rank_loops.hpp" + TRACE_MSG("WF steps [" << start_t << + " ... " << stop_t << + ") in MPI exterior " << + (is_left ? "left-" : "right-") << + domain_dims.get_dim_name(j)); + + // Loop prefix. + #define RANK_LOOP_INDICES rank_idxs + #define RANK_BODY_INDICES mega_block_range + #define RANK_USE_LOOP_PART_0 + #include "yask_rank_loops.hpp" + + // Loop body. + calc_mega_block(bp, mega_block_range); + + // Loop suffix. + #define RANK_USE_LOOP_PART_1 + #include "yask_rank_loops.hpp" + } // left/right. } // domain dims. -#endif - - // Mark vars that [may] have been written to by - // this stage. Mark vars as dirty even if not - // actually written by this rank, perhaps due to - // sub-domains or asymmetrical stencils. This is - // needed because neighbors will not know what vars - // are actually dirty, and all ranks must have the - // same information about which vars are possibly - // dirty. TODO: make this smarter to save unneeded - // MPI exchanges. - update_vars(bp, start_t, stop_t, true); + + // Mark vars dirty for all stages. + update_var_info(bp, start_t, stop_t, true); // Do the appropriate steps for halo exchange of exterior. - // TODO: exchange halo for each dim as soon as it's done. do_mpi_left = do_mpi_right = true; exchange_halos(); // Do interior only in next pass. do_mpi_left = do_mpi_right = false; do_mpi_interior = true; - } // Overlapping. - // Include automatically-generated loop code that calls - // calc_region(bp) for each region. If overlapping + } // Exterior only for overlapping comms. + + // Include automatically-generated loop code to call + // calc_mega_block() for each mega-block. If overlapping // comms, this will be just the interior. If not, it // will cover the whole rank. - TRACE_MSG("run_solution: step " << start_t << - " for stage '" << bp->get_name() << "'"); -#include "yask_rank_loops.hpp" - - // Mark as dirty only if we did exterior. + TRACE_MSG("steps [" << start_t << + " ... " << stop_t << ")"); + + // Loop prefix. + #define RANK_LOOP_INDICES rank_idxs + #define RANK_BODY_INDICES mega_block_range + #define RANK_USE_LOOP_PART_0 + #include "yask_rank_loops.hpp" + + // Loop body. + calc_mega_block(bp, mega_block_range); + + // Loop suffix. + #define RANK_USE_LOOP_PART_1 + #include "yask_rank_loops.hpp" + + // Mark as dirty only if we just did exterior. bool mark_dirty = do_mpi_left || do_mpi_right; - update_vars(bp, start_t, stop_t, mark_dirty); + update_var_info(bp, start_t, stop_t, mark_dirty); // Do the appropriate steps for halo exchange depending // on 'do_mpi_*' flags. exchange_halos(); // Set the overlap flags back to default. - do_mpi_interior = do_mpi_left = do_mpi_right = true; + init_mpi_flags(); - } // stages. - } // No WF tiling. + } // With WF tiling. - // If doing wave-fronts, must loop through all stages in - // calc_region(). - else { + // Overall steps. + steps_done += this_num_t; - // Null ptr => Eval all stages each time - // calc_region() is called. - StagePtr bp; - - // Do MPI-external passes? - if (mpi_interior.bb_valid) { - do_mpi_interior = false; - mpi_exterior_dim = 0; - - // Overlap comms and computation by restricting - // region boundaries. Make an external pass for - // each side of each domain dim, e.g., 'left x', - // 'right x', 'left y', ... - DOMAIN_VAR_LOOP(i, j) { - for (bool is_left : { true, false }) { - - // Skip if no halo to calculate in this - // section. - if (!does_exterior_exist(j, is_left)) - continue; - - // Set the proper flags to indicate what - // section we're working on. - do_mpi_left = is_left; - do_mpi_right = !is_left; - mpi_exterior_dim = j; - - // Include automatically-generated loop - // code that calls calc_region(bp) for - // each region. The region will be trimmed - // to the active MPI exterior section. - TRACE_MSG("run_solution: steps [" << start_t << - " ... " << stop_t << - ") in MPI exterior dim " << j << - " on the " << (is_left ? "left" : "right")); -#include "yask_rank_loops.hpp" - } // left/right. - } // domain dims. - - // Mark vars dirty for all stages. - update_vars(bp, start_t, stop_t, true); - - // Do the appropriate steps for halo exchange of exterior. - // TODO: exchange halo for each dim as soon as it's done. - do_mpi_left = do_mpi_right = true; - exchange_halos(); - - // Do interior only in next pass. - do_mpi_left = do_mpi_right = false; - do_mpi_interior = true; - } // Overlapping. - - // Include automatically-generated loop code that calls - // calc_region(bp) for each region. If overlapping - // comms, this will be just the interior. If not, it - // will cover the whole rank. - TRACE_MSG("run_solution: steps [" << start_t << - " ... " << stop_t << ")"); -#include "yask_rank_loops.hpp" - - // Mark as dirty only if we did exterior. - bool mark_dirty = do_mpi_left || do_mpi_right; - update_vars(bp, start_t, stop_t, mark_dirty); - - // Do the appropriate steps for halo exchange depending - // on 'do_mpi_*' flags. - exchange_halos(); - - // Set the overlap flags back to default. - do_mpi_interior = do_mpi_left = do_mpi_right = true; - - } // With WF tiling. - - // Overall steps. - steps_done += this_num_t; - - // Count steps for each stage to properly account for - // step conditions when using temporal tiling. - for (auto& bp : st_stages) { - idx_t num_stage_steps = 0; + // Count steps for each stage to properly account for + // step conditions when using temporal tiling. + for (auto& bp : st_stages) { + idx_t num_stage_steps = 0; - if (!check_step_conds) - num_stage_steps = this_num_t; - else { + if (!check_step_conds) + num_stage_steps = this_num_t; + else { - // Loop through each step. - assert(abs(step_dir) == 1); - for (idx_t t = start_t; t != stop_t; t += step_dir) { + // Loop through each step. + assert(abs(step_dir) == 1); + for (idx_t t = start_t; t != stop_t; t += step_dir) { - // Check step cond for this t. - if (bp->is_in_valid_step(t)) - num_stage_steps++; + // Check step cond for this t. + if (bp->is_in_valid_step(t)) + num_stage_steps++; + } } + + // Add to steps done for this stage. + bp->add_steps(num_stage_steps); } - // Count steps for this stage. - bp->add_steps(num_stage_steps); + // Call the auto-tuner to evaluate these steps and change + // settings when enough time has passed. + // FIXME: in-situ AT will not work properly with temporal conditions + // because not all sequences of N steps will do the same amount of work. + auto this_time = _at.timer.stop(); + _at.steps_done += this_num_t; + eval_auto_tuner(); + TRACE_MSG("did " << this_num_t << " step(s) in " << this_time << " secs."); + + } // step loop. + + #ifdef MODEL_CACHE + // Print cache stats, then disable. + // Thus, cache is only modeled for first call. + if (cache_model.is_enabled()) { + os << "Done modeling cache...\n"; + cache_model.dump_stats(); + cache_model.disable(); } + #endif - // Call the auto-tuner to evaluate these steps. - eval_auto_tuner(this_num_t); - - } // step loop. - -#ifdef MODEL_CACHE - // Print cache stats, then disable. - // Thus, cache is only modeled for first call. - if (cache_model.is_enabled()) { - os << "Done modeling cache...\n"; - cache_model.dump_stats(); - cache_model.disable(); - } -#endif - - // Stop vtune collection & timer. - VTUNE_PAUSE; + } // Something to do. + + // Stop timer. run_time.stop(); // User-provided code. @@ -567,60 +623,61 @@ namespace yask { } // run_solution(). - // Calculate results within a region. Each region is typically computed + // Calculate results within a mega-block. Each mega-block is typically computed // in a separate OpenMP 'for' region. In this function, we loop over // the time steps and stages and evaluate a stage in each of - // the blocks in the region. If 'sel_bp' is null, eval all stages; else + // the blocks in the mega-block. If 'sel_bp' is null, eval all stages; else // eval only the one pointed to. - void StencilContext::calc_region(StagePtr& sel_bp, + void StencilContext::calc_mega_block(StagePtr& sel_bp, const ScanIndices& rank_idxs) { STATE_VARS(this); - TRACE_MSG("calc_region: region [" << + TRACE_MSG("calc_mega_block: mega-block [" << rank_idxs.start.make_val_str() << " ... " << - rank_idxs.stop.make_val_str() << ") within rank [" << + rank_idxs.stop.make_val_str() << ") within possibly-adjusted rank [" << rank_idxs.begin.make_val_str() << " ... " << - rank_idxs.end.make_val_str() << ")" ); + rank_idxs.end.make_val_str() << ") for " << + make_mpi_section_descr()); - // Track time (use "else" to avoid double-counting). + // Track time separately for MPI exterior and interior. if (!do_mpi_interior && (do_mpi_left || do_mpi_right)) ext_time.start(); else int_time.start(); - // Init region begin & end from rank start & stop indices. - ScanIndices region_idxs(*dims, true, &rank_domain_offsets); - region_idxs.init_from_outer(rank_idxs); + // Init mega-block begin & end from rank start & stop indices. + ScanIndices mega_block_idxs = rank_idxs.create_inner(); // Time range. // When doing WF rank tiling, this loop will stride through - // several time-steps in each region. + // several time-steps in each mega-block. // When also doing TB, it will stride by the block strides. - idx_t begin_t = region_idxs.begin[step_posn]; - idx_t end_t = region_idxs.end[step_posn]; + idx_t begin_t = mega_block_idxs.begin[step_posn]; + idx_t end_t = mega_block_idxs.end[step_posn]; idx_t step_dir = (end_t >= begin_t) ? 1 : -1; idx_t stride_t = max(tb_steps, idx_t(1)) * step_dir; assert(stride_t); const idx_t num_t = CEIL_DIV(abs(end_t - begin_t), abs(stride_t)); // Time loop. - idx_t region_shift_num = 0; + idx_t mega_block_shift_num = 0; for (idx_t index_t = 0; index_t < num_t; index_t++) { // This value of index_t steps from start_t to stop_t-1. + // Be sure to handle reverse steps. const idx_t start_t = begin_t + (index_t * stride_t); const idx_t stop_t = (stride_t > 0) ? min(start_t + stride_t, end_t) : max(start_t + stride_t, end_t); // Set step indices that will pass through generated code. - region_idxs.index[step_posn] = index_t; - region_idxs.start[step_posn] = start_t; - region_idxs.stop[step_posn] = stop_t; + mega_block_idxs.index[step_posn] = index_t; + mega_block_idxs.start[step_posn] = start_t; + mega_block_idxs.stop[step_posn] = stop_t; // If no temporal blocking (default), loop through stages here, // and do only one stage at a time in calc_block(). If there is // no WF blocking either, the stage loop body will only execute - // with one active stage, and 'region_shift_num' will never be > 0. + // with one active stage, and 'mega_block_shift_num' will never be > 0. if (tb_steps == 0) { // Stages to evaluate at this time step. @@ -630,60 +687,68 @@ namespace yask { if (sel_bp && sel_bp != bp) continue; - TRACE_MSG("calc_region: no TB; stage '" << + TRACE_MSG("no TB; stage '" << bp->get_name() << "' in step(s) [" << start_t << " ... " << stop_t << ")"); // Check step. if (check_step_conds && !bp->is_in_valid_step(start_t)) { - TRACE_MSG("calc_region: step " << start_t << + TRACE_MSG("step " << start_t << " not valid for stage '" << bp->get_name() << "'"); continue; } - // Strides within a region are based on stage block sizes. + // Strides within a mega-block are based on stage block sizes. + // These may be different when per-stage auto-tuning has been done. auto& settings = bp->get_active_settings(); - region_idxs.stride = settings._block_sizes; - region_idxs.stride[step_posn] = stride_t; - - // Groups in region loops are based on block-group sizes. - region_idxs.group_size = settings._block_group_sizes; - - // Set region_idxs begin & end based on shifted rank - // start & stop (original region begin & end), rank + mega_block_idxs.stride = settings._block_sizes; + mega_block_idxs.stride[step_posn] = stride_t; + + // Tiles in mega-block loops. + mega_block_idxs.tile_size = settings._mega_block_tile_sizes; + + // Set mega_block_idxs begin & end based on shifted rank + // start & stop (original mega-block begin & end), rank // boundaries, and stage BB. This will be the base of the - // region loops. - bool ok = shift_region(rank_idxs.start, rank_idxs.stop, - region_shift_num, bp, - region_idxs); - - DOMAIN_VAR_LOOP(i, j) { - - // If there is only one blk in a region, make sure - // this blk fills this whole region. - if (settings._block_sizes[i] >= settings._region_sizes[i]) - region_idxs.stride[i] = region_idxs.end[i] - region_idxs.begin[i]; - } - - // Only need to loop through the span of the region if it is + // mega-block loops. + bool ok = shift_mega_block(rank_idxs.start, rank_idxs.stop, + mega_block_shift_num, bp, + mega_block_idxs); + mega_block_idxs.adjust_from_settings(settings._mega_block_sizes, + settings._mega_block_tile_sizes, + settings._block_sizes); + + // Only need to loop through the span of the mega-block if it is // at least partly inside the extended BB. For overlapping - // regions, they may start outside the domain but enter the + // mega-blocks, they may start outside the domain but enter the // domain as time progresses and their boundaries shift. So, // we don't want to return if this condition isn't met. if (ok) { idx_t nphases = 1; // Only 1 phase w/o TB. idx_t phase = 0; - // Include automatically-generated loop code that - // calls calc_block() for each block in this region. + // Include automatically-generated loop code to + // call calc_block() for each block in this mega-block. // Loops through x from begin_rx to end_rx-1; // similar for y and z. This code typically // contains the outer OpenMP loop(s). -#include "yask_region_loops.hpp" + + // Loop prefix. + #define MEGA_BLOCK_LOOP_INDICES mega_block_idxs + #define MEGA_BLOCK_BODY_INDICES blk_range + #define MEGA_BLOCK_USE_LOOP_PART_0 + #include "yask_mega_block_loops.hpp" + + // Loop body. + calc_block(bp, mega_block_shift_num, nphases, phase, rank_idxs, blk_range); + + // Loop suffix. + #define MEGA_BLOCK_USE_LOOP_PART_1 + #include "yask_mega_block_loops.hpp" } // Need to shift for next stage and/or time. - region_shift_num++; + mega_block_shift_num++; } // stages. } // no temporal blocking. @@ -691,57 +756,66 @@ namespace yask { // If using TB, iterate thru steps in a WF and stages in calc_block(). else { - TRACE_MSG("calc_region: w/TB in step(s) [" << + TRACE_MSG("calc_mega_block: w/TB in step(s) [" << start_t << " ... " << stop_t << ")"); // Null ptr => Eval all stages each time // calc_block() is called. StagePtr bp; - // Strides within a region are based on rank block sizes. - auto& settings = *opts; - region_idxs.stride = settings._block_sizes; - region_idxs.stride[step_posn] = stride_t; - - // Groups in region loops are based on block-group sizes. - region_idxs.group_size = settings._block_group_sizes; - - // Set region_idxs begin & end based on shifted start & stop - // and rank boundaries. This will be the base of the region - // loops. The bounds in region_idxs may be outside the + // Strides within a mega-block are based on rank block sizes. + // Cannot use different strides per stage with TB. + auto& settings = *actl_opts; + mega_block_idxs.stride = settings._block_sizes; + mega_block_idxs.stride[step_posn] = stride_t; + + // Tiles in mega-block loops. + mega_block_idxs.tile_size = settings._mega_block_tile_sizes; + + // Set mega_block_idxs begin & end based on shifted start & stop + // and rank boundaries. This will be the base of the mega-block + // loops. The bounds in mega_block_idxs may be outside the // actual rank because we're starting with the expanded rank. - bool ok = shift_region(rank_idxs.start, rank_idxs.stop, - region_shift_num, bp, - region_idxs); + bool ok = shift_mega_block(rank_idxs.start, rank_idxs.stop, + mega_block_shift_num, bp, + mega_block_idxs); + mega_block_idxs.adjust_from_settings(settings._mega_block_sizes, + settings._mega_block_tile_sizes, + settings._block_sizes); // Should always be valid because we just shifted (no trim). - // Trimming will be done at the mini-block level. + // Trimming will be done at the micro-block level. assert(ok); - DOMAIN_VAR_LOOP(i, j) { - - // If original blk covered entire region, reset stride. - if (settings._block_sizes[i] >= settings._region_sizes[i]) - region_idxs.stride[i] = region_idxs.end[i] - region_idxs.begin[i]; - } - // To tesselate n-D domain space, we use n+1 distinct - // "phases". For example, 1-D TB uses "upward" triangles - // and "downward" triangles. Region threads sync after every - // phase. Thus, the phase loop is here around the generated - // OMP loops. TODO: schedule phases and their shapes via task - // dependencies. + // "phases". For example, 1-D TB uses "upward" trapezoids + // and "downward" trapezoids. Outer OMP threads sync after + // every phase. Thus, the phase loop is here around the + // generated OMP loops. TODO: schedule phases and their + // shapes via task dependencies. idx_t nphases = nddims + 1; for (idx_t phase = 0; phase < nphases; phase++) { // Call calc_block() on every block concurrently. Only // the shapes corresponding to the current 'phase' will // be calculated. -#include "yask_region_loops.hpp" + + // Loop prefix. + #define MEGA_BLOCK_LOOP_INDICES mega_block_idxs + #define MEGA_BLOCK_BODY_INDICES blk_range + #define MEGA_BLOCK_USE_LOOP_PART_0 + #include "yask_mega_block_loops.hpp" + + // Loop body. + calc_block(bp, mega_block_shift_num, nphases, phase, rank_idxs, blk_range); + + // Loop suffix. + #define MEGA_BLOCK_USE_LOOP_PART_1 + #include "yask_mega_block_loops.hpp" } // Loop thru stages that were evaluated in - // these 'tb_steps' to increment shift for next region + // these 'tb_steps' to increment shift for next mega-block // "layer", if any. This is needed when there are more WF // steps than TB steps. TODO: consider moving this inside // calc_block(). @@ -753,7 +827,7 @@ namespace yask { continue; // One shift for each stage in each TB step. - region_shift_num++; + mega_block_shift_num++; } } } // with temporal blocking. @@ -761,84 +835,45 @@ namespace yask { if (!do_mpi_interior && (do_mpi_left || do_mpi_right)) { double ext_delta = ext_time.stop(); - TRACE_MSG("secs spent in this region for rank-exterior blocks: " << make_num_str(ext_delta)); + TRACE_MSG("secs spent in this mega-block for rank-exterior blocks: " << make_num_str(ext_delta)); } else { double int_delta = int_time.stop(); - TRACE_MSG("secs spent in this region for rank-interior blocks: " << make_num_str(int_delta)); + TRACE_MSG("secs spent in this mega-block for rank-interior blocks: " << make_num_str(int_delta)); } - } // calc_region. + } // calc_mega_block. // Calculate results within a block. This function calls - // 'calc_mini_block()' for the specified stage or all stages if 'sel_bp' + // 'calc_micro_block()' for the specified stage or all stages if 'sel_bp' // is null. When using TB, only the shape(s) needed for the tesselation // 'phase' are computed. Typically called by a top-level OMP thread - // from calc_region(). + // from calc_mega_block(). void StencilContext::calc_block(StagePtr& sel_bp, - idx_t region_shift_num, + idx_t mega_block_shift_num, idx_t nphases, idx_t phase, const ScanIndices& rank_idxs, - const ScanIndices& region_idxs) { + const ScanIndices& mega_block_idxs) { STATE_VARS(this); auto* bp = sel_bp.get(); - int region_thread_idx = omp_get_thread_num(); + int outer_thread_idx = omp_get_thread_num(); TRACE_MSG("calc_block: phase " << phase << ", block [" << - region_idxs.start.make_val_str() << " ... " << - region_idxs.stop.make_val_str() << - ") within region [" << - region_idxs.begin.make_val_str() << " ... " << - region_idxs.end.make_val_str() << - ") by region thread " << region_thread_idx); - -#ifdef OVERLAP_WITH_BLOCKS - // If we are not calculating some of the blocks, determine - // whether this block is *completely* inside the interior. - // A block even partially in the exterior is not considered - // "inside". - if (is_overlap_active()) { - - // Starting point and ending point must be in BB. - bool inside = true; - DOMAIN_VAR_LOOP(i, j) { + mega_block_idxs.start.make_val_str() << " ... " << + mega_block_idxs.stop.make_val_str() << + ") within mega-block [" << + mega_block_idxs.begin.make_val_str() << " ... " << + mega_block_idxs.end.make_val_str() << + ") by mega-block thread " << outer_thread_idx); - // Starting before beginning of interior? - if (region_idxs.start[i] < mpi_interior.bb_begin[j]) - inside = false; - - // Stopping after ending of interior? - if (region_idxs.stop[i] > mpi_interior.bb_end[j]) - inside = false; - } - if (do_mpi_interior) { - if (inside) - TRACE_MSG(" calculating because block is interior"); - else { - TRACE_MSG(" *not* calculating because block is exterior"); - return; - } - } - if (do_mpi_left || do_mpi_right) { - if (!inside) - TRACE_MSG(" calculating because block is exterior"); - else { - TRACE_MSG(" *not* calculating because block is interior"); - return; - } - } - } -#endif - - // Init block begin & end from region start & stop indices. - ScanIndices block_idxs(*dims, true); - block_idxs.init_from_outer(region_idxs); + // Init block begin & end from mega-block start & stop indices. + ScanIndices block_idxs = mega_block_idxs.create_inner(); // Time range. // When not doing TB, there is only one step. // When doing TB, we will only do one iteration here // that covers all steps, - // and calc_mini_block() will loop over all steps. + // and calc_micro_block() will loop over all steps. idx_t begin_t = block_idxs.begin[step_posn]; idx_t end_t = block_idxs.end[step_posn]; idx_t step_dir = (end_t >= begin_t) ? 1 : -1; @@ -849,7 +884,7 @@ namespace yask { // If TB is not being used, just process the given stage. // No need for a time loop. // No need to check bounds, because they were checked in - // calc_region() when not using TB. + // calc_mega_block() when not using TB. if (tb_steps == 0) { assert(bp); assert(abs(stride_t) == 1); @@ -861,13 +896,13 @@ namespace yask { block_idxs.start[step_posn] = begin_t; block_idxs.stop[step_posn] = end_t; - // Strides within a block are based on stage mini-block sizes. + // Strides within a block are based on stage micro-block sizes. auto& settings = bp->get_active_settings(); - block_idxs.stride = settings._mini_block_sizes; + block_idxs.stride = settings._micro_block_sizes; block_idxs.stride[step_posn] = stride_t; - // Groups in block loops are based on mini-block-group sizes. - block_idxs.group_size = settings._mini_block_group_sizes; + // Tiles in block loops. + block_idxs.tile_size = settings._block_tile_sizes; // Default settings for no TB. StagePtr bp = sel_bp; @@ -875,12 +910,29 @@ namespace yask { idx_t nshapes = 1; idx_t shape = 0; idx_t shift_num = 0; - BridgeMask bridge_mask; + bit_mask_t bridge_mask = 0; ScanIndices adj_block_idxs = block_idxs; - - // Include automatically-generated loop code that - // calls calc_mini_block() for each mini-block in this block. -#include "yask_block_loops.hpp" + adj_block_idxs.adjust_from_settings(settings._block_sizes, + settings._block_tile_sizes, + settings._micro_block_sizes); + + // Include automatically-generated loop code to + // call calc_micro_block() for each micro-block in this block. + + // Loop prefix. + #define BLOCK_LOOP_INDICES adj_block_idxs + #define BLOCK_BODY_INDICES micro_blk_range + #define BLOCK_USE_LOOP_PART_0 + #include "yask_block_loops.hpp" + + // Loop body. + calc_micro_block(outer_thread_idx, bp, mega_block_shift_num, + nphases, phase, nshapes, shape, bridge_mask, + rank_idxs, mega_block_idxs, block_idxs, micro_blk_range); + + // Loop suffix. + #define BLOCK_USE_LOOP_PART_1 + #include "yask_block_loops.hpp" } // no TB. // If TB is active, loop thru each required shape. @@ -893,47 +945,46 @@ namespace yask { // for each combination of domain dims. E.g., need 'x' and // 'y' bridges for 2D problem in phase 1. idx_t nshapes = n_choose_k(nddims, phase); - BridgeMask bridge_mask(nddims, false); + bit_mask_t bridge_mask = 0; // Set temporal indices to full range. block_idxs.index[step_posn] = 0; // only one index. block_idxs.start[step_posn] = begin_t; block_idxs.stop[step_posn] = end_t; - // Strides within a block are based on rank mini-block sizes. - auto& settings = *opts; - block_idxs.stride = settings._mini_block_sizes; + // Strides within a block are based on rank micro-block sizes. + auto& settings = *actl_opts; + block_idxs.stride = settings._micro_block_sizes; block_idxs.stride[step_posn] = step_dir; - // Groups in block loops are based on mini-block-group sizes. - block_idxs.group_size = settings._mini_block_group_sizes; + // Tiles in block loops. + block_idxs.tile_size = settings._block_tile_sizes; // Increase range of block to cover all phases and // shapes. ScanIndices adj_block_idxs = block_idxs; - DOMAIN_VAR_LOOP(i, j) { + DOMAIN_VAR_LOOP_FAST(i, j) { // TB shapes can extend to the right only. They can // cover a range as big as this block's base plus the // next block in all dims, so we add the width of the // current block to the end. This makes the adjusted - // blocks overlap, but the size of each mini-block is + // blocks overlap, but the size of each micro-block is // trimmed at each step to the proper active size. // TODO: find a way to make this more efficient to avoid - // calling calc_mini_block() many times with nothing to + // calling calc_micro_block() many times with nothing to // do. - auto width = region_idxs.stop[i] - region_idxs.start[i]; + auto width = mega_block_idxs.stop[i] - mega_block_idxs.start[i]; adj_block_idxs.end[i] += width; - - // If original MB covers a whole block, reset stride. - if (settings._mini_block_sizes[i] >= settings._block_sizes[i]) - adj_block_idxs.stride[i] = adj_block_idxs.end[i] - adj_block_idxs.begin[i]; } + adj_block_idxs.adjust_from_settings(settings._block_sizes, + settings._block_tile_sizes, + settings._micro_block_sizes); TRACE_MSG("calc_block: phase " << phase << ", adjusted block [" << adj_block_idxs.begin.make_val_str() << " ... " << adj_block_idxs.end.make_val_str() << - ") with mini-block stride " << + ") with micro-block stride " << adj_block_idxs.stride.make_val_str()); // Loop thru shapes. @@ -941,76 +992,81 @@ namespace yask { // Get 'shape'th combo of 'phase' things from 'nddims'. // These will be used to create bridge shapes. - auto dims_to_bridge = n_choose_k_set(nddims, phase, shape); - - // Set bits for selected dims. - DOMAIN_VAR_LOOP(i, j) - bridge_mask.at(j) = false; - for (int i = 0; i < phase; i++) { - auto dim = dims_to_bridge[i]; - bridge_mask.at(dim) = true; - } + bridge_mask = n_choose_k_set(nddims, phase, shape); // Can only be one time iteration here when doing TB - // because mini-block temporal size is always same + // because micro-block temporal size is always same // as block temporal size. assert(num_t == 1); - // Include automatically-generated loop code that calls - // calc_mini_block() for each mini-block in this block. + // Include automatically-generated loop code to call + // calc_micro_block() for each micro-block in this block. StagePtr bp; // null. -#include "yask_block_loops.hpp" + // Loop prefix. + #define BLOCK_LOOP_INDICES adj_block_idxs + #define BLOCK_BODY_INDICES micro_blk_range + #define BLOCK_USE_LOOP_PART_0 + #include "yask_block_loops.hpp" + + // Loop body. + calc_micro_block(outer_thread_idx, bp, mega_block_shift_num, + nphases, phase, nshapes, shape, bridge_mask, + rank_idxs, mega_block_idxs, block_idxs, micro_blk_range); + + // Loop suffix. + #define BLOCK_USE_LOOP_PART_1 + #include "yask_block_loops.hpp" + } // shape loop. } // TB. } // calc_block(). - // Calculate results within a mini-block. - // This function calls 'StencilBundleBase::calc_mini_block()' + // Calculate results within a micro-block. + // This function calls 'StencilBundleBase::calc_micro_block()' // for each bundle in the specified stage or all stages if 'sel_bp' is // null. When using TB, only the 'shape' needed for the tesselation // 'phase' are computed. The starting 'shift_num' is relative - // to the bottom of the current region and block. - void StencilContext::calc_mini_block(int region_thread_idx, + // to the bottom of the current mega-block and block. + void StencilContext::calc_micro_block(int outer_thread_idx, StagePtr& sel_bp, - idx_t region_shift_num, + idx_t mega_block_shift_num, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, - const BridgeMask& bridge_mask, + const bit_mask_t& bridge_mask, const ScanIndices& rank_idxs, - const ScanIndices& base_region_idxs, + const ScanIndices& base_mega_block_idxs, const ScanIndices& base_block_idxs, const ScanIndices& adj_block_idxs) { STATE_VARS(this); - TRACE_MSG("calc_mini_block: phase " << phase << + TRACE_MSG("calc_micro_block: phase " << phase << ", shape " << shape << - ", mini-block [" << + ", micro-block [" << adj_block_idxs.start.make_val_str() << " ... " << adj_block_idxs.stop.make_val_str() << ") within base-block [" << base_block_idxs.begin.make_val_str() << " ... " << - base_block_idxs.end.make_val_str() << ") within base-region [" << - base_region_idxs.begin.make_val_str() << " ... " << - base_region_idxs.end.make_val_str() << - ") by region thread " << region_thread_idx); + base_block_idxs.end.make_val_str() << ") within base-mega-block [" << + base_mega_block_idxs.begin.make_val_str() << " ... " << + base_mega_block_idxs.end.make_val_str() << + ") by mega-block thread " << outer_thread_idx); // Promote forward progress in MPI when calc'ing interior // only. Call from one thread only. // Let all other threads continue. if (is_overlap_active() && do_mpi_interior) { - if (region_thread_idx == 0) - poke_halo_exchange(); + if (outer_thread_idx == 0) + adv_halo_exchange(); } - // Init mini-block begin & end from blk start & stop indices. - ScanIndices mini_block_idxs(*dims, true); - mini_block_idxs.init_from_outer(adj_block_idxs); + // Init micro-block begin & end from blk start & stop indices. + ScanIndices micro_block_idxs = adj_block_idxs.create_inner(); // Time range. - // No more temporal blocks below mini-blocks, so we always stride + // No more temporal blocks below micro-blocks, so we always stride // by +/- 1. - idx_t begin_t = mini_block_idxs.begin[step_posn]; - idx_t end_t = mini_block_idxs.end[step_posn]; + idx_t begin_t = micro_block_idxs.begin[step_posn]; + idx_t end_t = micro_block_idxs.end[step_posn]; idx_t step_dir = (end_t >= begin_t) ? 1 : -1; idx_t stride_t = 1 * step_dir; // +/- 1. assert(stride_t); @@ -1025,17 +1081,17 @@ namespace yask { const idx_t stop_t = (stride_t > 0) ? min(start_t + stride_t, end_t) : max(start_t + stride_t, end_t); - TRACE_MSG("calc_mini_block: phase " << phase << + TRACE_MSG("calc_micro_block: phase " << phase << ", shape " << shape << ", in step " << start_t); assert(abs(stop_t - start_t) == 1); // no more TB. // Set step indices that will pass through generated code. - mini_block_idxs.index[step_posn] = index_t; - mini_block_idxs.begin[step_posn] = start_t; - mini_block_idxs.end[step_posn] = stop_t; - mini_block_idxs.start[step_posn] = start_t; - mini_block_idxs.stop[step_posn] = stop_t; + micro_block_idxs.index[step_posn] = index_t; + micro_block_idxs.begin[step_posn] = start_t; + micro_block_idxs.end[step_posn] = stop_t; + micro_block_idxs.start[step_posn] = start_t; + micro_block_idxs.stop[step_posn] = stop_t; // Stages to evaluate at this time step. for (auto& bp : st_stages) { @@ -1046,12 +1102,12 @@ namespace yask { // Check step. if (check_step_conds && !bp->is_in_valid_step(start_t)) { - TRACE_MSG("calc_mini_block: step " << start_t << + TRACE_MSG("calc_micro_block: step " << start_t << " not valid for stage '" << bp->get_name() << "'"); continue; } - TRACE_MSG("calc_mini_block: phase " << phase << + TRACE_MSG("calc_micro_block: phase " << phase << ", shape " << shape << ", step " << start_t << ", stage '" << bp->get_name() << @@ -1059,53 +1115,56 @@ namespace yask { // Start timers for this stage. Tracking only on thread // 0. TODO: track all threads and report cross-thread stats. - if (region_thread_idx == 0) + if (outer_thread_idx == 0) bp->start_timers(); - // Strides within a mini-blk are based on sub-blk sizes. + // Strides within a micro-blk are based on nano-blk sizes. // This will get overridden later if thread binding is enabled. auto& settings = bp->get_active_settings(); - mini_block_idxs.stride = settings._sub_block_sizes; - mini_block_idxs.stride[step_posn] = stride_t; + micro_block_idxs.stride = settings._nano_block_sizes; + micro_block_idxs.stride[step_posn] = stride_t; - // Groups in mini-blk loops are based on sub-block-group sizes. - mini_block_idxs.group_size = settings._sub_block_group_sizes; + // Tiles in micro-blk loops. + micro_block_idxs.tile_size = settings._micro_block_tile_sizes; - // Set mini_block_idxs begin & end based on shifted rank - // start & stop (original region begin & end), rank + // Set micro_block_idxs begin & end based on shifted rank + // start & stop (original mega-block begin & end), rank // boundaries, and stage BB. There may be several TB layers - // within a region WF, so we need to add the region and - // local mini-block shift counts. - bool ok = shift_region(rank_idxs.start, rank_idxs.stop, - region_shift_num + shift_num, bp, - mini_block_idxs); + // within a mega-block WF, so we need to add the mega-block and + // local micro-block shift counts. + bool ok = shift_mega_block(rank_idxs.start, rank_idxs.stop, + mega_block_shift_num + shift_num, bp, + micro_block_idxs); - // Set mini_block_idxs begin & end based on shifted begin & + // Set micro_block_idxs begin & end based on shifted begin & // end of block for given phase & shape. This will be the - // base for the mini-block loops, which have no temporal + // base for the micro-block loops, which have no temporal // tiling. if (ok) - ok = shift_mini_block(adj_block_idxs.start, adj_block_idxs.stop, + ok = shift_micro_block(adj_block_idxs.start, adj_block_idxs.stop, adj_block_idxs.begin, adj_block_idxs.end, base_block_idxs.begin, base_block_idxs.end, - base_region_idxs.begin, base_region_idxs.end, + base_mega_block_idxs.begin, base_mega_block_idxs.end, shift_num, nphases, phase, nshapes, shape, bridge_mask, - mini_block_idxs); + micro_block_idxs); if (ok) { + micro_block_idxs.adjust_from_settings(settings._micro_block_sizes, + settings._micro_block_tile_sizes, + settings._nano_block_sizes); // Update offsets of scratch vars based on the current - // mini-block location. + // micro-block location. if (scratch_vecs.size()) - update_scratch_var_info(region_thread_idx, mini_block_idxs.begin); + update_scratch_var_info(outer_thread_idx, micro_block_idxs.begin); - // Call calc_mini_block() for each non-scratch bundle. + // Call calc_micro_block() for each non-scratch bundle. for (auto* sb : *bp) if (sb->get_bb().bb_num_points) - sb->calc_mini_block(region_thread_idx, settings, mini_block_idxs); + sb->calc_micro_block(outer_thread_idx, settings, micro_block_idxs); // Make sure streaming stores are visible for later loads. make_stores_visible(); @@ -1115,23 +1174,24 @@ namespace yask { shift_num++; // Stop timers for this stage. - if (region_thread_idx == 0) + if (outer_thread_idx == 0) bp->stop_timers(); } // stages. } // time-steps. - } // calc_mini_block(). + } // calc_micro_block(). - // Find boundaries within region with 'base_start' to 'base_stop' + // Find boundaries within mega-block with 'base_start' to 'base_stop' // shifted 'shift_num' times, which should start at 0 and increment for // each stage in each time-step. Trim to ext-BB and MPI section if 'bp' if // not null. Write results into 'begin' and 'end' in 'idxs'. Return // 'true' if resulting area is non-empty, 'false' if empty. - bool StencilContext::shift_region(const Indices& base_start, const Indices& base_stop, - idx_t shift_num, - StagePtr& bp, - ScanIndices& idxs) { + bool StencilContext::shift_mega_block(const Indices& base_start, + const Indices& base_stop, + idx_t shift_num, + StagePtr& bp, + ScanIndices& idxs) { STATE_VARS(this); // For wavefront adjustments, see conceptual diagram in @@ -1139,7 +1199,7 @@ namespace yask { // may be trimmed based on the BB and WF extensions outside of the // rank-BB. - // Actual region boundaries must stay within [extended] stage BB. + // Actual mega-block boundaries must stay within [extended] stage BB. // We have to calculate the posn in the extended rank at each // value of 'shift_num' because it is being shifted spatially. bool ok = true; @@ -1147,8 +1207,8 @@ namespace yask { auto angle = wf_angles[j]; idx_t shift_amt = angle * shift_num; - // Shift initial spatial region boundaries for this iteration of - // temporal wavefront. Regions only shift left, so region loops + // Shift initial spatial mega-block boundaries for this iteration of + // temporal wavefront. Mega-Blocks only shift left, so mega-block loops // must strictly increment. They may do so in any order. Shift // by pts in one WF step. Always shift left in WFs. idx_t rstart = base_start[i] - shift_amt; @@ -1169,7 +1229,7 @@ namespace yask { idx_t dend = rank_bb.bb_end[j]; // In left ext, add 'angle' points for every shift to get - // region boundary in ext. + // mega-block boundary in ext. if (rstart < dbegin && left_wf_exts[j]) rstart = max(rstart, dbegin - left_wf_exts[j] + shift_amt); @@ -1177,9 +1237,8 @@ namespace yask { if (rstop > dend && right_wf_exts[j]) rstop = min(rstop, dend + right_wf_exts[j] - shift_amt); - // Trim region based on current MPI section if - // using overlapping but not whole-block method. - if (is_overlap_active() && mpi_exterior_dim >= 0) { + // Trim mega-block based on current MPI section if overlapping. + if (is_overlap_active()) { // Interior boundaries. idx_t int_begin = mpi_interior.bb_begin[j]; @@ -1203,7 +1262,7 @@ namespace yask { // Modify interior if there is an external MPI // section on either side. Reduce interior by - // 'wf_shift_pts' to get size at base of region, + // 'wf_shift_pts' to get size at base of mega-block, // then expand by current shift amount to get size // at current shift number. if (does_exterior_exist(j, true)) { // left. @@ -1280,7 +1339,7 @@ namespace yask { } // exterior. } // overlapping. - // Anything to do in the adjusted region? + // Anything to do in the adjusted mega-block? if (rstop <= rstart) { ok = false; break; @@ -1291,9 +1350,11 @@ namespace yask { idxs.begin[i] = rstart; idxs.end[i] = rstop; } - TRACE_MSG("shift_region: updated span: [" << + TRACE_MSG("shift_mega_block: updated span: [" << idxs.begin.make_val_str() << " ... " << - idxs.end.make_val_str() << ") within region base [" << + idxs.end.make_val_str() << ") for " << + make_mpi_section_descr() << + " within mega-block base [" << base_start.make_val_str() << " ... " << base_stop.make_val_str() << ") shifted " << shift_num << " time(s) is " << @@ -1301,27 +1362,27 @@ namespace yask { return ok; } - // For given 'phase' and 'shape', find boundaries within mini-block at + // For given 'phase' and 'shape', find boundaries within micro-block at // 'mb_base_start' to 'mb_base_stop' shifted by 'mb_shift_num', which // should start at 0 and increment for each stage in each time-step. // 'mb_base' is subset of 'adj_block_base'. Also trim to block at // 'block_base_start' to 'block_base_stop' shifted by 'mb_shift_num'. - // Input 'begin' and 'end' of 'idxs' should be trimmed to region. Writes + // Input 'begin' and 'end' of 'idxs' should be trimmed to mega-block. Writes // results back into 'begin' and 'end' of 'idxs'. Returns 'true' if // resulting area is non-empty, 'false' if empty. - bool StencilContext::shift_mini_block(const Indices& mb_base_start, - const Indices& mb_base_stop, - const Indices& adj_block_base_start, - const Indices& adj_block_base_stop, - const Indices& block_base_start, - const Indices& block_base_stop, - const Indices& region_base_start, - const Indices& region_base_stop, - idx_t mb_shift_num, - idx_t nphases, idx_t phase, - idx_t nshapes, idx_t shape, - const BridgeMask& bridge_mask, - ScanIndices& idxs) { + bool StencilContext::shift_micro_block(const Indices& mb_base_start, + const Indices& mb_base_stop, + const Indices& adj_block_base_start, + const Indices& adj_block_base_stop, + const Indices& block_base_start, + const Indices& block_base_stop, + const Indices& mega_block_base_start, + const Indices& mega_block_base_stop, + idx_t mb_shift_num, + idx_t nphases, idx_t phase, + idx_t nshapes, idx_t shape, + const bit_mask_t& bridge_mask, + ScanIndices& idxs) { STATE_VARS(this); auto nstages = st_stages.size(); bool ok = true; @@ -1334,11 +1395,11 @@ namespace yask { // the base block and the L side of the next block. auto tb_angle = tb_angles[j]; - // Is this block first and/or last in region? - bool is_first_blk = block_base_start[i] <= region_base_start[i]; - bool is_last_blk = block_base_stop[i] >= region_base_stop[i]; + // Is this block first and/or last in mega-block? + bool is_first_blk = block_base_start[i] <= mega_block_base_start[i]; + bool is_last_blk = block_base_stop[i] >= mega_block_base_stop[i]; - // Is there only one blk in the region in this dim? + // Is there only one blk in the mega-block in this dim? bool is_one_blk = is_first_blk && is_last_blk; // Initial start and stop point of phase-0 block. @@ -1352,28 +1413,27 @@ namespace yask { // Starting point of the *next* block. This is used to create // bridge shapes between blocks. Initially, the beginning of // the next block is the end of this block. - // TODO: split these parts more evenly when not full triangles. idx_t next_blk_start = block_base_stop[i]; // Adjust these based on current shift. Adjust by pts in one TB // step, reducing size on R & L sides. But if block is first - // and/or last, clamp to region. TODO: have different R & L + // and/or last, clamp to mega-block. TODO: have different R & L // angles. TODO: have different shifts for each stage. // Shift start to right unless first. First block will be a - // parallelogram or trapezoid clamped to beginning of region. + // parallelogram or trapezoid clamped to beginning of mega-block. blk_start += tb_angle * mb_shift_num; if (is_first_blk) blk_start = idxs.begin[i]; // Shift stop to left. If there will be no bridges, clamp - // last block to end of region. + // last block to end of mega-block. blk_stop -= tb_angle * mb_shift_num; if ((nphases == 1 || is_one_blk) && is_last_blk) blk_stop = idxs.end[i]; // Shift start of next block. Last bridge will be - // clamped to end of region. + // clamped to end of mega-block. next_blk_start += tb_angle * mb_shift_num; if (is_last_blk) next_blk_start = idxs.end[i]; @@ -1389,8 +1449,8 @@ namespace yask { // until all dims are bridged at last phase. // Use list of dims to bridge for this shape // computed earlier. - if (phase > 0 && bridge_mask[j]) { - TRACE_MSG("shift_mini_block: phase " << phase << + if (phase > 0 && is_bit_set(bridge_mask, j)) { + TRACE_MSG("shift_micro_block: phase " << phase << ", shape " << shape << ": bridging dim " << j); @@ -1408,7 +1468,7 @@ namespace yask { ok = false; else { - // Is this mini-block first and/or last in block? + // Is this micro-block first and/or last in block? bool is_first_mb = mb_base_start[i] <= adj_block_base_start[i]; bool is_last_mb = mb_base_stop[i] >= adj_block_base_stop[i]; @@ -1419,7 +1479,7 @@ namespace yask { idx_t mb_start = mb_base_start[i]; idx_t mb_stop = mb_base_stop[i]; - // Shift mini-block by MB angles unless there is only one. + // Shift micro-block by MB angles unless there is only one. // MB is a wave-front, so only shift left. if (!is_one_mb) { auto mb_angle = mb_angles[j]; @@ -1433,11 +1493,11 @@ namespace yask { if (is_last_mb) mb_stop = shape_stop; - // Trim mini-block to fit in region. + // Trim micro-block to fit in mega-block. mb_start = max(mb_start, idxs.begin[i]); mb_stop = min(mb_stop, idxs.end[i]); - // Trim mini-block range to fit in shape. + // Trim micro-block range to fit in shape. mb_start = max(mb_start, shape_start); mb_stop = min(mb_stop, shape_stop); @@ -1454,31 +1514,31 @@ namespace yask { } // dims. - TRACE_MSG("shift_mini_block: phase " << phase << "/" << nphases << + TRACE_MSG("shift_micro_block: phase " << phase << "/" << nphases << ", shape " << shape << "/" << nshapes << ", updated span: [" << idxs.begin.make_val_str() << " ... " << - idxs.end.make_val_str() << ") from original mini-block [" << + idxs.end.make_val_str() << ") from original micro-block [" << mb_base_start.make_val_str() << " ... " << mb_base_stop.make_val_str() << ") shifted " << mb_shift_num << " time(s) within adj-block base [" << adj_block_base_start.make_val_str() << " ... " << adj_block_base_stop.make_val_str() << ") and actual block base [" << block_base_start.make_val_str() << " ... " << - block_base_stop.make_val_str() << ") and region base [" << - region_base_start.make_val_str() << " ... " << - region_base_stop.make_val_str() << ") is " << + block_base_stop.make_val_str() << ") and mega-block base [" << + mega_block_base_start.make_val_str() << " ... " << + mega_block_base_stop.make_val_str() << ") is " << (ok ? "not " : "") << "empty"); return ok; } // Adjust offsets of scratch vars based on thread number 'thread_idx' - // and beginning point of mini-block 'idxs'. Each scratch-var is + // and beginning point of micro-block 'idxs'. Each scratch-var is // assigned to a thread, so it must "move around" as the thread is - // assigned to each mini-block. This move is accomplished by changing + // assigned to each micro-block. This move is accomplished by changing // the vars' local offsets. void StencilContext::update_scratch_var_info(int thread_idx, - const Indices& idxs) { + const Indices& idxs) { STATE_VARS(this); // Loop thru vecs of scratch vars. @@ -1523,415 +1583,37 @@ namespace yask { } } - // Compare vars in contexts. + // Compare output vars in contexts. // Return number of mis-compares. idx_t StencilContext::compare_data(const StencilContext& ref) const { STATE_VARS_CONST(this); + copy_vars_from_device(); - DEBUG_MSG("Comparing var(s) in '" << name << "' to '" << ref.name << "'..."); - if (var_ptrs.size() != ref.var_ptrs.size()) { - TRACE_MSG("** number of vars not equal"); + DEBUG_MSG("Comparing output var(s) in '" << name << "' to '" << ref.name << "'..."); + if (output_var_ptrs.size() != ref.output_var_ptrs.size()) { + TRACE_MSG("** number of output vars not equal"); return 1; } idx_t errs = 0; - for (size_t gi = 0; gi < var_ptrs.size(); gi++) { - TRACE_MSG("Var '" << ref.var_ptrs[gi]->get_name() << "'..."); - auto& gb = var_ptrs[gi]->gb(); - auto* rgbp = ref.var_ptrs[gi]->gbp(); + for (size_t gi = 0; gi < output_var_ptrs.size(); gi++) { + auto& gb = output_var_ptrs[gi]->gb(); + auto* rgbp = ref.output_var_ptrs[gi]->gbp(); + TRACE_MSG("Var '" << gb.get_name() << "'..."); errs += gb.compare(rgbp); } return errs; } - // Call MPI_Test() on all unfinished requests to promote MPI progress. - // TODO: replace with more direct and less intrusive techniques. - void StencilContext::poke_halo_exchange() { - STATE_VARS(this); - -#ifdef USE_MPI - if (!enable_halo_exchange || env->num_ranks < 2) - return; - - test_time.start(); - TRACE_MSG("poke_halo_exchange"); - - // Loop thru MPI data. - int num_tests = 0; - for (auto& mdi : mpi_data) { - auto& gname = mdi.first; - auto& var_mpi_data = mdi.second; - MPI_Request* var_recv_reqs = var_mpi_data.recv_reqs.data(); - MPI_Request* var_send_reqs = var_mpi_data.send_reqs.data(); - - int flag; -#if 1 - int indices[max(var_mpi_data.recv_reqs.size(), var_mpi_data.send_reqs.size())]; - MPI_Testsome(int(var_mpi_data.recv_reqs.size()), var_recv_reqs, &flag, indices, MPI_STATUS_IGNORE); - MPI_Testsome(int(var_mpi_data.send_reqs.size()), var_send_reqs, &flag, indices, MPI_STATUS_IGNORE); -#elif 0 - int index; - MPI_Testany(int(var_mpi_data.recv_reqs.size()), var_recv_reqs, &index, &flag, MPI_STATUS_IGNORE); - MPI_Testany(int(var_mpi_data.send_reqs.size()), var_send_reqs, &index, &flag, MPI_STATUS_IGNORE); -#else - for (size_t i = 0; i < var_mpi_data.recv_reqs.size(); i++) { - auto& r = var_recv_reqs[i]; - if (r != MPI_REQUEST_NULL) { - //TRACE_MSG(gname << " recv test &MPI_Request = " << &r); - MPI_Test(&r, &flag, MPI_STATUS_IGNORE); - num_tests++; - if (flag) - r = MPI_REQUEST_NULL; - } - } - for (size_t i = 0; i < var_mpi_data.send_reqs.size(); i++) { - auto& r = var_send_reqs[i]; - if (r != MPI_REQUEST_NULL) { - //TRACE_MSG(gname << " send test &MPI_Request = " << &r); - MPI_Test(&r, &flag, MPI_STATUS_IGNORE); - num_tests++; - if (flag) - r = MPI_REQUEST_NULL; - } - } -#endif - } - auto ttime = test_time.stop(); - TRACE_MSG("poke_halo_exchange: secs spent in " << num_tests << - " MPI test(s): " << make_num_str(ttime)); -#endif - } - - // Exchange dirty halo data for all vars and all steps. - void StencilContext::exchange_halos() { - -#ifdef USE_MPI - STATE_VARS(this); - if (!enable_halo_exchange || env->num_ranks < 2) - return; - - halo_time.start(); - double wait_delta = 0.; - TRACE_MSG("exchange_halos"); - if (is_overlap_active()) { - if (do_mpi_left) - TRACE_MSG(" following calc of MPI left exterior"); - if (do_mpi_right) - TRACE_MSG(" following calc of MPI right exterior"); - if (do_mpi_interior) - TRACE_MSG(" following calc of MPI interior"); - } - - // Vars for list of vars that need to be swapped and their step - // indices. Use an ordered map by *name* to make sure vars are - // swapped in same order on all ranks. (If we order vars by - // pointer, pointer values will not generally be the same on each - // rank.) - VarPtrMap vars_to_swap; - map first_steps_to_swap; - map last_steps_to_swap; - - // Loop thru all vars. - for (auto& gp : var_ptrs) { - auto& gb = gp->gb(); - - // Don't swap scratch vars. - if (gb.is_scratch()) - continue; - - // Only need to swap vars that have any MPI buffers. - auto& gname = gp->get_name(); - if (mpi_data.count(gname) == 0) - continue; - - // Check all allocated step indices. - // Use '0' for vars that don't use the step dim. - idx_t start_t = 0, stop_t = 1; - if (gp->is_dim_used(step_dim)) { - start_t = gp->get_first_valid_step_index(); - stop_t = gp->get_last_valid_step_index() + 1; - } - for (idx_t t = start_t; t < stop_t; t++) { - - // Only need to swap vars whose halos are not up-to-date - // for this step. - if (!gb.is_dirty(t)) - continue; - - // Swap this var. - vars_to_swap[gname] = gp; - - // Update first step. - if (first_steps_to_swap.count(gp) == 0 || t < first_steps_to_swap[gp]) - first_steps_to_swap[gp] = t; - - // Update last step. - if (last_steps_to_swap.count(gp) == 0 || t > last_steps_to_swap[gp]) - last_steps_to_swap[gp] = t; - - } // steps. - } // vars. - TRACE_MSG("exchange_halos: need to exchange halos for " << - vars_to_swap.size() << " var(s)"); - assert(vars_to_swap.size() == first_steps_to_swap.size()); - assert(vars_to_swap.size() == last_steps_to_swap.size()); - - // Sequence of things to do for each neighbor. - enum halo_steps { halo_irecv, halo_pack_isend, halo_unpack, halo_final }; - vector steps_to_do; - - // Flags indicate what part of vars were most recently calc'd. - // These determine what exchange steps need to be done now. - if (vars_to_swap.size()) { - if (do_mpi_left || do_mpi_right) { - steps_to_do.push_back(halo_irecv); - steps_to_do.push_back(halo_pack_isend); - } - if (do_mpi_interior) { - steps_to_do.push_back(halo_unpack); - steps_to_do.push_back(halo_final); - } - } - - int num_send_reqs = 0; - int num_recv_reqs = 0; - for (auto halo_step : steps_to_do) { - - if (halo_step == halo_irecv) - TRACE_MSG("exchange_halos: requesting data phase"); - else if (halo_step == halo_pack_isend) - TRACE_MSG("exchange_halos: packing and sending data phase"); - else if (halo_step == halo_unpack) - TRACE_MSG("exchange_halos: waiting for and unpacking data phase"); - else if (halo_step == halo_final) - TRACE_MSG("exchange_halos: waiting for send to finish phase"); - else - THROW_YASK_EXCEPTION("internal error: unknown halo-exchange step"); - - // Loop thru all vars to swap. - // Use 'gi' as an MPI tag. - int gi = 0; - for (auto gtsi : vars_to_swap) { - gi++; - auto& gname = gtsi.first; - auto& gp = gtsi.second; - auto& gb = gp->gb(); - auto& var_mpi_data = mpi_data.at(gname); - MPI_Request* var_recv_reqs = var_mpi_data.recv_reqs.data(); - MPI_Request* var_send_reqs = var_mpi_data.send_reqs.data(); - - // Loop thru all this rank's neighbors. - var_mpi_data.visit_neighbors - ([&](const IdxTuple& offsets, // NeighborOffset. - int neighbor_rank, - int ni, // unique neighbor index. - MPIBufs& bufs) { - auto& send_buf = bufs.bufs[MPIBufs::buf_send]; - auto& recv_buf = bufs.bufs[MPIBufs::buf_recv]; - TRACE_MSG("exchange_halos: with rank " << neighbor_rank << " at relative position " << - offsets.sub_elements(1).make_dim_val_offset_str()); - - // Are we using MPI shm w/this neighbor? - bool using_shm = opts->use_shm && mpi_info->shm_ranks.at(ni) != MPI_PROC_NULL; - - // Submit async request to receive data from neighbor. - if (halo_step == halo_irecv) { - auto nbytes = recv_buf.get_bytes(); - if (nbytes) { - if (using_shm) - TRACE_MSG("exchange_halos: no receive req due to shm"); - else { - void* buf = (void*)recv_buf._elems; - TRACE_MSG("exchange_halos: requesting up to " << make_byte_str(nbytes)); - auto& r = var_recv_reqs[ni]; - MPI_Irecv(buf, nbytes, MPI_BYTE, - neighbor_rank, int(gi), - env->comm, &r); - num_recv_reqs++; - } - } - else - TRACE_MSG("exchange_halos: 0B to request"); - } - - // Pack data into send buffer, then send to neighbor. - else if (halo_step == halo_pack_isend) { - auto nbytes = send_buf.get_bytes(); - if (nbytes) { - - // Vec ok? - // Domain sizes must be ok, and buffer size must be ok - // as calculated when buffers were created. - bool send_vec_ok = allow_vec_exchange && send_buf.vec_copy_ok; - - // Get first and last ranges. - IdxTuple first = send_buf.begin_pt; - IdxTuple last = send_buf.last_pt; - - // The code in alloc_mpi_data() pre-calculated the first and - // last points of each buffer, except in the step dim, where - // the max range was set. Update actual range now. - if (gp->is_dim_used(step_dim)) { - first.set_val(step_dim, first_steps_to_swap[gp]); - last.set_val(step_dim, last_steps_to_swap[gp]); - } - - // Wait until buffer is avail. - if (using_shm) { - TRACE_MSG("exchange_halos: waiting to write to shm buffer"); - wait_time.start(); - send_buf.wait_for_ok_to_write(); - wait_delta += wait_time.stop(); - } - - // Copy (pack) data from var to buffer. - void* buf = (void*)send_buf._elems; - idx_t nelems = 0; - TRACE_MSG("exchange_halos: packing [" << first.make_dim_val_str() << - " ... " << last.make_dim_val_str() << "] " << - (send_vec_ok ? "with" : "without") << - " vector copy into " << buf); - if (send_vec_ok) - nelems = gp->get_vecs_in_slice(buf, first, last); - else - nelems = gp->get_elements_in_slice(buf, first, last); - idx_t nbytes = nelems * get_element_bytes(); - - if (using_shm) { - TRACE_MSG("exchange_halos: no send req due to shm"); - send_buf.mark_write_done(); - } - else { - - // Send packed buffer to neighbor. - assert(nbytes <= send_buf.get_bytes()); - TRACE_MSG("exchange_halos: sending " << make_byte_str(nbytes)); - auto& r = var_send_reqs[ni]; - MPI_Isend(buf, nbytes, MPI_BYTE, - neighbor_rank, int(gi), env->comm, &r); - num_send_reqs++; - } - } - else - TRACE_MSG(" 0B to send"); - } - - // Wait for data from neighbor, then unpack it. - else if (halo_step == halo_unpack) { - auto nbytes = recv_buf.get_bytes(); - if (nbytes) { - - // Wait until buffer is avail. - if (using_shm) { - TRACE_MSG("exchange_halos: waiting to read from shm buffer"); - wait_time.start(); - recv_buf.wait_for_ok_to_read(); - wait_delta += wait_time.stop(); - } - else { - - // Wait for data from neighbor before unpacking it. - auto& r = var_recv_reqs[ni]; - if (r != MPI_REQUEST_NULL) { - TRACE_MSG(" waiting for receipt of " << make_byte_str(nbytes)); - wait_time.start(); - MPI_Wait(&r, MPI_STATUS_IGNORE); - wait_delta += wait_time.stop(); - } - r = MPI_REQUEST_NULL; - } - - // Vec ok? - bool recv_vec_ok = allow_vec_exchange && recv_buf.vec_copy_ok; - - // Get first and last ranges. - IdxTuple first = recv_buf.begin_pt; - IdxTuple last = recv_buf.last_pt; - - // Set step val as above. - if (gp->is_dim_used(step_dim)) { - first.set_val(step_dim, first_steps_to_swap[gp]); - last.set_val(step_dim, last_steps_to_swap[gp]); - } - - // Copy data from buffer to var. - void* buf = (void*)recv_buf._elems; - idx_t nelems = 0; - TRACE_MSG("exchange_halos: got data; unpacking into [" << first.make_dim_val_str() << - " ... " << last.make_dim_val_str() << "] " << - (recv_vec_ok ? "with" : "without") << - " vector copy from " << buf); - if (recv_vec_ok) - nelems = gp->set_vecs_in_slice(buf, first, last); - else - nelems = gp->set_elements_in_slice(buf, first, last); - assert(nelems <= recv_buf.get_size()); - - if (using_shm) - recv_buf.mark_read_done(); - } - else - TRACE_MSG("exchange_halos: 0B to wait for"); - } - - // Final steps. - else if (halo_step == halo_final) { - auto nbytes = send_buf.get_bytes(); - if (nbytes) { - - if (using_shm) - TRACE_MSG("exchange_halos: no send wait due to shm"); - else { - - // Wait for send to finish. - // TODO: consider using MPI_WaitAll. - // TODO: strictly, we don't have to wait on the - // send to finish until we want to reuse this buffer, - // so we could wait on the *previous* send right before - // doing another one. - auto& r = var_send_reqs[ni]; - if (r != MPI_REQUEST_NULL) { - TRACE_MSG(" waiting to finish send of " << make_byte_str(nbytes)); - wait_time.start(); - MPI_Wait(&var_send_reqs[ni], MPI_STATUS_IGNORE); - wait_delta += wait_time.stop(); - } - r = MPI_REQUEST_NULL; - } - } - - // Mark vars as up-to-date when done. - for (idx_t si = first_steps_to_swap[gp]; si <= last_steps_to_swap[gp]; si++) { - if (gb.is_dirty(si)) { - gb.set_dirty(false, si); - TRACE_MSG("exchange_halos: var '" << gname << - "' marked as clean at step-index " << si); - } - } - } - - }); // visit neighbors. - - } // vars. - - } // exchange sequence. - - TRACE_MSG("exchange_halos: " << num_recv_reqs << " MPI receive request(s) issued"); - TRACE_MSG("exchange_halos: " << num_send_reqs << " MPI send request(s) issued"); - - auto mpi_call_time = halo_time.stop(); - TRACE_MSG("exchange_halos: secs spent in MPI waits: " << make_num_str(wait_delta)); - TRACE_MSG("exchange_halos: secs spent in this call: " << make_num_str(mpi_call_time)); -#endif - } - - // Update data in vars that have been written to by stage 'sel_bp'. - void StencilContext::update_vars(const StagePtr& sel_bp, - idx_t start, idx_t stop, - bool mark_dirty) { + // Update data in vars that *may* have been written to by stage 'sel_bp' + // in any rank: set the last "valid step" and mark vars as "dirty", + // i.e., indicate that we may need to do a halo exchange. + void StencilContext::update_var_info(const StagePtr& sel_bp, + idx_t start, idx_t stop, + bool mark_dirty, + bool mod_dev_data) { STATE_VARS(this); idx_t stride = (start > stop) ? -1 : 1; - map> vars_done; // Stages. for (auto& bp : st_stages) { @@ -1946,32 +1628,13 @@ namespace yask { // Each bundle in this stage. for (auto* sb : *bp) { - // Get output step for this bundle, if any. - // For many stencils, this will be t+1 or - // t-1 if striding backward. - idx_t t_out = 0; - if (!sb->get_output_step_index(t, t_out)) - continue; + // Output vars for this bundle. + sb->update_var_info(YkVarBase::others, t, mark_dirty, mod_dev_data, true); - // Output vars for this bundle. NB: don't need to mark - // scratch vars as dirty because they are never exchanged. - for (auto gp : sb->output_var_ptrs) { - auto& gb = gp->gb(); - - // Update if not already done. - if (vars_done[gp].count(t_out) == 0) { - gb.update_valid_step(t_out); - if (mark_dirty) - gb.set_dirty(true, t_out); - TRACE_MSG("var '" << gp->get_name() << - "' updated at step " << t_out); - vars_done[gp].insert(t_out); - } - } } // bundles. } // steps. } // stages. - } // update_vars(). + } // update_var_info(). // Reset any locks, etc. void StencilContext::reset_locks() { @@ -1983,4 +1646,20 @@ namespace yask { } } + // Copy vars from host to device as needed. + void StencilContext::copy_vars_to_device() const { + for (auto gp : orig_var_ptrs) { + assert(gp); + gp->gb().const_copy_data_to_device(); + } + } + + // Copy vars from device to host as needed. + void StencilContext::copy_vars_from_device() const { + for (auto gp : orig_var_ptrs) { + assert(gp); + gp->gb().const_copy_data_from_device(); + } + } + } // namespace yask. diff --git a/src/kernel/lib/context.hpp b/src/kernel/lib/context.hpp index b48e776c..3101f386 100644 --- a/src/kernel/lib/context.hpp +++ b/src/kernel/lib/context.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -57,6 +57,12 @@ namespace yask { IdxTuple bb_end_tuple(const IdxTuple& ddims) const { return bb_end.make_tuple(ddims); } + IdxTuple bb_last_tuple(const IdxTuple& ddims) const { + auto res = bb_end.make_tuple(ddims); + DOMAIN_VAR_LOOP(i, j) + res[j] = res[j] - 1; + return res; + } IdxTuple bb_len_tuple(const IdxTuple& ddims) const { return bb_len.make_tuple(ddims); } @@ -78,8 +84,8 @@ namespace yask { // Is point in BB? // 'pt' must have same dims as BB. bool is_in_bb(const IdxTuple& pt) const { - assert(pt._get_num_dims() == bb_begin._get_num_dims()); - for (int i = 0; i < pt._get_num_dims(); i++) { + assert(pt.get_num_dims() == bb_begin.get_num_dims()); + for (int i = 0; i < pt.get_num_dims(); i++) { if (pt[i] < bb_begin[i]) return false; if (pt[i] >= bb_end[i]) @@ -88,8 +94,8 @@ namespace yask { return true; } bool is_in_bb(const Indices& pt) const { - assert(pt._get_num_dims() == bb_begin._get_num_dims()); - for (int i = 0; i < pt._get_num_dims(); i++) { + assert(pt.get_num_dims() == bb_begin.get_num_dims()); + for (int i = 0; i < pt.get_num_dims(); i++) { if (pt[i] < bb_begin[i]) return false; if (pt[i] >= bb_end[i]) @@ -168,7 +174,23 @@ namespace yask { typedef std::set StencilBundleSet; typedef std::shared_ptr StagePtr; typedef std::vector StageList; - typedef std::vector BridgeMask; + + // Common data needed in the kernel(s). + struct CommonCoreData { + + // Copies of context info. + Indices _global_sizes; + Indices _rank_sizes; + Indices _rank_domain_offsets; + + void set_core(const StencilContext *cxt); + }; + + // Base of core data needed in the kernel(s). + // Other data is added via inheritance by the YASK compiler. + struct StencilCoreBase { + CommonCoreData _common_core; + }; // Data and hierarchical sizes. // This is a pure-virtual class that must be implemented @@ -187,9 +209,7 @@ namespace yask { static constexpr size_t _data_buf_pad = YASK_PAD_BYTES; // Alloc given bytes on each NUMA node. - virtual void _alloc_data(const std::map & nbytes, - const std::map & nvars, - std::map >& _data_buf, + virtual void _alloc_data(AllocMap& alloc_reqs, const std::string& type); // Callbacks. @@ -219,22 +239,51 @@ namespace yask { // include any extensions needed for WF. BoundingBox mpi_interior; - // Flags to calculate the interior and/or exterior. + // Flags to track state of calculating the interior and/or exterior. bool do_mpi_interior = true; bool do_mpi_left = true; // left exterior in given dim. bool do_mpi_right = true; // right exterior in given dim. idx_t mpi_exterior_dim = -1; // which domain dim in left/right. - // Is overlap currently enabled? + // Set MPI flag to defaults. + inline void init_mpi_flags() { + do_mpi_interior = do_mpi_left = do_mpi_right = true; + mpi_exterior_dim = -1; + } + + // Is overlapping-comms mode currently enabled? inline bool is_overlap_active() const { + assert(do_mpi_interior || do_mpi_left || do_mpi_right); + if (!do_mpi_interior) + assert(do_mpi_left || do_mpi_right); // one or both. + else { + assert(do_mpi_left == do_mpi_right); // both or neither. + if (do_mpi_left != do_mpi_right) + assert(mpi_exterior_dim >= 0); // must specify dim. + } bool active = !do_mpi_interior || !do_mpi_left || !do_mpi_right; if (active) { - assert(do_mpi_interior || do_mpi_left || do_mpi_right); assert(mpi_interior.bb_valid); } return active; } + // Describe MPI flag setting. + std::string make_mpi_section_descr() { + STATE_VARS(this); + if (is_overlap_active()) + return std::string("MPI ") + + (do_mpi_interior ? "interior" : + (do_mpi_left && do_mpi_right) ? "exterior" : + do_mpi_left ? + ("exterior left-" + + domain_dims.get_dim_name(mpi_exterior_dim)) : + ("exterior right-" + + domain_dims.get_dim_name(mpi_exterior_dim))) + + " section"; + return std::string("all MPI sections"); + } + // Is there a non-zero exterior in the given section? inline bool does_exterior_exist(idx_t ddim, bool is_left) const { return is_left ? @@ -251,14 +300,14 @@ namespace yask { StageList st_stages; // All non-scratch vars, including those created by APIs. - VarPtrs var_ptrs; - VarPtrMap var_map; + VarPtrs all_var_ptrs; + VarPtrMap all_var_map; - // Only vars defined by the YASK compiler. + // Only non-scratch vars defined by the YASK compiler. VarPtrs orig_var_ptrs; VarPtrMap orig_var_map; - // Only vars defined by the YASK compiler that are updated by the stencils. + // Only non-scratch vars defined by the YASK compiler that are updated by the stencils. VarPtrs output_var_ptrs; VarPtrMap output_var_map; @@ -275,9 +324,13 @@ namespace yask { YaskTimer run_time; // time in run_solution(), including halo exchange. YaskTimer ext_time; // time in exterior stencil calculation. YaskTimer int_time; // time in interior stencil calculation. - YaskTimer halo_time; // time spent just doing halo exchange, including MPI waits. - YaskTimer wait_time; // time spent just doing MPI waits. - YaskTimer test_time; // time spent just doing MPI tests. + YaskTimer halo_time; // time spent in halo exchange. + YaskTimer halo_pack_time; // time spent on packing in halo exchange. + YaskTimer halo_unpack_time; // time spent on unpacking in halo exchange. + YaskTimer halo_copy_time; // time spent on copying buffers in halo exchange. + YaskTimer halo_wait_time; // time spent on MPI waits in halo exchange. + YaskTimer halo_test_time; // time spent on MPI tests for halo exchange. + YaskTimer halo_lock_wait_time; // time spent on shm lock waits in halo exchange. idx_t steps_done = 0; // number of steps that have been run. // Maximum halos, skewing angles, and work extensions over all vars @@ -290,7 +343,7 @@ namespace yask { IdxTuple left_wf_exts; // WF extension needed on left side of rank for halo exch. IdxTuple right_wf_exts; // WF extension needed on right side of rank. - // Settings for temporal blocking and mini-blocks. + // Settings for temporal blocking and micro-blocks. idx_t tb_steps = 0; // max number of steps in a TB. 0 => no TB. IdxTuple tb_angles; // TB skewing angles for each shift (in points). idx_t num_tb_shifts = 0; // number of TB shifts required in tb_steps. @@ -298,20 +351,7 @@ namespace yask { IdxTuple tb_tops; // top of TB trapezoid. IdxTuple mb_angles; // MB skewing angles for each shift (in points). - // MPI settings. - // TODO: move to settings or MPI info object. -#ifdef NO_VEC_EXCHANGE - bool allow_vec_exchange = false; -#else - bool allow_vec_exchange = true; // allow vectorized halo exchange. -#endif -#ifdef NO_HALO_EXCHANGE - bool enable_halo_exchange = false; -#else - bool enable_halo_exchange = true; -#endif - - // Clear this to ignore step conditions. + // Clear this to ignore step conditions during auto-tuning. bool check_step_conds = true; // MPI buffers for each var. @@ -319,8 +359,9 @@ namespace yask { std::map mpi_data; // Constructor. - StencilContext(KernelEnvPtr& env, - KernelSettingsPtr& settings); + StencilContext(KernelEnvPtr& kenv, + KernelSettingsPtr& ksettings, + KernelSettingsPtr& user_settings); // Destructor. virtual ~StencilContext() { @@ -330,6 +371,9 @@ namespace yask { get_stats(); } + // Access core data. + virtual StencilCoreBase* corep() =0; + // Ready? bool is_prepared() const { return rank_bb.bb_valid; @@ -338,12 +382,6 @@ namespace yask { rank_bb.bb_valid = prep; } - // Modify settings in shared state and auto-tuner. - void set_settings(KernelSettingsPtr opts) { - _state->_opts = opts; - _at.set_settings(opts.get()); - } - // Reset elapsed times to zero. void clear_timers(); @@ -391,7 +429,8 @@ namespace yask { virtual void reset_locks(); // Print info about the soln. - virtual void print_temporal_tiling_info() const; + virtual void print_temporal_tiling_info(std::string prefix = "") const; + virtual void print_sizes(std::string prefix = "") const; virtual void print_warnings() const; /// Get statistics associated with preceding calls to run_solution(). @@ -410,15 +449,21 @@ namespace yask { // Adjust offsets of scratch vars based // on thread and scan indices. - virtual void update_scratch_var_info(int region_thread_idx, + virtual void update_scratch_var_info(int outer_thread_idx, const Indices& idxs); + // Copy non-scratch vars to device as needed. + void copy_vars_to_device() const; + + // Copy non-scratch output vars from device as needed. + void copy_vars_from_device() const; + // Get total memory allocation required by vars. // Does not include MPI buffers. // TODO: add MPI buffers. virtual size_t get_num_bytes() { size_t sz = 0; - for (auto gp : var_ptrs) { + for (auto gp : all_var_ptrs) { if (gp) sz += gp->get_num_storage_bytes() + _data_buf_pad; } @@ -431,25 +476,22 @@ namespace yask { } // Init all vars & params by calling real_init_fn. - virtual void init_values(std::function real_init_fn); + virtual void init_values(real_t seed0, + std::function real_init_fn); // Init all vars & params to same value within vars, // but different for each var. - virtual void init_same() { - init_values([&](YkVarPtr gp, real_t seed){ gp->set_all_elements_same(seed); }); + virtual void init_same(real_t seed0) { + init_values(seed0, [&](YkVarPtr gp, real_t seed) + { gp->set_all_elements_same(seed); }); } // Init all vars & params to different values within vars, // and different for each var. - virtual void init_diff() { - init_values([&](YkVarPtr gp, real_t seed){ gp->set_all_elements_in_seq(seed); }); - } - - // Init all vars & params. - // By default it uses the init_same initialization routine. - virtual void init_data() { - init_diff(); // Safer than init_same() to avoid NaNs due to div-by-zero. + virtual void init_diff(real_t seed0) { + init_values(seed0, [&](YkVarPtr gp, real_t seed) + { gp->set_all_elements_in_seq(seed); }); } // Compare vars in contexts for validation. @@ -461,73 +503,89 @@ namespace yask { void run_ref(idx_t first_step_index, idx_t last_step_index); - // Calculate results within a region. - void calc_region(StagePtr& sel_bp, + // Calculate results within a mega-block. + void calc_mega_block(StagePtr& sel_bp, const ScanIndices& rank_idxs); // Calculate results within a block. void calc_block(StagePtr& sel_bp, - idx_t region_shift_num, + idx_t mega_block_shift_num, idx_t nphases, idx_t phase, const ScanIndices& rank_idxs, - const ScanIndices& region_idxs); + const ScanIndices& mega_block_idxs); - // Calculate results within a mini-block. - void calc_mini_block(int region_thread_idx, + // Calculate results within a micro-block. + void calc_micro_block(int outer_thread_idx, StagePtr& sel_bp, - idx_t region_shift_num, + idx_t mega_block_shift_num, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, - const BridgeMask& bridge_mask, + const bit_mask_t& bridge_mask, const ScanIndices& rank_idxs, - const ScanIndices& base_region_idxs, + const ScanIndices& base_mega_block_idxs, const ScanIndices& base_block_idxs, const ScanIndices& adj_block_idxs); // Exchange all dirty halo data for all stencil bundles. void exchange_halos(); - // Call MPI_Test() on all unfinished requests to promote MPI progress. - void poke_halo_exchange(); + // Call MPI_Test() on all unfinished requests to advance MPI progress. + void adv_halo_exchange(); // Update valid steps in vars that have been written to by stage 'sel_bp'. // If sel_bp==null, use all bundles. // If 'mark_dirty', also mark as needing halo exchange. - void update_vars(const StagePtr& sel_bp, - idx_t start, idx_t stop, - bool mark_dirty); + void update_var_info(const StagePtr& sel_bp, + idx_t start, idx_t stop, + bool mark_dirty, + bool mod_dev_data = true); + + // Mark all exchangable vars as possibly dirty in other ranks. This + // should be called anytime APIs could have been called and before + // running any steps. + void set_all_neighbor_vars_dirty() { + for (auto& gp : orig_var_ptrs) { + gp->gb().set_dirty_all(YkVarBase::others, true); + } + } - // Set various limits in 'idxs' based on current step in region. - bool shift_region(const Indices& base_start, const Indices& base_stop, + // Set various limits in 'idxs' based on current step in mega-block. + bool shift_mega_block(const Indices& base_start, const Indices& base_stop, idx_t shift_num, StagePtr& bp, ScanIndices& idxs); // Set various limits in 'idxs' based on current step in block. - bool shift_mini_block(const Indices& mb_base_start, + bool shift_micro_block(const Indices& mb_base_start, const Indices& mb_base_stop, const Indices& adj_block_base_start, const Indices& adj_block_base_stop, const Indices& block_base_start, const Indices& block_base_stop, - const Indices& region_base_start, - const Indices& region_base_stop, + const Indices& mega_block_base_start, + const Indices& mega_block_base_stop, idx_t mb_shift_num, idx_t nphases, idx_t phase, idx_t nshapes, idx_t shape, - const BridgeMask& bridge_mask, + const bit_mask_t& bridge_mask, ScanIndices& idxs); // Set the bounding-box around all stencil bundles. void find_bounding_boxes(); + // Set data needed by the kernels. + // Implemented by the YASK compiler-generated code. + virtual void set_core() =0; + // Make new scratch vars. - virtual void make_scratch_vars (int num_threads) =0; - + // Implemented by the YASK compiler-generated code. + virtual void make_scratch_vars(int num_threads) =0; + // Make a new var iff its dims match any in the stencil. // Returns pointer to the new var or nullptr if no match. - virtual VarBasePtr new_stencil_var (const std::string & name, - const VarDimNames & dims) =0; + // Implemented by the YASK compiler-generated code. + virtual VarBasePtr new_stencil_var(const std::string & name, + const VarDimNames & dims) =0; // Make a new var with 'name' and 'dims'. // Set sizes if 'sizes' is non-null. @@ -549,24 +607,34 @@ namespace yask { virtual const std::string& get_description() const { return long_name; } + virtual bool is_offloaded() const { + #if USE_OFFLOAD + return true; + #else + return false; + #endif + } virtual void set_debug_output(yask_output_ptr debug) { KernelStateBase::set_debug_output(debug); } + virtual void disable_debug_output() { + KernelStateBase::disable_debug_output(); + } virtual int get_num_vars() const { - return int(var_ptrs.size()); + return int(all_var_ptrs.size()); } virtual yk_var_ptr get_var(const std::string& name) { - auto i = var_map.find(name); - if (i != var_map.end()) + auto i = all_var_map.find(name); + if (i != all_var_map.end()) return i->second; return nullptr; } virtual std::vector get_vars() { std::vector vars; - for (int i = 0; i < get_num_vars(); i++) - vars.push_back(var_ptrs.at(i)); + for (auto& vp : all_var_ptrs) + vars.push_back(vp); return vars; } virtual yk_var_ptr @@ -589,7 +657,7 @@ namespace yask { virtual yk_var_ptr new_fixed_size_var(const std::string& name, const std::initializer_list& dims, - const std::initializer_list& dim_sizes) { + const idx_t_init_list& dim_sizes) { VarDimNames dims2(dims); VarDimSizes sizes2(dim_sizes); return new_fixed_size_var(name, dims2, sizes2); @@ -601,20 +669,17 @@ namespace yask { } virtual int get_num_domain_dims() const { STATE_VARS_CONST(this); - return dims->_domain_dims._get_num_dims(); + return dims->_domain_dims.get_num_dims(); } - virtual std::vector get_domain_dim_names() const { + virtual string_vec get_domain_dim_names() const { STATE_VARS_CONST(this); return domain_dims.get_dim_names(); } - virtual std::vector get_misc_dim_names() const { + virtual string_vec get_misc_dim_names() const { STATE_VARS_CONST(this); return misc_dims.get_dim_names(); } - virtual idx_t get_first_rank_domain_index(const std::string& dim) const; - virtual idx_t get_last_rank_domain_index(const std::string& dim) const; - virtual void run_solution(idx_t first_step_index, idx_t last_step_index); virtual void run_solution(idx_t step_index) { @@ -623,44 +688,51 @@ namespace yask { virtual void fuse_vars(yk_solution_ptr other); // APIs that access settings. - virtual void set_overall_domain_size(const std::string& dim, idx_t size); - virtual void set_rank_domain_size(const std::string& dim, idx_t size); - virtual void set_min_pad_size(const std::string& dim, idx_t size); - virtual void set_block_size(const std::string& dim, idx_t size); - virtual void set_region_size(const std::string& dim, idx_t size); - virtual void set_num_ranks(const std::string& dim, idx_t size); - virtual void set_rank_index(const std::string& dim, idx_t size); - virtual idx_t get_overall_domain_size(const std::string& dim) const; - virtual idx_t get_rank_domain_size(const std::string& dim) const; - virtual idx_t get_min_pad_size(const std::string& dim) const; - virtual idx_t get_block_size(const std::string& dim) const; - virtual idx_t get_region_size(const std::string& dim) const; - virtual idx_t get_num_ranks(const std::string& dim) const; - virtual idx_t get_rank_index(const std::string& dim) const; + #define GET_SOLN_API(api_name) \ + virtual idx_t get_ ## api_name (const std::string& dim) const; \ + virtual idx_t_vec get_ ## api_name ## _vec() const; + #define SET_SOLN_API(api_name) \ + virtual void set_ ## api_name (const std::string& dim, idx_t size); \ + virtual void set_ ## api_name ## _vec(const idx_t_vec& vals); \ + virtual void set_ ## api_name ## _vec(const idx_t_init_list& vals); + #define SOLN_API(api_name) \ + GET_SOLN_API(api_name) \ + SET_SOLN_API(api_name) + SOLN_API(num_ranks) + SOLN_API(rank_index) + SOLN_API(overall_domain_size) + SOLN_API(rank_domain_size) + SOLN_API(block_size) + SOLN_API(min_pad_size) + GET_SOLN_API(first_rank_domain_index) + GET_SOLN_API(last_rank_domain_index) + #undef SOLN_API + #undef SET_SOLN_API + #undef GET_SOLN_API + virtual std::string apply_command_line_options(const std::string& args); virtual std::string apply_command_line_options(int argc, char* argv[]); - virtual std::string apply_command_line_options(const std::vector& args); + virtual std::string apply_command_line_options(const string_vec& args); virtual bool get_step_wrap() const { STATE_VARS(this); - return opts->_step_wrap; + return actl_opts->_step_wrap; } virtual void set_step_wrap(bool do_wrap) { STATE_VARS(this); - opts->_step_wrap = do_wrap; + req_opts->_step_wrap = do_wrap; + actl_opts->_step_wrap = do_wrap; } virtual bool set_default_numa_preferred(int numa_node) { STATE_VARS(this); -#ifdef USE_NUMA - opts->_numa_pref = numa_node; + + // TODO: fix this when NUMA APIs are not available. + req_opts->_numa_pref = numa_node; + actl_opts->_numa_pref = numa_node; return true; -#else - opts->_numa_pref = yask_numa_none; - return numa_node == yask_numa_none; -#endif } virtual int get_default_numa_preferred() const { STATE_VARS_CONST(this); - return opts->_numa_pref; + return actl_opts->_numa_pref; } virtual void call_before_prepare_solution(hook_fn_t hook_fn) { @@ -680,7 +752,9 @@ namespace yask { } // Auto-tuner methods. - virtual void eval_auto_tuner(idx_t num_steps); + void visit_auto_tuners(std::function visitor); + void visit_auto_tuners(std::function visitor) const; + virtual void eval_auto_tuner(); // Auto-tuner APIs. virtual void reset_auto_tuner(bool enable, bool verbose = false); diff --git a/src/kernel/lib/factory.cpp b/src/kernel/lib/factory.cpp index d2549757..f70f5dbb 100644 --- a/src/kernel/lib/factory.cpp +++ b/src/kernel/lib/factory.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -29,35 +29,74 @@ using namespace std; // Auto-generated stencil code that extends base types. #define DEFINE_CONTEXT #include YSTR2(YK_CODE_FILE) +#undef DEFINE_CONTEXT namespace yask { // APIs. // See yask_kernel_api.hpp. + yk_env_ptr yk_factory::new_env(MPI_Comm comm) const { + auto ep = make_shared(); + assert(ep); + ep->init_env(0, 0, comm); + TRACE_MSG("YASK env object created with MPI communicator " << comm); + return ep; + } + yk_env_ptr yk_factory::new_env() const { + return new_env(MPI_COMM_NULL); + } std::string yk_factory::get_version_string() { return yask_get_version_string(); } + + // Compiling new_solution() triggers compilation of the stencil kernels. yk_solution_ptr yk_factory::new_solution(yk_env_ptr env, const yk_solution_ptr source) const { + TRACE_MSG("creating new YASK solution..."); + + // Make sure JIT compiliation has happened. + #ifdef USE_OFFLOAD + { + DEBUG_MSG("Initializing OpenMP offload; there may be a delay for JIT compilation..."); + YaskTimer init_timer; + init_timer.start(); + + // Dummy OMP section to trigger JIT. + // This should be the first "omp target" pragma encountered. + int dummy = 42; + #pragma omp target data device(KernelEnv::_omp_devn) map(dummy) + { } + + init_timer.stop(); + DEBUG_MSG("OpenMP offload initialization done in " << + make_num_str(init_timer.get_elapsed_secs()) << " secs."); + } + #endif + auto ep = dynamic_pointer_cast(env); assert(ep); auto dp = YASK_STENCIL_CONTEXT::new_dims(); // create Dims. assert(dp); - auto op = make_shared(dp, ep); - assert(op); + auto req_opts = make_shared(dp, ep); + assert(req_opts); + auto actl_opts = make_shared(dp, ep); + assert(actl_opts); - // Copy settings from source. + // Copy settings from source, if any. if (source.get()) { auto ssp = dynamic_pointer_cast(source); assert(ssp); - auto sop = ssp->get_settings(); + auto sop = ssp->get_req_opts(); + assert(sop); + *req_opts = *sop; + sop = ssp->get_actl_opts(); assert(sop); - *op = *sop; + *actl_opts = *sop; } // Create problem-specific object defined by stencil compiler. // TODO: allow more than one type of solution to be created. - auto sp = make_shared(ep, op); + auto sp = make_shared(ep, actl_opts, req_opts); assert(sp); #ifdef DEF_ARGS diff --git a/src/kernel/lib/generic_var.cpp b/src/kernel/lib/generic_var.cpp index a9a53f89..f02e8a8e 100644 --- a/src/kernel/lib/generic_var.cpp +++ b/src/kernel/lib/generic_var.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -43,17 +43,25 @@ namespace yask { // Make some descriptive info. template string GenericVarTyped::make_info_string(const string& elem_name) const { + const void* _elems = get_storage(); stringstream oss; oss << "'" << _name << "' "; - if (_var_dims._get_num_dims() == 0) + if (_var_dims.get_num_dims() == 0) oss << "scalar"; else - oss << _var_dims._get_num_dims() << "-D var (" << + oss << _var_dims.get_num_dims() << "-D var (" << _var_dims.make_dim_val_str(" * ") << ")"; - if (_elems) - oss << " with storage at " << _elems << " containing "; + if (_elems) { + oss << " with storage at " << _elems; + #ifdef USE_OFFLOAD_NO_USM + if (KernelEnv::_use_offload) + oss << " (" << (void*)get_dev_ptr(_elems, false, false) << + " on device)"; + #endif + oss << " containing "; + } else - oss << " with storage not yet allocated for "; + oss << " with storage not allocated for "; oss << make_byte_str(get_num_bytes()) << " (" << make_num_str(get_num_elems()) << " " << elem_name << " element(s) of " << @@ -63,40 +71,47 @@ namespace yask { // Free any old storage. // Set pointer to storage. - // 'base' should provide get_num_bytes() bytes at offset bytes. + // 'base' should provide get_num_bytes() bytes starting at offset bytes. + // When offloading, the memory should also be mapped to the device. template void GenericVarTyped::set_storage(shared_ptr& base, size_t offset) { STATE_VARS(this); + auto& _elemsp = get_elems(); // Release any old data if last owner. - release_storage(); + release_storage(true); // Share ownership of base. - // This ensures that last var to use a shared allocation - // will trigger dealloc. + // This ensures that the shared-ptr alloc won't trigger + // a free until the last var using it is done w/it. _base = base; // Set plain pointer to new data. - if (base.get()) { - char* p = _base.get() + offset; - _elems = (void*)p; - } else { - _elems = 0; - } + char* p = base.get() ? base.get() + offset : 0; + + // Set ptr and sync offload pointer in core. + _elemsp = (T*)p; + sync_data_ptr(); } // Release storage. template - void GenericVarTyped::release_storage() { + void GenericVarTyped::release_storage(bool reset_ptr) { STATE_VARS(this); + auto& _elemsp = get_elems(); _base.reset(); - _elems = 0; + + // Set ptr and sync offload pointer in core. + if (reset_ptr) { + char* p = 0; + _elemsp.set_and_sync((T*)p); + } } - // Perform default allocation. For other options, programmer should - // call get_num_elems() or get_num_bytes() and then provide allocated - // memory via set_storage(). + // Perform default allocation. For other options, call get_num_elems() + // or get_num_bytes() and then provide allocated memory via + // set_storage(). template void GenericVarTyped::default_alloc() { STATE_VARS(this); @@ -112,13 +127,15 @@ namespace yask { DEBUG_MSG("Allocating " << make_byte_str(sz) << " for var '" << _name << "' " << loc << "..."); auto base = shared_numa_alloc(sz, numa_pref); + TRACE_MSG("got memory at " << static_cast(base.get())); // Set as storage for this var. set_storage(base, 0); - } + } template void GenericVarTyped::set_elems_same(T val) { + void* _elems = get_storage(); if (_elems) { yask_parallel_for(0, get_num_elems(), 1, [&](idx_t start, idx_t stop, idx_t thread_num) { @@ -129,8 +146,9 @@ namespace yask { template void GenericVarTyped::set_elems_in_seq(T seed) { + void* _elems = get_storage(); if (_elems) { - const idx_t wrap = 71; // TODO: avoid multiple of any dim size. + const idx_t wrap = 31; // TODO: avoid multiple of any dim size. yask_parallel_for(0, get_num_elems(), 1, [&](idx_t start, idx_t stop, idx_t thread_num) { ((T*)_elems)[start] = seed * T(start % wrap + 1); diff --git a/src/kernel/lib/generic_var.hpp b/src/kernel/lib/generic_var.hpp index 0540e736..0ce866ee 100644 --- a/src/kernel/lib/generic_var.hpp +++ b/src/kernel/lib/generic_var.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -30,15 +30,105 @@ IN THE SOFTWARE. namespace yask { + // Forward decls. + class GenericVarBase; + template + class GenericVarTyped; + template + class GenericVar; + + // Core elements of a generic n-D var of elements of type T. + // This class defines the type and memory layout. + // The LayoutFn class must provide a 1:1 transform between + // n-D and 1-D indices with a constant stride between + // consecutively-indexed elements in each dim. + // A trivially-copyable type for offloading. + template + class GenericVarCore { + friend class GenericVarBase; + friend class GenericVarTyped; + friend class GenericVar; + + protected: + // Sizes and index transform functions. + // Sizes are copies of GenericVarBase::_var_dims. + LayoutFn _layout; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); + + // Start of actual data, which may be offset from GenericVarBase::_base. + synced_ptr _elems = 0; + + public: + + // Get number of dims. + ALWAYS_INLINE idx_t get_num_dims() const { + return _layout.get_num_sizes(); + } + + // Get 1D index using layout. + // Basically a wrapper around _layout.layout(), but with range checking for debug. + ALWAYS_INLINE idx_t get_index(const Indices& idxs, bool check=true) const { + #ifdef CHECK + if (check) { + + // Make sure all indices are in bounds. + for (int i = 0; i < _layout.get_num_sizes(); i++) { + idx_t j = idxs[i]; + host_assert(j >= 0); + host_assert(j < _layout.get_size(i)); + } + } + + // Strictly, _elems doesn't need to be valid when 'get_index()' is called + // because we're not accessing data. But we will make this restriction + // when 'check' is 'true'. + if (check) + host_assert(_elems.get()); + #endif + + idx_t ai = _layout.layout(idxs); + + #ifdef CHECK + if (check) { + + // Make sure final 1D index is in bounds. + host_assert(ai >= 0); + host_assert(ai < _layout.get_num_elements()); + } + #endif + return ai; + } + + // Return pointer to given element. + ALWAYS_INLINE const T* get_ptr(const Indices& pt, bool check=true) const { + idx_t ai = get_index(pt, check); + return &_elems[ai]; + } + ALWAYS_INLINE T* get_ptr(const Indices& pt, bool check=true) { + idx_t ai = get_index(pt, check); + return &_elems[ai]; + } + + // Return ref to given element. + ALWAYS_INLINE const T& operator()(const Indices& pt, bool check=true) const { + idx_t ai = get_index(pt, check); + return _elems[ai]; + } + ALWAYS_INLINE T& operator()(const Indices& pt, bool check=true) { + idx_t ai = get_index(pt, check); + return _elems[ai]; + } + + }; //GenericVarCore. + // A base class for a generic n-D var. // This class does not define a type or memory layout. - // This class hierarchy is not virtual. + // This class is pure virtual. class GenericVarBase : public KernelStateBase { protected: - // Start of actual data, which is some offset from _base on host. - void* _elems = 0; // Name for var. std::string _name; @@ -53,7 +143,7 @@ namespace yask { const static int _numa_unset = -999; int _numa_pref = _numa_unset; // use default from _opts. - // Names and lengths of var dimensions. + // Names and sizes of dims in this var. IdxTuple _var_dims; // Ctor. No allocation is done. See notes on default_alloc(). @@ -61,6 +151,8 @@ namespace yask { GenericVarBase(KernelStateBase& state, const std::string& name, const VarDimNames& dim_names); + // Dtor. + virtual ~GenericVarBase() { } public: @@ -72,7 +164,7 @@ namespace yask { int get_numa_pref() const { STATE_VARS_CONST(this); return (_numa_pref != _numa_unset) ? - _numa_pref : opts->_numa_pref; + _numa_pref : actl_opts->_numa_pref; } bool set_numa_pref(int numa_node) { #ifdef USE_NUMA @@ -96,7 +188,7 @@ namespace yask { // Get number of dimensions. int get_num_dims() const { - return _var_dims._get_num_dims(); + return _var_dims.get_num_dims(); } // Get the nth dim name. @@ -114,6 +206,10 @@ namespace yask { return _var_dims.get_val(n); } + // Modify dim sizes. + virtual void set_dim_size(int n, idx_t size) =0; + virtual void set_dim_sizes(const Indices& sizes) =0; + // Return 'true' if dimensions are same names // and sizes, 'false' otherwise. bool are_dims_and_sizes_same(const GenericVarBase& src) const { @@ -121,16 +217,30 @@ namespace yask { } // Direct access to data. - void* get_storage() { - return (void*)_elems; - } - const void* get_storage() const { - return (void*)_elems; - } + virtual const void* get_storage() const =0; + virtual void* get_storage() =0; + + // Free any old storage. + // Set pointer to storage. + // 'base' should provide get_num_bytes() bytes at offset bytes. + virtual void set_storage(std::shared_ptr& base, size_t offset) =0; + + // Release storage. + virtual void release_storage(bool reset_ptr) =0; + + // Perform default allocation. + // For other options, + // programmer should call get_num_elems() or get_num_bytes() and + // then provide allocated memory via set_storage(). + virtual void default_alloc() =0; + + // Get size in bytes. + virtual size_t get_num_bytes() const =0; }; // A base class for a generic n-D var of elements of arithmetic type T. // This class defines the type but does not define the memory layout. + // This class is pure virtual because its base is pure virtual. template class GenericVarTyped : public GenericVarBase { @@ -143,6 +253,9 @@ namespace yask { const VarDimNames& dim_names) : GenericVarBase(state, name, dim_names) { } + // Direct access to storage ptr. + virtual synced_ptr& get_elems() =0; + public: // Get size of one element. @@ -151,23 +264,25 @@ namespace yask { } // Get size in bytes. - size_t get_num_bytes() const { + size_t get_num_bytes() const override { return sizeof(T) * get_num_elems(); } // Free any old storage. // Set pointer to storage. // 'base' should provide get_num_bytes() bytes at offset bytes. - void set_storage(std::shared_ptr& base, size_t offset); + void set_storage(std::shared_ptr& base, size_t offset) override; // Release storage. - void release_storage(); + void release_storage(bool reset_ptr) override; + + // Sync pointer to data. + void sync_data_ptr() { + get_elems().sync(); + } // Perform default allocation. - // For other options, - // programmer should call get_num_elems() or get_num_bytes() and - // then provide allocated memory via set_storage(). - void default_alloc(); + void default_alloc() override; // Print some descriptive info. std::string make_info_string(const std::string& elem_name) const; @@ -180,57 +295,75 @@ namespace yask { }; // A generic n-D var of elements of type T. - // This class defines the type and memory layout. - // The LayoutFn class must provide a 1:1 transform between - // n-D and 1-D indices. + // A pointer to a GenericVarCore obj must be given at construction. + // The GenericVar does NOT own the GenericVarCore obj. template class GenericVar : public GenericVarTyped { protected: + typedef GenericVarCore _core_t; + _core_t* _corep; + static_assert(std::is_trivially_copyable<_core_t>::value, + "Needed for OpenMP offload"); - // Sizes and index transform functions. - LayoutFn _layout; - - // Both _var_dims and _layout hold sizes unless this is a + // Both _var_dims and _core._layout hold sizes unless this is a // scalar. (For a scalar, _var_dims is empty.) // These functions keep them in sync. void _sync_dims_with_layout() { - Indices idxs(_layout.get_sizes()); + Indices idxs(_corep->_layout.get_sizes()); idxs.set_tuple_vals(GenericVarBase::_var_dims); } void _sync_layout_with_dims() { STATE_VARS(this); Indices idxs(GenericVarBase::_var_dims); - _layout.set_sizes(idxs); + _corep->_layout.set_sizes(idxs); + } + + // Direct access to storage ptr. + // Allows modifying the pointer itself. + synced_ptr& get_elems() override { + return _corep->_elems; } public: // Construct an unallocated var. + // Must supply a pointer to an existing _core_t. GenericVar(KernelStateBase& state, + _core_t* corep, std::string name, const VarDimNames& dim_names) : - GenericVarTyped(state, name, dim_names) { + GenericVarTyped(state, name, dim_names), + _corep(corep) { + assert(_corep); - // '_var_dims' was set in GenericVar construction. + // '_var_dims' was set in GenericVarBase construction. // Need to sync '_layout' w/it. _sync_layout_with_dims(); - assert(int(dim_names.size()) == _layout.get_num_sizes()); + assert(int(dim_names.size()) == _corep->_layout.get_num_sizes()); } ~GenericVar() { // Release data. - GenericVarTyped::release_storage(); + GenericVarTyped::release_storage(false); + } + + // Direct access to data. + const void* get_storage() const override { + return (void*)_corep->_elems; + } + void* get_storage() override { + return (void*)_corep->_elems; } // Modify dim sizes. - void set_dim_size(int n, idx_t size) { + void set_dim_size(int n, idx_t size) override { GenericVarBase::_var_dims.set_val(n, size); _sync_layout_with_dims(); } - void set_dim_sizes(const Indices& sizes) { + void set_dim_sizes(const Indices& sizes) override { auto& vd = GenericVarBase::_var_dims; for (int i = 0; size_t(i) < vd.size(); i++) vd.set_val(i, sizes[i]); @@ -239,65 +372,67 @@ namespace yask { // Access all dim sizes. inline const Indices& get_dim_sizes() const { - return _layout.get_sizes(); + return _corep->_layout.get_sizes(); } // Get 1D index using layout. ALWAYS_INLINE idx_t get_index(const Indices& idxs, bool check=true) const { - #ifdef CHECK - if (check) { - for (int i = 0; size_t(i) < this->_var_dims.size(); i++) { - idx_t j = idxs[i]; - assert(j >= 0); - assert(j < this->_var_dims.get_val(i)); - } - } - #endif - - idx_t ai = _layout.layout(idxs); - - #ifdef CHECK - if (check) - assert(ai < this->get_num_elems()); - #endif - return ai; + return _corep->get_index(idxs, check); } ALWAYS_INLINE idx_t get_index(const IdxTuple& pt, bool check=true) const { - assert(GenericVarBase::_var_dims.are_dims_same(pt)); + host_assert(GenericVarBase::_var_dims.are_dims_same(pt)); Indices idxs(pt); return get_index(idxs, check); } // Pointer to given element. ALWAYS_INLINE const T* get_ptr(const Indices& pt, bool check=true) const { - idx_t ai = get_index(pt, check); - return &((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } ALWAYS_INLINE T* get_ptr(const Indices& pt, bool check=true) { - idx_t ai = get_index(pt, check); - return &((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } - // Return const ref to given element. + // Return ref to given element. ALWAYS_INLINE const T& operator()(const Indices& pt, bool check=true) const { - idx_t ai = get_index(pt, check); - return ((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } ALWAYS_INLINE const T& operator()(const IdxTuple& pt, bool check=true) const { - idx_t ai = get_index(pt, check); - return ((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } - - // Non-const access to given element. ALWAYS_INLINE T& operator()(const Indices& pt, bool check=true) { - idx_t ai = get_index(pt, check); - return ((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } ALWAYS_INLINE T& operator()(const IdxTuple& pt, bool check=true) { - idx_t ai = get_index(pt, check); - return ((T*)GenericVarBase::_elems)[ai]; + return _corep->get_ptr(pt, check); } + // Compute strides. + Indices get_strides() const { + auto nd = _corep->get_num_dims(); + Indices strides(nd); + for (int d = 0; d < nd; d++) { + + // For dim 'd', measure distance from index 0 to 1. + auto idxs = Indices(idx_t(0), nd); + auto i0 = get_index(idxs, false); + idxs[d] = 1; + auto i1 = get_index(idxs, false); + auto sd = i1 - i0; + strides[d] = sd; + assert(sd >= 0); + + // Check that the distance holds for other indices. + #ifdef CHECK + for (idx_t j : { 13, -17 }) { + idxs[d] = j; + auto i = get_index(idxs, false); + assert(i - i0 == sd * j); + } + #endif + } + return strides; + } }; } // namespace yask. diff --git a/src/kernel/lib/halo.cpp b/src/kernel/lib/halo.cpp new file mode 100644 index 00000000..2e1dfcb0 --- /dev/null +++ b/src/kernel/lib/halo.cpp @@ -0,0 +1,566 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +// This file contains implementations halo exchange methods. +// Also see context.cpp, setup.cpp, and soln_apis.cpp. + +#include "yask_stencil.hpp" +using namespace std; + +namespace yask { + + /* + Host halo exchange w/explicit shared memory (shm): + rank I rank J (neighbor of I) + --------------------- ---------------------- + var A ----------------> shared_buf ---------------> var A + pack unpack + + Host halo exchange w/o shm: + rank I rank J (neighbor of I) + --------------------- ---------------------- + var A ----> local_buf ------------> local_buf ----> var A + pack MPI_isend MPI_irecv unpack + + Device halo exchange w/o direct device copy w/o shm: + rank I rank J (neighbor of I) + ----------------------- ----------------------- + var A --> dev_local_buf dev_local_buf --> var A + pack | copy to host ^ unpack + V | copy to dev + host_local_buf ---------> host_local_buf + MPI_isend MPI_irecv + + Device halo exchange w/direct device copy w/o shm: + rank I rank J (neighbor of I) + --------------------------- ----------------------------- + dev var A --> dev_local_buf -----> dev_local_buf ----> dev var A + pack MPI_isend MPI_irecv unpack + (may still be implicitly routed through host.) + + Options not yet implemented: + * Device halo exchange w/o direct device copy w/shm. + * Device halo exchange w/direct device copy w/shm. + */ + + // Exchange dirty halo data for all vars and all steps. + void StencilContext::exchange_halos() { + + #if defined(USE_MPI) + STATE_VARS(this); + if (!actl_opts->do_halo_exchange || env->num_ranks < 2) + return; + auto& use_offload = KernelEnv::_use_offload; + auto use_device_mpi = use_offload ? actl_opts->use_device_mpi : false; + + halo_time.start(); + double wait_delta = 0.; + TRACE_MSG("following calc of " << make_mpi_section_descr()); + + // Vars that need to be swapped and their step indices. + struct SwapInfo { + YkVarPtr gp; + set steps; + }; + vector vars_to_swap; + + // Loop thru all vars in stencil. + for (auto& gp : orig_var_ptrs) { + auto& gb = gp->gb(); + assert(!gb.is_scratch()); + + // Only need to swap data in vars that have any MPI buffers. + auto& gname = gp->get_name(); + if (mpi_data.count(gname) == 0) + continue; + + // Check all allocated step indices. + // Use '0' for vars that don't use the step dim. + idx_t start_t = 0, stop_t = 1; + if (gp->is_dim_used(step_dim)) { + start_t = gp->get_first_valid_step_index(); + stop_t = gp->get_last_valid_step_index() + 1; + } + bool first = true; + for (idx_t t = start_t; t < stop_t; t++) { + + // If my var is dirty, the 'others' flag should always + // be set. Otherwise, the MPI exchanges will get + // out-of-sync because my neighbor might not ask for + // my data. + if (gb.is_dirty(YkVarBase::self, t)) + assert(gb.is_dirty(YkVarBase::others, t)); + + // Only need to swap vars whose halos are not up-to-date + // for this step. + if (!gb.is_dirty(YkVarBase::others, t)) + continue; + + // Swap this var. + if (first) { + SwapInfo si; + si.gp = gp; + vars_to_swap.push_back(si); + first = false; + } + vars_to_swap.back().steps.insert(t); + + } // steps. + } // vars. + TRACE_MSG("need to exchange halos from " << vars_to_swap.size() << + " var(s)"); + + // Sequence of things to do for each neighbor. + enum halo_steps { halo_irecv, halo_pack_isend, halo_unpack, halo_final }; + vector steps_to_do; + + // Flags indicate what part of vars were most recently calc'd. + // These determine what exchange steps need to be done now. + if (vars_to_swap.size()) { + if (do_mpi_left || do_mpi_right) { + steps_to_do.push_back(halo_irecv); + steps_to_do.push_back(halo_pack_isend); + } + if (do_mpi_interior) { + steps_to_do.push_back(halo_unpack); + steps_to_do.push_back(halo_final); + } + } + + int num_send_reqs = 0; + int num_recv_reqs = 0; + for (auto halo_step : steps_to_do) { + + if (halo_step == halo_irecv) + TRACE_MSG("requesting data phase"); + else if (halo_step == halo_pack_isend) + TRACE_MSG("packing and sending data phase"); + else if (halo_step == halo_unpack) + TRACE_MSG("waiting for and unpacking data phase"); + else if (halo_step == halo_final) + TRACE_MSG("waiting for send to finish phase"); + else + THROW_YASK_EXCEPTION("internal error: unknown halo-exchange step"); + + // Loop thru all vars to swap. + // Use 'gi' as an MPI tag. + int gi = 0; + for (auto& si : vars_to_swap) { + gi++; + auto gp = si.gp; + auto& gb = gp->gb(); + auto& gname = gb.get_name(); + TRACE_MSG(" processing var '" << gname << "', " << si.steps.size() << " step(s)"); + auto& var_mpi_data = mpi_data.at(gname); + auto* var_recv_reqs = var_mpi_data.recv_reqs.data(); + auto* var_send_reqs = var_mpi_data.send_reqs.data(); + auto* var_recv_stats = var_mpi_data.recv_stats.data(); + bool finalizing_var = false; + + // Loop thru all this rank's neighbors. + var_mpi_data.visit_neighbors + ([&](const IdxTuple& offsets, // NeighborOffset. + int neighbor_rank, + int ni, // unique neighbor index. + MPIBufs& bufs) { + auto& send_buf = bufs.bufs[MPIBufs::buf_send]; + auto& recv_buf = bufs.bufs[MPIBufs::buf_recv]; + TRACE_MSG("exchange_halos: with rank " << neighbor_rank << + " at relative position " << + offsets.sub_elements(1).make_dim_val_offset_str()); + + // Are we using MPI shm w/this neighbor? + bool using_shm = actl_opts->use_shm && + mpi_info->shm_ranks.at(ni) != MPI_PROC_NULL; + + // Submit async request to receive data from neighbor. + if (halo_step == halo_irecv) { + auto nbbytes = recv_buf.get_bytes(); + if (nbbytes) { + if (using_shm) + TRACE_MSG("exchange_halos: no receive req due to shm"); + else { + void* buf = (void*)recv_buf._elems; + void* rbuf = use_device_mpi ? get_dev_ptr(buf) : buf; + TRACE_MSG("exchange_halos: requesting up to " << + make_byte_str(nbbytes) << " into " << rbuf); + auto& r = var_recv_reqs[ni]; + if (nbbytes != int(nbbytes)) + THROW_YASK_EXCEPTION("error: int overflow in MPI_Isend()"); + MPI_Irecv(rbuf, int(nbbytes), MPI_BYTE, + neighbor_rank, int(gi), + env->comm, &r); + num_recv_reqs++; + } + } + else + TRACE_MSG("exchange_halos: 0B to request"); + } // recv step. + + // Pack data into send buffer, then send to neighbor. + else if (halo_step == halo_pack_isend) { + auto nbbytes = send_buf.get_bytes(); + if (nbbytes) { + + // Vec ok? + // Domain sizes must be ok, and buffer size must be ok + // as calculated when buffers were created. + bool send_vec_ok = send_buf.vec_copy_ok; + + // Get first and last indices to pack from. + IdxTuple first = send_buf.begin_pt; + IdxTuple last = send_buf.last_pt; + + // Wait until buffer is avail if sharing one. + if (using_shm) { + TRACE_MSG("exchange_halos: waiting to write to shm buffer"); + halo_lock_wait_time.start(); + send_buf.wait_for_ok_to_write(); + wait_delta += halo_lock_wait_time.stop(); + } + + // Check to see if my var is dirty in any step that the + // 'others' may be dirty in. + bool is_mine_dirty = false; + for (auto t : si.steps) { + if (gb.is_dirty(YkVarBase::self, t)) + is_mine_dirty = true; + } + + // Copy (pack) data from var to buffer. + void* buf = (void*)send_buf._elems; + size_t npbytes = 0; + char* bufp = (char*)buf; + + // Pack one step at a time. TODO: develop + // mechanism to allow only dirty steps to + // be packed and sent; this would involve + // sending the dirty step indices and/or a + // list of sizes. Currently, all + // possibly-dirty steps are sent if any is + // dirty. + if (is_mine_dirty) { + halo_pack_time.start(); + for (auto t : si.steps) { + if (gp->is_dim_used(step_dim)) { + first.set_val(step_dim, t); + last.set_val(step_dim, t); + } + TRACE_MSG("exchange_halos: packing [" << first.make_dim_val_str() << + " ... " << last.make_dim_val_str() << "] with " << + (send_vec_ok ? "vector" : "scalar") << + " copy into " << (void*)bufp << + (use_offload ? " on device" : " on host")); + idx_t nelems = 0; + if (send_vec_ok) + nelems = gb.get_vecs_in_slice(bufp, first, last, use_offload); + else + nelems = gb.get_elements_in_slice(bufp, first, last, use_offload); + auto nb = nelems * get_element_bytes(); + bufp += nb; + npbytes += nb; + } + halo_pack_time.stop(); + assert(npbytes <= nbbytes); + + // Copy packed data from device if needed. + if (use_offload && !use_device_mpi) { + TRACE_MSG("exchange_halos: copying buffer from device"); + halo_copy_time.start(); + offload_copy_from_device(buf, npbytes); + + if (npbytes) { + real_t v0 = *((real_t*)buf); + TRACE_MSG("exchange_halos: got " << v0 << " ... from device"); + } + + halo_copy_time.stop(); + assert(!using_shm); + } + } + + // Send data (might be 0 bytes, but still need to send). + if (using_shm) { + TRACE_MSG("exchange_halos: put " << make_byte_str(npbytes) << + " into shm"); + send_buf.set_data(npbytes); // Send size thru lock. + send_buf.mark_write_done(); + } + else { + + // Send packed buffer to neighbor. + auto& r = var_send_reqs[ni]; + void* sbuf = use_device_mpi ? get_dev_ptr(buf) : buf; + TRACE_MSG("exchange_halos: sending " << make_byte_str(npbytes) << + " from " << sbuf); + if (npbytes != int(npbytes)) + THROW_YASK_EXCEPTION("error: int overflow in MPI_Isend()"); + MPI_Isend(sbuf, int(npbytes), MPI_BYTE, + neighbor_rank, int(gi), env->comm, &r); + num_send_reqs++; + } + } + else + TRACE_MSG(" 0B to send"); + } // pack & send step. + + // Wait for data from neighbor, then unpack it. + else if (halo_step == halo_unpack) { + auto nbbytes = recv_buf.get_bytes(); + if (nbbytes) { + int nbytes = 0; + + // Wait until data in buffer is avail. + if (using_shm) { + TRACE_MSG("exchange_halos: waiting for data in shm buffer"); + halo_lock_wait_time.start(); + recv_buf.wait_for_ok_to_read(); + wait_delta += halo_lock_wait_time.stop(); + nbytes = recv_buf.get_data(); // Size was stored in lock. + } + else { + + auto& r = var_recv_reqs[ni]; + auto& s = var_recv_stats[ni]; + + if (r == MPI_REQUEST_NULL) { + // Already got status from an MPI_Test* or MPI_Wait* function. + TRACE_MSG("exchange_halos: already received up to " << + make_byte_str(nbbytes)); + } + + else { + // Wait for data from neighbor before unpacking it. + TRACE_MSG("exchange_halos: waiting for receipt of up to " << + make_byte_str(nbbytes)); + halo_wait_time.start(); + MPI_Wait(&r, &s); + wait_delta += halo_wait_time.stop(); + r = MPI_REQUEST_NULL; + } + MPI_Get_count(&s, MPI_BYTE, &nbytes); + } + TRACE_MSG("exchange_halos: got " << make_byte_str(nbytes)); + assert(nbytes <= nbbytes); + + if (!nbytes) { + TRACE_MSG("exchange_halos: received no data"); + } else { + + // Vec ok? + bool recv_vec_ok = recv_buf.vec_copy_ok; + + // Get first and last ranges. + IdxTuple first = recv_buf.begin_pt; + IdxTuple last = recv_buf.last_pt; + + void* buf = (void*)recv_buf._elems; + if (use_offload && !use_device_mpi) { + TRACE_MSG("exchange_halos: copying buffer to device"); + halo_copy_time.start(); + offload_copy_to_device(buf, nbytes); + + real_t v0 = *((real_t*)buf); + TRACE_MSG("exchange_halos: sent " << v0 << " ... to device"); + + halo_copy_time.stop(); + } + + // Copy data from buffer to var. + size_t npbytes = 0; + char* bufp = (char*)buf; + + // Unpack one step at a time. + halo_unpack_time.start(); + for (auto t : si.steps) { + if (gp->is_dim_used(step_dim)) { + first.set_val(step_dim, t); + last.set_val(step_dim, t); + } + TRACE_MSG("exchange_halos: unpacking into [" << + first.make_dim_val_str() << + " ... " << last.make_dim_val_str() << "] with " << + (recv_vec_ok ? "vector" : "scalar") << + " copy from " << (void*)bufp << + (use_offload ? " on device" : " on host")); + idx_t nelems = 0; + if (recv_vec_ok) + nelems = gp->set_vecs_in_slice(bufp, first, last, use_offload); + else + nelems = gp->set_elements_in_slice(bufp, first, last, use_offload); + auto nb = nelems * get_element_bytes(); + bufp += nb; + npbytes += nb; + } + halo_unpack_time.stop(); + + // Should have unpacked exactly what we got. + assert(npbytes == nbytes); + } + + if (using_shm) + recv_buf.mark_read_done(); + } + else + TRACE_MSG("exchange_halos: 0B to wait for"); + } // unpack step. + + // Final steps. + else if (halo_step == halo_final) { + auto nbbytes = send_buf.get_bytes(); + if (nbbytes) { + + if (using_shm) + TRACE_MSG("exchange_halos: no send wait due to shm"); + else { + + // Wait for send to finish. + // TODO: consider using MPI_WaitAll. Would need to do + // it outside the loops. + // TODO: strictly, we don't have to wait on the + // send to finish until we want to reuse this buffer, + // so we could wait on the *previous* send right before + // doing another one. + auto& r = var_send_reqs[ni]; + if (r != MPI_REQUEST_NULL) { + TRACE_MSG(" waiting to finish send of up to " << make_byte_str(nbbytes)); + halo_wait_time.start(); + MPI_Wait(&var_send_reqs[ni], MPI_STATUS_IGNORE); + wait_delta += halo_wait_time.stop(); + } + r = MPI_REQUEST_NULL; + } + } + finalizing_var = true; + + } // final step. + + }); // visit neighbors. + + // Did we finish w/this var? + if (finalizing_var) { + + // Mark var as up-to-date. + gb.set_dirty_all(YkVarBase::self, false); + gb.set_dirty_all(YkVarBase::others, false); + TRACE_MSG(" var '" << gname << "' marked as clean"); + } + + } // vars to swap. + } // exchange sequence. + + TRACE_MSG(num_recv_reqs << " MPI receive request(s) issued"); + TRACE_MSG(num_send_reqs << " MPI send request(s) issued"); + auto mpi_call_time = halo_time.stop(); + TRACE_MSG("secs spent in MPI waits: " << make_num_str(wait_delta)); + TRACE_MSG("secs spent in this call: " << make_num_str(mpi_call_time)); + #endif + } + + // Call MPI_Test() on all unfinished requests to advance MPI progress. + void StencilContext::adv_halo_exchange() { + + #if defined(USE_MPI) + STATE_VARS(this); + + // Exit if no exchanges. + if (!actl_opts->do_halo_exchange || env->num_ranks < 2) + return; + + // Exit if all exchanges are w/shm. + if (env->num_shm_ranks == env->num_ranks && actl_opts->use_shm) + return; + + halo_test_time.start(); + TRACE_MSG("entering"); + + // Loop thru MPI data. + int num_tests = 0; + for (auto& mdi : mpi_data) { + auto& gname = mdi.first; + auto& var_mpi_data = mdi.second; + auto* var_recv_reqs = var_mpi_data.recv_reqs.data(); + auto* var_send_reqs = var_mpi_data.send_reqs.data(); + auto* var_recv_stats = var_mpi_data.recv_stats.data(); + auto* var_send_stats = var_mpi_data.send_stats.data(); + + #ifndef USE_INDIV_MPI_TESTS + + // Bulk testing. + auto asize = max(var_mpi_data.recv_reqs.size(), var_mpi_data.send_reqs.size()); + int indices[asize]; + MPI_Status stats[asize]; + int n = 0; + MPI_Testsome(int(var_mpi_data.recv_reqs.size()), var_recv_reqs, &n, indices, stats); + num_tests++; + TRACE_MSG("completed " << n << " recv requests"); + for (int i = 0; i < n; i++) { + int loc = indices[i]; // Location of completed recv. + var_recv_stats[loc] = stats[i]; // Update correct stat. + assert(var_recv_reqs[loc] == MPI_REQUEST_NULL); + int nbytes = 0; + MPI_Get_count(&stats[i], MPI_BYTE, &nbytes); + TRACE_MSG((i+1) << ". got " << make_byte_str(nbytes) << " from req " << loc); + } + MPI_Testsome(int(var_mpi_data.send_reqs.size()), var_send_reqs, &n, indices, stats); + num_tests++; + for (int i = 0; i < n; i++) { + int loc = indices[i]; // Location of completed send. + var_send_stats[loc] = stats[i]; // Update correct stat. + assert(var_send_reqs[loc] == MPI_REQUEST_NULL); + } + + #else + // Individual testing. + int flag = 0; + for (size_t i = 0; i < var_mpi_data.recv_reqs.size(); i++) { + auto& r = var_recv_reqs[i]; + if (r != MPI_REQUEST_NULL) { + //TRACE_MSG(gname << " recv test &MPI_Request = " << &r); + MPI_Test(&r, &flag, &var_recv_stats[i]); + num_tests++; + if (flag) + r = MPI_REQUEST_NULL; + } + } + for (size_t i = 0; i < var_mpi_data.send_reqs.size(); i++) { + auto& r = var_send_reqs[i]; + if (r != MPI_REQUEST_NULL) { + //TRACE_MSG(gname << " send test &MPI_Request = " << &r); + MPI_Test(&r, &flag, &var_send_stats[i]); + num_tests++; + if (flag) + r = MPI_REQUEST_NULL; + } + } + #endif + } + auto ttime = halo_test_time.stop(); + TRACE_MSG("secs spent in " << num_tests << + " MPI test(s): " << make_num_str(ttime)); + #endif + } + +} // namespace yask. diff --git a/src/kernel/lib/indices.hpp b/src/kernel/lib/indices.hpp index 40b14f3f..0ca733a6 100644 --- a/src/kernel/lib/indices.hpp +++ b/src/kernel/lib/indices.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -27,45 +27,34 @@ IN THE SOFTWARE. namespace yask { - typedef std::vector VarIndices; - typedef std::vector VarDimSizes; - typedef std::vector VarDimNames; + typedef idx_t_vec VarIndices; + typedef idx_t_vec VarDimSizes; + typedef string_vec VarDimNames; + + // Max number of indices that can be held. + // Note use of "+max_idxs" in code below to avoid compiler + // trying to take a reference to it, resulting in an undefined + // symbol (sometimes). + constexpr int max_idxs = MAX_DIMS; + + // Step dim is always in [0] of an Indices type (if it is used). + constexpr int step_posn = 0; // A class to hold up to a given number of sizes or indices efficiently. // Similar to a Tuple, but less overhead and doesn't keep names. // This class is NOT virtual. - // TODO: make this a template with _ndims as a parameter. + // TODO: add a template parameter for max indices. // TODO: ultimately, combine with Tuple w/o loss of efficiency. class Indices { - public: - - // Max number of indices that can be held. - // Note use of "+max_idxs" in code below to avoid compiler - // trying to take a reference to it, resulting in an undefined - // symbol (sometimes). - static constexpr int max_idxs = MAX_DIMS; - - // Step dim is always in [0] of an Indices type (if it is used). - static constexpr int step_posn = 0; - protected: - idx_t _idxs[max_idxs]; - int _ndims; + idx_t _idxs[+max_idxs]; // Index values. + int _ndims; // Number of indices used. public: // Ctors. Indices() : _ndims(0) { } Indices(int ndims) : _ndims(ndims) { } // NB: _idxs remain uninit! - Indices(const IdxTuple& src) { - set_from_tuple(src); - } - Indices(const VarIndices& src) { - set_from_vec(src); - } - Indices(const std::initializer_list& src) { - set_from_init_list(src); - } Indices(const idx_t src[], int ndims) { set_from_array(src, ndims); } @@ -76,63 +65,36 @@ namespace yask { // Default copy ctor, copy operator should be okay. // Access size. - inline int _get_num_dims() const { + ALWAYS_INLINE int get_num_dims() const { return _ndims; } - inline void set_num_dims(int n) { + ALWAYS_INLINE void set_num_dims(int n) { _ndims = n; } // Access indices. - inline idx_t& operator[](int i) { - assert(i >= 0); - assert(i < _ndims); + ALWAYS_INLINE idx_t& operator[](int i) { + host_assert(i >= 0); + host_assert(i < _ndims); return _idxs[i]; } - inline const idx_t& operator[](int i) const { - assert(i >= 0); - assert(i < _ndims); + ALWAYS_INLINE const idx_t& operator[](int i) const { + host_assert(i >= 0); + host_assert(i < _ndims); return _idxs[i]; } - // Write to an IdxTuple. - // The 'tgt' must have the same number of dims. - void set_tuple_vals(IdxTuple& tgt) const { - assert(tgt.size() == size_t(_ndims)); - for (int i = 0; i < _ndims; i++) - if (size_t(i) < tgt.size()) - tgt.set_val(i, _idxs[i]); - } - - // Read from an IdxTuple. - void set_from_tuple(const IdxTuple& src) { - assert(src.size() <= +max_idxs); - int n = int(src.size()); - for (int i = 0; i < n; i++) - _idxs[i] = src.get_val(i); - _ndims = n; - } - - // Other inits. - void set_from_vec(const VarIndices& src) { - assert(src.size() <= +max_idxs); - int n = int(src.size()); - for (int i = 0; i < n; i++) - _idxs[i] = src[i]; - _ndims = n; - } - // default n => don't change _ndims. void set_from_array(const idx_t src[], int n = -1) { if (n < 0) n = _ndims; - assert(n <= +max_idxs); + host_assert(n <= +max_idxs); for (int i = 0; i < n; i++) _idxs[i] = src[i]; _ndims = n; } - void set_from_init_list(const std::initializer_list& src) { - assert(src.size() <= +max_idxs); + void set_from_init_list(const idx_t_init_list& src) { + host_assert(src.size() <= +max_idxs); int i = 0; for (auto idx : src) _idxs[i++] = idx; @@ -141,20 +103,25 @@ namespace yask { // default n => don't change _ndims. void set_from_const(idx_t val, int n = -1) { - if (n < 0) - n = _ndims; - assert(n <= +max_idxs); - for (int i = 0; i < n; i++) + if (n >= 0) + _ndims = n; + host_assert(_ndims <= +max_idxs); + + #if EXACT_INDICES + // Use just the used elements. + for (int i = 0; i < _ndims; i++) _idxs[i] = val; - _ndims = n; + #else + // Use all to allow unroll and avoid jumps. + _UNROLL for (int i = 0; i < +max_idxs; i++) + _idxs[i] = val; + #endif } void set_vals_same(idx_t val) { set_from_const(val); } // Some comparisons. - // These assume all the indices are valid or - // initialized to the same value. bool operator==(const Indices& rhs) const { if (_ndims != rhs._ndims) return false; @@ -191,111 +158,340 @@ namespace yask { return false; // equal, so not greater than. } + // In-place math function types, i.e., 'lhs' is modified. + struct AddFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs += rhs; } + }; + struct SubFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs -= rhs; } + }; + struct MulFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs *= rhs; } + }; + struct DivFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs /= rhs; } + }; + struct MinFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); } + }; + struct MaxFunc { + ALWAYS_INLINE static void func(idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); } + }; + // Generic element-wise operator. + // Instantiated with one of the above structs. // Returns a new object. - inline Indices combine_elements(std::function func, - const Indices& other) const { + template + ALWAYS_INLINE + Indices combine_elements(const Indices& other) const { + host_assert(_ndims == other._ndims); Indices res(*this); -#if EXACT_INDICES + #if EXACT_INDICES // Use just the used elements. for (int i = 0; i < _ndims; i++) -#else + T::func(res._idxs[i], other._idxs[i]); + #else // Use all to allow unroll and avoid jumps. - _UNROLL for (int i = 0; i < max_idxs; i++) -#endif - func(res._idxs[i], other._idxs[i]); + _UNROLL for (int i = 0; i < +max_idxs; i++) + T::func(res._idxs[i], other._idxs[i]); + #endif return res; } // Some element-wise operators. // These all return a new set of Indices rather // than modifying this object. - inline Indices add_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, - other); - } - inline Indices sub_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, - other); + ALWAYS_INLINE Indices add_elements(const Indices& other) const { + return combine_elements(other); } - inline Indices mul_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, - other); + ALWAYS_INLINE Indices sub_elements(const Indices& other) const { + return combine_elements(other); } - inline Indices div_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, - other); + ALWAYS_INLINE Indices mul_elements(const Indices& other) const { + return combine_elements(other); } - inline Indices min_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, - other); + ALWAYS_INLINE Indices min_elements(const Indices& other) const { + return combine_elements(other); } - inline Indices max_elements(const Indices& other) const { - return combine_elements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, - other); + ALWAYS_INLINE Indices max_elements(const Indices& other) const { + return combine_elements(other); } + // Divide done differently to avoid div-by-zero when EXACT_INDICES + // not defined. + ALWAYS_INLINE Indices div_elements(const Indices& other) const { + host_assert(_ndims == other._ndims); + Indices res(*this); + + for (int i = 0; i < _ndims; i++) + res._idxs[i] = _idxs[i] / other._idxs[i]; + return res; + } + // Generic element-wise operator with RHS const. + // Instantiated with one of the above structs. // Returns a new object. - inline Indices map_elements(std::function func, - idx_t crhs) const { + template + ALWAYS_INLINE + Indices map_elements(idx_t crhs) const { Indices res(*this); -#if EXACT_INDICES + #if EXACT_INDICES // Use just the used elements. for (int i = 0; i < _ndims; i++) -#else + T::func(res._idxs[i], crhs); + #else // Use all to allow unroll and avoid jumps. - _UNROLL for (int i = 0; i < max_idxs; i++) -#endif - func(res._idxs[i], crhs); + _UNROLL for (int i = 0; i < +max_idxs; i++) + T::func(res._idxs[i], crhs); + #endif return res; } // Operate on all elements. + ALWAYS_INLINE Indices add_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs += rhs; }, - crhs); + return map_elements(crhs); } + ALWAYS_INLINE Indices sub_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs -= rhs; }, - crhs); + return map_elements(crhs); } + ALWAYS_INLINE Indices mul_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs *= rhs; }, - crhs); + return map_elements(crhs); } + ALWAYS_INLINE Indices div_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs /= rhs; }, - crhs); + return map_elements(crhs); } + ALWAYS_INLINE Indices min_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs = std::min(lhs, rhs); }, - crhs); + return map_elements(crhs); } + ALWAYS_INLINE Indices max_const(idx_t crhs) const { - return map_elements([&](idx_t& lhs, idx_t rhs) { lhs = std::max(lhs, rhs); }, - crhs); + return map_elements(crhs); } // Reduce over all elements. - inline idx_t sum() const { + ALWAYS_INLINE + idx_t sum() const { idx_t res = 0; for (int i = 0; i < _ndims; i++) res += _idxs[i]; return res; } - inline idx_t product() const { + ALWAYS_INLINE + idx_t product() const { idx_t res = 1; for (int i = 0; i < _ndims; i++) res *= _idxs[i]; return res; } - // Make a Tuple w/given names. + // Convert 1D 'offset' to N-d offsets using values in 'this' as sizes of N-d space. + // If 'first_inner' is 'true', '(*this)[0]' is innermost dim (fortran-like); + // if 'first_inner' is 'false', '(*this)[_ndims-1]' is innermost dim (C-like). + Indices unlayout(bool first_inner, size_t offset) const { + Indices res(*this); + size_t prev_size = 1; + + // Loop thru dims. + int start_dim = first_inner ? 0 : _ndims-1; + int stop_dim = first_inner ? _ndims : -1; + int step_dim = first_inner ? 1 : -1; + for (int di = start_dim; di != stop_dim; di += step_dim) { + size_t dsize = size_t(_idxs[di]); + host_assert (dsize >= 0); + + // Div offset by product of previous dims. + size_t dofs = offset / prev_size; + + // Wrap within size of this dim. + dofs %= dsize; + + // Save in result. + res[di] = dofs; + + prev_size *= dsize; + host_assert(prev_size <= size_t(product())); + } + return res; + } + + // Advance 'idxs' containing indices in the N-d space defined by + // 'this' to the next logical index. + // Input 'idxs' must contain valid indices, i.e., each value must + // be between 0 and N-1, where N is the value in the corresponding + // dim in 'this'. + // If 'idxs' is at last index, "wraps-around" to all zeros. + // See 'unlayout()' for description of 'first_inner'. + inline void next_index(bool first_inner, Indices& idxs) const { + const int inner_dim = first_inner ? 0 : _ndims-1; + const int dim_step = first_inner ? 1 : -1; + + // Increment inner dim. + idxs[inner_dim]++; + + // Wrap around indices as needed. + // First test is redundant, but keeps us from entering loop most times. + if (idxs[inner_dim] >= _idxs[inner_dim]) { + for (int j = 0, k = inner_dim; j < _ndims; j++, k += dim_step) { + + // If too far in dim 'k', set idx to 0 and increment idx in next dim. + if (idxs[k] >= _idxs[k]) { + idxs[k] = 0; + int nxt_dim = k + dim_step; + if (nxt_dim >= 0 && nxt_dim < _ndims) + idxs[nxt_dim]++; + } + else + break; + } + } + } + + // More ctors. + Indices(const IdxTuple& src) { + set_from_tuple(src); + } + Indices(const VarIndices& src) { + set_from_vec(src); + } + Indices(const idx_t_init_list& src) { + set_from_init_list(src); + } + + // Write to an IdxTuple. + // The 'tgt' must have the same number of dims. + void set_tuple_vals(IdxTuple& tgt) const { + host_assert(tgt.size() == size_t(_ndims)); + for (int i = 0; i < _ndims; i++) + if (size_t(i) < tgt.size()) + tgt.set_val(i, _idxs[i]); + } + + // Read from an IdxTuple. + void set_from_tuple(const IdxTuple& src) { + host_assert(src.size() <= +max_idxs); + _ndims = int(src.size()); + for (int i = 0; i < _ndims; i++) + _idxs[i] = src.get_val(i); + } + + // Other inits. + void set_from_vec(const VarIndices& src) { + host_assert(src.size() <= +max_idxs); + int n = int(src.size()); + for (int i = 0; i < n; i++) + _idxs[i] = src[i]; + _ndims = n; + } + + // Call the 'visitor' lambda function at every point sequentially in + // the N-d space defined by 'this'. At each call, 'idxs' contains + // next point in N-d space, and 'idx' contains sequentially-numbered + // 1-d index. + // Stops and returns 'false' if/when visitor returns 'false'. + // See 'unlayout()' for description of 'first_inner'. + bool visit_all_points(bool first_inner, + std::function visitor) const { + Indices idxs(*this); + idxs.set_vals_same(0); + + // Total number of points to visit. + idx_t ne = product(); + + // 0 points? + if (ne < 1) + return true; + + // 1 point? + else if (ne == 1) + return visitor(idxs, 0); + + // Visit each point in sequential order. + for (idx_t i = 0; i < ne; i++) { + + // Call visitor. + bool ok = visitor(idxs, i); + if (!ok) + return false; + + // Jump to next index. + next_index(first_inner, idxs); + } + return true; + } + + // Same as visit_all_points(), except ranges of points are visited + // concurrently, and return value from 'visitor' is ignored. + void visit_all_points_in_parallel(bool first_inner, + std::function visitor) const { + // Total number of points to visit. + idx_t ne = product(); + + // 0 points? + if (ne < 1) + return; + + // 1 point? + else if (ne == 1) { + Indices idxs(*this); + idxs.set_vals_same(0); + visitor(idxs, 0); + return; + } + + #ifdef _OPENMP + + // Num threads to be started. + idx_t nthr = yask_get_num_threads(); + + // Start visits in parallel. + // (Not guaranteed that each tnum will be unique in every OMP + // impl, so don't rely on it.) + yask_parallel_for + (0, nthr, 1, + [&](idx_t n, idx_t np1, idx_t tnum) { + + // Start and stop indices for this thread. + idx_t start = div_equally_cumu_size_n(ne, nthr, n - 1); + idx_t stop = div_equally_cumu_size_n(ne, nthr, n); + assert(stop >= start); + if (stop <= start) + return; // from lambda. + + // Make Indices for this thread. + Indices idxs(*this); + + // Convert 1st linear index to n-dimensional indices. + idxs = unlayout(first_inner, start); + + // Visit each point in sequential order within this thread. + for (idx_t i = start; i < stop; i++) { + + // Call visitor. + visitor(idxs, i); + + // Jump to next index. + next_index(first_inner, idxs); + } + }); + + #else + // No OMP; use sequential version. + visit_all_points(first_inner, visitor); + #endif + } + + // Make a Tuple w/given names using values in 'this'. IdxTuple make_tuple(const VarDimNames& names) const { - assert((int)names.size() == _ndims); + host_assert((int)names.size() == _ndims); // Make a Tuple from names. IdxTuple tmp; @@ -304,7 +500,7 @@ namespace yask { return tmp; } - // Make a Tuple w/o useful names. + // Make a Tuple w/names 'd0', 'd1', etc. using values in 'this'. IdxTuple make_tuple() const { IdxTuple tmp; for (int i = 0; i < _ndims; i++) @@ -312,7 +508,7 @@ namespace yask { return tmp; } - // Make a Tuple w/names from another Tuple. + // Make a Tuple w/names from another Tuple using values in 'this'. IdxTuple make_tuple(const IdxTuple& names) const { auto tmp = names.get_dim_names(); return make_tuple(tmp); @@ -320,40 +516,42 @@ namespace yask { // Make string like "x=4, y=8". std::string make_dim_val_str(const VarDimNames& names, - std::string separator=", ", - std::string infix="=", - std::string prefix="", - std::string suffix="") const { + std::string separator=", ", + std::string infix="=", + std::string prefix="", + std::string suffix="") const { auto tmp = make_tuple(names); return tmp.make_dim_val_str(separator, infix, prefix, suffix); } - std::string make_dim_val_str(const IdxTuple& names, // ignore values. - std::string separator=", ", - std::string infix="=", - std::string prefix="", - std::string suffix="") const { + std::string make_dim_val_str(const IdxTuple& names, // ignore values in 'names'. + std::string separator=", ", + std::string infix="=", + std::string prefix="", + std::string suffix="") const { auto tmp = make_tuple(names); return tmp.make_dim_val_str(separator, infix, prefix, suffix); } // Make string like "4, 3, 2". std::string make_val_str(std::string separator=", ", - std::string prefix="", - std::string suffix="") const { + std::string prefix="", + std::string suffix="") const { // Make a Tuple w/o useful names. auto tmp = make_tuple(); return tmp.make_val_str(separator, prefix, suffix); } }; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); // Define OMP reductions on Indices. -#pragma omp declare reduction(min_idxs : Indices : \ - omp_out = omp_out.min_elements(omp_in) ) \ - initializer (omp_priv = omp_orig) -#pragma omp declare reduction(max_idxs : Indices : \ - omp_out = omp_out.max_elements(omp_in) ) \ - initializer (omp_priv = omp_orig) + #pragma omp declare reduction(min_idxs : Indices : \ + omp_out = omp_out.min_elements(omp_in) ) \ + initializer (omp_priv = omp_orig) + #pragma omp declare reduction(max_idxs : Indices : \ + omp_out = omp_out.max_elements(omp_in) ) \ + initializer (omp_priv = omp_orig) // Layout base class. // This class hierarchy is NOT virtual. @@ -369,24 +567,28 @@ namespace yask { _sizes(idx_t(0), nsizes) { } // Access sizes. - inline const Indices& get_sizes() const { return _sizes; } - void set_sizes(const Indices& sizes) { _sizes = sizes; } - inline idx_t get_size(int i) const { - assert(i >= 0); - assert(i < _sizes._get_num_dims()); + ALWAYS_INLINE const Indices& get_sizes() const { return _sizes; } + void set_sizes(const Indices& sizes) { + _sizes = sizes; + } + ALWAYS_INLINE idx_t get_size(int i) const { + host_assert(i >= 0); + host_assert(i < _sizes.get_num_dims()); return _sizes[i]; } void set_size(int i, idx_t size) { - assert(i >= 0); - assert(i < _sizes._get_num_dims()); + host_assert(i >= 0); + host_assert(i < _sizes.get_num_dims()); _sizes[i] = size; } // Product of valid sizes. - inline idx_t get_num_elements() const { + ALWAYS_INLINE idx_t get_num_elements() const { return _sizes.product(); } }; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); // 0-D <-> 1-D layout class. // (Trivial layout.) @@ -394,58 +596,56 @@ namespace yask { public: Layout_0d() : Layout(0) { } Layout_0d(const Indices& sizes) : Layout(0, sizes) { } - inline int get_num_sizes() const { + static constexpr int get_num_sizes() { return 0; } // Return 1-D offset from 0-D 'j' indices. - inline idx_t layout(const Indices& j) const { + ALWAYS_INLINE idx_t layout(const Indices& j) const { return 0; } // Return 0 indices based on 1-D 'ai' input. - inline Indices unlayout(idx_t ai) const { + ALWAYS_INLINE Indices unlayout(idx_t ai) const { Indices j(idx_t(0), 0); return j; } }; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); // Auto-generated layout algorithms for >0 dims. -#include "yask_layouts.hpp" + #include "yask_layouts.hpp" // Forward defns. struct Dims; - // A group of Indices needed for generated loops. + // A collection of Indices needed for generated loops. // See the help message from gen_loops.pl for the // documentation of the indices. // This class is NOT virtual. struct ScanIndices { - int ndims = 0; + int ndims = 0; // All indices will be in stencil dims. // Input values; not modified. - Indices begin, end; // first and end (beyond last) range of each index. - Indices stride; // distance between indices within [begin .. end). - Indices align; // alignment of indices after first one. - Indices align_ofs; // adjustment for alignment (see below). - Indices group_size; // proximity grouping within range. + Indices begin, end; // First and end (one beyond last) index (range). + Indices stride; // Distance between successive indices within begin/end range. + Indices align; // Alignment of beginning indices whenever possible (see notes below). + Indices align_ofs; // Adjustment for alignment (see below). + Indices tile_size; // Tiling within begin/end range if enabled during loop generation. - // Alignment: + // Notes: // First 'start' index is always at 'begin'. - // Subsequent indices are at 'begin' + 'stride', 'begin' + 2*'stride', etc. if 'align'==1. - // If 'align'>1, subsequent indices will be aligned such that - // (('start' - 'align_ofs') % 'align') == 0. + // If 'align'==1, subsequent 'start' indices are at 'begin' + 'index'*'stride'. + // If 'align'>1, subsequent 'start' indices will be aligned such that + // ('start' <= 'begin' + 'index'*'stride') && + // (('start' - 'align_ofs') % 'align') == 0. // Last 'start' index is always < 'end'. - // Last 'stop' index always == 'end'. - - // Output values; set once for entire range. - Indices num_indices; // number of indices in each dim. - idx_t linear_indices = 0; // total indices over all dims (product of num_indices). + // Last 'stop' index is always at 'end'. // Output values; set for each index by loop code. Indices start, stop; // first and last+1 for this sub-range. Indices index; // 0-based unique index for each sub-range in each dim. - idx_t linear_index = 0; // 0-based index over all dims. // Example w/3 sub-ranges in overall range: // begin end @@ -454,31 +654,69 @@ namespace yask { // start stop (index = 0) // start stop (index = 1) // start stop (index = 2) - + + // Example 2: begin=5, end=65, align=10, align_ofs=0, stride=20 => + // the following 4 sets of iteration vars: + // 1. index=0, start=5, stop=20 (peel for alignment: adj_begin=0) + // 2. index=1, start=20, stop=40 + // 3. index=2, start=40, stop=60 + // 4. index=3, start=60, stop=65 (rem) + // The calculation of these vars is done so that the 4 iterations + // can be done concurrently, i.e., everything is based on the + // current index, and there are no dependencies between iterations + // or their placement on threads, etc. + // Ctor. - ScanIndices(const Dims& dims, bool use_vec_align); - ScanIndices(const Dims& dims, bool use_vec_align, Indices* ofs) : - ScanIndices(dims, use_vec_align) { + ScanIndices(bool use_vec_align) : + ndims(NUM_STENCIL_DIMS), + begin(idx_t(0), ndims), + end(idx_t(0), ndims), + stride(idx_t(1), ndims), + align(idx_t(1), ndims), + align_ofs(idx_t(0), ndims), + tile_size(idx_t(1), ndims), + start(idx_t(0), ndims), + stop(idx_t(0), ndims), + index(idx_t(0), ndims) { + + // Set alignment to vector lengths. + // i: index for stencil dims, j: index for domain dims. + if (use_vec_align) + DOMAIN_VAR_LOOP_FAST(i, j) + align[i] = fold_pts[j]; + } + ScanIndices(bool use_vec_align, Indices* ofs) : + ScanIndices(use_vec_align) { if (ofs) { - DOMAIN_VAR_LOOP(i, j) { - assert(ofs->_get_num_dims() == ndims - 1); + host_assert(ofs->get_num_dims() == ndims - 1); + DOMAIN_VAR_LOOP_FAST(i, j) align_ofs[i] = (*ofs)[j]; - } } } - ScanIndices(const Dims& dims, bool use_vec_align, IdxTuple* ofs) : - ScanIndices(dims, use_vec_align) { + ScanIndices(bool use_vec_align, IdxTuple* ofs) : + ScanIndices(use_vec_align) { if (ofs) { - DOMAIN_VAR_LOOP(i, j) { - assert(ofs->_get_num_dims() == ndims - 1); + host_assert(ofs->get_num_dims() == ndims - 1); + DOMAIN_VAR_LOOP_FAST(i, j) align_ofs[i] = ofs->get_val(j); - } } } // Default bit-wise copy should be okay. - // Init from outer-loop indices. + // Get ranges. + inline idx_t get_overall_range(int dim_posn) const { + host_assert(dim_posn >= 0); + host_assert(dim_posn < ndims); + return end[dim_posn] - begin[dim_posn]; + } + inline idx_t get_current_range(int dim_posn) const { + host_assert(dim_posn >= 0); + host_assert(dim_posn < ndims); + return stop[dim_posn] - start[dim_posn]; + } + + // Create inner-loop indices from outer-loop indices. // Start..stop from point in outer loop become begin..end // for this loop. // @@ -488,20 +726,58 @@ namespace yask { // |------------------|------------------|------| // start | stop // V - // begin (this) end + // begin (inner) end // |------------------| - // start stop (may be sub-dividied later) - void init_from_outer(const ScanIndices& outer) { + // start stop + // NB: inner loops are initialized with one iteration, + // but typically they are sub-divided later. + inline ScanIndices create_inner() const { + ScanIndices inner(*this); // Begin & end set from start & stop of outer loop. - begin = start = outer.start; - end = stop = outer.stop; - - // Pass some values through. - align = outer.align; - align_ofs = outer.align_ofs; - - // Leave others alone. + inner.begin = inner.start = start; + inner.end = inner.stop = stop; + + // Pass alignment unchanged. + inner.align = align; + inner.align_ofs = align_ofs; + + // Init tile size & stride to whole range. + DOMAIN_VAR_LOOP_FAST(i, j) + inner.tile_size[i] = inner.stride[i] = get_overall_range(i); + + // Init first indices. + inner.index.set_from_const(0); + return inner; + } + + // Adjust upper limits of strides and tiles based on settings. This + // is used to ensure only one inner range and/or tile is configured + // if originally requested even if the current area has been + // increased slightly. For example, if the original micro-block-size + // and nano-block-size are 32, then the micro-block-size is increased + // to 34 for temporal tiling, this function will set the micro-block + // stride to 34 per the *intention* of the original setting to have + // only one nano-block. Similar for tiles. + inline void + adjust_from_settings(const IdxTuple& orig_sizes_of_this, + const IdxTuple& orig_tile_sizes_of_this, + const IdxTuple& orig_sizes_of_inner) { + + assert(ndims == orig_sizes_of_this.get_num_dims()); + assert(ndims == orig_tile_sizes_of_this.get_num_dims()); + assert(ndims == orig_sizes_of_inner.get_num_dims()); + DOMAIN_VAR_LOOP(i, j) { + + // If original [or auto-tuned] inner area covers + // this entire area, set stride size to full width. + if (orig_sizes_of_inner[i] >= orig_sizes_of_this[i]) + stride[i] = get_overall_range(i); + + // Similar for tiles. + if (orig_tile_sizes_of_this[i] >= orig_sizes_of_this[i]) + tile_size[i] = get_overall_range(i); + } } }; diff --git a/src/kernel/lib/new_var.cpp b/src/kernel/lib/new_var.cpp index 501fdfa3..c8bf9f0c 100644 --- a/src/kernel/lib/new_var.cpp +++ b/src/kernel/lib/new_var.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -98,8 +98,12 @@ namespace yask { } // Scalar? - if (ndims == 0) - gp = make_shared>(*this, name, gdims); + if (ndims == 0) { + //gp = make_shared>(*this, name, gdims); + typedef YkElemVar scalar_t; + gp = allocate_shared(yask_allocator(), + *this, name, gdims); + } // Include auto-gen code for all other cases. #include "yask_var_code.hpp" @@ -142,8 +146,8 @@ namespace yask { // Pads. // Set via both 'extra' and 'min'; larger result will be used. if (domain_dims.lookup(gdim)) { - ygp->set_extra_pad_size(i, opts->_extra_pad_sizes[gdim]); - ygp->set_min_pad_size(i, opts->_min_pad_sizes[gdim]); + ygp->update_extra_pad_size(i, actl_opts->_extra_pad_sizes[gdim]); + ygp->update_min_pad_size(i, actl_opts->_min_pad_sizes[gdim]); } // Offsets. diff --git a/src/kernel/lib/offload.hpp b/src/kernel/lib/offload.hpp new file mode 100644 index 00000000..a889af69 --- /dev/null +++ b/src/kernel/lib/offload.hpp @@ -0,0 +1,532 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +// This file contains declarations and code needed for OpenMP offload. +// Various versions of code is provided depending on whether offloading and, +// if offloading, whether USM is used. +// Also see OMP_* macros in yask.hpp. + +#pragma once + +namespace yask { + + // Definitions to use when offloading WITH unified shared memory. + // Not offloading: 0 + // Offloading w/USM: 1 + // Offloading w/o USM: 0 + #ifdef USE_OFFLOAD_USM + + // Allocate space for 'num' 'T' objects on host. + inline void* offload_alloc_host(size_t nbytes) { + auto devn = KernelEnv::_omp_devn; + + #ifdef INTEL_OMP + TRACE_MSG("allocating " << make_byte_str(nbytes) << " shared, specifying OMP dev " << devn); + void* p = omp_target_alloc_shared(nbytes, devn); + #else + TRACE_MSG("allocating " << make_byte_str(nbytes) << " on host"); + void* p = yask_aligned_alloc(nbytes, devn); + #endif + if (!p) + THROW_YASK_EXCEPTION("error: cannot allocate " + make_byte_str(nbytes) + " on host"); + return p; + } + + // Free memory allocated with offload_alloc_host(). + inline void offload_free_host(void* p) { + auto devn = KernelEnv::_omp_devn; + #ifdef INTEL_OMP + omp_target_free(p, devn); // frees after omp_target_alloc_*(). + #else + free(p); + #endif + } + #endif + + // Definitions to use when offloading but NOT using unified shared memory. + // Not offloading: 0 + // Offloading w/USM: 0 + // Offloading w/o USM: 1 + #ifdef USE_OFFLOAD_NO_USM + + // Allocate space for 'num' 'T' objects on host. + inline void* offload_alloc_host(size_t nbytes) { + auto devn = KernelEnv::_omp_devn; + + #ifdef INTEL_OMP + TRACE_MSG("allocating " << make_byte_str(nbytes) << " on host, specifying OMP dev " << devn); + void* p = omp_target_alloc_host(nbytes, devn); + #else + TRACE_MSG("allocating " << make_byte_str(nbytes) << " on host"); + void* p = yask_aligned_alloc(nbytes, devn); + #endif + if (!p) + THROW_YASK_EXCEPTION("error: cannot allocate " + make_byte_str(nbytes) + " on host"); + return p; + } + + // Free memory allocated with offload_alloc_host(). + inline void offload_free_host(void* p) { + auto devn = KernelEnv::_omp_devn; + #ifdef INTEL_OMP + omp_target_free(p, devn); // frees after omp_target_alloc_*(). + #else + free(p); + #endif + } + + // Get device addr from any mapped host addr 'hostp'. + // If 'must_be_mapped' is true, assertion will fail if not mapped. + // If 'enable_trace' is true, host and device addrs will be printed when + // tracing. Make sure 'enable_trace' is false if called from 'TRACE_MSG' + // to avoid deadlock. + template + T* get_dev_ptr(const T* hostp, + bool must_be_mapped = true, + bool enable_trace = true) { + if (!hostp) + return NULL; + auto devn = KernelEnv::_omp_devn; + bool is_present = omp_target_is_present((void*)hostp, devn); + if (!is_present && !must_be_mapped) + return NULL; + assert(is_present); + + void* mp = omp_get_mapped_ptr(hostp, devn); + T* dp = (T*)mp; + + if (enable_trace) { + TRACE_MSG("host addr == " << (void*)hostp << + "; dev addr == " << (void*)dp); + } + return dp; + } + + // Allocate space for 'num' 'T' objects on offload device. + // Map 'hostp' to allocated mem. + // Return device ptr. + template + void* offload_map_alloc(T* hostp, size_t num) { + if (KernelEnv::_use_offload) { + assert(hostp); + auto nb = sizeof(T) * num; + auto devn = KernelEnv::_omp_devn; + + TRACE_MSG("allocating " << make_byte_str(nb) << " on OMP dev " << devn); + #ifdef INTEL_OMP + void* devp = omp_target_alloc_device(nb, devn); + #else + void* devp = omp_target_alloc(nb, devn); + #endif + if (!devp) + THROW_YASK_EXCEPTION("error: cannot allocate " + make_byte_str(nb) + " on OMP device"); + + TRACE_MSG("mapping " << (void*)hostp << " to " << devp << " on OMP dev " << devn); + auto res = omp_target_associate_ptr(hostp, devp, nb, 0, devn); + if (res) + THROW_YASK_EXCEPTION("error: cannot map OMP device ptr"); + assert(omp_target_is_present(hostp, devn)); + assert(get_dev_ptr(hostp) == devp); + + TRACE_MSG("done allocating and mapping"); + return devp; + } + return hostp; + } + + // Unmap 'hostp' from 'devp' on offload device. + // Free space for 'num' 'T' objects on offload device. + // Not checking KernelEnv::_use_offload because this is often + // called from destructor which may be after _use_offload has + // been changed since offload_map_alloc() was called. + template + void _offload_map_free(void* devp, T* hostp, size_t num) { + if (!devp || !hostp) + return; + auto nb = sizeof(T) * num; + auto devn = KernelEnv::_omp_devn; + if (devp == (void*)hostp) + TRACE_MSG("not unmapping " << devp << " because host and OMP dev ptr are same"); + else { + TRACE_MSG("unmapping " << (void*)hostp << " from " << devp << " on OMP dev " << devn); + assert(omp_target_is_present(hostp, devn)); + assert(get_dev_ptr(hostp) == devp); + auto res = omp_target_disassociate_ptr(hostp, devn); + if (res) + THROW_YASK_EXCEPTION("error: cannot unmap OMP device ptr"); + TRACE_MSG("freeing " << make_byte_str(nb) << " on OMP dev " << devn); + omp_target_free(devp, devn); + TRACE_MSG("done unmapping and freeing"); + } + } + + // Unmap 'hostp' on offload device. + // Automatically looks up dev ptr. + // Free space for 'num' 'T' objects on offload device. + template + void offload_map_free(T* hostp, size_t num) { + void* devp = get_dev_ptr(hostp, false); + _offload_map_free(devp, hostp, num); + } + + // Copy data to device. + template + void _offload_copy_to_device(void* devp, T* hostp, size_t num) { + if (KernelEnv::_use_offload) { + assert(hostp); + assert(devp); + auto nb = sizeof(T) * num; + auto devn = KernelEnv::_omp_devn; + TRACE_MSG("copying " << make_byte_str(nb) << " to OMP dev " << devn); + assert(omp_target_is_present(hostp, devn)); + auto res = omp_target_memcpy(devp, hostp, // dst, src. + nb, 0, 0, + devn, KernelEnv::_omp_hostn); + TRACE_MSG("done copying to OMP dev"); + } + } + template + void offload_copy_to_device(T* hostp, size_t num) { + if (KernelEnv::_use_offload) { + void* devp = get_dev_ptr(hostp); + _offload_copy_to_device(devp, hostp, num); + } + } + + // Copy data from device. + template + void _offload_copy_from_device(void* devp, T* hostp, size_t num) { + if (KernelEnv::_use_offload) { + assert(hostp); + assert(devp); + auto nb = sizeof(T) * num; + auto devn = KernelEnv::_omp_devn; + TRACE_MSG("copying " << make_byte_str(nb) << " from OMP dev " << devn); + assert(omp_target_is_present(hostp, devn)); + auto res = omp_target_memcpy(hostp, devp, // dst, src. + nb, 0, 0, + KernelEnv::_omp_hostn, devn); + TRACE_MSG("done copying from OMP dev"); + } + } + template + void offload_copy_from_device(T* hostp, size_t num) { + if (KernelEnv::_use_offload) { + void* devp = get_dev_ptr(hostp); + _offload_copy_from_device(devp, hostp, num); + } + } + #endif + + // Definitions to use when offloading with unified shared memory OR not offloading. + // Not offloading: 1 + // Offloading w/USM: 1 + // Offloading w/o USM: 0 + #ifndef USE_OFFLOAD_NO_USM + template + T* get_dev_ptr(T* hostp, + bool must_be_mapped = true, + bool enable_trace = true) { return hostp; } + template + void* offload_map_alloc(T* hostp, size_t num) { return hostp; } + template + void _offload_map_free(void* devp, T* hostp, size_t num) { } + template + void offload_map_free(T* hostp, size_t num) { } + template + void _offload_copy_to_device(void* devp, T* hostp, size_t num) { } + template + void offload_copy_to_device(T* hostp, size_t num) { } + template + void _offload_copy_from_device(void* devp, T* hostp, size_t num) { } + template + void offload_copy_from_device(T* hostp, size_t num) { } + #endif + + // Definitions to use when not offloading. + // Not offloading: 1 + // Offloading w/USM: 0 + // Offloading w/o USM: 0 + #ifndef USE_OFFLOAD + + inline void* offload_alloc_host(size_t nbytes) { + return malloc(nbytes); + } + inline void offload_free_host(void* p) { + if (p) + free(p); + } + + #endif + + // Non-typed versions. + inline void* get_dev_ptr(const void* hostp, + bool must_be_mapped = true, + bool enable_trace = true) { + return (void*)get_dev_ptr((char*) hostp, must_be_mapped, enable_trace); + } + inline void _offload_copy_to_device(void* devp, void* hostp, size_t nbytes) { + _offload_copy_to_device(devp, (char*)hostp, nbytes); + } + inline void offload_copy_to_device(void* hostp, size_t nbytes) { + offload_copy_to_device((char*)hostp, nbytes); + } + inline void _offload_copy_from_device(void* devp, void* hostp, size_t nbytes) { + _offload_copy_from_device(devp, (char*)hostp, nbytes); + } + inline void offload_copy_from_device(void* hostp, size_t nbytes) { + offload_copy_from_device((char*)hostp, nbytes); + } + inline void* offload_map_alloc(void* hostp, size_t nbytes) { + return offload_map_alloc((char*)hostp, nbytes); + } + + // Type to track and sync pointers on target device. + // A synced pointer has these characteristics: + // - Pointer exists on host & dev. + // - Object containing pointer is mapped (associated) on dev. + // - On host copy of ptr: + // - Addr pointed to (value of '_p') is mapped on dev (value of '_dp'). + // - On dev copy of ptr: + // - Addr pointed to is mapped dev addr. + template + class synced_ptr { + private: + T* _p = 0; // ptr to data; used on host and device. + + // Additional data when offloading without unified addresses. + #ifdef USE_OFFLOAD_NO_USM + T* _dp = 0; // val of ptr on device. + #endif + + protected: + // Sync this pointer. + // To properly sync pointer on device, '*this' and '*_p' must + // already be mapped to device mem. + void _sync() { + #ifdef USE_OFFLOAD_NO_USM + if (KernelEnv::_use_offload) { + auto devn = KernelEnv::_omp_devn; + TRACE_MSG("omp: sync'ing ptr to " << _p << " on host..."); + + // Value of ptr on dev. + _dp = yask::get_dev_ptr(_p); + + // Addr of ptr on host & dev. + T** pp = &_p; + T** dpp = yask::get_dev_ptr(pp); + + // Set pointer on device to val of ptr on dev. + _offload_copy_to_device(dpp, &_dp, 1); + + TRACE_MSG("omp: sync'd ptr to " << _p << " on host at " << (void*)pp << + " -> " << _dp << " on device " << devn << " at " << (void*)dpp << + ((dpp == 0) ? " *******" : "")); + } + else + _dp = _p; + #endif + } + + public: + synced_ptr(T* p) : _p(p) { } + synced_ptr() : synced_ptr(0) { } + + // Accessors. + T* get() { return _p; } + const T* get() const { return _p; } + operator T*() { return _p; } + operator const T*() { return _p; } + T* operator->() { return _p; } + const T* operator->() const { return _p; } + T& operator*() { return *_p; } + const T& operator*() const { return *_p; } + T& operator[](size_t i) { return _p[i]; } + const T& operator[](size_t i) const { return _p[i]; } + T& operator[](long i) { return _p[i]; } + const T& operator[](long i) const { return _p[i]; } + T& operator[](int i) { return _p[i]; } + const T& operator[](int i) const { return _p[i]; } + + // Get pointer on device or NULL if not yet resolved. + #ifdef USE_OFFLOAD_NO_USM + const T* get_dev_ptr() const { return _dp; } + + // Get pointer on host if not offloading or with USM. + #else + const T* get_dev_ptr() const { return _p; } + #endif + + // Set pointer value. + void operator=(T* p) { + _p = p; + #ifdef USE_OFFLOAD_NO_USM + _dp = 0; + #endif + } + + // Sync pointer on device. + inline void sync() { + _sync(); + } + + // Set to given value and sync. + inline void set_and_sync(T* p) { + operator=(p); + _sync(); + } + + }; + + // Host/device coherency state machine. + class Coherency { + public: + + // Coherency states. + enum coh_state { host_mod, // Host copy modified; device copy out-of-sync. + dev_mod, // Device copy modified; host copy out-of-sync. + in_sync, // Host and device have same data. + num_states }; + + // Current state. + inline coh_state get_state() const { + return _state; + } + + #ifdef USE_OFFLOAD_NO_USM + + // Set state directly (not recommended). + void _force_state(coh_state state) { + TRACE_MSG("coherency state forced to " << state); + _state = state; + } + + protected: + coh_state _state = host_mod; + + public: + + // Boolean queries. + inline bool need_to_update_host() const { + return _state == dev_mod; + } + inline bool need_to_update_dev() const { + return _state == host_mod; + } + + // State-transition events. + // Functions return new state. + + // Call when host copy is modified, but dev copy is not. + coh_state mod_host() { + if (_state == dev_mod) + THROW_YASK_EXCEPTION("internal error: " + "host copy modified, but device copy was newer"); + _state = host_mod; + return _state; + } + + // Call when dev copy is modified, but host copy is not. + coh_state mod_dev() { + if (_state == host_mod) + THROW_YASK_EXCEPTION("internal error: " + "device copy modified, but host copy was newer"); + _state = dev_mod; + return _state; + } + + // Call when both dev and host copies are modified w/the same changes. + coh_state mod_both() { + if (_state == dev_mod) + THROW_YASK_EXCEPTION("internal error: " + "host copy modified, but device copy was newer"); + if (_state == host_mod) + THROW_YASK_EXCEPTION("internal error: " + "device copy modified, but host copy was newer"); + assert(_state == in_sync); + return _state; + } + + // Call when host data is copied to dev. + coh_state host_copied_to_dev() { + if (_state == dev_mod) + THROW_YASK_EXCEPTION("internal error: " + "host data copied to dev, but device copy was newer"); + _state = in_sync; + return _state; + } + + // Call when dev data is copied to host. + coh_state dev_copied_to_host() { + if (_state == host_mod) + THROW_YASK_EXCEPTION("internal error: " + "device data copied to host, but host copy was newer"); + _state = in_sync; + return _state; + } + + #else + // Stubs for no offload or USM. + + // Set state directly (not recommended). + void _force_state(coh_state state) { + TRACE_MSG("attempt to force coherency state to " << state << + " ignored because of offload build state"); + } + + protected: + coh_state _state = in_sync; + + public: + + // Boolean queries. + inline bool need_to_update_host() const { + return false; + } + inline bool need_to_update_dev() const { + return false; + } + + // State-transition event stubs. + coh_state mod_host() { + return _state; + } + coh_state mod_dev() { + return _state; + } + coh_state mod_both() { + return _state; + } + coh_state host_copied_to_dev() { + return _state; + } + coh_state dev_copied_to_host() { + return _state; + } + + #endif + }; + +} // namespace yask. diff --git a/src/kernel/lib/realv.hpp b/src/kernel/lib/realv.hpp index 7bba0582..4e7d2a4f 100644 --- a/src/kernel/lib/realv.hpp +++ b/src/kernel/lib/realv.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -24,6 +24,7 @@ IN THE SOFTWARE. *****************************************************************************/ // This file defines a union to use for optionally-folded vectors of floats or doubles. +// It uses a macro scheme to generate intrinsic calls for various SIMD lengths and precisions. #pragma once @@ -116,31 +117,38 @@ namespace yask { // Emulate instrinsics for unsupported VLEN. // Only 256 and 512-bit vectors supported. // VLEN == 1 also supported as scalar. + #ifndef NO_INTRINSICS #if VLEN == 1 #define NO_INTRINSICS // note: no warning here because intrinsics aren't wanted in this case. #elif !defined(INAME) - #warning "Emulating intrinsics because HW vector length not defined; check setting of USE_INTRIN256, USE_INTRIN512LO or USE_INTRIN512 in kernel Makefile" + #warning "Emulating intrinsics because HW vector length not defined; set NO_INTRINSICS to avoid this warning" #define NO_INTRINSICS #elif VLEN != VEC_ELEMS - #warning "Emulating intrinsics because VLEN != HW vector length" + #warning "Emulating intrinsics because VLEN != HW vector length; set NO_INTRINSICS to avoid this warning" #define NO_INTRINSICS #endif + #endif // Macro for looping through an aligned real_vec_t. - #if defined(CHECK) || (VLEN==1) + #if VLEN==1 + #define REAL_VEC_LOOP(i) \ + constexpr int i=0; + #define REAL_VEC_LOOP_UNALIGNED(i) \ + constexpr int i=0; + #elif defined(CHECK) #define REAL_VEC_LOOP(i) \ for (int i=0; i::value); + static_assert(std::is_aggregate::value); - - // Type for a vector block. + // Type for a SIMD vector w/operator overloading. struct real_vec_t { // union of data types. @@ -210,11 +219,6 @@ namespace yask { operator=(val); } - // get length. - ALWAYS_INLINE int get_num_elems() const { - return VLEN; - } - // copy whole vector. ALWAYS_INLINE real_vec_t& operator=(const real_vec_t& rhs) { #ifdef NO_INTRINSICS @@ -263,16 +267,20 @@ namespace yask { operator=(real_t(val)); } + // get length. + ALWAYS_INLINE int get_num_elems() const { + return VLEN; + } // access a real_t linearly. ALWAYS_INLINE real_t& operator[](idx_t l) { - assert(l >= 0); - assert(l < VLEN); + host_assert(l >= 0); + host_assert(l < VLEN); return u.r[l]; } ALWAYS_INLINE const real_t& operator[](idx_t l) const { - assert(l >= 0); - assert(l < VLEN); + host_assert(l >= 0); + host_assert(l < VLEN); return u.r[l]; } @@ -288,7 +296,7 @@ namespace yask { return res; } - // unary plus. + // unary plus (no-op). ALWAYS_INLINE real_vec_t operator+() const { return *this; } @@ -405,7 +413,7 @@ namespace yask { // masked copy: copy only the selected elements of 'from' // into 'this', keeping the existing ones. ALWAYS_INLINE void copy_from_masked(const real_vec_t& from, - uidx_t k1) { + uidx_t k1) { #if defined(NO_INTRINSICS) || !defined(USE_AVX512) REAL_VEC_LOOP(i) if ((k1 >> i) & 1) u.r[i] = from[i]; #else @@ -422,7 +430,7 @@ namespace yask { #endif } ALWAYS_INLINE void load_from_masked(const real_vec_t* __restrict from, - uidx_t k1) { + uidx_t k1) { #if defined(NO_INTRINSICS) || defined(NO_LOAD_INTRINSICS) || !defined(USE_AVX512) REAL_VEC_LOOP(i) if ((k1 >> i) & 1) u.r[i] = (*from)[i]; #else @@ -457,8 +465,6 @@ namespace yask { REAL_VEC_LOOP(i) (*to)[i] = u.r[i]; #elif !defined(USE_STREAMING_STORE) INAME(store)((imem_t*)to, u.mr); - #elif defined(ARCH_KNC) - INAME(storenrngo)((imem_t*)to, u.mr); #else INAME(stream)((imem_t*)to, u.mr); #endif @@ -705,8 +711,8 @@ namespace yask { std::cout << " b: "; b.print_reals(std::cout); #endif - assert(count >= 0); - assert(count <= VLEN); + host_assert(count >= 0); + host_assert(count <= VLEN); if (count == 0) res.u = b.u; else if (count == VLEN) @@ -759,10 +765,6 @@ namespace yask { real_vec_t* p = (real_vec_t*)(&r2[count]); // not usually aligned. res.u.mr = INAME(loadu)((imem_t const*)p); - // For DP on KNC, use 32-bit op w/2x count. - #elif REAL_BYTES == 8 && defined(ARCH_KNC) && defined(USE_INTRIN512) - res.u.mi = _mm512_alignr_epi32(a.u.mi, b.u.mi, count*2); - // Everything else. #else res.u.mi = INAMEI(alignr)(a.u.mi, b.u.mi, count); @@ -825,7 +827,7 @@ namespace yask { #if defined(NO_INTRINSICS) || !defined(USE_AVX512) // must make a temp copy in case &res == &a. real_vec_t tmp = a; - for (int i = 0; i < VLEN; i++) + REAL_VEC_LOOP_UNALIGNED(i) res.u.r[i] = tmp.u.r[ctrl.u.ci[i]]; #else res.u.mi = INAMEI(permutexvar)(ctrl.u.mi, a.u.mi); @@ -855,7 +857,7 @@ namespace yask { #if defined(NO_INTRINSICS) || !defined(USE_AVX512) // must make a temp copy in case &res == &a. real_vec_t tmp = a; - for (int i = 0; i < VLEN; i++) { + REAL_VEC_LOOP_UNALIGNED(i) { if ((k1 >> i) & 1) res.u.r[i] = tmp.u.r[ctrl.u.ci[i]]; } @@ -887,19 +889,12 @@ namespace yask { #if defined(NO_INTRINSICS) || !defined(USE_AVX512) // must make temp copies in case &res == &a or &b. real_vec_t tmpa = a, tmpb = b; - for (int i = 0; i < VLEN; i++) { + REAL_VEC_LOOP_UNALIGNED(i) { int sel = ctrl.u.ci[i] & ctrl_sel_bit; // 0 => a, 1 => b. int idx = ctrl.u.ci[i] & ctrl_idx_mask; // index. res.u.r[i] = sel ? tmpb.u.r[idx] : tmpa.u.r[idx]; } - #elif defined(ARCH_KNC) - yask_exception e; - std::stringstream err; - err << "error: 2-input permute not supported on KNC" << std::endl; - e.add_message(err.str()); - throw e; - #else res.u.mi = INAMEI(permutex2var)(a.u.mi, ctrl.u.mi, b.u.mi); #endif @@ -946,11 +941,11 @@ namespace yask { // Compare two real_vec_t's. inline bool within_tolerance(const real_vec_t& val, const real_vec_t& ref, - const real_vec_t& epsilon) { + const real_t epsilon) { if (val == ref) return true; for (int j = 0; j < VLEN; j++) { - if (!within_tolerance(val.u.r[j], ref.u.r[j], epsilon.u.r[j])) + if (!within_tolerance(val.u.r[j], ref.u.r[j], epsilon)) return false; } return true; diff --git a/src/kernel/lib/settings.cpp b/src/kernel/lib/settings.cpp index c7283231..3d7218ae 100644 --- a/src/kernel/lib/settings.cpp +++ b/src/kernel/lib/settings.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -29,11 +29,12 @@ using namespace std; namespace yask { // Check whether dim is of allowed type. + // Throw exception if not. void Dims::check_dim_type(const std::string& dim, - const std::string& fn_name, - bool step_ok, - bool domain_ok, - bool misc_ok) const { + const std::string& fn_name, + bool step_ok, + bool domain_ok, + bool misc_ok) const { if (step_ok && domain_ok && misc_ok) return; if (dim == _step_dim) { @@ -54,111 +55,61 @@ namespace yask { } } - // APIs. - // See yask_kernel_api.hpp. - yk_env_ptr yk_factory::new_env(MPI_Comm comm) const { - auto ep = make_shared(); - assert(ep); - ep->init_env(0, 0, comm); - return ep; - } - yk_env_ptr yk_factory::new_env() const { - return new_env(MPI_COMM_NULL); - } - - // KernelEnv global lock objects. + // Debug & trace. omp_lock_t KernelEnv::_debug_lock; bool KernelEnv::_debug_lock_init_done = false; - - // Init MPI, OMP. - void KernelEnv::init_env(int* argc, char*** argv, MPI_Comm existing_comm) - { - // MPI init. - my_rank = 0; - num_ranks = 1; - -#ifdef USE_MPI - int is_init = false; - MPI_Initialized(&is_init); - - // No MPI communicator provided. - if (existing_comm == MPI_COMM_NULL || - existing_comm == MPI_COMM_WORLD) { - if (!is_init) { - int provided = 0; - MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); - if (provided < MPI_THREAD_SERIALIZED) { - THROW_YASK_EXCEPTION("error: MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE not provided"); - } - is_init = true; - } - comm = MPI_COMM_WORLD; - } - - // MPI communicator provided. - else { - if (!is_init) - THROW_YASK_EXCEPTION("error: YASK environment created with" - " an existing MPI communicator, but MPI is not initialized"); - comm = existing_comm; + yask_output_ptr KernelEnv::_debug; + bool KernelEnv::_trace = false; + + // OMP offload devices. + #ifdef USE_OFFLOAD + bool KernelEnv::_use_offload = true; + int KernelEnv::_omp_hostn = 0; + int KernelEnv::_omp_devn = 0; + #else + bool KernelEnv::_use_offload = false; + #endif + + // Debug APIs. + yask_output_ptr yk_env::get_debug_output() { + if (!KernelEnv::_debug.get()) { + yask_output_factory ofac; + auto so = ofac.new_stdout_output(); + set_debug_output(so); } - - // Get some info on this communicator. - MPI_Comm_rank(comm, &my_rank); - MPI_Comm_group(comm, &group); - MPI_Comm_size(comm, &num_ranks); - if (num_ranks < 1) - THROW_YASK_EXCEPTION("error: MPI_Comm_size() returns less than one rank"); - - // Create a shm communicator. - MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); - MPI_Comm_rank(shm_comm, &my_shm_rank); - MPI_Comm_group(shm_comm, &shm_group); - MPI_Comm_size(shm_comm, &num_shm_ranks); - -#else - comm = MPI_COMM_NULL; -#endif - - // Turn off denormals unless the USE_DENORMALS macro is set. -#ifndef USE_DENORMALS - // Enable FTZ - _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); - - //Enable DAZ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -#endif - - // Set env vars needed by OMP. - // TODO: make this visible to the user. - int ret = setenv("OMP_PLACES", "cores", 0); // default placement for outer loop. - assert(ret == 0); - ret = setenv("KMP_HOT_TEAMS_MODE", "1", 0); // more efficient nesting. - assert(ret == 0); - ret = setenv("KMP_HOT_TEAMS_MAX_LEVEL", "2", 0); // 2-level nesting. - - // Save initial value of OMP max threads. - // Side effect: causes OMP to dump debug info if env var set. - if (!max_threads) - max_threads = omp_get_max_threads(); + assert(KernelEnv::_debug.get()); + return KernelEnv::_debug; } - + void yk_env::set_debug_output(yask_output_ptr debug) { + KernelEnv::_debug = debug; + } + void yk_env::disable_debug_output() { + yask_output_factory yof; + KernelEnv::_debug = yof.new_null_output(); + } + void yk_env::set_trace_enabled(bool enable) { + KernelEnv::_trace = enable; + } + bool yk_env::is_trace_enabled() { + return KernelEnv::_trace; + } + // Apply a function to each neighbor rank. // Does NOT visit self. void MPIInfo::visit_neighbors(std::function visitor) { - - // TODO: convert to use visit_all_points(). - for (int i = 0; i < neighborhood_size; i++) { - auto neigh_offsets = neighborhood_sizes.unlayout(i); - int neigh_rank = my_neighbors.at(i); - assert(i == get_neighbor_index(neigh_offsets)); - - if (i != my_neighbor_index) - visitor(neigh_offsets, neigh_rank, i); - } + (const IdxTuple& neigh_offsets, // NeighborOffset vals. + int neigh_rank, // MPI rank. + int neigh_index)> visitor) { + + neighborhood_sizes.visit_all_points + ([&](const IdxTuple& neigh_offsets, idx_t i) { + int neigh_rank = my_neighbors.at(i); + assert(i == get_neighbor_index(neigh_offsets)); + + if (i != my_neighbor_index) + visitor(neigh_offsets, neigh_rank, i); + return true; // from lambda; + }); } // Set pointer to storage. @@ -200,10 +151,10 @@ namespace yask { // Apply a function to each neighbor rank. // Does NOT visit self or non-existent neighbors. void MPIData::visit_neighbors(std::function visitor) { + (const IdxTuple& neigh_offsets, // NeighborOffset. + int neigh_rank, + int neigh_index, + MPIBufs& bufs)> visitor) { _mpi_info->visit_neighbors ([&](const IdxTuple& neigh_offsets, int neigh_rank, int i) { @@ -221,20 +172,24 @@ namespace yask { return bufs[i].bufs[bd]; } + // Settings static vars. + const string KernelSettings::_mega_block_str = "Mb"; + const string KernelSettings::_block_str = "b"; + const string KernelSettings::_micro_block_str = "mb"; + const string KernelSettings::_nano_block_str = "nb"; + const string KernelSettings::_pico_block_str = "pb"; + // Settings ctor. KernelSettings::KernelSettings(DimsPtr dims, KernelEnvPtr env) : _dims(dims), max_threads(env->max_threads) { auto& step_dim = dims->_step_dim; // Target-dependent defaults. - int def_blk_size = 32; // TODO: calculate based on actual cache size and stencil. - num_block_threads = 2; + def_blk_size = 32; // TODO: calculate based on actual cache size and stencil. + num_inner_threads = 1; if (string(YASK_TARGET) == "knl") { def_blk_size = 64; // larger L2. - num_block_threads = 8; // 4 threads per core * 2 cores per tile. - } - else if (string(YASK_TARGET) == "knc") { - num_block_threads = 4; // 4 threads per core. + num_inner_threads = 8; // 4 threads per core * 2 cores per tile. } // Use both step and domain dims for all size tuples. @@ -243,28 +198,31 @@ namespace yask { _rank_sizes = dims->_stencil_dims; _rank_sizes.set_vals_same(0); // 0 => calc from global. - - _region_sizes = dims->_stencil_dims; - _region_sizes.set_vals_same(0); // 0 => rank size. - - _block_group_sizes = dims->_stencil_dims; - _block_group_sizes.set_vals_same(0); // 0 => min size. - + _rank_tile_sizes = dims->_stencil_dims; + _rank_tile_sizes.set_vals_same(0); // 0 => rank size. + + _mega_block_sizes = dims->_stencil_dims; + _mega_block_sizes.set_vals_same(0); // 0 => rank size. + _mega_block_tile_sizes = dims->_stencil_dims; + _mega_block_tile_sizes.set_vals_same(0); // 0 => mega-block size. + _block_sizes = dims->_stencil_dims; - _block_sizes.set_vals_same(def_blk_size); - _block_sizes.set_val(step_dim, 0); // 0 => default. + _block_sizes.set_vals_same(0); // 0 => mega-block size. + _block_tile_sizes = dims->_stencil_dims; + _block_tile_sizes.set_vals_same(0); // 0 => block size. - _mini_block_group_sizes = dims->_stencil_dims; - _mini_block_group_sizes.set_vals_same(0); // 0 => min size. + _micro_block_sizes = dims->_stencil_dims; + _micro_block_sizes.set_vals_same(0); // 0 => block size. + _micro_block_tile_sizes = dims->_stencil_dims; + _micro_block_tile_sizes.set_vals_same(0); // 0 => micro-block size. - _mini_block_sizes = dims->_stencil_dims; - _mini_block_sizes.set_vals_same(0); // 0 => calc from block. + _nano_block_sizes = dims->_stencil_dims; + _nano_block_sizes.set_vals_same(0); // 0 => micro-block size. + _nano_block_tile_sizes = dims->_stencil_dims; + _nano_block_tile_sizes.set_vals_same(0); // 0 => nano-block size. - _sub_block_group_sizes = dims->_stencil_dims; - _sub_block_group_sizes.set_vals_same(0); // 0 => min size. - - _sub_block_sizes = dims->_stencil_dims; - _sub_block_sizes.set_vals_same(0); // 0 => calc from mini-block. + _pico_block_sizes = dims->_stencil_dims; + _pico_block_sizes.set_vals_same(0); // 0 => cluster size. _min_pad_sizes = dims->_stencil_dims; _min_pad_sizes.set_vals_same(0); @@ -278,6 +236,13 @@ namespace yask { _rank_indices = dims->_domain_dims; _rank_indices.set_vals_same(0); + + // Things to tune. + #ifdef USE_OFFLOAD + _tuner_targets.push_back(_pico_block_str); + #else + _tuner_targets.push_back(_block_str); + #endif } // Add options to set one domain var to a cmd-line parser. @@ -297,7 +262,7 @@ namespace yask { idx_t* dp = var.lookup(dname); // use lookup() to get non-const ptr. // Option for individual dim. - parser.add_option(new CommandLineParser::IdxOption + parser.add_option(make_shared (prefix + dname, descrip + " in '" + dname + "' dimension.", *dp)); @@ -313,7 +278,7 @@ namespace yask { auto shortcut = prefix; if (shortcut.back() == '_') shortcut.pop_back(); - parser.add_option(new CommandLineParser::MultiIdxOption + parser.add_option(make_shared (shortcut, "Shortcut for" + multi_help, multi_vars)); @@ -323,7 +288,7 @@ namespace yask { void KernelSettings::add_options(CommandLineParser& parser) { // Following options are in the 'yask' namespace, i.e., no object. - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("print_suffixes", "Format output with suffixes for human readibility, e.g., 6.15K, 12.3GiB, 7.45m." " If disabled, prints without suffixes for computer parsing, e.g., 6150, 1.23e+10, 7.45e-3.", @@ -332,275 +297,398 @@ namespace yask { // Following options are in 'this' object. _add_domain_option(parser, "g", "Global-domain (overall-problem) size", _global_sizes); _add_domain_option(parser, "l", "Local-domain (rank) size", _rank_sizes); - _add_domain_option(parser, "d", "Deprecated alias for local-domain size", _rank_sizes); - _add_domain_option(parser, "r", "Region size", _region_sizes, true); - _add_domain_option(parser, "b", "Block size", _block_sizes, true); - _add_domain_option(parser, "mb", "Mini-block size", _mini_block_sizes); - _add_domain_option(parser, "sb", "Sub-block size", _sub_block_sizes); -#ifdef SHOW_GROUPS - _add_domain_option(parser, "bg", "Block-group size", _block_group_sizes); - _add_domain_option(parser, "mbg", "Mini-block-group size", _mini_block_group_sizes); - _add_domain_option(parser, "sbg", "Sub-block-group size", _sub_block_group_sizes); -#endif - _add_domain_option(parser, "mp", "Minimum var-padding size (including halo)", _min_pad_sizes); - _add_domain_option(parser, "ep", "Extra var-padding size (beyond halo)", _extra_pad_sizes); - parser.add_option(new CommandLineParser::BoolOption + _add_domain_option(parser, _mega_block_str, "Mega-block size", _mega_block_sizes, true); + _add_domain_option(parser, _block_str, "Block size", _block_sizes, true); + _add_domain_option(parser, _micro_block_str, "Micro-block size", _micro_block_sizes); + _add_domain_option(parser, _nano_block_str, "Nano-block size", _nano_block_sizes); + _add_domain_option(parser, _pico_block_str, "Pico-block size", _pico_block_sizes); + _add_domain_option(parser, "d", "[Deprecated] Use local-domain size options", _rank_sizes); + #ifdef USE_TILING + _add_domain_option(parser, "l_tile", "[Advanced] Local-domain-tile size", _rank_tile_sizes); + _add_domain_option(parser, "Mb_tile", "[Advanced] Mega-Block-tile size", _mega_block_tile_sizes); + _add_domain_option(parser, "b_tile", "[Advanced] Block-tile size", _block_tile_sizes); + _add_domain_option(parser, "mb_tile", "[Advanced] Micro-block-tile size", _micro_block_tile_sizes); + _add_domain_option(parser, "nb_tile", "[Advanced] Nano-block-tile size", _nano_block_tile_sizes); + #endif + _add_domain_option(parser, "mp", "[Advanced] Minimum padding size (including halo)" + " applied to all YASK vars", _min_pad_sizes); + _add_domain_option(parser, "ep", "[Advanced] Extra padding size (beyond halo)" + " applied to all YASK vars", _extra_pad_sizes); + parser.add_option(make_shared ("allow_addl_padding", - "Allow automatic extension of padding beyond what is needed for" - " vector alignment for additional performance reasons", + "[Advanced] Allow automatic extension of padding for" + " additional performance on any or all YASK vars.", _allow_addl_pad)); -#ifdef USE_MPI + #ifdef USE_MPI _add_domain_option(parser, "nr", "Num ranks", _num_ranks); _add_domain_option(parser, "ri", "This rank's logical index (0-based)", _rank_indices); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("overlap_comms", "Overlap MPI communication with calculation of interior elements whenever possible.", overlap_comms)); - parser.add_option(new CommandLineParser::BoolOption - ("use_shm", - "Use shared memory for MPI halo-exchange buffers between ranks on the same node when possible.", - use_shm)); - parser.add_option(new CommandLineParser::IdxOption + parser.add_option(make_shared ("min_exterior", - "Minimum width of MPI exterior section to compute before starting MPI communication.", + "[Advanced] Minimum width of exterior section to" + " compute before starting MPI communication. " + "Applicable only when overlap_comms is enabled.", _min_exterior)); -#endif - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared + ("exchange_halos", + "[Debug] Perform halo packs/unpacks/sends/receives. " + "Will not give correct results if disabled.", + do_halo_exchange)); + #ifdef USE_OFFLOAD + parser.add_option(make_shared + ("use_device_mpi", + "Enable device-to-device MPI transfers using device addresses. " + "Must be supported by MPI library and hardware.", + use_device_mpi)); + #else + parser.add_option(make_shared + ("use_shm", + "Directly use shared memory for halo-exchange buffers " + "between ranks on the same node when possible. " + "Otherwise, use the same non-blocking MPI send and receive calls " + "that are used between nodes.", + use_shm)); + parser.add_option(make_shared + ("force_scalar_exchange", + "[Debug] Do not allow vectorized halo exchanges.", + force_scalar_exchange)); + #endif + #endif + parser.add_option(make_shared ("force_scalar", - "Evaluate every var point with scalar stencil operations (for debug).", + "[Debug] Evaluate every var point with scalar stencil operations " + "and exchange halos using only scalar packing and unpacking.", force_scalar)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("max_threads", - "Max OpenMP threads to use. Overrides default number of OpenMP threads " - "or the value set by OMP_NUM_THREADS.", + "Maximum number of OpenMP CPU threads to use for both outer and inner threads. " + "If zero (0), the default value from the OpenMP library is used.", max_threads)); - parser.add_option(new CommandLineParser::IntOption - ("thread_divisor", - "Divide the number of OpenMP threads by the argument value. " - "If -max_threads is also used, divide the argument to that option by the " - "argument to this one. If -max_threads is not used, " - "divide the default number of OpenMP threads. " - "In either case, use the resulting truncated value as the " - "maximum number of OpenMP threads to use.", - thread_divisor)); - parser.add_option(new CommandLineParser::IntOption - ("block_threads", - "Number of threads to use in a nested OpenMP region for each block. " + parser.add_option(make_shared + ("outer_threads", + "Number of CPU threads to use in the outer OpenMP region. " + "Specifies how many blocks may be executed concurrently within each mega-block. " "Will be restricted to a value less than or equal to " "the maximum number of OpenMP threads specified by -max_threads " - "and/or -thread_divisor. " - "Each thread is used to execute stencils within a sub-block, and " - "sub-blocks are executed in parallel within mini-blocks.", - num_block_threads)); - parser.add_option(new CommandLineParser::BoolOption - ("bind_block_threads", - "Divide mini-blocks into sub-blocks of slabs along the first valid dimension " - "(usually the outer-domain dimension), ignoring other sub-block sizes. " - "Assign each slab to a block thread based on its global index in that dimension. " - "This setting may increase cache locality when using multiple " - "block-threads when scratch vars are used and/or " + "divided by the number specified by -inner_threads. " + "If zero (0), set to the value specified by -max_threads " + "divided by the number specified by -inner_threads.", + num_outer_threads)); + parser.add_option(make_shared + ("inner_threads", + "Number of CPU threads to use in each inner (nested) OpenMP region. " + "Specifies how many nano-blocks may be executed concurrently within each micro-block. " + "Will be restricted to a value less than or equal to " + "the maximum number of OpenMP threads specified by -max_threads. " + "If zero (0), set to one (1).", + num_inner_threads)); + parser.add_option(make_shared + ("block_threads", + "[Deprecated] Use 'inner_threads' option.", + num_inner_threads)); + #ifdef USE_OFFLOAD + parser.add_option(make_shared + ("device_thread_limit", + "Set the maximum number of OpenMP device threads used within a team.", + thread_limit)); + #endif + #ifndef USE_OFFLOAD + parser.add_option(make_shared + ("bind_inner_threads", + "[Advanced] Divide micro-blocks into nano-blocks of slabs along the first valid dimension " + "(usually the outer-domain dimension), ignoring other nano-block sizes. " + "Assign each slab to an inner thread based on its global index in that dimension. " + "This setting may increase cache locality when using more than one " + "inner thread, especially when scratch vars are used and/or " "when temporal blocking is active. " - "This option is ignored if there are fewer than two block threads.", - bind_block_threads)); -#ifdef USE_NUMA - stringstream msg; - msg << "Preferred NUMA node on which to allocate data for " - "vars and MPI buffers. Alternatively, use special values " << - yask_numa_local << " for explicit local-node allocation, " << - yask_numa_interleave << " for interleaving pages across all nodes, or " << - yask_numa_none << " for no NUMA policy."; - parser.add_option(new CommandLineParser::IntOption - ("numa_pref", msg.str(), + "This option is ignored if there are fewer than two inner threads.", + bind_inner_threads)); + #endif + parser.add_option(make_shared + ("bundle_allocs", + "[Advanced] Allocate memory for multiple YASK vars in " + "a single large chunk when possible. " + "If 'false', allocate each YASK var separately.", + _bundle_allocs)); + parser.add_option(make_shared + ("numa_pref", + string("[Advanced] Specify allocation policy for vars and MPI buffers. ") + + #ifdef USE_NUMA + " Use values >= 0 to specify the preferred NUMA node. " + " Use " + to_string(yask_numa_local) + " for local NUMA-node allocation. " + + " Use " + to_string(yask_numa_interleave) + " for interleaving pages across NUMA nodes. " + + #endif + #ifdef USE_OFFLOAD + " Use " + to_string(yask_numa_offload) + " for allocation optimized for offloading. " + + #endif + " Use " + to_string(yask_numa_none) + " for default allocator.", _numa_pref)); -#endif -#ifdef USE_PMEM - parser.add_option(new CommandLineParser::IntOption - ("numa_pref_max", - "Maximum GiB to allocate on preferred NUMA node before allocating on pmem device.", - _numa_pref_max)); -#endif - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("auto_tune", - "Adjust block sizes *during* normal operation to tune for performance. " - "May cause varying performance between steps.", + "Adjust specified block and tile sizes *during* normal operation " + "to tune for performance, i.e., 'online' or 'in-situ' tuning. " + "Successive steps will use different sizes until an optimal size is found. " + "Will likely cause varying performance between steps, " + "so this is not recommended for benchmarking. " + "However, this can be a useful tool for deployment of YASK stencils " + "onto unknown and/or varied systems where 'offline' tuning is not practical.", _do_auto_tune)); - parser.add_option(new CommandLineParser::DoubleOption - ("auto_tune_min_secs", - "Minimum seconds to run trial during auto-tuning for trial settings to be " - "considered better than the existing best.", - _tuner_min_secs)); - parser.add_option(new CommandLineParser::BoolOption - ("auto_tune_mini_blocks", - "Apply the auto-tuner to mini-block sizes instead of block sizes. " - "Particularly useful when using temporal block tiling.", - _tune_mini_blks)); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared + ("auto_tune_trial_secs", + "[Advanced] Seconds to run new trial during auto-tuning " + "for new trial to be considered better than the existing best.", + _tuner_trial_secs)); + parser.add_option(make_shared + ("auto_tune_radius", + "[Advanced] Starting search radius for tuning block sizes. " + "A power of 2 is recommended.", + _tuner_radius)); + #ifdef ALLOW_STAGE_TUNERS + parser.add_option(make_shared ("auto_tune_each_stage", - "Apply the auto-tuner separately to each stage when " - "those stages are applied in separate passes across the entire var, " + "[Advanced] Apply the auto-tuner separately to each stage. " + "Will only be used if stages are applied in separate " + "passes across the entire grid, " "i.e., when no temporal tiling is used.", _allow_stage_tuners)); + #endif + + // Make set of allowed auto-tune targets. + set allowed_targets; + allowed_targets.insert(_mega_block_str); + allowed_targets.insert(_block_str); + allowed_targets.insert(_micro_block_str); + allowed_targets.insert(_nano_block_str); + allowed_targets.insert(_pico_block_str); + parser.add_option(make_shared + ("auto_tune_targets", + "[Advanced] Apply the auto-tuner to adjust the sizes of the listed targets. " + "Allowed targets are " + "'" + _mega_block_str + "' for mega-block sizes, " + "'" + _block_str + "' for block sizes, " + "'" + _micro_block_str + "' for micro-block sizes, " + "'" + _nano_block_str + "' for nano-block sizes, and " + "'" + _pico_block_str + "' for pico-block sizes. " + "Targets must be separated by a single comma (','). " + "Targets will be tuned in the order given and may be repeated.", + allowed_targets, _tuner_targets)); } // Print usage message. - void KernelSettings::print_usage(ostream& os, - CommandLineParser& app_parser, - const string& pgm_name, - const string& app_notes, - const vector& app_examples) + void KernelSettings::print_usage(ostream& os) { - os << "Usage: " << pgm_name << " [options]\n" - "Options:\n"; - app_parser.print_help(os); CommandLineParser soln_parser; add_options(soln_parser); soln_parser.print_help(os); - os << "\nTerms for the various levels of tiling from smallest to largest:\n" - " A 'point' is a single floating-point (FP) element.\n" - " This binary uses " << REAL_BYTES << "-byte FP elements.\n" - " A 'vector' is composed of points.\n" + os << + "\nTerms for the various work-sizes from largest to smallest:\n" + " The 'global-domain' or 'overall-problem' is the work done across all MPI ranks.\n" + " The global-domain is composed of one or more local-domains.\n" + #ifndef USE_MPI + " This binary has NOT been compiled with MPI support,\n" + " so the global-domain is equivalent to the single local-domain.\n" + #endif + " A 'local-domain' or 'rank-domain' is the work done in one MPI rank.\n" + " Ranks may be evaluated in parallel in separate MPI processes.\n" + " The purpose of local-domains is to control the amount of work done in one\n" + " entire MPI rank.\n" + " Each local-domain is composed of one or more mega-blocks.\n" + " A 'mega-block' is a sub-division of work within a local-domain.\n" + " Mega-blocks are evaluated sequentially within ranks.\n" + " The purpose of mega-blocks is to control the amount of work done across an\n" + " entire MPI rank while sharing a large cache.\n" + " If using temporal wave-front rank tiling (see mega-block-size guidelines),\n" + " then this is the work done in each wave-front rank tile;\n" + " else, there is typically only one mega-block the size of the local-domain.\n" + " Each mega-block is composed of one or more blocks.\n" + " A 'block' is a sub-division of work within a mega-block.\n" + " Blocks may be evaluated in parallel within mega-blocks.\n" + " The purpose of blocking is to provide control over the amount of\n" + " work done by each outer OpenMP thread.\n" + " This is the most commonly-tuned work-size for many stencils, especially\n" + " when not using any sort of temporal tiling.\n" + " Each block is composed of one or more micro-blocks.\n" + " A 'micro-block' is a sub-division of work within a block.\n" + " Micro-blocks are evaluated sequentially within blocks.\n" + " The purpose of micro-blocking is to allow distinction between the amount\n" + " of work done by an outer thread (via blocking) and the amount of work done\n" + " for cache locality (via micro-blocking).\n" + " If using temporal wave-front block tiling (see micro-block-size guidelines),\n" + " then this is the work done for each wave-front block tile,\n" + " and the number temporal steps in the micro-block is always equal\n" + " to the number temporal steps a block;\n" + " else, there is typically only one micro-block the size of a block.\n" + " If micro-block sizes are not specified, a micro-block is the same size\n" + " as a block, and the amount of work done by a thread and the amount of\n" + " work done for cache locality is the same.\n" + " Each micro-block is composed of one or more nano-blocks.\n" + " A 'nano-block' is a sub-division of work within a micro-block.\n" + " Nano-blocks may be evaluated in parallel within micro-blocks.\n" + " The purpose of nano-blocking is to allow multiple inner OpenMP threads\n" + " to cooperatively work on a micro-block, sharing cached values--\n" + " this is particularly useful when using hyper-threads on a CPU.\n" + " If the number of inner OpenMP threads is greater than one,\n" + " then this is the unit of work for each nested thread,\n" + " and nano-blocks are evaluated concurrently within each micro-block;\n" + " else, nano-blocks are evaluated sequentially within each micro-block.\n" + #ifdef USE_OFFLOAD + " When offloading computation to a device, a nano-block is the unit of work\n" + " done in each offloaded kernel invocation.\n" + #endif + " There is no temporal tiling at the nano-block level.\n" + " Each nano-block is composed of one or more pico-blocks.\n" + " A 'pico-block' is a sub-division of work within a nano-block.\n" + #ifdef USE_OFFLOAD + " Pico-blocks may be evaluated in parallel within nano-blocks on the device.\n" + #else + " Pico-blocks are evaluated sequentially within nano-blocks.\n" + #endif + " The purpose of a pico-block is to allow additional cache-locality\n" + " at this low level.\n" + #ifdef USE_OFFLOAD + " When offloading computation to a device, a pico-block allows\n" + " cache-locality within a kernel work item.\n" + #endif + " There is no temporal tiling at the pico-block level.\n" + " Each pico-block is composed of one or more clusters.\n" + " A 'cluster' is the work done in each inner-most pico-loop iteration.\n" + " Clusters are evaluated sequentially within pico-blocks.\n" + " The purpose of clustering is to allow more than one vector of\n" + " work to be done in each loop iteration, useful for very simple stencils.\n" + " Each cluster is composed of one or more vectors.\n" + " A 'vector' is typically the work done by a SIMD instruction.\n" + " Vectors are evaluated sequentially within clusters.\n" " A 'folded vector' contains points in more than one dimension.\n" " The size of a vector is typically that of a SIMD register.\n" - " A 'vector-cluster' is composed of vectors.\n" - " This is the unit of work done in each inner-most loop iteration.\n" - " A 'sub-block' is composed of vector-clusters.\n" - " If the number of block-threads is greater than one,\n" - " then this is the unit of work for one nested OpenMP thread;\n" - " else, sub-blocks are evaluated sequentially within each mini-block.\n" - " A 'mini-block' is composed of sub-blocks.\n" - " If using temporal wave-front block tiling (see mini-block-size guidelines),\n" - " then this is the unit of work for each wave-front block tile,\n" - " and the number temporal steps in the mini-block is always equal\n" - " to the number temporal steps a temporal block;\n" - " else, there is typically only one mini-block the size of a block.\n" - " Mini-blocks are evaluated sequentially within blocks.\n" - " A 'block' is composed of mini-blocks.\n" - " If the number of threads is greater than one (typical),\n" - " then this is the unit of work for one OpenMP thread;\n" - " else, blocks are evaluated sequentially within each region.\n" - " A 'region' is composed of blocks.\n" - " If using temporal wave-front rank tiling (see region-size guidelines),\n" - " then this is the unit of work for each wave-front rank tile;\n" - " else, there is typically only one region the size of the rank-domain.\n" - " Regions are evaluated sequentially within ranks.\n" - " A 'local-domain' or 'rank-domain' is composed of regions.\n" - " This is the unit of work for one MPI rank.\n" - " Ranks are evaluated in parallel in separate MPI processes.\n" - " The 'global-domain' or 'overall-problem' is composed of local-domains.\n" - " This is the unit of work across all MPI ranks.\n" << -#ifndef USE_MPI - " This binary has NOT been compiled with MPI support,\n" - " so the global-domain is equivalent to the single local-domain.\n" << -#endif - "\nGuidelines for setting tiling sizes:\n" - " The vector and vector-cluster sizes are set at compile-time, so\n" - " there are no run-time options to set them.\n" - " Set sub-block sizes to specify a unit of work done by each nested OpenMP thread.\n" - " Multiple sub-blocks are intended to allow sharing of caches\n" - " among multiple hyper-threads in a core when there is more than\n" - " one block-thread. It can also be used to share data between caches\n" - " among multiple cores.\n" - " A sub-block size of 0 in a given domain dimension =>\n" - " sub-block size is set to mini-block size in that dimension;\n" - " when there is more than one block-thread, the first dimension\n" - " will instead be set to the vector length to create \"slab\" shapes.\n" - " Set mini-block sizes to control temporal wave-front tile sizes within a block.\n" - " Multiple mini-blocks are intended to increase locality in level-2 caches\n" - " when blocks are larger than L2 capacity.\n" - " A mini-block size of 0 in a given domain dimension =>\n" - " mini-block size is set to block size in that dimension.\n" - " The size of a mini-block in the step dimension is always implicitly\n" - " the same as that of a block.\n" - " Set block sizes to specify a unit of work done by each top-level OpenMP thread.\n" + " Each vector is composed of one or more points.\n" + " A 'point' is a single floating-point (FP) element in a grid.\n" + " Points may be evaluated in parallel within vectors.\n" + " This binary uses " << REAL_BYTES << "-byte FP elements.\n" + "\n" + "Guidelines for setting work-sizes and their defaults:\n" + " The global-domain sizes specify the work done across all MPI ranks.\n" + " A global-domain size of 0 in a given domain dimension =>\n" + " global-domain size is the sum of local-domain sizes in that dimension.\n" + " The local-domain sizes specify the work done on each MPI rank.\n" + " A local-domain size of 0 in a given domain dimension =>\n" + " local-domain size is determined by the global-domain size in that dimension.\n" + " This and the number of vars affect the amount of memory used per rank.\n" + " Either the global-domain size or the local-domain size must be specified.\n" + " The mega-block sizes are used to configure temporal wave-front rank tiling.\n" + " Temporal wave-front rank tiling may increase locality in large shared caches\n" + " when a local-domain is larger than the capacity of those caches.\n" + " A mega-block size >1 in the step dimension (e.g., '-rt') enables wave-front rank tiling.\n" + " A mega-block size of 0 in the step dimension => the temporal wave-front\n" + " rank tiling will have the same number of steps as the temporal block tiling.\n" + " The mega-block size in the step dimension affects how often MPI halo-exchanges occur:\n" + " A mega-block size of 0 in the step dimension => exchange after every stage.\n" + " A mega-block size >0 in the step dimension => exchange after that many steps.\n" + " The block sizes specify the work done by each top-level OpenMP thread.\n" " A block size of 0 in a given domain dimension =>\n" - " block size is set to region size in that dimension.\n" + " block size is set to mega-block size in that dimension.\n" " A block size of 0 in the step dimension (e.g., '-bt') disables any temporal blocking.\n" " A block size of 1 in the step dimension enables temporal blocking, but only between\n" " stages in the same step.\n" - " A block size >1 in the step dimension enables temporal blocking across multiple steps.\n" + " A block size >1 in the step dimension enables temporal mega-block tiling across multiple steps.\n" " The temporal block size may be automatically reduced if needed based on the\n" - " domain block sizes and the stencil halos.\n" - " Set region sizes to control temporal wave-front tile sizes within a rank.\n" - " Multiple regions are intended to increase locality in level-3 caches\n" - " when ranks are larger than L3 capacity.\n" - " A region size of 0 in the step dimension (e.g., '-rt') => region size is\n" - " set to block size in the step dimension.\n" - " A region size >1 in the step dimension enables wave-front rank tiling.\n" - " The region size in the step dimension affects how often MPI halo-exchanges occur:\n" - " A region size of 0 in the step dimension => exchange after every stage.\n" - " A region size >0 in the step dimension => exchange after that many steps.\n" - " Set local-domain sizes to specify the work done on this MPI rank.\n" - " A local-domain size of 0 in a given domain dimension =>\n" - " local-domain size is determined by the global-domain size in that dimension.\n" - " This and the number of vars affect the amount of memory used.\n" - " Set global-domain sizes to specify the work done across all MPI ranks.\n" - " A global-domain size of 0 in a given domain dimension =>\n" - " global-domain size is the sum of local-domain sizes in that dimension.\n" -#ifdef SHOW_GROUPS - " Setting 'group' sizes controls only the order of tiles.\n" - " These are advanced settings that are not commonly used.\n" -#endif - "\nControlling OpenMP threading:\n" - " Using '-max_threads 0' =>\n" - " max_threads is set to OpenMP's default number of threads.\n" - " The -thread_divisor option is a convenience to control the number of\n" - " hyper-threads used without having to know the number of cores,\n" - " e.g., using '-thread_divisor 2' will halve the number of OpenMP threads.\n" - " For stencil evaluation, threads are allocated using nested OpenMP:\n" - " Num threads per region = max_threads / thread_divisor / block_threads.\n" - " Num threads per block = block_threads.\n" - " Num threads per sub-block = 1.\n" - " Num threads used for halo exchange is same as num per region.\n" << -#ifdef USE_MPI + " domain block sizes, the stencil halos, and the step size of the mega-blocks.\n" + " The micro-block sizes are used to configure temporal wave-front block tiling.\n" + " Temporal wave-front block tiling may increase locality in core-local caches\n" + " (e.g., L2) when blocks are larger than that the capacity of those caches.\n" + " A micro-block size of 0 in a given domain dimension =>\n" + " micro-block size is set to block size in that dimension.\n" + " The size of a micro-block in the step dimension is always implicitly\n" + " the same as that of a block.\n" + " The nano-block sizes specify the work done by each nested OpenMP thread.\n" + " Multiple nano-blocks may enable more effective sharing of caches\n" + " among multiple hyper-threads in a core when there is more than\n" + " one block-thread. It can also be used to share data between caches\n" + " among multiple cores.\n" + " A nano-block size of 0 in a given domain dimension =>\n" + " nano-block size is set to micro-block size in that dimension;\n" + " when there is more than one block-thread, the first dimension\n" + " will instead be set to the vector length to create \"slab\" shapes.\n" + " A pico-block size of 0 in a given domain dimension =>\n" + " pico-block size is set to cluster size in that dimension;\n" + " The vector and cluster sizes are set at compile-time, so\n" + " there are no run-time options to set them.\n" + #ifdef USE_TILING + " Set 'tile' sizes to provide finer control over the order of evaluation\n" + " within the given area. For example, nano-block-tiles create smaller areas\n" + " within nano-blocks; points with the first nano-block-tile will be scheduled\n" + " before those the second nano-block-tile, etc. (There is no additional level\n" + " of temporal tiling or sychronization added with this tiling.)\n" + " A tile size of 0 in a given domain dimension => tile size is set to the size\n" + " of its enclosing area in that dimension, i.e., there will only be one tile\n" + " in that dimension.\n" + #endif + #ifdef USE_MPI "\nControlling MPI scaling:\n" - " To 'strong-scale' a given overall-problem size, use multiple MPI ranks\n" - " and keep the global-domain sizes constant.\n" - " To 'weak-scale' to a larger overall-problem size, use multiple MPI ranks\n" - " and keep the local-domain sizes constant.\n" << -#endif - app_notes; - - // Make example knobs. - string ex1, ex2; - DOMAIN_VAR_LOOP(i, j) { - auto& dname = _dims->_domain_dims.get_dim_name(j); - ex1 += " -g" + dname + " " + to_string(i * 128); - ex2 += " -nr" + dname + " " + to_string(i + 1); - } - os << - "\nExamples:\n" - " " << pgm_name << " -g 768 # global-domain size in all dims.\n" - " " << pgm_name << ex1 << " # global-domain size in each dim.\n" - " " << pgm_name << " -l 2048 -r 512 -rt 10 # local-domain size and temporal rank tiling.\n" - " " << pgm_name << " -g 512" << ex2 << " # number of ranks in each dim.\n"; - for (auto ae : app_examples) - os << " " << pgm_name << " " << ae << endl; - os << flush; + " To 'strong-scale' a given overall-problem size, use multiple MPI ranks\n" + " and keep the global-domain sizes constant.\n" + " To 'weak-scale' to a larger overall-problem size, use multiple MPI ranks\n" + " and keep the local-domain sizes constant.\n" + #endif + "\nControlling OpenMP CPU threading:\n" + " For stencil evaluation, threads are allocated using nested OpenMP:\n" + " Num outer_threads = max_threads / inner_threads if not specified.\n" + " Num CPU threads per rank and mega-block = outer_threads.\n" + " Num CPU threads per block = inner_threads.\n" + " Num CPU threads per micro-block, nano-block, and pico-block = 1.\n"; } - - // For each one of 'inner_sizes' that is zero, - // make it equal to corresponding one in 'outer_sizes'. - // Round up each of 'inner_sizes' to be a multiple of corresponding one in 'mults'. + void KernelSettings::print_values(ostream& os) + { + CommandLineParser soln_parser; + add_options(soln_parser); + soln_parser.print_values(os); + } + + // For each one of 'inner_sizes' dim that is zero, + // make it equal to same dim in 'outer_sizes'. + // Round up each of 'inner_sizes' dim to be a multiple of same dim in 'mults'. + // Limit size to 'outer_sizes'. // Output info to 'os' using '*_name' and dim names. // Does not process 'step_dim'. // Return product of number of inner subsets. - idx_t KernelSettings::find_num_subsets(ostream& os, - IdxTuple& inner_sizes, const string& inner_name, - const IdxTuple& outer_sizes, const string& outer_name, - const IdxTuple& mults, const std::string& step_dim) { + static idx_t find_num_subsets(ostream& os, + IdxTuple& inner_sizes, const string& inner_name, + const IdxTuple& outer_sizes, const string& outer_name, + const IdxTuple& mults, const string& mult_name, + const std::string& step_dim) { idx_t prod = 1; + bool rounded = false; + bool trimmed = false; for (auto& dim : inner_sizes) { auto& dname = dim._get_name(); if (dname == step_dim) continue; idx_t* dptr = inner_sizes.lookup(dname); // use lookup() to get non-const ptr. + // Set default to outer size. idx_t outer_size = outer_sizes[dname]; if (*dptr <= 0) *dptr = outer_size; // 0 => use full size as default. - if (mults.lookup(dname) && mults[dname] > 1) - *dptr = ROUND_UP(*dptr, mults[dname]); + + // Round up. + if (mults.lookup(dname) && mults[dname] > 1) { + idx_t rsz = ROUND_UP(*dptr, mults[dname]); + if (rsz != *dptr) { + *dptr = rsz; + rounded = true; + } + } + + // Limit. + if (*dptr > outer_size) { + *dptr = outer_size; + trimmed = true; + } + + // Calc stats. idx_t inner_size = *dptr; idx_t ninner = (inner_size <= 0) ? 0 : (outer_size + inner_size - 1) / inner_size; // full or partial. @@ -609,16 +697,25 @@ namespace yask { idx_t nfull = rem ? (ninner - 1) : ninner; // full only. if (outer_size > 0) { - os << " In '" << dname << "' dimension, " << + os << " In '" << dname << "' dim, " << outer_name << " of size " << - outer_size << " contains " << nfull << " " << - inner_name << "(s) of size " << inner_size; + outer_size << " contains " << ninner << " " << + inner_name << "(s)"; if (rem) - os << " plus 1 remainder " << inner_name << " of size " << rem; - os << "." << endl; + os << ": " << nfull << " of full-size " << inner_size << + " plus 1 of remainder-size " << rem; + else + os << " of size " << inner_size; + os << ".\n"; } prod *= ninner; } + if (rounded) + os << " The " << inner_name << " sizes have been rounded up to multiples of " << + mult_name << " sizes.\n"; + if (trimmed) + os << " The " << inner_name << " sizes have been limited to " << + outer_name << " sizes.\n"; return prod; } @@ -626,74 +723,94 @@ namespace yask { // other vars before allocating memory. // Called from prepare_solution(), during auto-tuning, etc. void KernelSettings::adjust_settings(KernelStateBase* ksb) { + + // Null stream to throw away debug info if 'ksb' is null. + yask_output_factory yof; + auto nullop = yof.new_null_output(); yask_output_ptr op = ksb ? ksb->get_debug_output() : nullop; ostream& os = op->get_ostream(); auto& step_dim = _dims->_step_dim; - auto& inner_dim = _dims->_inner_dim; - auto& rt = _region_sizes[step_dim]; + auto& inner_layout_dim = _dims->_inner_layout_dim; + auto& inner_loop_dim = _dims->_inner_loop_dim; + auto& rt = _mega_block_sizes[step_dim]; auto& bt = _block_sizes[step_dim]; - auto& mbt = _mini_block_sizes[step_dim]; + auto& mbt = _micro_block_sizes[step_dim]; auto& cluster_pts = _dims->_cluster_pts; - int nddims = _dims->_domain_dims._get_num_dims(); + int nddims = _dims->_domain_dims.get_num_dims(); // Fix up step-dim sizes. rt = max(rt, idx_t(0)); bt = max(bt, idx_t(0)); mbt = max(mbt, idx_t(0)); if (!rt) - rt = bt; // Default region steps == block steps. + rt = bt; // Default mega-block steps == block steps. if (!mbt) - mbt = bt; // Default mini-blk steps == block steps. + mbt = bt; // Default micro-blk steps == block steps. - // Determine num regions. - // Also fix up region sizes as needed. - // Temporal region size will be increase to + // Adjust defaults for blocks on CPU or pico-blocks on GPU. + DOMAIN_VAR_LOOP(i, j) { + #ifdef USE_OFFLOAD + if (!_pico_block_sizes[i]) + _pico_block_sizes[i] = def_blk_size; + #else + if (!_block_sizes[i]) + _block_sizes[i] = def_blk_size; + #endif + } + + // Determine num mega-blocks. + // Also fix up mega-block sizes as needed. + // Temporal mega-block size will be increase to // current temporal block size if needed. - // Default region size (if 0) will be size of rank-domain. - os << "\nRegions:" << endl; - auto nr = find_num_subsets(os, _region_sizes, "region", - _rank_sizes, "local-domain", - cluster_pts, step_dim); - os << " num-regions-per-local-domain-per-step: " << nr << endl; - os << " Since the region size in the '" << step_dim << - "' dim is " << rt << ", temporal wave-front rank tiling is "; + // Default mega-block size (if 0) will be size of rank-domain. + os << "\nMega-Blocks:" << endl; + auto nr = find_num_subsets(os, + _mega_block_sizes, "mega-block", + _rank_sizes, "local-domain", + cluster_pts, "cluster", + step_dim); + os << " num-mega-blocks-per-local-domain-per-step: " << nr << endl; + os << " Since the mega-block size in the '" << step_dim << + "' dim is " << rt << ", temporal wave-front tiling of each local-domain is "; if (!rt) os << "NOT "; os << "enabled.\n"; // Determine num blocks. // Also fix up block sizes as needed. - // Default block size (if 0) will be size of region. os << "\nBlocks:" << endl; - auto nb = find_num_subsets(os, _block_sizes, "block", - _region_sizes, "region", - cluster_pts, step_dim); - os << " num-blocks-per-region-per-step: " << nb << endl; + auto nb = find_num_subsets(os, + _block_sizes, "block", + _mega_block_sizes, "mega-block", + cluster_pts, "cluster", + step_dim); + os << " num-blocks-per-mega-block-per-step: " << nb << endl; os << " num-blocks-per-local-domain-per-step: " << (nb * nr) << endl; os << " Since the block size in the '" << step_dim << - "' dim is " << bt << ", temporal blocking is "; + "' dim is " << bt << ", temporal concurrent tiling of each mega-block is "; if (!bt) os << "NOT "; os << "enabled.\n"; - // Determine num mini-blocks. - // Also fix up mini-block sizes as needed. - os << "\nMini-blocks:" << endl; - auto nmb = find_num_subsets(os, _mini_block_sizes, "mini-block", - _block_sizes, "block", - cluster_pts, step_dim); - os << " num-mini-blocks-per-block-per-step: " << nmb << endl; - os << " num-mini-blocks-per-region-per-step: " << (nmb * nb) << endl; - os << " num-mini-blocks-per-local-domain-per-step: " << (nmb * nb * nr) << endl; - os << " Since the mini-block size in the '" << step_dim << - "' dim is " << mbt << ", temporal wave-front block tiling is "; + // Determine num micro-blocks. + // Also fix up micro-block sizes as needed. + os << "\nMicro-blocks:" << endl; + auto nmb = find_num_subsets(os, + _micro_block_sizes, "micro-block", + _block_sizes, "block", + cluster_pts, "cluster", + step_dim); + os << " num-micro-blocks-per-block-per-step: " << nmb << endl; + os << " num-micro-blocks-per-mega-block-per-step: " << (nmb * nb) << endl; + os << " num-micro-blocks-per-local-domain-per-step: " << (nmb * nb * nr) << endl; + os << " Since the micro-block size in the '" << step_dim << + "' dim is " << mbt << ", temporal wave-front tiling of each block is "; if (!mbt) os << "NOT "; os << "enabled.\n"; - // Adjust defaults for sub-blocks to be slab if - // we are using more than one block thread. - // Otherwise, find_num_subsets() would set default - // to entire block. - if (num_block_threads > 1 && _sub_block_sizes.sum() == 0) { + // Adjust defaults for nano-blocks to be slab if we are using more + // than one block thread. Otherwise, find_num_subsets() would set + // default to entire block, and we wouldn't use multiple threads. + if (num_inner_threads > 1 && _nano_block_sizes.sum() == 0) { // Default dim is outer one. _bind_posn = 1; @@ -704,7 +821,7 @@ namespace yask { // Don't pick inner dim. auto& dname = _dims->_domain_dims.get_dim_name(j); - if (dname == inner_dim) + if (dname == inner_loop_dim) continue; auto bsz = _block_sizes[i]; @@ -713,7 +830,7 @@ namespace yask { // Subdivide this dim if there are enough clusters in // the block for each thread. - if (clus_per_blk >= num_block_threads) { + if (clus_per_blk >= num_inner_threads) { _bind_posn = i; // Stop when first dim picked. @@ -728,135 +845,154 @@ namespace yask { // Use narrow slabs if at least 2D. // TODO: consider a better heuristic. if (nddims >= 2) - _sub_block_sizes[_bind_posn] = cpts; + _nano_block_sizes[_bind_posn] = cpts; // Divide block equally. else - _sub_block_sizes[_bind_posn] = ROUND_UP(bsz / num_block_threads, cpts); + _nano_block_sizes[_bind_posn] = ROUND_UP(bsz / num_inner_threads, cpts); } - // Determine num sub-blocks. - // Also fix up sub-block sizes as needed. - os << "\nSub-blocks:" << endl; - auto nsb = find_num_subsets(os, _sub_block_sizes, "sub-block", - _mini_block_sizes, "mini-block", - cluster_pts, step_dim); - os << " num-sub-blocks-per-mini-block-per-step: " << nsb << endl; - os << " num-sub-blocks-per-block-per-step: " << (nsb * nmb) << endl; - os << " num-sub-blocks-per-region-per-step: " << (nsb * nmb * nb) << endl; - os << " num-sub-blocks-per-rank-per-step: " << (nsb * nmb * nb * nr) << endl; + // Determine num nano-blocks. + // Also fix up nano-block sizes as needed. + os << "\nNano-blocks:" << endl; + auto nsb = find_num_subsets(os, + _nano_block_sizes, "nano-block", + _micro_block_sizes, "micro-block", + cluster_pts, "cluster", + step_dim); + os << " num-nano-blocks-per-micro-block-per-step: " << nsb << endl; + os << " num-nano-blocks-per-block-per-step: " << (nsb * nmb) << endl; + os << " num-nano-blocks-per-mega-block-per-step: " << (nsb * nmb * nb) << endl; + os << " num-nano-blocks-per-rank-per-step: " << (nsb * nmb * nb * nr) << endl; + os << " Temporal tiling of micro-blocks is never enabled.\n"; + + // Determine num pico-blocks. + // Also fix up pico-block sizes as needed. + os << "\nPico-blocks:" << endl; + auto npb = find_num_subsets(os, + _pico_block_sizes, "pico-block", + _nano_block_sizes, "nano-block", + cluster_pts, "cluster", + step_dim); + os << " num-pico-blocks-per-nano-block-per-step: " << npb << endl; + os << " num-pico-blocks-per-micro-block-per-step: " << (npb * nsb) << endl; + os << " num-pico-blocks-per-block-per-step: " << (npb * nsb * nmb) << endl; + os << " num-pico-blocks-per-mega-block-per-step: " << (npb * nsb * nmb * nb) << endl; + os << " num-pico-blocks-per-rank-per-step: " << (npb * nsb * nmb * nb * nr) << endl; + os << " Temporal tiling of nano-blocks is never enabled.\n"; // Determine binding dimension. Do this again if it was done above // by default because it may have changed during adjustment. - if (bind_block_threads && num_block_threads > 1) { + if (bind_inner_threads && num_inner_threads > 1) { DOMAIN_VAR_LOOP(i, j) { // Don't pick inner dim. auto& dname = _dims->_domain_dims.get_dim_name(j); - if (dname == inner_dim) + if (dname == inner_loop_dim) continue; auto bsz = _block_sizes[i]; - auto sbsz = _sub_block_sizes[i]; + auto sbsz = _nano_block_sizes[i]; auto sb_per_b = CEIL_DIV(bsz, sbsz); - // Choose first dim with enough sub-blocks + // Choose first dim with enough nano-blocks // per block. - if (sb_per_b >= num_block_threads) { + if (sb_per_b >= num_inner_threads) { _bind_posn = i; break; } } - os << " Note: only the sub-block size in the '" << + os << " Note: only the nano-block size in the '" << _dims->_stencil_dims.get_dim_name(_bind_posn) << "' dimension may be used at run-time\n" - " because block-thread binding is enabled on " << num_block_threads << " block threads.\n"; + " because block-thread binding is enabled on " << num_inner_threads << " block threads.\n"; } - // Now, we adjust groups. These are done after all the above sizes - // because group sizes are more like 'guidelines' and don't have + #ifdef USE_TILING + // Now, we adjust tiles. These are done after all the above sizes + // because tile sizes are more like 'guidelines' and don't have // their own loops. - // Adjust defaults for groups to be min size. - // Otherwise, find_num_block_groups_in_region() would set default - // to entire region. - DOMAIN_VAR_LOOP(i, j) { - if (_block_group_sizes[i] == 0) - _block_group_sizes[i] = 1; // will be rounded up to min size. - if (_mini_block_group_sizes[i] == 0) - _mini_block_group_sizes[i] = 1; // will be rounded up to min size. - if (_sub_block_group_sizes[i] == 0) - _sub_block_group_sizes[i] = 1; // will be rounded up to min size. - } - -#ifdef SHOW_GROUPS - os << "\nGroups (only affect ordering):" << endl; - - // Show num block-groups. - // TODO: only print this if block-grouping is enabled. - auto nbg = find_num_subsets(os, _block_group_sizes, "block-group", - _region_sizes, "region", - _block_sizes, step_dim); - os << " num-block-groups-per-region-per-step: " << nbg << endl; - auto nb_g = find_num_subsets(os, _block_sizes, "block", - _block_group_sizes, "block-group", - cluster_pts, step_dim); - os << " num-blocks-per-block-group-per-step: " << nb_g << endl; - - // Show num mini-block-groups. - // TODO: only print this if mini-block-grouping is enabled. - auto nmbg = find_num_subsets(os, _mini_block_group_sizes, "mini-block-group", - _block_sizes, "block", - _mini_block_sizes, step_dim); - os << " num-mini-block-groups-per-block-per-step: " << nmbg << endl; - auto nmb_g = find_num_subsets(os, _mini_block_sizes, "mini-block", - _mini_block_group_sizes, "mini-block-group", - cluster_pts, step_dim); - os << " num-mini-blocks-per-block-group-per-step: " << nmb_g << endl; - - // Show num sub-block-groups. - // TODO: only print this if sub-block-grouping is enabled. - auto nsbg = find_num_subsets(os, _sub_block_group_sizes, "sub-block-group", - _mini_block_sizes, "mini-block", - _sub_block_sizes, step_dim); - os << " num-sub-block-groups-per-mini-block-per-step: " << nsbg << endl; - auto nsb_g = find_num_subsets(os, _sub_block_sizes, "sub-block", - _sub_block_group_sizes, "sub-block-group", - _dims->_cluster_pts, step_dim); - os << " num-sub-blocks-per-sub-block-group-per-step: " << nsb_g << endl; -#endif + // Show num rank-tiles. + // TODO: only print this if rank-tiling is enabled. + os << "\nLocal-domain tiles:\n"; + auto nlg = find_num_subsets(os, + _rank_tile_sizes, "local-domain-tile", + _rank_sizes, "local-domain", + _mega_block_sizes, "mega-block", + step_dim); + os << " num-local-domain-tiles-per-local-domain-per-step: " << nlg << endl; + + // Show num mega-block-tiles. + // TODO: only print this if mega-block-tiling is enabled. + os << "\nMega-Block tiles:\n"; + auto nrg = find_num_subsets(os, + _mega_block_tile_sizes, "mega-block-tile", + _mega_block_sizes, "mega-block", + _block_sizes, "block", + step_dim); + os << " num-mega-block-tiles-per-mega-block-per-step: " << nlg << endl; + + // Show num block-tiles. + // TODO: only print this if block-tiling is enabled. + os << "\nBlock tiles:\n"; + auto nbg = find_num_subsets(os, + _block_tile_sizes, "block-tile", + _block_sizes, "block", + _micro_block_sizes, "micro-block", + step_dim); + os << " num-block-tiles-per-block-per-step: " << nbg << endl; + + // Show num micro-block-tiles. + // TODO: only print this if micro-block-tiling is enabled. + os << "\nMicro-block tiles:\n"; + auto nmbt = find_num_subsets(os, + _micro_block_tile_sizes, "micro-block-tile", + _micro_block_sizes, "micro-block", + _nano_block_sizes, "nano-block", + step_dim); + os << " num-micro-block-tiles-per-micro-block-per-step: " << nmbt << endl; + + // Show num nano-block-tiles. + // TODO: only print this if nano-block-tiling is enabled. + os << "\nNano-block tiles:\n"; + auto nsbt = find_num_subsets(os, + _nano_block_tile_sizes, "nano-block-tile", + _nano_block_sizes, "nano-block", + _pico_block_sizes, "pico-block", + step_dim); + os << " num-nano-block-tiles-per-nano-block-per-step: " << nsbt << endl; + + // NB: there are no pico-block tiles. + #endif + os << endl; } // Ctor. KernelStateBase::KernelStateBase(KernelEnvPtr& kenv, - KernelSettingsPtr& ksettings) + KernelSettingsPtr& kactl_opts, + KernelSettingsPtr& kreq_opts) { + host_assert(kenv); + host_assert(kactl_opts); + host_assert(kreq_opts); + host_assert(kactl_opts->_dims); + // Create state. All other objects that need to share // this state should use a shared ptr to it. _state = make_shared(); - + // Share passed ptrs. - assert(kenv); _state->_env = kenv; - assert(ksettings); - _state->_opts = ksettings; - assert(ksettings->_dims); - _state->_dims = ksettings->_dims; + _state->_actl_opts = kactl_opts; + _state->_req_opts = kreq_opts; + _state->_dims = kactl_opts->_dims; - // Create MPI Info object. - _state->_mpi_info = make_shared(ksettings->_dims); + // Create MPI Info object. + _state->_mpi_info = make_shared(_state->_dims); // Set vars after above inits. STATE_VARS(this); - // Find index posns in stencil dims. - DOMAIN_VAR_LOOP(i, j) { - auto& dname = stencil_dims.get_dim_name(i); - if (state->_outer_posn < 0) - state->_outer_posn = i; - if (dname == dims->_inner_dim) - state->_inner_posn = i; - } - assert(outer_posn == state->_outer_posn); } // Set number of threads w/o using thread-divisor. @@ -866,98 +1002,83 @@ namespace yask { STATE_VARS(this); // Get max number of threads. - int mt = max(opts->max_threads, 1); + int mt = max(actl_opts->max_threads, 1); - // Reset number of OMP threads to max allowed and disable nesting. + // Set num threads to use for inner and outer loops. + yask_num_threads[0] = mt; + yask_num_threads[1] = 0; + + // Reset number of OMP threads to max allowed. omp_set_num_threads(mt); - omp_set_nested(0); - omp_set_max_active_levels(1); return mt; } // Get total number of computation threads to use. - int KernelStateBase::get_num_comp_threads(int& region_threads, int& blk_threads) const { + int KernelStateBase::get_num_comp_threads(int& outer_threads, int& inner_threads) const { STATE_VARS(this); - // Max threads / divisor. - int mt = max(opts->max_threads, 1); - int td = max(opts->thread_divisor, 1); - int at = mt / td; - at = max(at, 1); - - // Blk threads per region thread. - int bt = max(opts->num_block_threads, 1); - bt = min(bt, at); // Cannot be > 'at'. - blk_threads = bt; - assert(bt >= 1); - - // Region threads. - int rt = at / bt; - rt = max(rt, 1); - region_threads = rt; - assert(rt >= 1); - - // Total number of block threads. + int mt = max(actl_opts->max_threads, 1); + + int it = max(actl_opts->num_inner_threads, 1); + it = min(it, mt); + inner_threads = it; + + int max_ot = max(mt / it, 1); + int ot = actl_opts->num_outer_threads; + if (ot <= 0) + ot = max_ot; + ot = min(ot, max_ot); + outer_threads = ot; + + // Total number of inner threads. // Might be less than max threads due to truncation. - int ct = bt * rt; + int ct = it * ot; assert(ct <= mt); return ct; } - // Set number of threads to use for a region. - // Enable nested OMP if there are >1 block threads, - // disable otherwise. + // Set number of threads to use for a mega-block. + // Enable nested OMP. // Return number of threads. // Do nothing and return 0 if not properly initialized. - int KernelStateBase::set_region_threads() { + int KernelStateBase::set_num_outer_threads() { int rt=0, bt=0; int at = get_num_comp_threads(rt, bt); - // Must call before entering top parallel region. + // Must call before entering top parallel mega-block. int ol = omp_get_level(); assert(ol == 0); - // Limit outer nesting to allow num_block_threads per nested - // block loop. + // Enable nested OMP. + omp_set_nested(1); + omp_set_max_active_levels(yask_max_levels + 1); // Add 1 for offload. + + // Set num threads to use for inner and outer loops. yask_num_threads[0] = rt; + yask_num_threads[1] = bt; - if (bt > 1) { - omp_set_nested(1); - omp_set_max_active_levels(2); - int mal = omp_get_max_active_levels(); - assert (mal == 2); - yask_num_threads[1] = bt; - } - else { - assert(bt == 1); - omp_set_nested(0); - omp_set_max_active_levels(1); - int mal = omp_get_max_active_levels(); - assert (mal == 1); - yask_num_threads[1] = 0; - } - + // Set num threads for a mega-block. omp_set_num_threads(rt); return rt; } // Set number of threads for a block. - // Must be called from within a top-level OMP parallel region. + // Must be called from within a top-level OMP parallel mega-block. // Return number of threads. // Do nothing and return 0 if not properly initialized. - int KernelStateBase::set_block_threads() { + int KernelStateBase::set_num_inner_threads() { int rt=0, bt=0; int at = get_num_comp_threads(rt, bt); // Must call within top parallel region. + #ifdef _OPENMP int ol = omp_get_level(); assert(ol == 1); + int mal = omp_get_max_active_levels(); + assert (mal >= 2); + #endif - if (bt > 1) { - int mal = omp_get_max_active_levels(); - assert (mal == 2); - omp_set_num_threads(bt); - } + omp_set_num_threads(bt); return bt; } diff --git a/src/kernel/lib/settings.hpp b/src/kernel/lib/settings.hpp index e4741c1f..498e3b1f 100644 --- a/src/kernel/lib/settings.hpp +++ b/src/kernel/lib/settings.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -41,6 +41,8 @@ namespace yask { typedef std::vector ScratchVecs; // Environmental settings. + // Several member vars are static because they are considered global + // and set/get can be called w/o an obj. class KernelEnv : public virtual yk_env { @@ -50,11 +52,18 @@ namespace yask { public: - // Output stream for messages. - yask_output_ptr _debug; + // Output stream for debug & trace messages. + static yask_output_ptr _debug; // Is tracing enabled? - bool _trace = false; + static bool _trace; + + // OMP offload devices. + static bool _use_offload; + #ifdef USE_OFFLOAD + static int _omp_hostn; + static int _omp_devn; + #endif // MPI vars. MPI_Comm comm = MPI_COMM_NULL; // global communicator. @@ -71,10 +80,7 @@ namespace yask { // OMP vars. int max_threads=0; // initial value from OMP. - KernelEnv() { - yask_output_factory yof; - set_debug_output(yof.new_stdout_output()); - } + KernelEnv() { } virtual ~KernelEnv() { } // Init MPI, OMP, etc. @@ -104,34 +110,26 @@ namespace yask { virtual void global_barrier() const { MPI_Barrier(comm); } - virtual yask_output_ptr get_debug_output() const { - return _debug; - } - virtual void set_debug_output(yask_output_ptr debug) { - _debug = debug; - } - virtual bool is_trace_enabled() const { - return _trace; - } - virtual void set_trace_enabled(bool enable) { - _trace = enable; - } }; typedef std::shared_ptr KernelEnvPtr; // Dimensions for a solution. // Similar to the Dimensions class in the YASK compiler // from which these values are set. + // These are not changed after initialization. struct Dims { // Algorithm for vec dims in fold layout. VEC_FOLD_LAYOUT_CLASS _vec_fold_layout; - // Dimensions with 0 values. + // Special dims. std::string _step_dim; // usually time, 't'. - std::string _inner_dim; // the domain dim used in the inner loop. - IdxTuple _domain_dims; - IdxTuple _stencil_dims; // step & domain dims. + std::string _inner_layout_dim; // innermost index in layout. + std::string _inner_loop_dim; // innermost index in pico loops. + + // Dimensions with 0 values. + IdxTuple _domain_dims; // e.g., 'x', 'y'. + IdxTuple _stencil_dims; // union of step & domain dims. IdxTuple _misc_dims; // Dimensions and sizes. @@ -140,6 +138,9 @@ namespace yask { IdxTuple _cluster_pts; // all domain dims. IdxTuple _cluster_mults; // all domain dims. + // Just sizes. + Indices _fold_sizes; // all domain dims. + // Direction of step. // This is a heuristic value used only for stepping the // perf-measuring utility and the auto-tuner. @@ -156,7 +157,7 @@ namespace yask { // Get linear index into a vector given 'fold_ofs', which are // element offsets that must be *exactly* those in _vec_fold_pts. idx_t get_elem_index_in_vec(const Indices& fold_ofs) const { - assert(fold_ofs._get_num_dims() == NUM_VEC_FOLD_DIMS); + host_assert(fold_ofs.get_num_dims() == NUM_VEC_FOLD_DIMS); // Use compiler-generated fold macro. idx_t i = VEC_FOLD_LAYOUT(fold_ofs); @@ -164,43 +165,19 @@ namespace yask { #ifdef DEBUG_LAYOUT // Use compiler-generated fold layout class. idx_t j = _vec_fold_layout.layout(fold_ofs); - assert(i == j); + host_assert(i == j); #endif return i; } - - // Get linear index into a vector given 'elem_ofs', which are - // element offsets that may include other dimensions. - idx_t get_elem_index_in_vec(const IdxTuple& elem_ofs) const { - assert(_vec_fold_pts._get_num_dims() == NUM_VEC_FOLD_DIMS); - if (NUM_VEC_FOLD_DIMS == 0) - return 0; - - // Get required offsets into an Indices obj. - IdxTuple fold_ofs(_vec_fold_pts); - fold_ofs.set_vals_same(0); - fold_ofs.set_vals(elem_ofs, false); // copy only fold offsets. - Indices fofs(fold_ofs); - - // Call version that requires vec-fold offsets only. - idx_t i = get_elem_index_in_vec(fofs); - - // Use fold layout to find element index. -#ifdef DEBUG_LAYOUT - idx_t i2 = _vec_fold_pts.layout(fold_ofs, false); - assert(i == i2); -#endif - return i; - } }; typedef std::shared_ptr DimsPtr; - // Utility to determine number of points in a "sizes" var. + // Utility to determine number of spatial points in a "sizes" var. inline idx_t get_num_domain_points(const IdxTuple& sizes) { - assert(sizes._get_num_dims() == NUM_STENCIL_DIMS); + host_assert(sizes.get_num_dims() == NUM_STENCIL_DIMS); idx_t pts = 1; - DOMAIN_VAR_LOOP(i, j) + DOMAIN_VAR_LOOP_FAST(i, j) pts *= sizes[i]; return pts; } @@ -209,65 +186,90 @@ namespace yask { // of these vars can be set via cmd-line options and/or APIs. class KernelSettings { - // Null stream to throw away debug info. - yask_output_factory yof; - yask_output_ptr nullop = yof.new_null_output(); + // Default block size on CPU. + int def_blk_size = 0; public: + // Abbreviations for sizes. + static const std::string _mega_block_str; + static const std::string _block_str; + static const std::string _micro_block_str; + static const std::string _nano_block_str; + static const std::string _pico_block_str; + // Ptr to problem dimensions (NOT sizes), folding, etc. // This is solution info from the YASK compiler. DimsPtr _dims; // Sizes in elements (points). - // All these tuples contain stencil dims, even the ones that - // don't strictly need them. + // All these tuples contain step dims, even the ones that + // don't use them, for consistency. IdxTuple _global_sizes; // Overall problem domain sizes. - IdxTuple _rank_sizes; // This rank's domain sizes. - IdxTuple _region_sizes; // region size (used for wave-front tiling). - IdxTuple _block_group_sizes; // block-group size (only used for 'grouped' region loops). + IdxTuple _rank_sizes; // This rank's domain (local) sizes. + IdxTuple _rank_tile_sizes; // rank-tile size (only used for 'tiled' rank loops). + IdxTuple _mega_block_sizes; // mega-block size (used for wave-front tiling). + IdxTuple _mega_block_tile_sizes; // mega-block-tile size (only used for 'tiled' mega-block loops). IdxTuple _block_sizes; // block size (used for each outer thread). - IdxTuple _mini_block_group_sizes; // mini-block-group size (only used for 'grouped' block loops). - IdxTuple _mini_block_sizes; // mini-block size (used for wave-fronts in blocks). - IdxTuple _sub_block_group_sizes; // sub-block-group size (only used for 'grouped' mini-block loops). - IdxTuple _sub_block_sizes; // sub-block size (used for each nested thread). + IdxTuple _block_tile_sizes; // block-tile size (only used for 'tiled' block loops). + IdxTuple _micro_block_sizes; // micro-block size (used for wave-fronts in blocks). + IdxTuple _micro_block_tile_sizes; // micro-block-tile size (only used for 'tiled' micro-block loops). + IdxTuple _nano_block_sizes; // nano-block size (used for each nested thread). + IdxTuple _nano_block_tile_sizes; // nano-block-tile size (only used for 'tiled' nano-block loops). + IdxTuple _pico_block_sizes; // pico-block size (used within nano-blocks, no pico-tiling). + + // Global padding applied to all vars by default. + // These tuples contain all stencil dims, even though the step dim isn't used. IdxTuple _min_pad_sizes; // minimum spatial padding (including halos). IdxTuple _extra_pad_sizes; // extra spatial padding (outside of halos). // MPI settings. - IdxTuple _num_ranks; // number of ranks in each dim. - IdxTuple _rank_indices; // my rank index in each dim. + // These tuples contain only domain dims. + IdxTuple _num_ranks; // number of ranks in each domain dim. + IdxTuple _rank_indices; // my rank index in each domain dim. bool find_loc = true; // whether my rank index needs to be calculated. bool overlap_comms = true; // overlap comms with computation. - bool use_shm = true; // use shared memory if possible. - idx_t _min_exterior = 0; // minimum size of MPI exterior to calculate. + idx_t _min_exterior = 32; // minimum size of MPI exterior to calculate. + #ifdef USE_OFFLOAD + bool use_device_mpi = true; // transfer data directly between devices. + bool use_shm = false; // transfer data using shared memory (w/o MPI calls) on same node. + #else + bool use_device_mpi = false; + bool use_shm = true; + #endif // OpenMP settings. - int max_threads = 0; // Initial number of threads to use overall; 0=>OMP default. - int thread_divisor = 1; // Reduce number of threads by this amount. - int num_block_threads = 1; // Number of threads to use for a block. - bool bind_block_threads = false; // Bind block threads to indices. - - // Var behavior. + int max_threads = 0; // Initial number of host threads to use overall; 0=>OMP default. + int num_outer_threads = 0; // Number of threads to use for blocks. + int num_inner_threads = 1; // Number of threads to use within a block. + bool bind_inner_threads = false; // Bind inner threads to global indices. + #ifdef USE_OFFLOAD + int thread_limit = 32; // Offload threads per team. + #else + int thread_limit = 1; + #endif + + // Var behavior, including allocation. bool _step_wrap = false; // Allow invalid step indices to alias to valid ones (set via APIs only). bool _allow_addl_pad = true; // Allow extending padding beyond what's needed for alignment. + bool _bundle_allocs = !KernelEnv::_use_offload; // Group allocations together. + int _numa_pref = NUMA_PREF; // Stencil-dim posn in which to apply block-thread binding. // TODO: make this a cmd-line parameter. int _bind_posn = 1; // Tuning. - bool _do_auto_tune = false; // whether to do auto-tuning. - bool _tune_mini_blks = false; // auto-tune mini-blks instead of blks. + bool _do_auto_tune = false; // whether to do "online" auto-tuning. bool _allow_stage_tuners = false; // allow per-stage tuners when possible. - double _tuner_min_secs = 0.25; // min time to run tuner for new better setting. + double _tuner_trial_secs = 0.5; // time to run tuner for new better setting. + int _tuner_radius = 16; + string_vec _tuner_targets; // things to tune from following. // Debug. bool force_scalar = false; // Do only scalar ops. - - // NUMA settings. - int _numa_pref = NUMA_PREF; - int _numa_pref_max = 128; // GiB to alloc before using PMEM. + bool do_halo_exchange = true; // False => skip halo exchanges. + bool force_scalar_exchange = false; // Don't allow vec exchanges. // Ctor/dtor. KernelSettings(DimsPtr dims, KernelEnvPtr env); @@ -281,26 +283,17 @@ namespace yask { IdxTuple& var, bool allow_step = false); - idx_t find_num_subsets(std::ostream& os, - IdxTuple& inner_sizes, const std::string& inner_name, - const IdxTuple& outer_sizes, const std::string& outer_name, - const IdxTuple& mults, const std::string& step_dim); - public: // Add options to a cmd-line parser to set the settings. virtual void add_options(CommandLineParser& parser); - // Print usage message. - void print_usage(std::ostream& os, - CommandLineParser& parser, - const std::string& pgm_name, - const std::string& app_notes, - const std::vector& app_examples); - + // Print informational messages. + void print_usage(std::ostream& os); + void print_values(std::ostream& os); + // Make sure all user-provided settings are valid by rounding-up - // values as needed. - // Called from prepare_solution(), so it doesn't normally need to be called from user code. - // Prints informational info to 'os'. + // values as needed. Called from prepare_solution(). + // Prints informational info to debug output in *ksb. virtual void adjust_settings(KernelStateBase* ksb = 0); // Determine if this is the first or last rank in given dim. @@ -349,7 +342,7 @@ namespace yask { // What get_neighbor_index() returns for myself. // Example: trunc(3^3 / 2) = 13 for 3D problem. - int my_neighbor_index; + idx_t my_neighbor_index; // MPI rank of each neighbor. // MPI_PROC_NULL => no neighbor. @@ -382,7 +375,7 @@ namespace yask { // Max neighbors. neighborhood_sizes = dims->_domain_dims; // copy dims from domain. neighborhood_sizes.set_vals_same(num_offsets); // set sizes in each domain dim. - neighborhood_size = neighborhood_sizes.product(); // neighbors in all dims. + neighborhood_size = neighborhood_sizes.product(); // num neighbors in all dims. // Myself. IdxTuple noffsets(neighborhood_sizes); @@ -403,8 +396,8 @@ namespace yask { // Input 'offsets': tuple of NeighborOffset vals. virtual idx_t get_neighbor_index(const IdxTuple& offsets) const { idx_t i = neighborhood_sizes.layout(offsets); // 1D index. - assert(i >= 0); - assert(i < neighborhood_size); + host_assert(i >= 0); + host_assert(i < neighborhood_size); return i; } @@ -475,6 +468,15 @@ namespace yask { if (_shm_lock) _shm_lock->mark_write_done(); } + idx_t get_data() const { + if (_shm_lock) + return _shm_lock->get_data(); + return 0; + } + void set_data(idx_t v) { + if (_shm_lock) + _shm_lock->set_data(v); + } // Number of points overall. idx_t get_size() const { @@ -482,7 +484,7 @@ namespace yask { return 0; return num_pts.product(); } - idx_t get_bytes() const { + size_t get_bytes() const { return get_size() * sizeof(real_t); } @@ -545,6 +547,8 @@ namespace yask { // These are used for async comms. std::vector recv_reqs; std::vector send_reqs; + std::vector recv_stats; + std::vector send_stats; MPIData(MPIInfoPtr mpi_info) : _mpi_info(mpi_info) { @@ -557,6 +561,10 @@ namespace yask { // Init handles. recv_reqs.resize(n, MPI_REQUEST_NULL); send_reqs.resize(n, MPI_REQUEST_NULL); + MPI_Status nullst; + memset(&nullst, 0, sizeof(nullst)); + recv_stats.resize(n, nullst); + send_stats.resize(n, nullst); } void reset_locks() { @@ -584,22 +592,13 @@ namespace yask { KernelEnvPtr _env; // User settings. - KernelSettingsPtr _opts; + KernelSettingsPtr _actl_opts; // Actual settings to use. + KernelSettingsPtr _req_opts; // Settings specified by user and/or tuner. bool _use_stage_tuners = false; // Problem dims. DimsPtr _dims; - // Position of inner domain dim in stencil-dims tuple. - // Misc dims will follow this if/when using interleaving. - // TODO: move to Dims. - int _inner_posn = -1; // -1 => not set. - - // Position of outer domain dim in stencil-dims tuple. - // For 1D stencils, _outer_posn == _inner_posn. - // TODO: move to Dims. - int _outer_posn = -1; // -1 => not set. - // MPI neighbor info. MPIInfoPtr _mpi_info; }; @@ -614,30 +613,31 @@ namespace yask { // functions with side-effects. #define STATE_VARS0(_ksbp, pfx) \ pfx auto* ksbp = _ksbp; \ - assert(ksbp); \ + host_assert(ksbp); \ pfx auto* state = ksbp->get_state().get(); \ - assert(state); \ + host_assert(state); \ pfx auto* env = state->_env.get(); \ - assert(env); \ - pfx auto* opts = state->_opts.get(); \ - assert(opts); \ + host_assert(env); \ + pfx auto* actl_opts = state->_actl_opts.get(); \ + host_assert(actl_opts); \ + pfx auto* req_opts = state->_req_opts.get(); \ + host_assert(req_opts); \ pfx auto* dims = state->_dims.get(); \ - assert(dims); \ - pfx auto* mpi_info = state->_mpi_info.get(); \ - assert(mpi_info); \ + host_assert(dims); \ + pfx auto* mpi_info = state->_mpi_info.get(); \ + host_assert(mpi_info); \ const auto& step_dim = dims->_step_dim; \ - const auto& inner_dim = dims->_inner_dim; \ + const auto& inner_layout_dim = dims->_inner_layout_dim; \ + const auto& inner_loop_dim = dims->_inner_loop_dim; \ const auto& domain_dims = dims->_domain_dims; \ constexpr int nddims = NUM_DOMAIN_DIMS; \ - assert(nddims == domain_dims.size()); \ + host_assert(nddims == domain_dims.size()); \ const auto& stencil_dims = dims->_stencil_dims; \ constexpr int nsdims = NUM_STENCIL_DIMS; \ - assert(nsdims == stencil_dims.size()); \ + host_assert(nsdims == stencil_dims.size()); \ auto& misc_dims = dims->_misc_dims; \ constexpr int step_posn = 0; \ - assert(step_posn == +Indices::step_posn); \ - constexpr int outer_posn = 1; \ - const int inner_posn = state->_inner_posn + host_assert(step_posn == +step_posn); #define STATE_VARS(_ksbp) STATE_VARS0(_ksbp,) #define STATE_VARS_CONST(_ksbp) STATE_VARS0(_ksbp, const) @@ -655,22 +655,25 @@ namespace yask { public: KernelStateBase(KernelStatePtr& state) : _state(state) {} - KernelStateBase(KernelEnvPtr& env, - KernelSettingsPtr& settings); + KernelStateBase(KernelEnvPtr& kenv, + KernelSettingsPtr& kactl_opts, + KernelSettingsPtr& kreq_opts); KernelStateBase(KernelStateBase* p) : _state(p->_state) { } // Access to state. ALWAYS_INLINE KernelStatePtr& get_state() { - assert(_state); + host_assert(_state); return _state; } ALWAYS_INLINE const KernelStatePtr& get_state() const { - assert(_state); + host_assert(_state); return _state; } - KernelSettingsPtr& get_settings() { return _state->_opts; } - const KernelSettingsPtr& get_settings() const { return _state->_opts; } + KernelSettingsPtr& get_actl_opts() { return _state->_actl_opts; } + const KernelSettingsPtr& get_actl_opts() const { return _state->_actl_opts; } + KernelSettingsPtr& get_req_opts() { return _state->_req_opts; } + const KernelSettingsPtr& get_req_opts() const { return _state->_req_opts; } KernelEnvPtr& get_env() { return _state->_env; } const KernelEnvPtr& get_env() const { return _state->_env; } DimsPtr& get_dims() { return _state->_dims; } @@ -684,6 +687,9 @@ namespace yask { void set_debug_output(yask_output_ptr debug) { _state->_env->set_debug_output(debug); } + void disable_debug_output() { + _state->_env->disable_debug_output(); + } // Set number of threads w/o using thread-divisor. // Return number of threads. @@ -691,21 +697,20 @@ namespace yask { int set_max_threads(); // Get total number of computation threads to use. - int get_num_comp_threads(int& region_threads, int& blk_threads) const; + int get_num_comp_threads(int& outer_threads, int& inner_threads) const; - // Set number of threads to use for a region. + // Set number of threads to use for a mega-block. // Enable nested OMP if there are >1 block threads, // disable otherwise. // Return number of threads. // Do nothing and return 0 if not properly initialized. - int set_region_threads(); + int set_num_outer_threads(); // Set number of threads for a block. - // Must be called from within a top-level OMP parallel region. + // Must be called from within a top-level OMP parallel mega-block. // Return number of threads. // Do nothing and return 0 if not properly initialized. - int set_block_threads(); - + int set_num_inner_threads(); }; // An object that is created from a context, shares ownership of the diff --git a/src/kernel/lib/setup.cpp b/src/kernel/lib/setup.cpp index f7638575..5ed9e1c2 100644 --- a/src/kernel/lib/setup.cpp +++ b/src/kernel/lib/setup.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -23,47 +23,122 @@ IN THE SOFTWARE. *****************************************************************************/ -// This file contains implementations of StencilContext and -// StencilBundleBase methods specific to the preparation steps. +// This file contains implementations of configuration-related methods +// from several classes. #include "yask_stencil.hpp" using namespace std; namespace yask { - // Stop collecting VTune data when a factory is defined. - // Even better to use -start-paused option. - yk_factory::yk_factory() { - VTUNE_PAUSE; - } - - // ScanIndices ctor. - ScanIndices::ScanIndices(const Dims& dims, bool use_vec_align) : - ndims(NUM_STENCIL_DIMS), - begin(idx_t(0), ndims), - end(idx_t(0), ndims), - stride(idx_t(1), ndims), - align(idx_t(1), ndims), - align_ofs(idx_t(0), ndims), - group_size(idx_t(1), ndims), - num_indices(idx_t(1), ndims), - start(idx_t(0), ndims), - stop(idx_t(0), ndims), - index(idx_t(0), ndims) { - - // i: index for stencil dims, j: index for domain dims. - DOMAIN_VAR_LOOP(i, j) { + // Init MPI, OMP. + // This function can have some calls to TRACE_MSG(), but it shouldn't + // call DEBUG_MSG(), because the user might not want debug messages + // before setting up the env. + void KernelEnv::init_env(int* argc, char*** argv, MPI_Comm existing_comm) + { + TRACE_MSG("Initializing YASK environment..."); + YaskTimer init_timer; + init_timer.start(); + + // MPI init. + my_rank = 0; + num_ranks = 1; + + #ifdef USE_MPI + TRACE_MSG("Initializing MPI..."); + int is_init = false; + MPI_Initialized(&is_init); + + // No MPI communicator provided. + if (existing_comm == MPI_COMM_NULL || + existing_comm == MPI_COMM_WORLD) { + if (!is_init) { + + int provided = 0; + MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided); + if (provided < MPI_THREAD_SERIALIZED) { + THROW_YASK_EXCEPTION("error: MPI_THREAD_SERIALIZED or MPI_THREAD_MULTIPLE not provided"); + } + is_init = true; + } + comm = MPI_COMM_WORLD; + } - // Set alignment to vector lengths. - if (use_vec_align) - align[i] = fold_pts[j]; + // MPI communicator provided. + else { + if (!is_init) + THROW_YASK_EXCEPTION("error: YASK environment created with" + " an existing MPI communicator, but MPI is not initialized"); + comm = existing_comm; } + + // Get some info on this communicator. + MPI_Comm_rank(comm, &my_rank); + MPI_Comm_group(comm, &group); + MPI_Comm_size(comm, &num_ranks); + if (num_ranks < 1) + THROW_YASK_EXCEPTION("error: MPI_Comm_size() returns less than one rank"); + + // Create a shm communicator. + MPI_Comm_split_type(comm, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &shm_comm); + MPI_Comm_rank(shm_comm, &my_shm_rank); + MPI_Comm_group(shm_comm, &shm_group); + MPI_Comm_size(shm_comm, &num_shm_ranks); + + #else + comm = MPI_COMM_NULL; + #endif + + // Turn off denormals unless the USE_DENORMALS macro is set. + #ifndef USE_DENORMALS + // Enable FTZ + _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); + + //Enable DAZ + _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); + #endif + + #ifdef _OPENMP + TRACE_MSG("Initializing OpenMP..."); + + // Set env vars needed by OMP. + // TODO: make this visible to the user. + int ret = setenv("OMP_PLACES", "cores", 0); // default placement for outer loop. + assert(ret == 0); + ret = setenv("KMP_HOT_TEAMS_MODE", "1", 0); // more efficient nesting. + assert(ret == 0); + ret = setenv("KMP_HOT_TEAMS_MAX_LEVEL", "3", 0); // nesting. + + // Check initial value of OMP max threads. + // Side effect: causes OMP to dump debug info if env var set. + int mt = omp_get_max_threads(); + if (!max_threads) + max_threads = mt; + + #ifdef USE_OFFLOAD + _omp_hostn = omp_get_initial_device(); + _omp_devn = omp_get_default_device(); + #endif + + #else + max_threads = 1; + #endif + + init_timer.stop(); + TRACE_MSG("Environment initialization done in " << + make_num_str(init_timer.get_elapsed_secs()) << " secs."); + } + + // Bootstrap factory ctor. + yk_factory::yk_factory() { } // Context ctor. StencilContext::StencilContext(KernelEnvPtr& kenv, - KernelSettingsPtr& ksettings) : - KernelStateBase(kenv, ksettings), + KernelSettingsPtr& ksettings, + KernelSettingsPtr& user_settings) : + KernelStateBase(kenv, ksettings, user_settings), _at(this, ksettings.get()) { STATE_VARS(this); @@ -93,17 +168,17 @@ namespace yask { // All ranks should have the same settings for certain options. assert_equality_over_ranks(nr, env->comm, "total number of MPI ranks"); - assert_equality_over_ranks(idx_t(opts->use_shm), env->comm, "use_shm setting"); - assert_equality_over_ranks(idx_t(opts->find_loc), env->comm, "defined rank indices"); + assert_equality_over_ranks(idx_t(actl_opts->use_shm), env->comm, "use_shm setting"); + assert_equality_over_ranks(idx_t(actl_opts->find_loc), env->comm, "defined rank indices"); DOMAIN_VAR_LOOP(i, j) { auto& dname = domain_dims.get_dim_name(j); - assert_equality_over_ranks(opts->_global_sizes[i], env->comm, - "global-domain size in '" + dname + "' dimension"); - assert_equality_over_ranks(opts->_num_ranks[j], env->comm, - "number of ranks in '" + dname + "' dimension"); + assert_equality_over_ranks(actl_opts->_global_sizes[i], env->comm, + "global-domain size in '" + dname + "' dimension"); + assert_equality_over_ranks(actl_opts->_num_ranks[j], env->comm, + "number of ranks in '" + dname + "' dimension"); // Check that either local or global size is set. - if (!opts->_global_sizes[i] && !opts->_rank_sizes[i]) + if (!actl_opts->_global_sizes[i] && !actl_opts->_rank_sizes[i]) THROW_YASK_EXCEPTION("Error: both local-domain size and " "global-domain size are zero in '" + dname + "' dimension on rank " + @@ -111,11 +186,11 @@ namespace yask { "and the other will be calculated"); } -#ifndef USE_MPI + #ifndef USE_MPI // Simple settings. - opts->_num_ranks.set_vals_same(1); - opts->_rank_indices.set_vals_same(0); + actl_opts->_num_ranks.set_vals_same(1); + actl_opts->_rank_indices.set_vals_same(0); rank_domain_offsets.set_vals_same(0); assert(nr == 1); @@ -123,51 +198,51 @@ namespace yask { DOMAIN_VAR_LOOP(i, j) { // Need to set local size. - if (!opts->_rank_sizes[i]) - opts->_rank_sizes[i] = opts->_global_sizes[i]; + if (!actl_opts->_rank_sizes[i]) + actl_opts->_rank_sizes[i] = actl_opts->_global_sizes[i]; // Need to set global size. - else if (!opts->_global_sizes[i]) - opts->_global_sizes[i] = opts->_rank_sizes[i]; + else if (!actl_opts->_global_sizes[i]) + actl_opts->_global_sizes[i] = actl_opts->_rank_sizes[i]; // Check that settings are equal. - else if (opts->_global_sizes[i] != opts->_rank_sizes[i]) { + else if (actl_opts->_global_sizes[i] != actl_opts->_rank_sizes[i]) { auto& dname = domain_dims.get_dim_name(j); FORMAT_AND_THROW_YASK_EXCEPTION("Error: specified local-domain size of " << - opts->_rank_sizes[i] << + actl_opts->_rank_sizes[i] << " does not equal specified global-domain size of " << - opts->_global_sizes[i] << " in '" << dname << + actl_opts->_global_sizes[i] << " in '" << dname << "' dimension"); } } -#else + #else // Set number of ranks in each dim if any is unset (zero). - TRACE_MSG("rank layout " << opts->_num_ranks.make_dim_val_str(" * ") << " requested"); - opts->_num_ranks = opts->_num_ranks.get_compact_factors(nr); - TRACE_MSG("rank layout " << opts->_num_ranks.make_dim_val_str(" * ") << " selected"); + TRACE_MSG("rank layout " << actl_opts->_num_ranks.make_dim_val_str(" * ") << " requested"); + actl_opts->_num_ranks = actl_opts->_num_ranks.get_compact_factors(nr); + TRACE_MSG("rank layout " << actl_opts->_num_ranks.make_dim_val_str(" * ") << " selected"); // Check ranks. - idx_t req_ranks = opts->_num_ranks.product(); + idx_t req_ranks = actl_opts->_num_ranks.product(); if (req_ranks != nr) FORMAT_AND_THROW_YASK_EXCEPTION("error: " << req_ranks << " rank(s) requested (" + - opts->_num_ranks.make_dim_val_str(" * ") + "), but " << + actl_opts->_num_ranks.make_dim_val_str(" * ") + "), but " << nr << " rank(s) are active"); // Determine my coordinates if not provided already. // TODO: do this more intelligently based on proximity. - if (opts->find_loc) - opts->_rank_indices = opts->_num_ranks.unlayout(me); + if (actl_opts->find_loc) + actl_opts->_rank_indices = actl_opts->_num_ranks.unlayout(me); // Check rank indices. DOMAIN_VAR_LOOP(i, j) { auto& dname = domain_dims.get_dim_name(j); - if (opts->_rank_indices[j] < 0 || - opts->_rank_indices[j] >= opts->_num_ranks[j]) + if (actl_opts->_rank_indices[j] < 0 || + actl_opts->_rank_indices[j] >= actl_opts->_num_ranks[j]) THROW_YASK_EXCEPTION("Error: rank index of " + - to_string(opts->_rank_indices[j]) + + to_string(actl_opts->_rank_indices[j]) + " is not within allowed range [0 ... " + - to_string(opts->_num_ranks[j] - 1) + + to_string(actl_opts->_num_ranks[j] - 1) + "] in '" + dname + "' dimension on rank " + to_string(me)); } @@ -190,8 +265,8 @@ namespace yask { // Init tables for this rank. DOMAIN_VAR_LOOP(i, j) { - coords[me][j] = opts->_rank_indices[j]; - rsizes[me][j] = opts->_rank_sizes[i]; + coords[me][j] = actl_opts->_rank_indices[j]; + rsizes[me][j] = actl_opts->_rank_sizes[i]; } // Exchange coord and size info between all ranks. @@ -330,7 +405,7 @@ namespace yask { // Determine whether neighbor is in my shm group. // If so, record rank number in shmcomm. - if (opts->use_shm && env->shm_comm != MPI_COMM_NULL) { + if (actl_opts->use_shm && env->shm_comm != MPI_COMM_NULL) { int g_rank = rn; int s_rank = MPI_PROC_NULL; MPI_Group_translate_ranks(env->group, 1, &g_rank, @@ -349,8 +424,8 @@ namespace yask { bool vlen_mults = true; DOMAIN_VAR_LOOP(i, j) { auto& dname = domain_dims.get_dim_name(j); - auto nranks = opts->_num_ranks[j]; - bool is_last = (opts->_rank_indices[j] == nranks - 1); + auto nranks = actl_opts->_num_ranks[j]; + bool is_last = (actl_opts->_rank_indices[j] == nranks - 1); // Does rn have all VLEN-multiple sizes? // TODO: allow last rank in each dim to be non-conformant. @@ -375,12 +450,12 @@ namespace yask { if (pass == 0) { DOMAIN_VAR_LOOP(i, j) { auto& dname = domain_dims.get_dim_name(j); - auto nranks = opts->_num_ranks[j]; - auto gsz = opts->_global_sizes[i]; - bool is_last = (opts->_rank_indices[j] == nranks - 1); + auto nranks = actl_opts->_num_ranks[j]; + auto gsz = actl_opts->_global_sizes[i]; + bool is_last = (actl_opts->_rank_indices[j] == nranks - 1); // Need to determine my rank size. - if (!opts->_rank_sizes[i]) { + if (!actl_opts->_rank_sizes[i]) { if (rank_domain_sums[j] != 0) FORMAT_AND_THROW_YASK_EXCEPTION ("Error: local-domain size is not specified in the '" << @@ -405,14 +480,14 @@ namespace yask { rsz = rem; // Set rank size depending on whether it is last one. - opts->_rank_sizes[i] = rsz; + actl_opts->_rank_sizes[i] = rsz; TRACE_MSG("local-domain-size[" << dname << "] = " << rem); } // Need to determine global size. // Set it to sum of rank sizes. - else if (!opts->_global_sizes[i]) - opts->_global_sizes[i] = rank_domain_sums[j]; + else if (!actl_opts->_global_sizes[i]) + actl_opts->_global_sizes[i] = rank_domain_sums[j]; } } @@ -420,22 +495,160 @@ namespace yask { else { DOMAIN_VAR_LOOP(i, j) { auto& dname = domain_dims.get_dim_name(j); - if (opts->_global_sizes[i] != rank_domain_sums[j]) { + if (actl_opts->_global_sizes[i] != rank_domain_sums[j]) { FORMAT_AND_THROW_YASK_EXCEPTION("Error: sum of local-domain sizes across " << nr << " ranks is " << rank_domain_sums[j] << ", which does not equal global-domain size of " << - opts->_global_sizes[i] << " in '" << dname << + actl_opts->_global_sizes[i] << " in '" << dname << "' dimension"); } } } } // passes. -#endif + #endif } // setup_rank(). + void StencilContext::print_warnings() const { + STATE_VARS(this); +#ifdef CHECK + DEBUG_MSG("*** WARNING: YASK compiled with CHECK; ignore performance."); +#endif +#if defined(NO_INTRINSICS) && (VLEN > 1) + DEBUG_MSG("*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance."); +#endif +#ifdef MODEL_CACHE + DEBUG_MSG("*** WARNING: YASK compiled with MODEL_CACHE; ignore performance."); +#endif +#ifdef TRACE_MEM + DEBUG_MSG("*** WARNING: YASK compiled with TRACE_MEM; ignore performance."); +#endif +#ifdef TRACE_INTRINSICS + DEBUG_MSG("*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance."); +#endif + TRACE_MSG("*** WARNING: YASK run with -trace; ignore performance"); + if (!actl_opts->do_halo_exchange) + DEBUG_MSG("*** WARNING: YASK run without halo exchanges; ignore performance; invalid results."); + } + + void StencilContext::print_temporal_tiling_info(string prefix) const { + STATE_VARS(this); + + DEBUG_MSG(prefix << "num-wave-front-steps: " << wf_steps << endl << + prefix << "num-temporal-block-steps: " << tb_steps); + + // Print detailed info only if temporal tiling enabled. + if (wf_steps > 0 || tb_steps > 0) { + DEBUG_MSG(prefix << "wave-front-angles: " << wf_angles.make_dim_val_str() << endl << + prefix << "num-wave-front-shifts: " << num_wf_shifts << endl << + prefix << "wave-front-shift-amounts: " << wf_shift_pts.make_dim_val_str() << endl << + prefix << "left-wave-front-exts: " << left_wf_exts.make_dim_val_str() << endl << + prefix << "right-wave-front-exts: " << right_wf_exts.make_dim_val_str() << endl << + prefix << "ext-local-domain: " << ext_bb.make_range_string(domain_dims) << endl << + prefix << "temporal-block-angles: " << tb_angles.make_dim_val_str() << endl << + prefix << "num-temporal-block-shifts: " << num_tb_shifts << endl << + prefix << "temporal-block-long-base: " << tb_widths.make_dim_val_str(" * ") << endl << + prefix << "temporal-block-short-base: " << tb_tops.make_dim_val_str(" * ") << endl << + prefix << "micro-block-angles: " << mb_angles.make_dim_val_str()); + } + } + + void StencilContext::print_sizes(string prefix) const { + STATE_VARS(this); +#ifdef USE_TILING + DEBUG_MSG(prefix << "local-domain-tile-size: " << + actl_opts->_rank_tile_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#endif + DEBUG_MSG(prefix << "mega-block-size: " << + actl_opts->_mega_block_sizes.make_dim_val_str(" * ")); +#ifdef USE_TILING + DEBUG_MSG(prefix << "mega-block-tile-size: " << + actl_opts->_mega_block_tile_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#endif + DEBUG_MSG(prefix << "block-size: " << + actl_opts->_block_sizes.make_dim_val_str(" * ")); +#ifdef USE_TILING + DEBUG_MSG(prefix << "block-tile-size: " << + actl_opts->_block_tile_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#endif + DEBUG_MSG(prefix << "micro-block-size: " << + actl_opts->_micro_block_sizes.make_dim_val_str(" * ")); +#ifdef USE_TILING + DEBUG_MSG(prefix << "micro-block-tile-size: " << + actl_opts->_micro_block_tile_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#endif + DEBUG_MSG(prefix << "nano-block-size: " << + actl_opts->_nano_block_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#ifdef USE_TILING + DEBUG_MSG(prefix << "nano-block-tile-size: " << + actl_opts->_nano_block_tile_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); +#endif + DEBUG_MSG(prefix << "pico-block-size: " << + actl_opts->_pico_block_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); + } + + void StencilContext::init_stats() { + STATE_VARS(this); + + // Calc and report total allocation and domain sizes. + rank_nbytes = get_num_bytes(); + tot_nbytes = sum_over_ranks(rank_nbytes, env->comm); + rank_domain_pts = rank_bb.bb_num_points; + tot_domain_pts = sum_over_ranks(rank_domain_pts, env->comm); + DEBUG_MSG("\nDomain size in this rank (points): " << make_num_str(rank_domain_pts) << + "\nTotal allocation in this rank: " << make_byte_str(rank_nbytes) << + "\nOverall problem size in " << env->num_ranks << " rank(s) (points): " << + make_num_str(tot_domain_pts) << + "\nTotal overall allocation in " << env->num_ranks << " rank(s): " << + make_byte_str(tot_nbytes)); + + // Report some sizes and settings. + DEBUG_MSG("\nWork-unit sizes in grid points (from largest to smallest):"); + DEBUG_MSG(" global-domain-size: " << + actl_opts->_global_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); + DEBUG_MSG(" local-domain-size: " << + actl_opts->_rank_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); + print_sizes(" "); + DEBUG_MSG(" cluster-size: " << dims->_cluster_pts.make_dim_val_str(" * ")); + DEBUG_MSG(" vector-size: " << dims->_fold_pts.make_dim_val_str(" * ")); + DEBUG_MSG("\nOther settings:\n" + " yask-version: " << yask_get_version_string() << endl << + " target: " << get_target() << endl << + " stencil-name: " << get_name() << endl << + " stencil-description: " << get_description() << endl << + " element-size: " << make_byte_str(get_element_bytes()) << endl << + " inner-layout dim: " << dims->_inner_layout_dim << endl << + " inner-loop dim: " << dims->_inner_loop_dim); +#ifdef USE_MPI + DEBUG_MSG(" num-ranks: " << actl_opts->_num_ranks.make_dim_val_str(" * ") << endl << + " rank-indices: " << actl_opts->_rank_indices.make_dim_val_str() << endl << + " local-domain-offsets: " << rank_domain_offsets.make_dim_val_str(dims->_domain_dims)); + if (actl_opts->overlap_comms) + DEBUG_MSG(" mpi-interior: " << mpi_interior.make_range_string(domain_dims)); +#endif + DEBUG_MSG(" vector-len: " << VLEN << endl << + " extra-padding: " << actl_opts->_extra_pad_sizes.remove_dim(step_posn).make_dim_val_str() << endl << + " min-padding: " << actl_opts->_min_pad_sizes.remove_dim(step_posn).make_dim_val_str() << endl << + " allow-addl-padding: " << actl_opts->_allow_addl_pad << endl << + " L1-prefetch-distance: " << PFD_L1 << endl << + " L2-prefetch-distance: " << PFD_L2 << endl << + " max-halos: " << max_halos.make_dim_val_str()); + print_temporal_tiling_info(" "); + + // Info about eqs, stages and bundles. + DEBUG_MSG("\n" + "Num stages: " << st_stages.size() << endl << + "Num stencil bundles: " << st_bundles.size() << endl << + "Num stencil equations: " << NUM_STENCIL_EQS); + + // Info on work in stages. + DEBUG_MSG("\nBreakdown of work stats in this rank:"); + for (auto& sp : st_stages) + sp->init_work_stats(); + } + // Set non-scratch var sizes and offsets based on settings. // Set wave-front settings. // This should be called anytime a setting or rank offset is changed. @@ -456,7 +669,7 @@ namespace yask { auto& dname = dim._get_name(); // Each non-scratch var. - for (auto gp : var_ptrs) { + for (auto gp : all_var_ptrs) { assert(gp); if (!gp->is_dim_used(dname)) continue; @@ -468,12 +681,12 @@ namespace yask { (!gb.is_user_var() && force)) { // Rank domains. - gp->_set_domain_size(dname, opts->_rank_sizes[dname]); + gp->_set_domain_size(dname, actl_opts->_rank_sizes[dname]); // Pads. // Set via both 'extra' and 'min'; larger result will be used. - gp->set_extra_pad_size(dname, opts->_extra_pad_sizes[dname]); - gp->set_min_pad_size(dname, opts->_min_pad_sizes[dname]); + gp->update_extra_pad_size(dname, actl_opts->_extra_pad_sizes[dname]); + gp->update_min_pad_size(dname, actl_opts->_min_pad_sizes[dname]); // Offsets. auto dp = dims->_domain_dims.lookup_posn(dname); @@ -492,9 +705,9 @@ namespace yask { // Calculate wave-front shifts. // See the wavefront diagram in run_solution() for description // of angles and extensions. - idx_t tb_steps = opts->_block_sizes[step_dim]; // use requested size; actual may be less. + idx_t tb_steps = actl_opts->_block_sizes[step_dim]; // use requested size; actual may be less. assert(tb_steps >= 0); - wf_steps = opts->_region_sizes[step_dim]; + wf_steps = actl_opts->_mega_block_sizes[step_dim]; wf_steps = max(wf_steps, tb_steps); // round up WF steps if less than TB steps. assert(wf_steps >= 0); num_wf_shifts = 0; @@ -511,22 +724,23 @@ namespace yask { assert(num_wf_shifts >= 0); // Determine whether separate tuners can be used. - state->_use_stage_tuners = opts->_allow_stage_tuners && (tb_steps == 0) && (st_stages.size() > 1); + // Only allowed when no TB and >1 stage. + state->_use_stage_tuners = actl_opts->_allow_stage_tuners && (tb_steps == 0) && (st_stages.size() > 1); // Calculate angles and related settings. for (auto& dim : domain_dims) { auto& dname = dim._get_name(); - auto rnsize = opts->_region_sizes[dname]; - auto rksize = opts->_rank_sizes[dname]; - auto nranks = opts->_num_ranks[dname]; + auto rnsize = actl_opts->_mega_block_sizes[dname]; + auto rksize = actl_opts->_rank_sizes[dname]; + auto nranks = actl_opts->_num_ranks[dname]; // Req'd shift in this dim based on max halos. idx_t angle = ROUND_UP(max_halos[dname], dims->_fold_pts[dname]); // Determine the spatial skewing angles for WF tiling. We - // only need non-zero angles if the region size is less than the + // only need non-zero angles if the mega-block size is less than the // rank size or there are other ranks in this dim, i.e., if - // the region covers the *global* domain in a given dim, no + // the mega-block covers the *global* domain in a given dim, no // wave-front shifting is needed in that dim. idx_t wf_angle = 0; if (rnsize < rksize || nranks > 1) @@ -542,7 +756,7 @@ namespace yask { // Is domain size at least as large as halo + wf_ext in direction // when there are multiple ranks? auto min_size = max_halos[dname] + shifts; - if (opts->_num_ranks[dname] > 1 && rksize < min_size) { + if (actl_opts->_num_ranks[dname] > 1 && rksize < min_size) { FORMAT_AND_THROW_YASK_EXCEPTION ("Error: local-domain size of " << rksize << " in '" << dname << "' dim is less than minimum size of " << min_size << @@ -551,11 +765,11 @@ namespace yask { // If there is another rank to the left, set wave-front // extension on the left. - left_wf_exts[dname] = opts->is_first_rank(dname) ? 0 : shifts; + left_wf_exts[dname] = actl_opts->is_first_rank(dname) ? 0 : shifts; // If there is another rank to the right, set wave-front // extension on the right. - right_wf_exts[dname] = opts->is_last_rank(dname) ? 0 : shifts; + right_wf_exts[dname] = actl_opts->is_last_rank(dname) ? 0 : shifts; } // Now that wave-front settings are known, we can push this info @@ -592,7 +806,7 @@ namespace yask { TRACE_MSG("update_tb_info()..."); // Get requested size. - tb_steps = opts->_block_sizes[step_dim]; + tb_steps = actl_opts->_block_sizes[step_dim]; // Reset all TB and MB vars. num_tb_shifts = 0; @@ -617,13 +831,13 @@ namespace yask { DOMAIN_VAR_LOOP(i, j) { auto& dim = domain_dims.get_dim(j); auto& dname = dim._get_name(); - auto rnsize = opts->_region_sizes[i]; + auto rnsize = actl_opts->_mega_block_sizes[i]; // There must be only one block size when using TB, so get // sizes from context settings instead of stages. assert(state->_use_stage_tuners == false); - auto blksize = opts->_block_sizes[i]; - auto mblksize = opts->_mini_block_sizes[i]; + auto blksize = actl_opts->_block_sizes[i]; + auto mblksize = actl_opts->_micro_block_sizes[i]; // Req'd shift in this dim based on max halos. // Can't use separate L & R shift because of possible data reuse in vars. @@ -640,7 +854,7 @@ namespace yask { mb_angles[j] = mb_angle; // Determine the max spatial skewing angles for TB. - // If blk covers whole region, no shifting is needed in that dim. + // If blk covers whole mega-block, no shifting is needed in that dim. idx_t tb_angle = 0; if (blksize < rnsize) tb_angle = angle; @@ -721,7 +935,7 @@ namespace yask { // TODO: use actual number of shifts dynamically instead of this // max. DOMAIN_VAR_LOOP(i, j) { - auto blk_sz = opts->_block_sizes[i]; + auto blk_sz = actl_opts->_block_sizes[i]; auto tb_angle = tb_angles[j]; tb_widths[j] = blk_sz; tb_tops[j] = blk_sz; @@ -747,17 +961,19 @@ namespace yask { } // update_tb_info(). // Init all vars & params by calling init_fn. - void StencilContext::init_values(function real_init_fn) { + void StencilContext::init_values(real_t seed0, + function real_init_fn) { STATE_VARS(this); - real_t seed = 0.1; - DEBUG_MSG("Initializing vars..."); + real_t seed = seed0; + DEBUG_MSG("Initializing stencil vars..."); YaskTimer itimer; itimer.start(); - for (auto gp : var_ptrs) { + for (auto gp : orig_var_ptrs) { real_init_fn(gp, seed); - seed += 0.01; + gp->gb().get_coh().mod_host(); + seed += seed0; } itimer.stop(); DEBUG_MSG("Var initialization done in " << @@ -775,7 +991,7 @@ namespace yask { // Rank BB is based only on rank offsets and rank domain sizes. rank_bb.bb_begin = rank_domain_offsets; - rank_bb.bb_end = rank_bb.bb_begin_tuple(domain_dims).add_elements(opts->_rank_sizes, false); + rank_bb.bb_end = rank_bb.bb_begin_tuple(domain_dims).add_elements(actl_opts->_rank_sizes, false); rank_bb.update_bb("rank", this, true, true); // BB may be extended for wave-fronts. @@ -831,7 +1047,7 @@ namespace yask { void StencilBundleBase::copy_bounding_box(const StencilBundleBase* src) { STATE_VARS(this); TRACE_MSG("copy_bounding_box for '" << get_name() << "' from '" << - src->get_name() << "'..."); + src->get_name() << "'..."); _bundle_bb = src->_bundle_bb; assert(_bundle_bb.bb_valid); @@ -870,21 +1086,20 @@ namespace yask { // Divide the overall BB into a slice for each thread // across the outer dim. - const int odim = 0; // Use 0 instead of outer_posn because BB lens are in domain dims. + const int odim = 0; // Split across first domain dim; TODO: pick smarter. idx_t outer_len = _bundle_bb.bb_len[odim]; idx_t nthreads = yask_get_num_threads(); idx_t len_per_thr = CEIL_DIV(outer_len, nthreads); TRACE_MSG("find_bounding_box: running " << nthreads << " thread(s) over " << - outer_len << " point(s) in outer dim"); + outer_len << " point(s) in outer dim"); // Struct w/padding to avoid false sharing. struct BBL_t { BBList bbl; - char pad[CACHELINE_BYTES]; + char pad[CACHELINE_BYTES - sizeof(BBList)]; }; // List of full BBs for each thread. - // TODO: remove false sharing. vector bb_lists(nthreads); // Run rect-finding code on each thread. @@ -893,139 +1108,140 @@ namespace yask { yask_parallel_for (0, nthreads, 1, [&](idx_t start, idx_t stop, idx_t thread_num) { - auto& cur_bb_list = bb_lists[start].bbl; - - // Begin and end of this slice. - // These Indices contain domain dims. - Indices islice_begin(_bundle_bb.bb_begin); - islice_begin[odim] += start * len_per_thr; - Indices islice_end(_bundle_bb.bb_end); - islice_end[odim] = min(islice_end[odim], islice_begin[odim] + len_per_thr); - if (islice_end[odim] <= islice_begin[odim]) - return; // from lambda. - - // Construct len of slice in all dims. - Indices islice_len = islice_end.sub_elements(islice_begin); - auto slice_len = islice_len.make_tuple(domain_dims); - - // Visit all points in slice, looking for a new - // valid beginning point, 'ib*pt'. - Indices ibspt(stencil_dims); // in stencil dims. - ibspt[step_posn] = 0; - Indices ibdpt(domain_dims); // in domain dims. - slice_len.visit_all_points - ([&](const IdxTuple& ofs, size_t idx) { - - // Find global point from 'ofs' in domain - // and stencil dims. - Indices iofs(ofs); - ibdpt = islice_begin.add_elements(iofs); // domain indices. - DOMAIN_VAR_LOOP(i, j) - ibspt[i] = ibdpt[j]; // stencil indices. - - // Valid point must be in sub-domain and - // not seen before in this slice. - bool is_valid = is_in_valid_domain(ibspt); - if (is_valid) { - for (auto& bb : cur_bb_list) { - if (bb.is_in_bb(ibdpt)) { - is_valid = false; - break; - } - } - } - - // Process this new rect starting at 'ib*pt'. - if (is_valid) { - - // Scan from 'ib*pt' to end of this slice - // looking for end of rect. - auto iscan_len = islice_end.sub_elements(ibdpt); - auto scan_len = iscan_len.make_tuple(domain_dims); - - // End point to be found, 'ie*pt'. - Indices iespt(stencil_dims); // stencil dims. - iespt[step_posn] = 0; - Indices iedpt(domain_dims); // domain dims. - - // Repeat scan until no adjustment is made. - bool do_scan = true; - while (do_scan) { - do_scan = false; - - TRACE_MSG("scanning " << scan_len.make_dim_val_str(" * ") << - " starting at " << ibdpt.make_dim_val_str(domain_dims)); - scan_len.visit_all_points - ([&](const IdxTuple& eofs, size_t eidx) { - - // Make sure scan_len range is observed. - DOMAIN_VAR_LOOP(i, j) - assert(eofs[j] < scan_len[j]); - - // Find global point from 'eofs'. - Indices ieofs(eofs); - iedpt = ibdpt.add_elements(ieofs); // domain tuple. - DOMAIN_VAR_LOOP(i, j) - iespt[i] = iedpt[j]; // stencil tuple. - - // Valid point must be in sub-domain and - // not seen before in this slice. - bool is_evalid = is_in_valid_domain(iespt); - if (is_evalid) { - for (auto& bb : cur_bb_list) { - if (bb.is_in_bb(iedpt)) { - is_evalid = false; - break; - } - } - } - - // If this is an invalid point, adjust - // scan range appropriately. - if (!is_evalid) { - - // Adjust 1st dim that is beyond its starting pt. - // This will reduce the range of the scan. - DOMAIN_VAR_LOOP(i, j) { - - // Beyond starting point in this dim? - if (iedpt[j] > ibdpt[j]) { - scan_len[j] = iedpt[j] - ibdpt[j]; - - // restart scan for - // remaining dims. - // TODO: be smarter - // about where to - // restart scan. - if (j < nddims - 1) - do_scan = true; - - return false; // stop this scan. - } - } - } - - return true; // keep looking for invalid point. - }); // Looking for invalid point. - } // while scan is adjusted. - TRACE_MSG("found BB " << scan_len.make_dim_val_str(" * ") << - " starting at " << ibdpt.make_dim_val_str(domain_dims)); - iscan_len.set_from_tuple(scan_len); - - // 'scan_len' now contains sizes of the new BB. - BoundingBox new_bb; - new_bb.bb_begin = ibdpt; - new_bb.bb_end = ibdpt.add_elements(iscan_len); - new_bb.update_bb("sub-bb", _context, true); - cur_bb_list.push_back(new_bb); - - } // new rect found. - - return true; // from labmda; keep looking. - }); // Looking for new rects. - }); // threads/slices. + auto& cur_bb_list = bb_lists[start].bbl; + + // Begin and end of this slice. + // These Indices contain domain dims. + Indices islice_begin(_bundle_bb.bb_begin); + islice_begin[odim] += start * len_per_thr; + Indices islice_end(_bundle_bb.bb_end); + islice_end[odim] = min(islice_end[odim], islice_begin[odim] + len_per_thr); + if (islice_end[odim] <= islice_begin[odim]) + return; // from lambda. + + // Construct len of slice in all dims. + Indices islice_len = islice_end.sub_elements(islice_begin); + + // Visit all points in slice, looking for a new + // valid beginning point, 'ib*pt'. + Indices ibspt(stencil_dims); // in stencil dims. + ibspt[step_posn] = 0; + Indices ibdpt(domain_dims); // in domain dims. + islice_len.visit_all_points + (true, + [&](const Indices& iofs, size_t idx) { + + // Find global point from 'iofs' in domain + // and stencil dims. + ibdpt = islice_begin.add_elements(iofs); // domain indices. + DOMAIN_VAR_LOOP(i, j) + ibspt[i] = ibdpt[j]; // stencil indices. + + // Valid point must be in sub-domain and + // not seen before in this slice. + bool is_valid = is_in_valid_domain(ibspt); + if (is_valid) { + for (auto& bb : cur_bb_list) { + if (bb.is_in_bb(ibdpt)) { + is_valid = false; + break; + } + } + } + + // Process this new rect starting at 'ib*pt'. + if (is_valid) { + + // Scan from 'ib*pt' to end of this slice + // looking for end of rect. + auto iscan_len = islice_end.sub_elements(ibdpt); + + // End point to be found, 'ie*pt'. + Indices iespt(stencil_dims); // stencil dims. + iespt[step_posn] = 0; + Indices iedpt(domain_dims); // domain dims. + + // Repeat scan until no adjustment is made. + bool do_scan = true; + while (do_scan) { + do_scan = false; + + TRACE_MSG("scanning " << + iscan_len.make_dim_val_str(domain_dims, " * ") << + " starting at " << + ibdpt.make_dim_val_str(domain_dims) << + " in thread " << thread_num); + iscan_len.visit_all_points + (true, + [&](const Indices& ieofs, size_t eidx) { + + // Make sure iscan_len range is observed. + DOMAIN_VAR_LOOP(i, j) + assert(ieofs[j] < iscan_len[j]); + + // Find global point from 'ieofs'. + iedpt = ibdpt.add_elements(ieofs); // domain tuple. + DOMAIN_VAR_LOOP(i, j) + iespt[i] = iedpt[j]; // stencil tuple. + + // Valid point must be in sub-domain and + // not seen before in this slice. + bool is_evalid = is_in_valid_domain(iespt); + if (is_evalid) { + for (auto& bb : cur_bb_list) { + if (bb.is_in_bb(iedpt)) { + is_evalid = false; + break; + } + } + } + + // If this is an invalid point, adjust + // scan range appropriately. + if (!is_evalid) { + + // Adjust 1st dim that is beyond its starting pt. + // This will reduce the range of the scan. + DOMAIN_VAR_LOOP(i, j) { + + // Beyond starting point in this dim? + if (iedpt[j] > ibdpt[j]) { + iscan_len[j] = iedpt[j] - ibdpt[j]; + + // restart scan for + // remaining dims. + // TODO: be smarter + // about where to + // restart scan. + if (j < nddims - 1) + do_scan = true; + + return false; // stop this scan. + } + } + } + + return true; // keep looking for invalid point. + }); // Looking for invalid point. + } // while scan is adjusted. + TRACE_MSG("found BB " << iscan_len.make_dim_val_str(domain_dims, " * ") << + " starting at " << ibdpt.make_dim_val_str(domain_dims) << + " in thread " << thread_num); + + // 'iscan_len' now contains sizes of the new BB. + BoundingBox new_bb; + new_bb.bb_begin = ibdpt; + new_bb.bb_end = ibdpt.add_elements(iscan_len); + new_bb.update_bb("sub-bb", _context, true); + cur_bb_list.push_back(new_bb); + + } // new rect found. + + return true; // from labmda; keep looking. + }); // Looking for new rects. + }); // threads/slices. TRACE_MSG("sub-bbs found in " << - bbtimer.get_secs_since_start() << " secs."); + bbtimer.get_secs_since_start() << " secs."); // At this point, we have a set of full BBs. // Reset overall BB. @@ -1036,8 +1252,8 @@ namespace yask { for (int n = 0; n < nthreads; n++) { auto& cur_bb_list = bb_lists[n].bbl; TRACE_MSG("processing " << cur_bb_list.size() << - " sub-BB(s) in bundle '" << get_name() << - "' from thread " << n); + " sub-BB(s) in bundle '" << get_name() << + "' from thread " << n); // BBs in slice 'n'. for (auto& bbn : cur_bb_list) { @@ -1100,7 +1316,7 @@ namespace yask { _bundle_bb.update_bb(get_name(), _context, false); bbtimer.stop(); TRACE_MSG("find-bounding-box: done in " << - bbtimer.get_elapsed_secs() << " secs."); + bbtimer.get_elapsed_secs() << " secs."); } // Compute convenience values for a bounding-box. @@ -1136,7 +1352,7 @@ namespace yask { if (print_info) DEBUG_MSG("Note: '" << name << "' domain" " has one or more starting edges not on vector boundaries;" - " masked calculations will be used in peel and remainder sub-blocks."); + " masked calculations will be used in peel and remainder nano-blocks."); bb_is_aligned = false; break; } @@ -1152,7 +1368,7 @@ namespace yask { if (print_info && bb_is_aligned) DEBUG_MSG("Note: '" << name << "' domain" " has one or more sizes that are not vector-cluster multiples;" - " masked calculations will be used in peel and remainder sub-blocks."); + " masked calculations will be used in peel and remainder nano-blocks."); bb_is_cluster_mult = false; break; } @@ -1162,4 +1378,28 @@ namespace yask { bb_valid = true; } + // Add a new non-scratch var to the containers. + void StencilContext::add_var(YkVarPtr gp, bool is_orig, bool is_output) { + STATE_VARS(this); + assert(gp); + auto& gname = gp->get_name(); + if (all_var_map.count(gname)) + THROW_YASK_EXCEPTION("Error: var '" + gname + "' already exists"); + + // Add to list and map. + all_var_ptrs.push_back(gp); + all_var_map[gname] = gp; + + // Add to orig list and map if 'is_orig'. + if (is_orig) { + orig_var_ptrs.push_back(gp); + orig_var_map[gname] = gp; + } + + // Add to output list and map if 'is_output'. + if (is_output) { + output_var_ptrs.push_back(gp); + output_var_map[gname] = gp; + } + } } // namespace yask. diff --git a/src/kernel/lib/soln_apis.cpp b/src/kernel/lib/soln_apis.cpp index 409c52f0..a5a8aa13 100644 --- a/src/kernel/lib/soln_apis.cpp +++ b/src/kernel/lib/soln_apis.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -32,50 +32,82 @@ using namespace std; namespace yask { // APIs. - // See yask_kernel_api.hpp. - -#define GET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ - idx_t StencilContext::api_name(const string& dim) const { \ - STATE_VARS(this); \ - if (prep_req && !is_prepared()) \ - THROW_YASK_EXCEPTION("Error: '" #api_name \ - "()' called before calling 'prepare_solution()'"); \ - dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ - return expr; \ - } - GET_SOLN_API(get_num_ranks, opts->_num_ranks[dim], false, true, false, false) - GET_SOLN_API(get_overall_domain_size, opts->_global_sizes[dim], false, true, false, true) - GET_SOLN_API(get_rank_domain_size, opts->_rank_sizes[dim], false, true, false, false) - GET_SOLN_API(get_region_size, opts->_region_sizes[dim], true, true, false, false) - GET_SOLN_API(get_block_size, opts->_block_sizes[dim], true, true, false, false) - GET_SOLN_API(get_first_rank_domain_index, rank_bb.bb_begin_tuple(domain_dims)[dim], false, true, false, true) - GET_SOLN_API(get_last_rank_domain_index, rank_bb.bb_end_tuple(domain_dims)[dim] - 1, false, true, false, true) - GET_SOLN_API(get_min_pad_size, opts->_min_pad_sizes[dim], false, true, false, false) - GET_SOLN_API(get_rank_index, opts->_rank_indices[dim], false, true, false, true) -#undef GET_SOLN_API - - // The var sizes are updated any time these settings are changed. -#define SET_SOLN_API(api_name, expr, step_ok, domain_ok, misc_ok, reset_prep) \ - void StencilContext::api_name(const string& dim, idx_t n) { \ - STATE_VARS(this); \ - TRACE_MSG("solution '" << get_name() << "'." \ - #api_name "('" << dim << "', " << n << ")"); \ - dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ - expr; \ - update_var_info(false); \ - if (reset_prep) set_prepared(false); \ - } - SET_SOLN_API(set_rank_index, opts->_rank_indices[dim] = n; - opts->find_loc = false, false, true, false, true) - SET_SOLN_API(set_num_ranks, opts->_num_ranks[dim] = n, false, true, false, true) - SET_SOLN_API(set_overall_domain_size, opts->_global_sizes[dim] = n; - if (n) opts->_rank_sizes[dim] = 0, false, true, false, true) - SET_SOLN_API(set_rank_domain_size, opts->_rank_sizes[dim] = n; - if (n) opts->_global_sizes[dim] = 0, false, true, false, true) - SET_SOLN_API(set_region_size, opts->_region_sizes[dim] = n, true, true, false, true) - SET_SOLN_API(set_block_size, opts->_block_sizes[dim] = n, true, true, false, true) - SET_SOLN_API(set_min_pad_size, opts->_min_pad_sizes[dim] = n, false, true, false, false) -#undef SET_SOLN_API + // See yask_kernel_api.hpp and context.hpp. + + #define GET_SOLN_API(api_name, expr, start_i, step_ok, domain_ok, misc_ok) \ + idx_t StencilContext::get_ ## api_name(const string& dim) const { \ + STATE_VARS(this); \ + dims->check_dim_type(dim, "get_" #api_name, step_ok, domain_ok, misc_ok); \ + return expr[dim]; \ + } \ + idx_t_vec StencilContext::get_ ## api_name ## _vec() const { \ + STATE_VARS(this); \ + return expr.get_vals(start_i, NUM_DOMAIN_DIMS); \ + } + #define SET_SOLN_API(api_name, expr, start_i, step_ok, domain_ok, misc_ok, reset_prep) \ + void StencilContext::set_ ## api_name(const string& dim, idx_t n) { \ + STATE_VARS(this); \ + TRACE_MSG("solution '" << get_name() << "'.set_" \ + #api_name "('" << dim << "', " << n << ")"); \ + dims->check_dim_type(dim, "set_" #api_name, step_ok, domain_ok, misc_ok); \ + expr[dim] = n; \ + update_var_info(false); \ + if (reset_prep) set_prepared(false); \ + } \ + void StencilContext::set_ ## api_name ## _vec(const idx_t_vec& vals) { \ + STATE_VARS(this); \ + TRACE_MSG("solution '" << get_name() << "'.set_" \ + #api_name "_vec(...)"); \ + if (vals.size() != NUM_DOMAIN_DIMS) \ + THROW_YASK_EXCEPTION("Error: set_'" #api_name \ + "_vec()' called without the proper number of domain dims"); \ + expr.set_vals(start_i, vals); \ + update_var_info(false); \ + if (reset_prep) set_prepared(false); \ + } \ + void StencilContext::set_ ## api_name ## _vec(const idx_t_init_list& vals) { \ + STATE_VARS(this); \ + TRACE_MSG("solution '" << get_name() << "'.set_" \ + #api_name "_vec(...)"); \ + if (vals.size() != NUM_DOMAIN_DIMS) \ + THROW_YASK_EXCEPTION("Error: set_'" #api_name \ + "_vec()' called without the proper number of domain dims"); \ + expr.set_vals(start_i, vals); \ + update_var_info(false); \ + if (reset_prep) set_prepared(false); \ + } + #define SOLN_API(api_name, expr, start_i, step_ok, domain_ok, misc_ok, reset_prep) \ + GET_SOLN_API(api_name, expr, start_i, step_ok, domain_ok, misc_ok) \ + SET_SOLN_API(api_name, expr, start_i, step_ok, domain_ok, misc_ok, reset_prep) + + SOLN_API(num_ranks, + actl_opts->_num_ranks, 0, + false, true, false, true) + SOLN_API(rank_index, + actl_opts->_rank_indices, 0, + false, true, false, true) + SOLN_API(overall_domain_size, + actl_opts->_global_sizes, 1, + false, true, false, true) + SOLN_API(rank_domain_size, + actl_opts->_rank_sizes, 1, + false, true, false, true) + SOLN_API(block_size, + actl_opts->_block_sizes, 1, + true, true, false, true) + SOLN_API(min_pad_size, + actl_opts->_min_pad_sizes, 1, + false, true, false, false) + + GET_SOLN_API(first_rank_domain_index, + rank_bb.bb_begin_tuple(domain_dims), 0, + false, true, false) + GET_SOLN_API(last_rank_domain_index, + rank_bb.bb_last_tuple(domain_dims), 0, + false, true, false) + #undef SOLN_API + #undef SET_SOLN_API + #undef GET_SOLN_API // Callbacks. void StencilContext::call_hooks(hook_fn_vec& hook_fns) { @@ -116,33 +148,45 @@ namespace yask { reset_auto_tuner(true, false); // Report ranks. - DEBUG_MSG("\nNum MPI ranks: " << env->get_num_ranks() << - "\nThis MPI rank index: " << env->get_rank_index()); + DEBUG_MSG("\nNum MPI ranks: " << env->get_num_ranks() << + "\nThis MPI rank index: " << env->get_rank_index()); // report threads. { - DEBUG_MSG("Num OpenMP procs: " << omp_get_num_procs()); + DEBUG_MSG("Num OpenMP procs: " << omp_get_num_procs()); int rt, bt; int at = get_num_comp_threads(rt, bt); - DEBUG_MSG("Num OpenMP threads avail: " << opts->max_threads << - "\nNum OpenMP threads used: " << at << - "\n Num threads per region: " << rt << - "\n Num threads per block: " << bt); + DEBUG_MSG("Num OpenMP threads avail: " << actl_opts->max_threads << + "\nNum OpenMP threads used: " << at << + "\n Num outer threads: " << rt << + "\n Num inner threads: " << bt); + #ifdef USE_OFFLOAD + DEBUG_MSG("Num OpenMP devices: " << omp_get_num_devices() << + "\nOpenMP host device: " << KernelEnv::_omp_hostn << + "\nOpenMP offload device: " << KernelEnv::_omp_devn << + "\nDevice thread limit: " << actl_opts->thread_limit); + #ifdef USE_OFFLOAD_USM + DEBUG_MSG("Using unified shared mem: true"); + #else + DEBUG_MSG("Using unified shared mem: false"); + #endif + #endif } // Set the number of threads for a region. The number of threads // used in top-level OpenMP parallel sections should not change // during execution. - int rthreads = set_region_threads(); + int rthreads = set_num_outer_threads(); // Run a dummy nested OMP loop to make sure nested threading is // initialized. - yask_parallel_for(0, rthreads * 100, 1, + yask_parallel_for(0, rthreads * 10, 1, [&](idx_t start, idx_t stop, idx_t thread_num) { }); // Some var stats. - DEBUG_MSG("\nNum vars: " << var_ptrs.size() << - "\nNum vars to be updated: " << output_var_ptrs.size()); + DEBUG_MSG("\nNum vars: " << all_var_ptrs.size() << + "\nNum vars to be updated: " << output_var_ptrs.size() << + "\nNum vars created via APIs: " << (all_var_ptrs.size() - orig_var_ptrs.size())); // Set up data based on MPI rank, including local or global sizes, // var positions. @@ -150,34 +194,37 @@ namespace yask { // Adjust all settings before setting MPI buffers or sizing vars. // Prints adjusted settings. - // TODO: print settings again after auto-tuning. - opts->adjust_settings(this); + actl_opts->adjust_settings(this); // Set offsets in vars and find WF extensions // based on the vars' halos. Force setting // the size of all solution vars. update_var_info(true); - // Determine bounding-boxes for all bundles. - // This must be done after finding WF extensions. + // Set core data needed in kernels. + set_core(); + + // Determine bounding-boxes for all bundles. This must be done + // after finding WF extensions. And, this must be done after + // set_core() because is_in_valid_domain() needs the core data. find_bounding_boxes(); // Copy current settings to stages. Needed here because settings may // have been changed via APIs or from call to setup_rank() since last - // call to prepare_solution(). This will wipe out any previous - // auto-tuning. + // call to prepare_solution(). FIXME: This will wipe out any previous + // stage-specific auto-tuning. for (auto& sp : st_stages) - sp->get_local_settings() = *opts; + sp->get_local_settings() = *actl_opts; - // Alloc vars, scratch vars, MPI bufs. + // Free the scratch and MPI data first to give vars preference. + // Alloc vars (if needed), scratch vars, MPI bufs. // This is the order in which preferred NUMA nodes (e.g., HBW mem) // will be used. - // We free the scratch and MPI data first to give vars preference. YaskTimer alloc_timer; alloc_timer.start(); free_scratch_data(); free_mpi_data(); - alloc_var_data(); + alloc_var_data(); // Does nothing if already done. alloc_scratch_data(); alloc_mpi_data(); alloc_timer.stop(); @@ -191,143 +238,34 @@ namespace yask { } // prepare_solution(). - void StencilContext::print_warnings() const { - STATE_VARS(this); -#ifdef CHECK - DEBUG_MSG("*** WARNING: YASK compiled with CHECK; ignore performance results."); -#endif -#if defined(NO_INTRINSICS) && (VLEN > 1) - DEBUG_MSG("*** WARNING: YASK compiled with NO_INTRINSICS; ignore performance results."); -#endif -#ifdef MODEL_CACHE - DEBUG_MSG("*** WARNING: YASK compiled with MODEL_CACHE; ignore performance results."); -#endif -#ifdef TRACE_MEM - DEBUG_MSG("*** WARNING: YASK compiled with TRACE_MEM; ignore performance results."); -#endif -#ifdef TRACE_INTRINSICS - DEBUG_MSG("*** WARNING: YASK compiled with TRACE_INTRINSICS; ignore performance results."); -#endif - TRACE_MSG("*** WARNING: YASK run with -trace; ignore performance results"); - } - - void StencilContext::print_temporal_tiling_info() const { - STATE_VARS(this); - - DEBUG_MSG(" num-wave-front-steps: " << wf_steps << - "\n num-temporal-block-steps: " << tb_steps); - - // Print detailed info only if temporal tiling enabled. - if (wf_steps > 0 || tb_steps > 0) { - DEBUG_MSG(" wave-front-angles: " << wf_angles.make_dim_val_str() << endl << - " num-wave-front-shifts: " << num_wf_shifts << endl << - " wave-front-shift-amounts: " << wf_shift_pts.make_dim_val_str() << endl << - " left-wave-front-exts: " << left_wf_exts.make_dim_val_str() << endl << - " right-wave-front-exts: " << right_wf_exts.make_dim_val_str() << endl << - " ext-local-domain: " << ext_bb.make_range_string(domain_dims) << endl << - " temporal-block-angles: " << tb_angles.make_dim_val_str() << endl << - " num-temporal-block-shifts: " << num_tb_shifts << endl << - " temporal-block-long-base: " << tb_widths.make_dim_val_str(" * ") << endl << - " temporal-block-short-base: " << tb_tops.make_dim_val_str(" * ") << endl << - " mini-block-angles: " << mb_angles.make_dim_val_str()); - } - } - - void StencilContext::init_stats() { - STATE_VARS(this); - - // Calc and report total allocation and domain sizes. - rank_nbytes = get_num_bytes(); - tot_nbytes = sum_over_ranks(rank_nbytes, env->comm); - rank_domain_pts = rank_bb.bb_num_points; - tot_domain_pts = sum_over_ranks(rank_domain_pts, env->comm); - DEBUG_MSG("\nDomain size in this rank (points): " << make_num_str(rank_domain_pts) << - "\nTotal allocation in this rank: " << make_byte_str(rank_nbytes) << - "\nOverall problem size in " << env->num_ranks << " rank(s) (points): " << - make_num_str(tot_domain_pts) << - "\nTotal overall allocation in " << env->num_ranks << " rank(s): " << - make_byte_str(tot_nbytes)); - - // Report some sizes and settings. - DEBUG_MSG("\nWork-unit sizes in points (from smallest to largest):\n" - " vector-size: " << dims->_fold_pts.make_dim_val_str(" * ") << endl << - " cluster-size: " << dims->_cluster_pts.make_dim_val_str(" * ") << endl << - " sub-block-size: " << opts->_sub_block_sizes.remove_dim(step_posn).make_dim_val_str(" * ") << endl << - " mini-block-size: " << opts->_mini_block_sizes.make_dim_val_str(" * ") << endl << - " block-size: " << opts->_block_sizes.make_dim_val_str(" * ") << endl << - " region-size: " << opts->_region_sizes.make_dim_val_str(" * ") << endl << - " local-domain-size: " << opts->_rank_sizes.remove_dim(step_posn).make_dim_val_str(" * ") << endl << - " global-domain-size: " << opts->_global_sizes.remove_dim(step_posn).make_dim_val_str(" * ")); -#ifdef SHOW_GROUPS - DEBUG_MSG(" sub-block-group-size: " << opts->_sub_block_group_sizes.make_dim_val_str(" * ") << endl << - " block-group-size: " << opts->_block_group_sizes.make_dim_val_str(" * ")); -#endif - DEBUG_MSG("\nOther settings:\n" - " yask-version: " << yask_get_version_string() << endl << - " target: " << get_target() << endl << - " stencil-name: " << get_name() << endl << - " stencil-description: " << get_description() << endl << - " element-size: " << make_byte_str(get_element_bytes()) << endl << - " local-domain: " << rank_bb.make_range_string(domain_dims)); -#ifdef USE_MPI - DEBUG_MSG(" num-ranks: " << opts->_num_ranks.make_dim_val_str(" * ") << endl << - " rank-indices: " << opts->_rank_indices.make_dim_val_str() << endl << - " local-domain-offsets: " << rank_domain_offsets.make_dim_val_str(dims->_domain_dims)); - if (opts->overlap_comms) - DEBUG_MSG(" mpi-interior: " << mpi_interior.make_range_string(domain_dims)); -#endif - DEBUG_MSG( " vector-len: " << VLEN << endl << - " extra-padding: " << opts->_extra_pad_sizes.remove_dim(step_posn).make_dim_val_str() << endl << - " minimum-padding: " << opts->_min_pad_sizes.remove_dim(step_posn).make_dim_val_str() << endl << - " allow-addl-padding: " << opts->_allow_addl_pad << endl << - " L1-prefetch-distance: " << PFD_L1 << endl << - " L2-prefetch-distance: " << PFD_L2 << endl << - " max-halos: " << max_halos.make_dim_val_str()); - print_temporal_tiling_info(); - - // Info about eqs, stages and bundles. - DEBUG_MSG("\n" - "Num stages: " << st_stages.size() << endl << - "Num stencil bundles: " << st_bundles.size() << endl << - "Num stencil equations: " << NUM_STENCIL_EQS); - - // Info on work in stages. - DEBUG_MSG("\nBreakdown of work stats in this rank:"); - for (auto& sp : st_stages) - sp->init_work_stats(); - } - // Dealloc vars, etc. void StencilContext::end_solution() { STATE_VARS(this); TRACE_MSG("end_solution()..."); - // Final halo exchange (usually not needed). - exchange_halos(); - // Release any MPI data. env->global_barrier(); mpi_data.clear(); // Release var data. - for (auto gp : var_ptrs) { + for (auto gp : all_var_ptrs) { if (!gp) continue; gp->release_storage(); } - // Reset threads to original value. - set_max_threads(); + // Reset threads to original value. + set_max_threads(); } void StencilContext::fuse_vars(yk_solution_ptr source) { auto sp = dynamic_pointer_cast(source); assert(sp); - for (auto gp : var_ptrs) { + for (auto gp : all_var_ptrs) { auto gname = gp->get_name(); - auto si = sp->var_map.find(gname); - if (si != sp->var_map.end()) { + auto si = sp->all_var_map.find(gname); + if (si != sp->all_var_map.end()) { auto sgp = si->second; gp->fuse_vars(sgp); } @@ -340,7 +278,7 @@ namespace yask { } string StencilContext::apply_command_line_options(int argc, char* argv[]) { - std::vector args; + string_vec args; for (int i = 1; i < argc; i++) args.push_back(argv[i]); return apply_command_line_options(args); @@ -348,38 +286,20 @@ namespace yask { string StencilContext::apply_command_line_options(const vector& args) { STATE_VARS(this); + string rem; - // Create a parser and add base options to it. - CommandLineParser parser; - opts->add_options(parser); + // Apply settings to actual and requested options. + for (auto& cur_opts : { actl_opts, req_opts }) { + + // Create a parser and add base options to it. + CommandLineParser parser; + cur_opts->add_options(parser); - // Parse cmd-line options, which sets values in settings. - return parser.parse_args("YASK", args); - } - - // Add a new var to the containers. - void StencilContext::add_var(YkVarPtr gp, bool is_orig, bool is_output) { - STATE_VARS(this); - assert(gp); - auto& gname = gp->get_name(); - if (var_map.count(gname)) - THROW_YASK_EXCEPTION("Error: var '" + gname + "' already exists"); - - // Add to list and map. - var_ptrs.push_back(gp); - var_map[gname] = gp; - - // Add to orig list and map if 'is_orig'. - if (is_orig) { - orig_var_ptrs.push_back(gp); - orig_var_map[gname] = gp; + // Parse cmd-line options, which sets values in opts. + rem = parser.parse_args("YASK", args); } - // Add to output list and map if 'is_output'. - if (is_output) { - output_var_ptrs.push_back(gp); - output_var_map[gname] = gp; - } + return rem; } static string print_pct(double ntime, double dtime) { @@ -407,17 +327,24 @@ namespace yask { // Measured outside parallel region. double hetime = min(halo_time.get_elapsed_secs(), rtime); - // 'wait_time' is part of 'halo_time'. - double wtime = min(wait_time.get_elapsed_secs(), hetime); + // These are part of 'halo_time', so 'min' calls are used to ensure + // constituent times do not exceed overall halo time. + double hwtime = min(halo_wait_time.get_elapsed_secs(), hetime); + double hltime = min(halo_lock_wait_time.get_elapsed_secs(), hetime - hwtime); + double hptime = min(halo_pack_time.get_elapsed_secs(), hetime - hwtime - hltime); + double hutime = min(halo_unpack_time.get_elapsed_secs(), hetime - hwtime - hltime - hptime); + double hctime = min(halo_copy_time.get_elapsed_secs(), hetime - hwtime - hltime - hptime - hutime); // Exterior and interior parts. Measured outside parallel region. // Does not include 'halo_time'. double etime = min(ext_time.get_elapsed_secs(), rtime - hetime); double itime = int_time.get_elapsed_secs(); - // 'test_time' is part of 'int_time', but only on region thread 0. - // It's not part of 'halo_time'. - double ttime = test_time.get_elapsed_secs() / rthr; // ave. + // 'test_time' is part of measured 'int_time', but only on outer thread 0. + // It's not part of 'halo_time', since it's done outside of 'halo_exchange'. + // We calculate an average to distribute this time across threads, + // although it's just an estimate. + double ttime = halo_test_time.get_elapsed_secs() / rthr; // ave. // Remove average test time from interior time. itime -= ttime; @@ -426,7 +353,7 @@ namespace yask { // Compute time. double ctime = etime + itime; - // All halo time. + // All halo-related time. double htime = hetime + ttime; // Other. @@ -541,21 +468,33 @@ namespace yask { DEBUG_MSG(" other (sec): " << make_num_str(optime) << print_pct(optime, ctime)); } -#ifdef USE_MPI - double ohtime = max(htime - wtime - ttime, 0.); - DEBUG_MSG(" Compute-time breakdown by halo area:\n" - " rank-exterior compute (sec): " << make_num_str(etime) << - print_pct(etime, ctime) << endl << - " rank-interior compute (sec): " << make_num_str(itime) << - print_pct(itime, ctime) << endl << - " Halo-time breakdown:\n" - " MPI waits (sec): " << make_num_str(wtime) << - print_pct(wtime, htime) << endl << - " MPI tests (sec): " << make_num_str(ttime) << + + #ifdef USE_MPI + if (etime > 0.0) + DEBUG_MSG(" Compute-time breakdown by halo area:\n" + " rank-exterior compute (sec): " << make_num_str(etime) << + print_pct(etime, ctime) << endl << + " rank-interior compute (sec): " << make_num_str(itime) << + print_pct(itime, ctime)); + double hotime = max(htime - hltime - hwtime - ttime - hptime - hutime - hctime, 0.); + DEBUG_MSG(" Halo-time breakdown:\n" + " shm-lock waits (sec): " << make_num_str(hltime) << + print_pct(hltime, htime) << endl << + " MPI waits (sec): " << make_num_str(hwtime) << + print_pct(hwtime, htime) << endl << + " MPI tests (sec): " << make_num_str(ttime) << print_pct(ttime, htime) << endl << - " packing, unpacking, etc. (sec): " << make_num_str(ohtime) << - print_pct(ohtime, htime)); -#endif + " buffer packing (sec): " << make_num_str(hptime) << + print_pct(hptime, htime) << endl << + " buffer unpacking (sec): " << make_num_str(hutime) << + print_pct(hutime, htime) << endl << + #ifdef USE_OFFLOAD + " explicit copying to/from device (sec): " << make_num_str(hctime) << + print_pct(hctime, htime) << endl << + #endif + " other halo time (sec): " << make_num_str(hotime) << + print_pct(hotime, htime)); + #endif // Note that rates are reported with base-10 suffixes per common convention, not base-2. // See https://www.speedguide.net/articles/bits-bytes-and-bandwidth-reference-guide-115. @@ -590,8 +529,12 @@ namespace yask { ext_time.clear(); int_time.clear(); halo_time.clear(); - wait_time.clear(); - test_time.clear(); + halo_pack_time.clear(); + halo_unpack_time.clear(); + halo_copy_time.clear(); + halo_lock_wait_time.clear(); + halo_wait_time.clear(); + halo_test_time.clear(); steps_done = 0; for (auto& sp : st_stages) { sp->timer.clear(); diff --git a/src/kernel/lib/stencil_calc.cpp b/src/kernel/lib/stencil_calc.cpp index 72329680..64c724b5 100644 --- a/src/kernel/lib/stencil_calc.cpp +++ b/src/kernel/lib/stencil_calc.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -31,52 +31,51 @@ using namespace std; namespace yask { - // Calculate results within a mini-block defined by 'mini_block_idxs'. - // This is called by StencilContext::calc_mini_block() for each bundle. + // Calculate results within a micro-block defined by 'micro_block_idxs'. + // This is called by StencilContext::calc_micro_block() for each bundle. // It is here that any required scratch-var stencils are evaluated // first and then the non-scratch stencils in the stencil bundle. // It is also here that the boundaries of the bounding-box(es) of the bundle // are respected. There must not be any temporal blocking at this point. - void StencilBundleBase::calc_mini_block(int region_thread_idx, + void StencilBundleBase::calc_micro_block(int outer_thread_idx, KernelSettings& settings, - const ScanIndices& mini_block_idxs) { + const ScanIndices& micro_block_idxs) { STATE_VARS(this); - TRACE_MSG("calc_mini_block('" << get_name() << "'): [" << - mini_block_idxs.begin.make_val_str() << " ... " << - mini_block_idxs.end.make_val_str() << ") by " << - mini_block_idxs.stride.make_val_str() << - " by region thread " << region_thread_idx); + TRACE_MSG("calc_micro_block('" << get_name() << "'): [" << + micro_block_idxs.begin.make_val_str() << " ... " << + micro_block_idxs.end.make_val_str() << ") by " << + micro_block_idxs.stride.make_val_str() << + " by outer thread " << outer_thread_idx); assert(!is_scratch()); - // No TB allowed here. -#ifdef CHECK - idx_t begin_t = mini_block_idxs.begin[step_posn]; - idx_t end_t = mini_block_idxs.end[step_posn]; - assert(abs(end_t - begin_t) == 1); -#endif + // No temporal blocking allowed here. + assert(abs(micro_block_idxs.get_overall_range(step_posn)) == 1); + auto t = micro_block_idxs.begin[step_posn]; + assert(abs(micro_block_idxs.end[step_posn] - t) == 1); // Nothing to do if outer BB is empty. if (_bundle_bb.bb_num_points == 0) { - TRACE_MSG("calc_mini_block: empty BB"); + TRACE_MSG("calc_micro_block: empty BB"); return; } // TODO: if >1 BB, check limits of outer one first to save time. // Set number of threads in this block. - int nbt = _context->set_block_threads(); + // This will be the number of nano-blocks done in parallel. + int nbt = _context->set_num_inner_threads(); // Thread-binding info. // We only bind threads if there is more than one block thread // and binding is enabled. - bool bind_threads = nbt > 1 && settings.bind_block_threads; + bool bind_threads = nbt > 1 && settings.bind_inner_threads; int bind_posn = settings._bind_posn; - idx_t bind_slab_pts = settings._sub_block_sizes[bind_posn]; + idx_t bind_slab_pts = settings._nano_block_sizes[bind_posn]; // Other sizes not used. // Loop through each solid BB for this bundle. - // For each BB, calc intersection between it and 'mini_block_idxs'. - // If this is non-empty, apply the bundle to all its required sub-blocks. - TRACE_MSG("calc_mini_block('" << get_name() << "'): checking " << + // For each BB, calc intersection between it and 'micro_block_idxs'. + // If this is non-empty, apply the bundle to all its required nano-blocks. + TRACE_MSG("calc_micro_block('" << get_name() << "'): checking " << _bb_list.size() << " BB(s)"); int bbn = 0; for (auto& bb : _bb_list) { @@ -85,17 +84,17 @@ namespace yask { if (bb.bb_num_points == 0) bb_ok = false; - // Trim the mini-block indices based on the bounding box(es) + // Trim the micro-block indices based on the bounding box(es) // for this bundle. - ScanIndices mb_idxs(mini_block_idxs); - DOMAIN_VAR_LOOP(i, j) { + ScanIndices mb_idxs(micro_block_idxs); + DOMAIN_VAR_LOOP_FAST(i, j) { // Begin point. - auto bbegin = max(mini_block_idxs.begin[i], bb.bb_begin[j]); + auto bbegin = max(micro_block_idxs.begin[i], bb.bb_begin[j]); mb_idxs.begin[i] = bbegin; // End point. - auto bend = min(mini_block_idxs.end[i], bb.bb_end[j]); + auto bend = min(micro_block_idxs.end[i], bb.bb_end[j]); mb_idxs.end[i] = bend; // Anything to do? @@ -107,577 +106,185 @@ namespace yask { // nothing to do? if (!bb_ok) { - TRACE_MSG("calc_mini_block for bundle '" << get_name() << + TRACE_MSG("calc_micro_block for bundle '" << get_name() << "': no overlap between bundle " << bbn << " and current block"); continue; // to next BB. } - TRACE_MSG("calc_mini_block('" << get_name() << + TRACE_MSG("calc_micro_block('" << get_name() << "'): after trimming for BB " << bbn << ": [" << mb_idxs.begin.make_val_str() << " ... " << mb_idxs.end.make_val_str() << ")"); // Get the bundles that need to be processed in // this block. This will be any prerequisite scratch-var - // bundles plus this non-scratch bundle. + // bundles plus the current non-scratch bundle. auto sg_list = get_reqd_bundles(); // Loop through all the needed bundles. for (auto* sg : sg_list) { - // If binding threads to data, start threads within a block. - // Each of these threads will eventually work on a separate - // sub-block. This is nested within an OMP region thread. - // If there is only one block per thread, nested OMP is - // disabled, and this OMP pragma does nothing. - _Pragma("omp parallel proc_bind(spread)") { - int block_thread_idx = 0; - if (nbt > 1) { - assert(omp_get_level() == 2); - assert(omp_get_num_threads() == nbt); - block_thread_idx = omp_get_thread_num(); - } + // Indices needed for the generated loops. Will normally be a + // copy of 'mb_idxs' except when updating scratch-vars. + ScanIndices adj_mb_idxs = sg->adjust_span(outer_thread_idx, mb_idxs); - // Indices needed for the generated loops. Will normally be a - // copy of 'mb_idxs' except when updating scratch-vars. - ScanIndices adj_mb_idxs = sg->adjust_span(region_thread_idx, mb_idxs); + // Tweak settings for adjusted indices. + adj_mb_idxs.adjust_from_settings(settings._micro_block_sizes, + settings._micro_block_tile_sizes, + settings._nano_block_sizes); - // Tweak settings for adjusted indices. - DOMAIN_VAR_LOOP(i, j) { + // If binding threads to data. + if (bind_threads) { - // If binding threads to sub-blocks and this is the - // binding dim, set stride size and alignment - // granularity to the slab width. Setting the - // alignment keeps slabs aligned between stages. - if (bind_threads && i == bind_posn) { + // Tweak settings for adjusted indices. This sets + // up the nano-blocks as multiple slabs perpendicular + // to the binding dim within the micro-block. + DOMAIN_VAR_LOOP_FAST(i, j) { + + // If this is the binding dim, set stride size + // and alignment granularity to the slab + // width. Setting the alignment keeps slabs + // aligned between stages and/or steps. + if (i == bind_posn) { adj_mb_idxs.stride[i] = bind_slab_pts; adj_mb_idxs.align[i] = bind_slab_pts; } - // If original [or auto-tuned] sub-block covers - // entire mini-block, set stride size to full width. - // Also do this when binding and this is not the - // binding dim. - else if ((settings._sub_block_sizes[i] >= settings._mini_block_sizes[i]) || - bind_threads) - adj_mb_idxs.stride[i] = adj_mb_idxs.end[i] - adj_mb_idxs.begin[i]; + // If this is not the binding dim, set stride + // size to full width. For now, this is the + // only option for micro-block shapes when + // binding. TODO: consider other options. + else + adj_mb_idxs.stride[i] = adj_mb_idxs.get_overall_range(i); } - TRACE_MSG("calc_mini_block('" << get_name() << "'): " << - " for reqd bundle '" << sg->get_name() << "': [" << - adj_mb_idxs.begin.make_val_str() << " ... " << - adj_mb_idxs.end.make_val_str() << ") by " << - adj_mb_idxs.stride.make_val_str() << - " by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - // If binding threads to data, run the mini-block - // loops on all block threads and call calc_sub_block() - // only by the designated thread for the given slab - // index in the binding dim. - if (bind_threads) { + TRACE_MSG("calc_micro_block('" << get_name() << "'): " << + " for reqd bundle '" << sg->get_name() << "': [" << + adj_mb_idxs.begin.make_val_str() << " ... " << + adj_mb_idxs.end.make_val_str() << ") by " << + adj_mb_idxs.stride.make_val_str() << + " by outer thread " << outer_thread_idx << + " with " << nbt << " block thread(s) bound to data"); + + // Start threads within a block. Each of these threads + // will eventually work on a separate nano-block. This + // is nested within an OMP outer thread. + _Pragma("omp parallel proc_bind(spread)") { + assert(omp_get_level() == 2); + assert(omp_get_num_threads() == nbt); + int inner_thread_idx = omp_get_thread_num(); + + // Run the micro-block loops on all block threads and + // call calc_nano_block() only by the designated + // thread for the given slab index in the binding + // dim. This is an explicit replacement for "normal" + // OpenMP scheduling. + + // Disable the OpenMP construct in the micro-block loop + // because we're already in the parallel section. + #define MICRO_BLOCK_OMP_PRAGMA + + // Loop prefix. + #define MICRO_BLOCK_LOOP_INDICES adj_mb_idxs + #define MICRO_BLOCK_BODY_INDICES nano_blk_range + #define MICRO_BLOCK_USE_LOOP_PART_0 + #include "yask_micro_block_loops.hpp" + + // Loop body. const idx_t idx_ofs = 0x1000; // to help keep pattern when idx is neg. - - // Disable the OpenMP construct in the mini-block loop. -#define OMP_PRAGMA -#define CALC_SUB_BLOCK(mb_idxs) \ - auto bind_elem_idx = mb_idxs.start[bind_posn]; \ - auto bind_slab_idx = idiv_flr(bind_elem_idx + idx_ofs, bind_slab_pts); \ - auto bind_thr = imod_flr(bind_slab_idx, nbt); \ - if (block_thread_idx == bind_thr) \ - sg->calc_sub_block(region_thread_idx, block_thread_idx, settings, mb_idxs) -#include "yask_mini_block_loops.hpp" -#undef CALC_SUB_BLOCK -#undef OMP_PRAGMA - } - - // If not binding threads to data, call calc_sub_block() - // with a different thread for each sub-block using - // standard OpenMP scheduling. - else { -#define CALC_SUB_BLOCK(mb_idxs) \ - sg->calc_sub_block(region_thread_idx, block_thread_idx, settings, mb_idxs) -#include "yask_mini_block_loops.hpp" -#undef CALC_SUB_BLOCK - } + auto bind_elem_idx = nano_blk_range.start[bind_posn]; + auto bind_slab_idx = idiv_flr(bind_elem_idx + idx_ofs, bind_slab_pts); + auto bind_thr = imod_flr(bind_slab_idx, nbt); + if (inner_thread_idx == bind_thr) + sg->calc_nano_block(outer_thread_idx, inner_thread_idx, + settings, nano_blk_range); + + // Loop sufffix. + #define MICRO_BLOCK_USE_LOOP_PART_1 + #include "yask_micro_block_loops.hpp" + + } // Parallel region. + } // Binding threads to data. + + // If not binding or there is only one block per thread. + // (This is the more common case.) + else { + + TRACE_MSG("calc_micro_block('" << get_name() << "'): " << + " for reqd bundle '" << sg->get_name() << "': [" << + adj_mb_idxs.begin.make_val_str() << " ... " << + adj_mb_idxs.end.make_val_str() << ") by " << + adj_mb_idxs.stride.make_val_str() << + " by outer thread " << outer_thread_idx << + " with " << nbt << " block thread(s) NOT bound to data"); + + // Call calc_nano_block() with a different thread for + // each nano-block using standard OpenMP scheduling. + + // Loop prefix. + #define MICRO_BLOCK_LOOP_INDICES adj_mb_idxs + #define MICRO_BLOCK_BODY_INDICES nano_blk_range + #define MICRO_BLOCK_USE_LOOP_PART_0 + #include "yask_micro_block_loops.hpp" + + // Loop body. + int inner_thread_idx = omp_get_thread_num(); + sg->calc_nano_block(outer_thread_idx, inner_thread_idx, + settings, nano_blk_range); + + // Loop suffix. + #define MICRO_BLOCK_USE_LOOP_PART_1 + #include "yask_micro_block_loops.hpp" } // OMP parallel when binding threads to data. } // bundles. - } // BB list. - } - - // Calculate results for one sub-block using pure scalar code. - // This is for debug. - void StencilBundleBase::calc_sub_block_scalar(int region_thread_idx, - int block_thread_idx, - KernelSettings& settings, - const ScanIndices& mini_block_idxs) { - STATE_VARS(this); - TRACE_MSG("calc_sub_block_scalar for bundle '" << get_name() << "': [" << - mini_block_idxs.start.make_val_str() << - " ... " << mini_block_idxs.stop.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - // Init sub-block begin & end from block start & stop indices. - // Use the 'misc' loops. Indices for these loops will be scalar and - // global rather than normalized as in the cluster and vector loops. - ScanIndices misc_idxs(*dims, true); - misc_idxs.init_from_outer(mini_block_idxs); - - // Stride sizes and alignment are one element. - misc_idxs.stride.set_from_const(1); - misc_idxs.align.set_from_const(1); - - // Define misc-loop function. - // Since stride is always 1, we ignore misc_idxs.stop. -#define MISC_FN(pt_idxs) do { \ - calc_scalar(region_thread_idx, pt_idxs.start); \ - } while(0) - - // Scan through n-D space. - // The OMP in the misc loops will be ignored if we're already in - // the max allowed nested OMP region. -#include "yask_misc_loops.hpp" -#undef MISC_FN - } - // Calculate results for one sub-block. - // The index ranges in 'mini_block_idxs' are sub-divided - // into full vector-clusters, full vectors, and sub-vectors - // and finally evaluated by the YASK-compiler-generated loops. - void StencilBundleBase::calc_sub_block_vec(int region_thread_idx, - int block_thread_idx, - KernelSettings& settings, - const ScanIndices& mini_block_idxs) { + // Mark exterior dirty for halo exchange if exterior was done. + bool mark_dirty = _context->do_mpi_left || _context->do_mpi_right; + update_var_info(YkVarBase::self, t, mark_dirty, true, false); + + } // BB list. + } // calc_micro_block(). + + // Mark vars dirty that are updated by this bundle and/or + // update last valid step. + void StencilBundleBase::update_var_info(YkVarBase::dirty_idx whose, + idx_t t, + bool mark_extern_dirty, + bool mod_dev_data, + bool update_valid_step) { STATE_VARS(this); - TRACE_MSG("calc_sub_block_vec for bundle '" << get_name() << "': [" << - mini_block_idxs.start.make_val_str() << - " ... " << mini_block_idxs.stop.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - /* - Indices in each domain dim: - - sub_block_eidxs.begin rem_masks used here - | peel_masks used here | sub_block_eidxs.end - | | | | - v v v v - |---+-------+---------------------------+---+---| "+" => vec boundaries. - ^ ^ ^ ^ ^ ^ - | | | | | | - | | sub_block_fcidxs.begin | | sub_block_vidxs.end - | sub_block_fvidxs.begin | sub_block_fvidxs.end - sub_block_vidxs.begin sub_block_fcidxs.end - */ - - // Init sub-block begin & end from block start & stop indices. - // These indices are in element units and global (NOT rank-relative). - ScanIndices sub_block_idxs(*dims, true); - sub_block_idxs.init_from_outer(mini_block_idxs); - - // Sub block indices in element units and rank-relative. - ScanIndices sub_block_eidxs(sub_block_idxs); - - // Subset of sub-block that is full clusters. - // These indices are in element units and rank-relative. - ScanIndices sub_block_fcidxs(sub_block_idxs); - - // Subset of sub-block that is full vectors. - // These indices are in element units and rank-relative. - ScanIndices sub_block_fvidxs(sub_block_idxs); - - // Superset of sub-block that is full or partial (masked) vectors. - // These indices are in element units and rank-relative. - ScanIndices sub_block_vidxs(sub_block_idxs); - - // These will be set to rank-relative, so set ofs to zero. - sub_block_eidxs.align_ofs.set_from_const(0); - sub_block_fcidxs.align_ofs.set_from_const(0); - sub_block_fvidxs.align_ofs.set_from_const(0); - sub_block_vidxs.align_ofs.set_from_const(0); - - // Masks for computing partial vectors in each dim. - // Init to all-ones (no masking). - Indices peel_masks(nsdims), rem_masks(nsdims); - peel_masks.set_from_const(-1); - rem_masks.set_from_const(-1); - - // Flags that indicate what type of processing needs to be done. - bool do_clusters = true; // any clusters to do? - bool do_vectors = false; // any vectors to do? - bool do_scalars = false; // any scalars to do? - - // Adjust indices to be rank-relative. - // Determine the subset of this sub-block that is - // clusters, vectors, and partial vectors. - _DOMAIN_VAR_LOOP(i, j) { - - // Rank offset. - auto rofs = _context->rank_domain_offsets[j]; - - // Begin/end of rank-relative scalar elements in this dim. - auto ebgn = sub_block_idxs.begin[i] - rofs; - auto eend = sub_block_idxs.end[i] - rofs; - sub_block_eidxs.begin[i] = ebgn; - sub_block_eidxs.end[i] = eend; - - // Find range of full clusters. - // Note that fcend <= eend because we round - // down to get whole clusters only. - // Similarly, fcbgn >= ebgn. - auto cpts = dims->_cluster_pts[j]; - auto fcbgn = round_up_flr(ebgn, cpts); - auto fcend = round_down_flr(eend, cpts); - sub_block_fcidxs.begin[i] = fcbgn; - sub_block_fcidxs.end[i] = fcend; - - // Any clusters to do? - if (fcend <= fcbgn) - do_clusters = false; - - // If anything before or after clusters, continue with - // setting vector indices and peel/rem masks. - if (fcbgn > ebgn || fcend < eend) { - - // Find range of full and/or partial vectors. - // Note that fvend <= eend because we round - // down to get whole vectors only. - // Note that vend >= eend because we round - // up to include partial vectors. - // Similar but opposite for begin vars. - // We make a vector mask to pick the - // right elements. - auto vpts = fold_pts[j]; - auto fvbgn = round_up_flr(ebgn, vpts); - auto fvend = round_down_flr(eend, vpts); - auto vbgn = round_down_flr(ebgn, vpts); - auto vend = round_up_flr(eend, vpts); - if (i == inner_posn) { - - // Don't do any full and/or partial vectors in plane of - // inner domain dim. We'll do these with scalars. This - // should be unusual because vector folding is normally - // done in a plane perpendicular to the inner dim for >= - // 2D domains. - fvbgn = vbgn = fcbgn; - fvend = vend = fcend; - } - sub_block_fvidxs.begin[i] = fvbgn; - sub_block_fvidxs.end[i] = fvend; - sub_block_vidxs.begin[i] = vbgn; - sub_block_vidxs.end[i] = vend; - - // Any vectors to do (full and/or partial)? - if (vbgn < fcbgn || vend > fcend) - do_vectors = true; - - // Calculate masks in this dim for partial vectors. - // All such masks will be ANDed together to form the - // final masks over all domain dims. - // Example: assume folding is x=4*y=4. - // Possible 'x' peel mask to exclude 1st 2 cols: - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // Possible 'y' peel mask to exclude 1st row: - // 0 0 0 0 - // 1 1 1 1 - // 1 1 1 1 - // 1 1 1 1 - // Along 'x' face, the 'x' peel mask is used. - // Along 'y' face, the 'y' peel mask is used. - // Along an 'x-y' edge, they are ANDed to make this mask: - // 0 0 0 0 - // 0 0 1 1 - // 0 0 1 1 - // 0 0 1 1 - // so that the 6 corner elements are updated. - - if (vbgn < fvbgn || vend > fvend) { - idx_t pmask = 0, rmask = 0; - - // Need to set upper bit. - idx_t mbit = 0x1 << (dims->_fold_pts.product() - 1); - - // Visit points in a vec-fold to set bits for this dim's - // masks per the diagram above. TODO: make this more - // efficient. - dims->_fold_pts.visit_all_points - ([&](const IdxTuple& pt, size_t idx) { - - // Shift masks to next posn. - pmask >>= 1; - rmask >>= 1; - - // If the peel point is within the sub-block, - // set the next bit in the mask. - idx_t pi = vbgn + pt[j]; - if (pi >= ebgn) - pmask |= mbit; - - // If the rem point is within the sub-block, - // put a 1 in the mask. - pi = fvend + pt[j]; - if (pi < eend) - rmask |= mbit; - - // Keep visiting. - return true; - }); - - // Save masks in this dim. - peel_masks[i] = pmask; - rem_masks[i] = rmask; - } - - // Anything not covered? - // This will only be needed in inner dim because we - // will do partial vectors in other dims. - if (i == inner_posn && (ebgn < vbgn || eend > vend)) - do_scalars = true; - } - // If no peel or rem, just set vec indices to same as - // full cluster. - else { - sub_block_fvidxs.begin[i] = fcbgn; - sub_block_fvidxs.end[i] = fcend; - sub_block_vidxs.begin[i] = fcbgn; - sub_block_vidxs.end[i] = fcend; - } - } - - // Normalized indices needed for sub-block loop. - ScanIndices norm_sub_block_idxs(sub_block_eidxs); - - // Normalize the cluster indices. - // These will be the bounds of the sub-block loops. - // Set both begin/end and start/stop to ensure start/stop - // vars get passed through to calc_loop_of_clusters() - // for the inner loop. - normalize_indices(sub_block_fcidxs.begin, norm_sub_block_idxs.begin); - norm_sub_block_idxs.start = norm_sub_block_idxs.begin; - normalize_indices(sub_block_fcidxs.end, norm_sub_block_idxs.end); - norm_sub_block_idxs.stop = norm_sub_block_idxs.end; - norm_sub_block_idxs.align.set_from_const(1); // one vector. - - // Full rectilinear polytope of aligned clusters: use optimized code. - if (do_clusters) { - TRACE_MSG("calc_sub_block_vec: using cluster code for [" << - sub_block_fcidxs.begin.make_val_str() << - " ... " << sub_block_fcidxs.end.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - // Stride sizes are based on cluster lengths (in vector units). - // The stride in the inner loop is hard-coded in the generated code. - DOMAIN_VAR_LOOP(i, j) { - norm_sub_block_idxs.stride[i] = dims->_cluster_mults[j]; // N vecs. - } - - // Define the function called from the generated loops to simply - // call the loop-of-clusters functions. -#define CALC_INNER_LOOP(loop_idxs) \ - calc_loop_of_clusters(region_thread_idx, block_thread_idx, loop_idxs) - - // Include automatically-generated loop code that calls - // calc_inner_loop(). -#include "yask_sub_block_loops.hpp" -#undef CALC_INNER_LOOP - - } // whole clusters. - - // Full and partial peel/remainder vectors in all dims except - // the inner one. - if (do_vectors) { - TRACE_MSG("calc_sub_block_vec: using vector code for [" << - sub_block_vidxs.begin.make_val_str() << - " ... " << sub_block_vidxs.end.make_val_str() << - ") *not* within full vector-clusters at [" << - sub_block_fcidxs.begin.make_val_str() << - " ... " << sub_block_fcidxs.end.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - // Keep a copy of the normalized cluster indices - // that were calculated above. - // The full clusters were already done above, so - // we only need to do vectors before or after the - // clusters in each dim. - // We'll exclude them below. - ScanIndices norm_sub_block_fcidxs(norm_sub_block_idxs); - - // Normalize the vector indices. - // These will be the bounds of the sub-block loops. - // Set both begin/end and start/stop to ensure start/stop - // vars get passed through to calc_loop_of_clusters() - // for the inner loop. - normalize_indices(sub_block_vidxs.begin, norm_sub_block_idxs.begin); - norm_sub_block_idxs.start = norm_sub_block_idxs.begin; - normalize_indices(sub_block_vidxs.end, norm_sub_block_idxs.end); - norm_sub_block_idxs.stop = norm_sub_block_idxs.end; - - // Stride sizes are one vector. - // The stride in the inner loop is hard-coded in the generated code. - norm_sub_block_idxs.stride.set_from_const(1); - - // Also normalize the *full* vector indices to determine if - // we need a mask at each vector index. - // We just need begin and end indices for this. - ScanIndices norm_sub_block_fvidxs(sub_block_eidxs); - normalize_indices(sub_block_fvidxs.begin, norm_sub_block_fvidxs.begin); - normalize_indices(sub_block_fvidxs.end, norm_sub_block_fvidxs.end); - norm_sub_block_fvidxs.align.set_from_const(1); // one vector. - - // Define the function called from the generated loops to - // determine whether a loop of vectors is within the peel range - // (before the cluster) and/or remainder range (after the - // clusters)--setting the 'ok' flag. In other words, the vectors - // should be used only around the outside of the inner block of - // clusters. Then, call the loop-of-vectors function - // w/appropriate mask. See the mask diagrams above that show - // how the masks are ANDed together. Since stride is always 1, we - // ignore loop_idxs.stop. -#define CALC_INNER_LOOP(loop_idxs) \ - bool ok = false; \ - idx_t mask = idx_t(-1); \ - DOMAIN_VAR_LOOP(i, j) { \ - if (i != inner_posn && \ - (loop_idxs.start[i] < norm_sub_block_fcidxs.begin[i] || \ - loop_idxs.start[i] >= norm_sub_block_fcidxs.end[i])) { \ - ok = true; \ - if (loop_idxs.start[i] < norm_sub_block_fvidxs.begin[i]) \ - mask &= peel_masks[i]; \ - if (loop_idxs.start[i] >= norm_sub_block_fvidxs.end[i]) \ - mask &= rem_masks[i]; \ - } \ - } \ - if (ok) calc_loop_of_vectors(region_thread_idx, block_thread_idx, loop_idxs, mask); - - // Include automatically-generated loop code that calls - // calc_inner_loop(). -#include "yask_sub_block_loops.hpp" -#undef CALC_INNER_LOOP - } - - // Use scalar code for anything not done above. This should only be - // called if vectorizing on the inner loop and sub-block size in - // that dim is not a multiple of the inner-dim vector len, so that - // situation should be avoided. - if (do_scalars) { - - // Use the 'misc' loops. Indices for these loops will be scalar and - // global rather than normalized as in the cluster and vector loops. - ScanIndices misc_idxs(sub_block_idxs); - - // Stride sizes and alignment are one element. - misc_idxs.stride.set_from_const(1); - misc_idxs.align.set_from_const(1); - - TRACE_MSG("calc_sub_block_vec: using scalar code for [" << - misc_idxs.begin.make_val_str() << " ... " << - misc_idxs.end.make_val_str() << - ") *not* within vectors at [" << - sub_block_vidxs.begin.make_val_str() << " ... " << - sub_block_vidxs.end.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - - // Define misc-loop function. This is called at each point in - // the sub-block. Since stride is always 1, we ignore - // misc_idxs.stop. TODO: handle more efficiently: do one slab - // for inner-peel and one for outer-peel, calculate masks, and - // call vector code. -#define MISC_FN(pt_idxs) do { \ - bool ok = false; \ - DOMAIN_VAR_LOOP(i, j) { \ - auto rofs = _context->rank_domain_offsets[j]; \ - if (pt_idxs.start[i] < rofs + sub_block_vidxs.begin[i] || \ - pt_idxs.start[i] >= rofs + sub_block_vidxs.end[i]) { \ - ok = true; break; } \ - } \ - if (ok) { \ - calc_scalar(region_thread_idx, pt_idxs.start); \ - } \ - } while(0) - - // Scan through n-D space. - // The OMP in the misc loops will be ignored if we're already in - // the max allowed nested OMP region. -#include "yask_misc_loops.hpp" -#undef MISC_FN - } - - } // calc_sub_block_vec. - - // Calculate a series of cluster results within an inner loop. - // The 'loop_idxs' must specify a range only in the inner dim. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. - void StencilBundleBase::calc_loop_of_clusters(int region_thread_idx, - int block_thread_idx, - const ScanIndices& loop_idxs) { - STATE_VARS(this); - TRACE_MSG("calc_loop_of_clusters: local vector-indices [" << - loop_idxs.start.make_val_str() << - " ... " << loop_idxs.stop.make_val_str() << - ") by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - -#ifdef CHECK - // Check that only the inner dim has a range greater than one cluster. - DOMAIN_VAR_LOOP(i, j) { - if (i != inner_posn) - assert(loop_idxs.start[i] + dims->_cluster_mults[j] >= - loop_idxs.stop[i]); + // Get output step for this bundle, if any. For most stencils, this + // will be t+1 or t-1 if striding backward. + idx_t t_out = 0; + if (!get_output_step_index(t, t_out)) { + TRACE_MSG("not updating because output step is not available"); + return; } -#endif - // Need all starting indices. - const Indices& start_idxs = loop_idxs.start; + // Output vars for this bundle. NB: don't need to mark + // scratch vars as dirty because they are never exchanged. + for (auto gp : output_var_ptrs) { + auto& gb = gp->gb(); - // Need stop for inner loop only. - idx_t stop_inner = loop_idxs.stop[inner_posn]; + // Mark given dirty flag. + // This flag will be false if we're only updating the interior, + // i.e., we don't need to trigger a halo exchange. + if (mark_extern_dirty) { + gb.set_dirty(whose, true, t_out); + TRACE_MSG(gb.get_name() << " marked dirty"); + } - // Call code from stencil compiler. - calc_loop_of_clusters(region_thread_idx, block_thread_idx, start_idxs, stop_inner); - } + // Mark the entire var as dirty on the device, regardless + // of whether this is the interior or exterior. + if (mod_dev_data) + gb.get_coh().mod_dev(); - // Calculate a series of vector results within an inner loop. - // The 'loop_idxs' must specify a range only in the inner dim. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. - void StencilBundleBase::calc_loop_of_vectors(int region_thread_idx, - int block_thread_idx, - const ScanIndices& loop_idxs, - idx_t write_mask) { - STATE_VARS(this); - TRACE_MSG("calc_loop_of_vectors: local vector-indices [" << - loop_idxs.start.make_val_str() << - " ... " << loop_idxs.stop.make_val_str() << - ") w/write-mask = 0x" << hex << write_mask << dec << - " by region thread " << region_thread_idx << - " and block thread " << block_thread_idx); - -#ifdef CHECK - // Check that only the inner dim has a range greater than one vector. - for (int i = 0; i < nsdims; i++) { - if (i != step_posn && i != inner_posn) - assert(loop_idxs.start[i] + 1 >= loop_idxs.stop[i]); + // Update last valid step. + if (update_valid_step) + gb.update_valid_step(t_out); } -#endif - - // Need all starting indices. - const Indices& start_idxs = loop_idxs.start; - - // Need stop for inner loop only. - idx_t stop_inner = loop_idxs.stop[inner_posn]; - - // Call code from stencil compiler. - calc_loop_of_vectors(region_thread_idx, block_thread_idx, start_idxs, stop_inner, write_mask); } // If this bundle is updating scratch var(s), @@ -691,7 +298,7 @@ namespace yask { // its halo sizes are still used to specify how much to // add to 'idxs'. // Returns adjusted indices. - ScanIndices StencilBundleBase::adjust_span(int region_thread_idx, + ScanIndices StencilBundleBase::adjust_span(int outer_thread_idx, const ScanIndices& idxs) const { STATE_VARS(this); ScanIndices adj_idxs(idxs); @@ -701,13 +308,13 @@ namespace yask { assert(sv); // Get the one for this thread. - auto& gp = sv->at(region_thread_idx); + auto& gp = sv->at(outer_thread_idx); assert(gp); auto& gb = gp->gb(); assert(gb.is_scratch()); // i: index for stencil dims, j: index for domain dims. - DOMAIN_VAR_LOOP(i, j) { + DOMAIN_VAR_LOOP_FAST(i, j) { auto& dim = dims->_stencil_dims.get_dim(i); auto& dname = dim._get_name(); @@ -720,7 +327,9 @@ namespace yask { idx_t lh = gp->get_left_halo_size(posn); idx_t rh = gp->get_right_halo_size(posn); - // Round up halos to fold sizes. + // Round up halos to vector sizes. + // TODO: consider cluster sizes, but need to make changes + // elsewhere in code. lh = ROUND_UP(lh, fold_pts[j]); rh = ROUND_UP(rh, fold_pts[j]); @@ -729,16 +338,18 @@ namespace yask { adj_idxs.end[i] = idxs.end[i] + rh; // Make sure var covers index bounds. - TRACE_MSG("adjust_span: mini-blk [" << + TRACE_MSG("adjust_span: micro-blk [" << idxs.begin[i] << "..." << idxs.end[i] << ") adjusted to [" << adj_idxs.begin[i] << "..." << adj_idxs.end[i] << ") within scratch-var '" << - gp->get_name() << "' allocated [" << - gp->get_first_rank_alloc_index(posn) << "..." << - gp->get_last_rank_alloc_index(posn) << "] in dim '" << dname << "'"); - assert(adj_idxs.begin[i] >= gp->get_first_rank_alloc_index(posn)); - assert(adj_idxs.end[i] <= gp->get_last_rank_alloc_index(posn) + 1); + gp->get_name() << "' with halos " << + gp->get_left_halo_size(posn) << " and " << + gp->get_right_halo_size(posn) << " allocated [" << + gp->get_first_local_index(posn) << "..." << + gp->get_last_local_index(posn) << "] in dim '" << dname << "'"); + assert(adj_idxs.begin[i] >= gp->get_first_local_index(posn)); + assert(adj_idxs.end[i] <= gp->get_last_local_index(posn) + 1); // If existing stride is >= whole tile, adjust it also. idx_t width = idxs.end[i] - idxs.begin[i]; @@ -760,13 +371,13 @@ namespace yask { // Start and stop stage timers for final stats and auto-tuners. void Stage::start_timers() { auto ts = YaskTimer::get_timespec(); - timer.start(&ts); - get_at().timer.start(&ts); + timer.start(ts); + get_at().timer.start(ts); } void Stage::stop_timers() { auto ts = YaskTimer::get_timespec(); - timer.stop(&ts); - get_at().timer.stop(&ts); + timer.stop(ts); + get_at().timer.stop(ts); } void Stage::add_steps(idx_t num_steps) { steps_done += num_steps; @@ -810,10 +421,9 @@ namespace yask { // Stats for this bundle for 1 pt. idx_t writes1 = 0, reads1 = 0, fpops1 = 0; - // Loop through all the needed bundles to - // count stats for scratch bundles. - // Does not count extra ops needed in scratch halos - // since this varies depending on block size. + // Loop through all the needed bundles to count stats for + // scratch bundles. Does not count extra ops needed in scratch + // halos since this varies depending on block size. auto sg_list = sg->get_reqd_bundles(); for (auto* rsg : sg_list) { reads1 += rsg->get_scalar_points_read(); diff --git a/src/kernel/lib/stencil_calc.hpp b/src/kernel/lib/stencil_calc.hpp index 20975e85..13ff0638 100644 --- a/src/kernel/lib/stencil_calc.hpp +++ b/src/kernel/lib/stencil_calc.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -29,17 +29,13 @@ namespace yask { // Classes that support evaluation of one stencil bundle // and a stage of bundles. - // A stencil context contains one or more stages. + // A stencil solution contains one or more stages. // A pure-virtual class base for a stencil bundle. class StencilBundleBase : public ContextLinker { protected: - std::string _name; - int _scalar_fp_ops = 0; - int _scalar_points_read = 0; - int _scalar_points_written = 0; // Other bundles that this one depends on. StencilBundleSet _depends_on; @@ -48,9 +44,6 @@ namespace yask { // before this bundle. Listed in eval order first-to-last. StencilBundleList _scratch_children; - // Whether this updates scratch var(s); - bool _is_scratch = false; - // Overall bounding box for the bundle. // This may or may not be solid, i.e., it // may contain some invalid points. @@ -62,25 +55,39 @@ namespace yask { // any invalid points. These will all be inside '_bundle_bb'. BBList _bb_list; - // Normalize the indices, i.e., divide by vector len in each dim. + // Normalize the 'orig' indices, i.e., divide by vector len in each dim. // Ranks offsets must already be subtracted. // Each dim in 'orig' must be a multiple of corresponding vec len. - void normalize_indices(const Indices& orig, Indices& norm) const { + inline Indices + normalize_indices(const Indices& orig) const { STATE_VARS(this); - assert(orig._get_num_dims() == nsdims); - assert(norm._get_num_dims() == nsdims); + assert(orig.get_num_dims() == nsdims); + Indices norm(orig); // i: index for stencil dims, j: index for domain dims. - DOMAIN_VAR_LOOP(i, j) { + DOMAIN_VAR_LOOP_FAST(i, j) { // Divide indices by fold lengths as needed by // read/write_vec_norm(). Use idiv_flr() instead of '/' - // because begin/end vars may be negative (if in halo). + // because begin/end vars may be negative (e.g., if in halo). norm[i] = idiv_flr(orig[i], fold_pts[j]); // Check for no remainder. assert(imod_flr(orig[i], fold_pts[j]) == 0); } + return norm; + } + inline ScanIndices + normalize_indices(const ScanIndices& orig) { + ScanIndices norm(orig); + norm.begin = normalize_indices(orig.begin); + norm.start = norm.begin; + norm.end = normalize_indices(orig.end); + norm.stop = norm.end; + norm.tile_size = normalize_indices(orig.tile_size); + norm.align = normalize_indices(orig.align); + norm.stride = normalize_indices(orig.stride); + return norm; } public: @@ -101,36 +108,22 @@ namespace yask { ContextLinker(context) { } virtual ~StencilBundleBase() { } - // Get name of this bundle. - const std::string& get_name() const { return _name; } - - // Get estimated number of FP ops done for one scalar eval. - virtual int get_scalar_fp_ops() const { return _scalar_fp_ops; } - - // Get number of points read and written for one scalar eval. - virtual int get_scalar_points_read() const { return _scalar_points_read; } - virtual int get_scalar_points_written() const { return _scalar_points_written; } - - // Scratch accessors. - bool is_scratch() const { return _is_scratch; } - void set_scratch(bool is_scratch) { _is_scratch = is_scratch; } - // Access to BBs. BoundingBox& get_bb() { return _bundle_bb; } BBList& get_bbs() { return _bb_list; } // Add dependency. - virtual void add_dep(StencilBundleBase* eg) { + void add_dep(StencilBundleBase* eg) { _depends_on.insert(eg); } // Get dependencies. - virtual const StencilBundleSet& get_deps() const { + const StencilBundleSet& get_deps() const { return _depends_on; } // Add needed scratch-bundle. - virtual void add_scratch_child(StencilBundleBase* eg) { + void add_scratch_child(StencilBundleBase* eg) { _scratch_children.push_back(eg); } @@ -158,15 +151,63 @@ namespace yask { // Copy BB vars from another. void copy_bounding_box(const StencilBundleBase* src); + // Calculate results for an arbitrary tile for points in the valid domain. + // Scratch vars, if any, are indexed via 'scratch_var_idx'. + virtual void + calc_in_domain(int scratch_var_idx, const ScanIndices& misc_idxs) =0; + + // Calculate results within a micro-block. + void + calc_micro_block(int outer_thread_idx, + KernelSettings& settings, + const ScanIndices& micro_block_idxs); + + // Mark vars dirty that are updated by this bundle and/or + // update last valid step. + void + update_var_info(YkVarBase::dirty_idx whose, + idx_t step, + bool mark_extern_dirty, + bool mod_dev_data, + bool update_valid_step); + + // Calculate results within a nano-block. + virtual void + calc_nano_block(int outer_thread_idx, + int inner_thread_idx, + KernelSettings& settings, + const ScanIndices& micro_block_idxs) =0; + + // Functions below are stubs for the code generated + // by the stencil compiler. + + // Get name of this bundle. + virtual const std::string + get_name() const =0; + + // Get estimated number of FP ops done for one scalar eval. + virtual int + get_scalar_fp_ops() const =0; + + // Get number of points read and written for one scalar eval. + virtual int + get_scalar_points_read() const =0; + virtual int + get_scalar_points_written() const =0; + + // Whether this bundle updates scratch var(s)? + virtual bool + is_scratch() const =0; + // Determine whether indices are in [sub-]domain. virtual bool is_in_valid_domain(const Indices& idxs) const =0; - // Return true if there is a non-default conditions. + // Return true if there are any non-default conditions. virtual bool - is_sub_domain_expr() const { return false; } + is_sub_domain_expr() const =0; virtual bool - is_step_cond_expr() const { return false; } + is_step_cond_expr() const =0; // Return human-readable description of conditions. virtual std::string @@ -187,88 +228,735 @@ namespace yask { get_output_step_index(idx_t input_step_index, idx_t& output_step_index) const =0; - // Calculate one scalar result. - virtual void - calc_scalar(int thread_idx, const Indices& idxs) =0; + }; // StencilBundleBase. - // Calculate results within a mini-block. - void - calc_mini_block(int region_thread_idx, - KernelSettings& settings, - const ScanIndices& mini_block_idxs); + // A template that is instantiated with the stencil-compiler + // output class. + template + class StencilBundleTempl: + public StencilBundleBase { + + protected: + StencilBundleImplT _bundle; + + // Access core data. + // TODO: use dynamic_cast in CHECK mode. + inline StencilCoreDataT* _corep() { + return static_cast(_context->corep()); + } + inline const StencilCoreDataT* _corep() const { + return static_cast(_context->corep()); + } + + public: + + // Ctor. + StencilBundleTempl(StencilContext* context): + StencilBundleBase(context) { } + + // Dtor. + virtual ~StencilBundleTempl() { } + + // Get name of this bundle. + const std::string get_name() const override { + return _bundle._name; + } + + // Get estimated number of FP ops done for one scalar eval. + int get_scalar_fp_ops() const override { + return _bundle._scalar_fp_ops; + } - // Calculate results within a sub-block. + // Get number of points read and written for one scalar eval. + int get_scalar_points_read() const override { + return _bundle._scalar_points_read; + } + int get_scalar_points_written() const override { + return _bundle._scalar_points_written; + } + + // Whether this bundle updates scratch var(s)? + bool is_scratch() const override { + return _bundle._is_scratch; + } + + // Determine whether indices are in [sub-]domain. + bool is_in_valid_domain(const Indices& idxs) const override { + return _bundle.is_in_valid_domain(_corep(), idxs); + } + + // Return true if there are any non-default conditions. + bool is_sub_domain_expr() const override { + return _bundle.is_sub_domain_expr(); + } + bool is_step_cond_expr() const override { + return _bundle.is_step_cond_expr(); + } + + // Return human-readable description of conditions. + std::string get_domain_description() const override { + return _bundle.get_domain_description(); + } + std::string get_step_cond_description() const override { + return _bundle.get_step_cond_description(); + } + + // Determine whether step index is enabled. + bool is_in_valid_step(idx_t input_step_index) const override { + return _bundle.is_in_valid_step(_corep(), input_step_index); + } + + // If bundle updates var(s) with the step index, + // set 'output_step_index' to the step that an update + // occurs when calling one of the calc_*() methods with + // 'input_step_index' and return 'true'. + // Else, return 'false'; + bool get_output_step_index(idx_t input_step_index, + idx_t& output_step_index) const override { + return _bundle.get_output_step_index(input_step_index, + output_step_index); + } + + // Calculate results for an arbitrary tile for points in the valid domain. + // Scratch vars, if any are used, are indexed via 'scratch_var_idx'. + // This is very slow and used for reference calculations. void - calc_sub_block_vec(int region_thread_idx, - int block_thread_idx, - KernelSettings& settings, - const ScanIndices& mini_block_idxs); + calc_in_domain(int scratch_var_idx, const ScanIndices& misc_idxs) override { + auto* cp = _corep(); + + // Loop prefix. + #define MISC_LOOP_INDICES misc_idxs + #define MISC_BODY_INDICES misc_range + #define MISC_USE_LOOP_PART_0 + #include "yask_misc_loops.hpp" + + // Loop body. Since stride is always 1, we ignore + // stop indices. If point is in sub-domain for this bundle, + // then execute the reference scalar code. TODO: fix domain of + // scratch vars. + if (_bundle.is_in_valid_domain(cp, misc_range.start)) + _bundle.calc_scalar(cp, scratch_var_idx, misc_range.start); + + // Loop suffix. + #define MISC_USE_LOOP_PART_1 + #include "yask_misc_loops.hpp" + } + + // Calculate results within a nano-block. + // Essentially just a chooser between the debug and optimized versions. void - calc_sub_block_scalar(int region_thread_idx, - int block_thread_idx, - KernelSettings& settings, - const ScanIndices& mini_block_idxs); - inline void - calc_sub_block(int region_thread_idx, - int block_thread_idx, + calc_nano_block(int outer_thread_idx, + int inner_thread_idx, KernelSettings& settings, - const ScanIndices& mini_block_idxs) { - if (block_thread_idx < 0) - block_thread_idx = omp_get_thread_num(); + const ScanIndices& micro_block_idxs) override { + + // Choose between scalar debug and optimized impls. if (settings.force_scalar) - calc_sub_block_scalar(region_thread_idx, block_thread_idx, - settings, mini_block_idxs); + calc_nano_block_dbg(outer_thread_idx, inner_thread_idx, + settings, micro_block_idxs); else - calc_sub_block_vec(region_thread_idx, block_thread_idx, - settings, mini_block_idxs); + calc_nano_block_opt(outer_thread_idx, inner_thread_idx, + settings, micro_block_idxs); } - // Calculate a series of cluster results within an inner loop. - // All indices start at 'start_idxs'. Inner loop iterates to - // 'stop_inner' by 'stride_inner'. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. - virtual void - calc_loop_of_clusters(int region_thread_idx, - int block_thread_idx, - const Indices& start_idxs, - idx_t stop_inner) =0; - - // Calculate a series of cluster results within an inner loop. - // The 'loop_idxs' must specify a range only in the inner dim. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. + // Calculate results for one nano-block using pure scalar code. + // This is very slow and used for debug. void - calc_loop_of_clusters(int region_thread_idx, - int block_thread_idx, - const ScanIndices& loop_idxs); - - // Calculate a series of vector results within an inner loop. - // All indices start at 'start_idxs'. Inner loop iterates to - // 'stop_inner' by 'stride_inner'. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. - // Each vector write is masked by 'write_mask'. - virtual void - calc_loop_of_vectors(int region_thread_idx, - int block_thread_idx, - const Indices& start_idxs, - idx_t stop_inner, - idx_t write_mask) =0; - - // Calculate a series of vector results within an inner loop. - // The 'loop_idxs' must specify a range only in the inner dim. - // Indices must be rank-relative. - // Indices must be normalized, i.e., already divided by VLEN_*. - // Each vector write is masked by 'write_mask'. + calc_nano_block_dbg(int outer_thread_idx, + int inner_thread_idx, + KernelSettings& settings, + const ScanIndices& micro_block_idxs) { + STATE_VARS(this); + TRACE_MSG("calc_nano_block_dbg for bundle '" << get_name() << "': [" << + micro_block_idxs.start.make_val_str() << + " ... " << micro_block_idxs.stop.make_val_str() << + ") by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + + auto* cp = _corep(); + + // Init nano-block begin & end from block start & stop indices. + // Use the 'misc' loops. Indices for these loops will be scalar and + // global rather than normalized as in the cluster and vector loops. + ScanIndices sb_idxs = micro_block_idxs.create_inner(); + + // Stride and alignment to 1 element. + sb_idxs.stride.set_from_const(1); + sb_idxs.align.set_from_const(1); + + calc_nano_block_dbg2(cp, outer_thread_idx, sb_idxs); + } + + // Scalar calc loop. + // Static to make sure offload doesn't need 'this'. + static void + calc_nano_block_dbg2(StencilCoreDataT* cp, + int outer_thread_idx, + const ScanIndices& misc_idxs) { + + // Scan through n-D space. + // Set OMP loop to offload; disable OMP on host. + #ifdef USE_OFFLOAD + #define MISC_OMP_PRAGMA \ + _Pragma("omp target parallel for device(KernelEnv::_omp_devn) schedule(static,1)") + #else + #define MISC_OMP_PRAGMA + #endif + + // Loop prefix. + #define MISC_LOOP_INDICES misc_idxs + #define MISC_BODY_INDICES misc_range + #define MISC_USE_LOOP_PART_0 + #include "yask_misc_loops.hpp" + + // Loop body. + // Since stride is always 1, we only need start indices. + StencilBundleImplT::calc_scalar(cp, outer_thread_idx, misc_range.start); + + // Loop suffix. + #define MISC_USE_LOOP_PART_1 + #include "yask_misc_loops.hpp" + } + + // Calculate results for one nano-block. + // The index ranges in 'micro_block_idxs' are sub-divided + // into full vector-clusters, full vectors, and partial vectors. + // The resulting areas are evaluated by the YASK-compiler-generated code. void - calc_loop_of_vectors(int region_thread_idx, - int block_thread_idx, - const ScanIndices& loop_idxs, - idx_t write_mask); + calc_nano_block_opt(int outer_thread_idx, + int inner_thread_idx, + KernelSettings& settings, + const ScanIndices& micro_block_idxs) { + STATE_VARS(this); + TRACE_MSG("calc_nano_block_opt for bundle '" << get_name() << "': [" << + micro_block_idxs.start.make_val_str() << + " ... " << micro_block_idxs.stop.make_val_str() << + ") by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + auto* cp = _corep(); + + /* + 2D example: + +--------------------+ + | | + | +--------------+ | + | | | | + | | +------+ | | + | | | <------------ full clusters (multiple vectors) + | | | | | | + | | +------+ <------ full (unmasked, single) vectors + | | | | + | +--------------+ <--- partial (masked, single) vectors (peel/rem) + | | + +--------------------+ + + Indices and areas in each domain dim: + + eidxs.begin + | peel <--------- partial vecs here -------> remainder + | | left <------ full vecs here ----> right | + | | | full clusters here | | eidxs.end + | | | | | | | + v v v v v v v + +--+-------+---------------------------+-----+--+ "+" => compute boundaries. + | | | | + +---+-------+---------------------------+-----+---+ "+" => vec-aligned boundaries. + ^ ^ ^ ^ ^ ^ + | | | | | | + | | fcidxs.begin (rounded up) | | ovidxs.end (rounded up) + | fvidxs.begin (rounded up) | fvidxs.end (rounded down) + ovidxs.begin (rounded down) fcidxs.end (rounded down) + ('end' indices are one past last) + + Also need to handle all sorts of cases where some of these + sections are empty, the case where the peel and remainder + overlap, and the case where the left and right full vecs + overlap. + */ + + // Init nano-block begin & end from block start & stop indices. + // These indices are in element units and global (NOT rank-relative). + // All other indices below are contructed from 'sb_idxs' to ensure + // step indices are copied properly. + ScanIndices sb_idxs = micro_block_idxs.create_inner(); + + // Strides within a nano-blk are based on pico-blk sizes. + sb_idxs.stride = settings._pico_block_sizes; + sb_idxs.stride[step_posn] = idx_t(1); + + // Tiles in nano-blocks. + sb_idxs.tile_size = settings._nano_block_tile_sizes; + + // Sub block indices in element units and rank-relative. + ScanIndices sb_eidxs(sb_idxs); + + // Subset of nano-block that is full clusters. + // These indices are in element units and rank-relative. + ScanIndices sb_fcidxs(sb_idxs); + + // Subset of nano-block that is full vectors. + // These indices are in element units and rank-relative. + ScanIndices sb_fvidxs(sb_idxs); + + // Superset of nano-block rounded to vector outer-boundaries as shown above. + // These indices are in element units and rank-relative. + ScanIndices sb_ovidxs(sb_idxs); + + // These will be set to rank-relative, so set ofs to zero. + sb_eidxs.align_ofs.set_from_const(0); + sb_fcidxs.align_ofs.set_from_const(0); + sb_fvidxs.align_ofs.set_from_const(0); + sb_ovidxs.align_ofs.set_from_const(0); + + // Flag for full clusters. + bool do_clusters = true; + bool do_outside_clusters = false; + + // Bit-field flags for full and partial vecs on left and right + // in each dim. + bit_mask_t do_left_fvecs = 0, do_right_fvecs = 0; + bit_mask_t do_left_pvecs = 0, do_right_pvecs = 0; + + // Bit-masks for computing partial vectors in each dim. + // Init to zeros (nothing needed). + Indices peel_masks(idx_t(0), nddims), rem_masks(idx_t(0), nddims); + + // For each domain dim: + // - Adjust indices to be rank-relative. + // - Determine the subset of this nano-block that is + // clusters, vectors, and partial vectors. + DOMAIN_VAR_LOOP(i, j) { - }; // StencilBundleBase. + // Rank offset. + auto rofs = _context->rank_domain_offsets[j]; + + // Begin/end of rank-relative scalar elements in this dim. + auto ebgn = sb_idxs.begin[i] - rofs; + auto eend = sb_idxs.end[i] - rofs; + + // Find range of full clusters. + // These are also the inner-boundaries of the + // full vectors. + // NB: fcbgn will be > fcend if the nano-block + // is within a cluster. + auto cpts = dims->_cluster_pts[j]; + auto fcbgn = round_up_flr(ebgn, cpts); + auto fcend = round_down_flr(eend, cpts); + + // Find range of full vectors. + // These are also the inner-boundaries of the peel + // and rem sections. + // NB: fvbgn will be > fvend if the nano-block + // is within a vector. + auto vpts = fold_pts[j]; + auto fvbgn = round_up_flr(ebgn, vpts); + auto fvend = round_down_flr(eend, vpts); + + // Outer vector-aligned boundaries. Note that rounding + // direction is opposite of full vectors, i.e., rounding + // toward outside of nano-block. These will be used as + // boundaries for partial vectors if needed. + auto ovbgn = round_down_flr(ebgn, vpts); + auto ovend = round_up_flr(eend, vpts); + assert(ovend >= ovbgn); + assert(ovbgn <= fvbgn); + assert(ovend >= fvend); + + // Any full vectors to do on left or right? These should + // always be false when cluster size is 1. + bool do_left_fvec = fvbgn < fcbgn; + bool do_right_fvec = fvend > fcend; + + // Any partial vectors to do on left or right? + bool do_left_pvec = ebgn < fvbgn; + bool do_right_pvec = eend > fvend; + + // Create masks. + idx_t pmask = 0, rmask = 0; + if (do_left_pvec || do_right_pvec) { + + // Calculate masks in this dim for partial vectors. + // 2D example: assume folding is x=4*y=4. + // Possible 'x' peel mask to exclude 1st 2 cols: + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // Along 'x' edge, this mask is used to update 8 elems per vec. + // Possible 'y' peel mask to exclude 1st row: + // 0 0 0 0 + // 1 1 1 1 + // 1 1 1 1 + // 1 1 1 1 + // Along 'y' edge, this mask is used to update 12 elems per vec. + // In an 'x-y' corner, they are ANDed to make this mask: + // 0 0 0 0 + // 0 0 1 1 + // 0 0 1 1 + // 0 0 1 1 + // so that the 6 corner elements are updated per vec. + + // Need to set upper bit. + idx_t mbit = idx_t(1) << (dims->_fold_pts.product() - 1); + + // Visit points in a vec-fold to set bits for this dim's + // masks per the diagram above. + bool first_inner = dims->_fold_pts.is_first_inner(); + dims->_fold_sizes.visit_all_points + (first_inner, + [&](const Indices& pt, size_t idx) { + + // Shift masks to next posn. + pmask >>= 1; + rmask >>= 1; + + // If the peel point is within the nano-block, + // set the next bit in the mask. + // Index is outer begin point plus this offset. + idx_t pi = ovbgn + pt[j]; + if (pi >= ebgn) + pmask |= mbit; + + // If the rem point is within the nano-block, + // put a 1 in the mask. + // Index is full-vector end point plus this offset. + pi = fvend + pt[j]; + if (pi < eend) + rmask |= mbit; + return true; // from lambda. + }); + if (do_left_pvec) + assert(pmask != 0); + if (do_right_pvec) + assert(rmask != 0); + } + + // Special cases: boundaries and flags that need fixing due + // to overlaps... + + // Overlapping peel and rem, i.e., ebgn and eend are in the + // same vector. AND peel and rem masks into one mask and do + // peel only. + if (do_left_pvec && do_right_pvec && ovbgn == fvend) { + assert(fvbgn == ovend); + pmask &= rmask; + rmask = 0; + do_left_pvec = true; + do_right_pvec = false; + do_left_fvec = false; + do_right_fvec = false; + do_clusters = false; + } + + // No clusters. + else if (fcend <= fcbgn) { + + // Move both cluster boundaries to end + // of full-vec range. + fcbgn = fcend = fvend; + do_clusters = false; + + // Any full vecs? Do left only due to fc-range + // adjustment above. + if (do_left_fvec || do_right_fvec) { + do_left_fvec = true; + do_right_fvec = false; + } + } + + // Any outside parts at all? + if (do_left_fvec || do_right_fvec || + do_left_pvec || do_right_pvec) + do_outside_clusters = true; + + // Save loop-local (current dim) vars. + // ScanIndices vars. + sb_eidxs.begin[i] = ebgn; + sb_eidxs.end[i] = eend; + sb_fcidxs.begin[i] = fcbgn; + sb_fcidxs.end[i] = fcend; + sb_fvidxs.begin[i] = fvbgn; + sb_fvidxs.end[i] = fvend; + sb_ovidxs.begin[i] = ovbgn; + sb_ovidxs.end[i] = ovend; + + // Domain-dim mask vars. + peel_masks[j] = pmask; + rem_masks[j] = rmask; + if (do_left_fvec) + set_bit(do_left_fvecs, j); + if (do_right_fvec) + set_bit(do_right_fvecs, j); + if (do_left_pvec) + set_bit(do_left_pvecs, j); + if (do_right_pvec) + set_bit(do_right_pvecs, j); + + } // domain dims. + + int thread_limit = actl_opts->thread_limit; + + // Normalized cluster indices. + auto norm_fcidxs = normalize_indices(sb_fcidxs); + + if (!do_clusters) + TRACE_MSG("no full clusters to calculate"); + + // Full rectilinear polytope of aligned clusters: use optimized + // code for full clusters w/o masking. + else { + TRACE_MSG("calculating clusters within " + "normalized local indices [" << + norm_fcidxs.begin.make_val_str() << + " ... " << norm_fcidxs.end.make_val_str() << + ") with stride " << norm_fcidxs.stride.make_val_str() << + " by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + + // Perform the calculations in this block. + calc_clusters_opt2(cp, outer_thread_idx, inner_thread_idx, + thread_limit, norm_fcidxs); + + } // whole clusters. + + if (!do_outside_clusters) + TRACE_MSG("no full or partial vectors to calculate"); + else { + TRACE_MSG("processing full and/or partial vectors " + "within local indices [" << + sb_eidxs.begin.make_val_str() << + " ... " << sb_eidxs.end.make_val_str() << + ") bordering full clusters at [" << + sb_fcidxs.begin.make_val_str() << + " ... " << sb_fcidxs.end.make_val_str() << + ") by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + #if CPTS == 1 + THROW_YASK_EXCEPTION("Internal error: vector border-code not expected with cluster-size==1"); + #else + + // Normalized vector indices. + auto norm_fvidxs = normalize_indices(sb_fvidxs); + auto norm_ovidxs = normalize_indices(sb_ovidxs); + + // Need to find range in each border part. + // 2D example w/4 edges and 4 corners: + // +---+------+---+ + // | lx| |rx | + // | ly| ly |ly | + // +---+------+---+ + // | | | | + // | lx| |rx | + // | | | | + // +---+------+---+ + // | lx| |rx | + // | ry| ry |ry | + // +---+------+---+ + // l=left or peel. + // r=right or remainder. + // Same idea for full or partial vectors, but + // different start and stop indices. + // Strictly, full vectors could be done with fewer parts since + // masking isn't needed, but full vectors are only needed when + // using clustering, and clustering is usually done at most + // along one dim, so this optimization wouldn't help much in practice. + int partn = 0; + + // Loop through progressively more intersections of domain dims, e.g., + // for 2D, do edges (1 dim), then corners (2-dim intersections); + // for 3D, do faces (1 dim), then edges (2-dim intersections), + // then corners (3-dim intersections). + for (int k = 1; k <= nddims; k++) { + + // Num of combos of 'k' dims, e.g., + // for 2D: + // k=1, edges: x, y (2); + // k=2, corners: x-y (1); + // for 3D: + // k=1, faces: x, y, z (3); + // k=2, edges: x-y, x-z, y-z (3); + // k=3, corners: x-y-z (1), + auto ncombos = n_choose_k(nddims, k); + + // Num of left-right sequences of length 'k' = 2^k, e.g., + // for 2D: + // k=1, edges: l, r (2); + // k=2, corners: l-l, l-r, r-l, r-r (4) + // for 3D: + // k=1, faces: l, r (2); + // k=2, edges: l-l, l-r, r-l, r-r (4); + // k=3, corners: l-l-l, l-l-r, l-r-l, l-r-r, + // r-l-l, r-l-r, r-r-l, r-r-r (8). + auto nseqs = bit_mask_t(1) << k; + + // Process each seq of each combo, e.g., + // for 2D, 8 parts: + // k=1, edges: 2 seqs * 2 combos => 4 edges; + // k=2, corners: 4 seqs * 1 combo = 4 corners; + // for 3D, 26 parts: + // k=1, faces: 2 seqs * 3 combos => 6 faces; + // k=2, edges: 4 seqs * 3 combos => 12 edges; + // k=3, corners: 8 seqs * 1 combo => 8 corners. + + // Each combo. + for (int r = 0; r < ncombos; r++) { + + // Dims selected in this combo: 'nndims'-length + // bitset w/'k' bits set. + auto cdims = n_choose_k_set(nddims, k, r); + + // L-R seqs: 'k'-length bitset. + for (bit_mask_t lr = 0; lr < nseqs; lr++) { + partn++; + + // Normalized ranges for this part. Initialize + // each to range for non-selected dims. Strides + // are actually overridded by the STRIDE + // macros generated by the YASK compiler, so + // these settings are not needed. + auto fv_part(norm_fcidxs); + ///fv_part.stride.set_from_const(1); // 1-vector stride. + auto pv_part(norm_fvidxs); + + bool fv_needed = true; + bool pv_needed = true; + bit_mask_t pv_mask = bit_mask_t(-1); + + // Loop through each domain dim to set range for + // this combo and l-r seq. + #ifdef TRACE + std::string descr = std::string("part ") + std::to_string(partn) + ": '"; + #endif + int nsel = 0; + DOMAIN_VAR_LOOP(i, j) { + + // Is this dim selected in the current combo? + // If selected, is it left or right? + bool is_sel = is_bit_set(cdims, j); + if (is_sel) { + bool is_left = !is_bit_set(lr, nsel); + nsel++; + + // Set left-right ranges. + // See indices diagram at beginning of this function. + if (is_left) { + fv_part.begin[i] = norm_fvidxs.begin[i]; + fv_part.end[i] = norm_fcidxs.begin[i]; + if (!is_bit_set(do_left_fvecs, j)) + fv_needed = false; + pv_part.begin[i] = norm_ovidxs.begin[i]; + pv_part.end[i] = norm_fvidxs.begin[i]; + pv_mask &= peel_masks[j]; + if (!is_bit_set(do_left_pvecs, j)) + pv_needed = false; + } else { + fv_part.begin[i] = norm_fcidxs.end[i]; + fv_part.end[i] = norm_fvidxs.end[i]; + if (!is_bit_set(do_right_fvecs, j)) + fv_needed = false; + pv_part.begin[i] = norm_fvidxs.end[i]; + pv_part.end[i] = norm_ovidxs.end[i]; + pv_mask &= rem_masks[j]; + if (!is_bit_set(do_right_pvecs, j)) + pv_needed = false; + } + #ifdef TRACE + if (descr.length()) + descr += " & "; + descr += std::string(is_left ? "left" : "right") + "-" + + domain_dims.get_dim_name(j); + #endif + } + } + #ifdef TRACE + descr += "'"; + #endif + + // Calc this full-vector part. + if (fv_needed) { + TRACE_MSG("calculating full vectors for " << descr << + " within normalized local indices [" << + fv_part.begin.make_val_str() << + " ... " << fv_part.end.make_val_str() << + ") by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + + calc_vectors_opt2(cp, + outer_thread_idx, inner_thread_idx, + thread_limit, fv_part, bit_mask_t(-1)); + } + //else TRACE_MSG("full vectors not needed for " << descr); + + // Calc this partial-vector part. + if (pv_needed) { + TRACE_MSG("calculating partial vectors with mask 0x" << + std::hex << pv_mask << std::dec << " for " << descr << + " within normalized local indices [" << + pv_part.begin.make_val_str() << + " ... " << pv_part.end.make_val_str() << + ") by outer thread " << outer_thread_idx << + " and inner thread " << inner_thread_idx); + + calc_vectors_opt2(cp, + outer_thread_idx, inner_thread_idx, + thread_limit, pv_part, pv_mask); + } + //else TRACE_MSG("partial vectors not needed for " << descr); + + } // L-R seqs. + } // dim combos. + } + #endif + } + + } // calc_nano_block_opt. + + // Calculate a tile of clusters. + // This should be the hottest function for most stencils. + // All functions called from this one should be inlined. + // Indices must be vec-len-normalized and rank-relative. + // Static to make sure offload doesn't need 'this'. + static void + calc_clusters_opt2(StencilCoreDataT* corep, + int outer_thread_idx, + int inner_thread_idx, + int thread_limit, + ScanIndices& norm_idxs) { + + // Call code from stencil compiler. + ssc_start(); + StencilBundleImplT::calc_clusters(corep, + outer_thread_idx, inner_thread_idx, + thread_limit, norm_idxs); + ssc_stop(); + } + + // Calculate a tile of vectors using the given mask. + // All functions called from this one should be inlined. + // Indices must be vec-len-normalized and rank-relative. + // Static to make sure offload doesn't need 'this'. + static void + calc_vectors_opt2(StencilCoreDataT* corep, + int outer_thread_idx, + int inner_thread_idx, + int thread_limit, + ScanIndices& norm_idxs, + bit_mask_t mask) { + + #if CPTS == 1 + THROW_YASK_EXCEPTION("Internal error: masked-vector code not expected with cluster-size==1"); + #else + + // Call code from stencil compiler. + StencilBundleImplT::calc_vectors(corep, + outer_thread_idx, inner_thread_idx, + thread_limit, norm_idxs, mask); + #endif + } + }; // StencilBundleBase. + // A collection of independent stencil bundles. // "Independent" implies that they may be evaluated // in any order. @@ -310,7 +998,7 @@ namespace yask { const std::string& name) : ContextLinker(context), _name(name), - _stage_opts(*context->get_state()->_opts), // init w/a copy of the base settings. + _stage_opts(*context->get_state()->_actl_opts), // init w/a copy of the base settings. _at(context, &_stage_opts, name) { } virtual ~Stage() { } @@ -342,7 +1030,7 @@ namespace yask { // Otherwise, return one in context. KernelSettings& get_active_settings() { STATE_VARS(this); - return use_stage_tuners() ? _stage_opts : *opts; + return use_stage_tuners() ? _stage_opts : *actl_opts; } // Perf-tracking methods. diff --git a/src/kernel/lib/utils.cpp b/src/kernel/lib/utils.cpp index 1a41dabf..481b3139 100644 --- a/src/kernel/lib/utils.cpp +++ b/src/kernel/lib/utils.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -35,35 +35,26 @@ Cache cache_model(MODEL_CACHE); namespace yask { // Timer. - void YaskTimer::start(TimeSpec* ts) { + void YaskTimer::start(const TimeSpec& ts) { // Make sure timer was stopped. assert(_begin.tv_sec == 0); assert(_begin.tv_nsec == 0); - if (ts) - _begin = *ts; - else { - auto cts = get_timespec(); - _begin = cts; - } + _begin = ts; } - double YaskTimer::stop(TimeSpec* ts) { - TimeSpec end, delta; - if (ts) - end = *ts; - else { - auto cts = get_timespec(); - end = cts; - } + double YaskTimer::stop(const TimeSpec& ts) { // Make sure timer was started. assert(_begin.tv_sec != 0); + TimeSpec end = ts; + // Make sure time is going forward. assert(end.tv_sec >= _begin.tv_sec); // Elapsed time is just end - begin times. + TimeSpec delta; delta.tv_sec = end.tv_sec - _begin.tv_sec; _elapsed.tv_sec += delta.tv_sec; @@ -93,270 +84,8 @@ namespace yask { return double(delta.tv_sec) + double(delta.tv_nsec) * 1e-9; } - // Aligned allocation. - char* yask_aligned_alloc(std::size_t nbytes) { - - // Alignment to use based on size. - const size_t _def_alignment = CACHELINE_BYTES; - const size_t _def_big_alignment = YASK_HUGE_ALIGNMENT; - size_t align = (nbytes >= _def_big_alignment) ? - _def_big_alignment : _def_alignment; - void *p = 0; - - // Some envs have posix_memalign(), some have aligned_alloc(). -#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 - int ret = posix_memalign(&p, align, nbytes); - if (ret) p = 0; -#else - p = aligned_alloc(align, nbytes); -#endif - - if (!p) - THROW_YASK_EXCEPTION("error: cannot allocate " + make_byte_str(nbytes) + - " aligned to " + make_byte_str(align)); - return static_cast(p); - } - -#ifdef USE_PMEM - static int pmem_tmpfile(const char *dir, size_t size, int *fd, void **addr) - { - static char tmpl[] = "/appdirect_mem_xxxxxx"; - int err = 0; - - char fullname[strlen(dir) + sizeof (tmpl)]; - (void) strcpy(fullname, dir); - (void) strcat(fullname, tmpl); - - if ((*fd = mkstemp(fullname)) < 0) { - perror("mkstemp()"); - err = MEMKIND_ERROR_RUNTIME; - THROW_YASK_EXCEPTION("Error: MEMKIND_ERROR_RUNTIME - mkstemp()\n"); - } - - (void) unlink(dir); - - if (ftruncate(*fd, size) != 0) { - err = MEMKIND_ERROR_RUNTIME; - THROW_YASK_EXCEPTION("Error: MEMKIND_ERROR_RUNTIME - ftruncate()\n"); - } - - *addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, *fd, 0); - if (*addr == MAP_FAILED) { - err = MEMKIND_ERROR_RUNTIME; - THROW_YASK_EXCEPTION("Error: MEMKIND_ERROR_RUNTIME - mmap()\n"); - } - - return err; - } -#endif - - // NUMA allocation. - // 'numa_pref' == yask_numa_none: use default aligned alloc. - // 'numa_pref' >= 0: preferred NUMA node. - // 'numa_pref' < 0: use NUMA policy corresponding to value. - // TODO: get rid of magic-number scheme. - char* numa_alloc(std::size_t nbytes, int numa_pref) { - - void *p = 0; - - if (numa_pref == yask_numa_none) - return yask_aligned_alloc(nbytes); - -#ifdef USE_NUMA - - // Should we use the numa policy library? -#ifdef USE_NUMA_POLICY_LIB -#pragma omp single - else if (numa_available() != -1) { - numa_set_bind_policy(0); - if (numa_pref >= 0 && numa_pref <= numa_max_node()) - numa_alloc_onnode(nbytes, numa_pref); - else - numa_alloc_local(nbytes); - // Interleaved not available. - } - else - THROW_YASK_EXCEPTION("Error: explicit NUMA policy allocation is not available"); - - // Use mmap/mbind explicitly. -#else - else if (get_mempolicy(NULL, NULL, 0, 0, 0) == 0) { - - // Set mmap flags. - int mmprot = PROT_READ | PROT_WRITE; - int mmflags = MAP_PRIVATE | MAP_ANONYMOUS; - - // Get an anonymous R/W memory map. - p = mmap(0, nbytes, mmprot, mmflags, -1, 0); - - // If successful, apply the desired binding. - if (p && p != MAP_FAILED) { - if (numa_pref >= 0) { - - // Prefer given node. - unsigned long nodemask = 0x1UL << numa_pref; - mbind(p, nbytes, MPOL_PREFERRED, &nodemask, sizeof(nodemask) * 8, 0); - } - else if (numa_pref == yask_numa_interleave) { - - // Use all nodes. - unsigned long nodemask = (unsigned long)-1; - mbind(p, nbytes, MPOL_INTERLEAVE, &nodemask, sizeof(nodemask) * 8, 0); - } - - else{ - - // Use local node. - // MPOL_LOCAL was defined in Linux 3.8, so use - // MPOL_DEFAULT as backup on old systems. -#ifdef MPOL_LOCAL - mbind(p, nbytes, MPOL_LOCAL, 0, 0, 0); -#else - mbind(p, nbytes, MPOL_DEFAULT, 0, 0, 0); -#endif - } - } - else - THROW_YASK_EXCEPTION("Error: anonymous mmap of " + make_byte_str(nbytes) + - " failed"); - } - else - THROW_YASK_EXCEPTION("Error: explicit NUMA policy allocation is not available"); - -#endif // not USE_NUMA_POLICY_LIB. - -#else - THROW_YASK_EXCEPTION("Error: NUMA allocation is not enabled; build with numa=1"); -#endif // USE_NUMA. - - // Should not get here w/null p; throw exception. - if (!p) - THROW_YASK_EXCEPTION("Error: cannot allocate " + make_byte_str(nbytes) + - " using numa-node (or policy) " + to_string(numa_pref)); - - // Check alignment. - if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Error: NUMA-allocated " << p << " is not " << - CACHELINE_BYTES << "-byte aligned"); - - // Return as a char* as required for shared_ptr ctor. - return static_cast(p); - } - - // Reverse numa_alloc(). - void NumaDeleter::operator()(char* p) { - - if (p && _numa_pref == yask_numa_none) { - free(p); - p = NULL; - } - -#ifdef USE_NUMA -#ifdef USE_NUMA_POLICY_LIB - if (p && numa_available() != -1) { - numa_free(p, _nbytes); - p = NULL; - } -#else - if (p && get_mempolicy(NULL, NULL, 0, 0, 0) == 0) { - munmap(p, _nbytes); - p = NULL; - } -#endif -#endif - if (p) { - free(p); - p = NULL; - } - } - - // PMEM allocation. - char* pmem_alloc(std::size_t nbytes, int dev_num) { - - void *p = 0; - - // Allocate into pmem. -#ifdef USE_PMEM - int err = 0; - int fd; - // 'X' of pmem_x should be matched with the NUMA node. - string pmem_name("/mnt/pmem"); - pmem_name += to_string(dev_num); - err = pmem_tmpfile(pmem_name.c_str(), nbytes, &fd, &p); - if (err) - THROW_YASK_EXCEPTION("Error: Unable to create temporary file for PMEM"); -#else - THROW_YASK_EXCEPTION("Error: PMEM allocation is not enabled; build with pmem=1"); -#endif - - if (!p) - THROW_YASK_EXCEPTION("Error: cannot allocate " + make_byte_str(nbytes) + - " on pmem dev " + to_string(dev_num)); - - // Check alignment. - if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Error: PMEM-allocated " << p << " is not " << - CACHELINE_BYTES << "-byte aligned"); - - // Return as a char* as required for shared_ptr ctor. - return static_cast(p); - } - - // Reverse pmem_alloc(). - void PmemDeleter::operator()(char* p) { - if (p) { - munmap(p, _nbytes); - p = NULL; - } - } - - // MPI shm allocation. - char* shm_alloc(std::size_t nbytes, - const MPI_Comm* shm_comm, MPI_Win* shm_win) { - - void *p = 0; - - // Allocate using MPI shm. -#ifdef USE_MPI - assert(shm_comm); - assert(shm_win); - MPI_Info win_info; - MPI_Info_create(&win_info); - MPI_Info_set(win_info, "alloc_shared_noncontig", "true"); - MPI_Win_allocate_shared(nbytes, 1, win_info, *shm_comm, &p, shm_win); - MPI_Info_free(&win_info); - MPI_Win_lock_all(0, *shm_win); -#else - THROW_YASK_EXCEPTION("Error: MPI shm allocation is not enabled; build with mpi=1"); -#endif - - if (!p) - THROW_YASK_EXCEPTION("Error: cannot allocate " + make_byte_str(nbytes) + - " using MPI shm"); - - // Check alignment. - if ((size_t(p) & (CACHELINE_BYTES - 1)) != 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Error: MPI shm-allocated " << p << " is not " << - CACHELINE_BYTES << "-byte aligned"); - - // Return as a char* as required for shared_ptr ctor. - return static_cast(p); - } - - // Reverse shm_alloc(). - void ShmDeleter::operator()(char* p) { - -#ifdef USE_MPI - assert(_shm_comm); - assert(_shm_win); - MPI_Win_unlock_all(*_shm_win); - MPI_Win_free(_shm_win); - p = NULL; -#else - THROW_YASK_EXCEPTION("Error: MPI shm deallocation is not enabled; build with mpi=1"); -#endif - } - + ////// MPI utils ////// + // Find sum of rank_vals over all ranks. idx_t sum_over_ranks(idx_t rank_val, MPI_Comm comm) { idx_t sum_val = rank_val; @@ -422,11 +151,15 @@ namespace yask { pos += words[i].length(); } os << endl; + + // Print current value. + os << _help_leader << _current_value_str; + print_value(os) << ".\n"; } // Check for matching option to "-"str at args[argi]. // Return true and increment argi if match. - bool CommandLineParser::OptionBase::_check_arg(const std::vector& args, + bool CommandLineParser::OptionBase::_is_opt(const string_vec& args, int& argi, const std::string& str) const { @@ -451,7 +184,7 @@ namespace yask { const char* nptr = args[argi].c_str(); char* endptr = 0; double val = strtod(nptr, &endptr); - if (val == HUGE_VAL || val == -HUGE_VAL || *endptr != '\0') { + if (!isfinite(val) || *endptr != '\0') { THROW_YASK_EXCEPTION("Error: argument for option '" + args[argi - 1] + "' is not a valid floating-point number"); } @@ -481,15 +214,29 @@ namespace yask { return idx_t(val); } + // Get one string value from args[argi]. + // On failure, print msg using string from args[argi-1] and exit. + // On success, increment argi and return value. + string CommandLineParser::OptionBase::_string_val(const vector& args, + int& argi) + { + if (size_t(argi) >= args.size()) + THROW_YASK_EXCEPTION("Error: no argument for option '" + args[argi - 1] + "'"); + + auto v = args[argi]; + argi++; + return v; + } + // Check for a boolean option. - bool CommandLineParser::BoolOption::check_arg(const std::vector& args, + bool CommandLineParser::BoolOption::check_arg(const string_vec& args, int& argi) { - if (_check_arg(args, argi, _name)) { + if (_is_opt(args, argi, _name)) { _val = true; return true; } string false_name = string("no-") + _name; - if (_check_arg(args, argi, false_name)) { + if (_is_opt(args, argi, false_name)) { _val = false; return true; } @@ -500,14 +247,12 @@ namespace yask { void CommandLineParser::BoolOption::print_help(ostream& os, int width) const { _print_help(os, string("[no-]" + _name), width); - os << _help_leader << _current_value_str << - (_val ? "true" : "false") << "." << endl; } // Check for a double option. - bool CommandLineParser::DoubleOption::check_arg(const std::vector& args, + bool CommandLineParser::DoubleOption::check_arg(const string_vec& args, int& argi) { - if (_check_arg(args, argi, _name)) { + if (_is_opt(args, argi, _name)) { _val = _double_val(args, argi); return true; } @@ -518,14 +263,12 @@ namespace yask { void CommandLineParser::DoubleOption::print_help(ostream& os, int width) const { _print_help(os, _name + " ", width); - os << _help_leader << _current_value_str << - _val << "." << endl; } // Check for an int option. - bool CommandLineParser::IntOption::check_arg(const std::vector& args, + bool CommandLineParser::IntOption::check_arg(const string_vec& args, int& argi) { - if (_check_arg(args, argi, _name)) { + if (_is_opt(args, argi, _name)) { _val = (int)_idx_val(args, argi); // TODO: check for over/underflow. return true; } @@ -536,14 +279,12 @@ namespace yask { void CommandLineParser::IntOption::print_help(ostream& os, int width) const { _print_help(os, _name + " ", width); - os << _help_leader << _current_value_str << - _val << "." << endl; } // Check for an idx_t option. - bool CommandLineParser::IdxOption::check_arg(const std::vector& args, + bool CommandLineParser::IdxOption::check_arg(const string_vec& args, int& argi) { - if (_check_arg(args, argi, _name)) { + if (_is_opt(args, argi, _name)) { _val = _idx_val(args, argi); return true; } @@ -554,27 +295,18 @@ namespace yask { void CommandLineParser::IdxOption::print_help(ostream& os, int width) const { _print_help(os, _name + " ", width); - os << _help_leader << _current_value_str << - _val << "." << endl; } // Print help on an multi-idx_t option. void CommandLineParser::MultiIdxOption::print_help(ostream& os, int width) const { _print_help(os, _name + " ", width); - os << _help_leader << _current_value_str; - for (size_t i = 0; i < _vals.size(); i++) { - if (i > 0) - os << ", "; - os << *_vals[i]; - } - os << "." << endl; } // Check for an multi-idx_t option. - bool CommandLineParser::MultiIdxOption::check_arg(const std::vector& args, + bool CommandLineParser::MultiIdxOption::check_arg(const string_vec& args, int& argi) { - if (_check_arg(args, argi, _name)) { + if (_is_opt(args, argi, _name)) { idx_t val = _idx_val(args, argi); for (size_t i = 0; i < _vals.size(); i++) *_vals[i] = val; @@ -583,19 +315,75 @@ namespace yask { return false; } + // Check for a string option. + bool CommandLineParser::StringOption::check_arg(const string_vec& args, + int& argi) { + if (_is_opt(args, argi, _name)) { + _val = _string_val(args, argi); + return true; + } + return false; + } + + // Print help on a string option. + void CommandLineParser::StringOption::print_help(ostream& os, + int width) const { + _print_help(os, _name + " ", width); + } + + // Check for a string-list option. + bool CommandLineParser::StringListOption::check_arg(const string_vec& args, + int& argi) { + if (_is_opt(args, argi, _name)) { + _val.clear(); + string strs = _string_val(args, argi); + stringstream ss(strs); + string str; + while (getline(ss, str, ',')) { + if (_allowed_strs.size() && _allowed_strs.count(str) == 0) { + THROW_YASK_EXCEPTION("Error: illegal argument '" + str + "' to option '" + + args[argi - 2] + "'"); + } + _val.push_back(str); + } + return true; + } + return false; + } + + // Print help on a string-list option. + void CommandLineParser::StringListOption::print_help(ostream& os, + int width) const { + _print_help(os, _name + " ", width); + } + // Print help on all options. void CommandLineParser::print_help(ostream& os) const { for (auto oi : _opts) { - const auto* opt = oi.second; + const auto opt = oi.second; opt->print_help(os, _width); } } + // Print settings of all options. + void CommandLineParser::print_values(ostream& os) const { + const size_t name_wid = 22; + for (auto oi : _opts) { + const auto& name = oi.first; + const auto& opt = oi.second; + os << " " << name << ": "; + if (name.length() < name_wid) + for (size_t i = 0; i < name_wid - name.length(); i++) + os << " "; + opt->print_value(os) << endl; + } + } + // Parse options from the command-line and set corresponding vars. // Recognized strings from args are consumed, and unused ones // are returned. string CommandLineParser::parse_args(const std::string& pgm_name, - const std::vector& args) { + const string_vec& args) { vector non_args; // Loop through strings in args. @@ -604,7 +392,7 @@ namespace yask { // Compare against all registered options. bool matched = false; for (auto oi : _opts) { - auto* opt = oi.second; + auto opt = oi.second; // If a match is found, argi will be incremeted // as needed beyond option and/or its arg. @@ -635,33 +423,36 @@ namespace yask { // Tokenize args from a string. vector CommandLineParser::set_args(const string& arg_string) { - string tmp; - bool in_quotes = false; + string tmp; // current arg. + char in_quote = '\0'; // current string delimiter or null if none. vector args; for (char c : arg_string) { - // If WS, start a new string unless in quotes. - if (isspace(c)) { - if (in_quotes) - tmp += c; - else { - if (tmp.length()) - args.push_back(tmp); - tmp.clear(); - } - } + // If in quotes, add to string or handle end. + if (in_quote != '\0') { - // If quote, start or end double-quotes. - // TODO: handle single-quotes. - else if (c == '"') { - if (in_quotes) { - if (tmp.length()) - args.push_back(tmp); + // End of quoted string, i.e., this char + // matches opening quote. + if (in_quote == c) { + args.push_back(tmp); // may be empty string. tmp.clear(); - in_quotes = false; + in_quote = '\0'; } + else - in_quotes = true; + tmp += c; + } + + // If WS, save old string and start a new string. + else if (isspace(c)) { + if (tmp.length()) + args.push_back(tmp); + tmp.clear(); + } + + // If quote, remember delimiter. + else if (c == '"' || c == '\'') { + in_quote = c; } // Otherwise, just add to tmp. @@ -669,6 +460,10 @@ namespace yask { tmp += c; } + if (in_quote != '\0') + THROW_YASK_EXCEPTION("Error: unterminated quote in '" + + arg_string + "'"); + // Last string. if (tmp.length()) args.push_back(tmp); diff --git a/src/kernel/lib/utils.hpp b/src/kernel/lib/utils.hpp index c2301d18..4c8699ab 100644 --- a/src/kernel/lib/utils.hpp +++ b/src/kernel/lib/utils.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -25,42 +25,13 @@ IN THE SOFTWARE. #pragma once -// Provide the needed definitions for NUMA support. -// This is fairly convoluted because of the inconsistency of -// support on various OS releases. -// The USE_NUMA* vars are set in the Makefile. -#ifdef USE_NUMA - -// Use numa policy library? -#ifdef USE_NUMA_POLICY_LIB -#include - -// Use if available. -#elif defined(USE_NUMAIF_H) -#include - -// This is a hack, but some systems are missing . -#elif !defined(NUMAIF_H) -extern "C" { - extern long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags); - extern long mbind(void *start, unsigned long len, int mode, - const unsigned long *nmask, unsigned long maxnode, unsigned flags); -} - -// Conservatively don't define MPOL_LOCAL. -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#endif -#endif +// Misc utilities. namespace yask { // Fatal error. - inline void exit_yask(int code) { + [[noreturn]] inline + void exit_yask(int code) { #ifdef USE_MPI int flag; @@ -75,108 +46,21 @@ namespace yask { exit(code); } + // Get an int from an env var. + inline int get_env_int(const std::string& name, int def) { + int res = def; + char* s = getenv(name.c_str()); + if (s) + res = atoi(s); + return res; + } + // Find sum of rank_vals over all ranks. extern idx_t sum_over_ranks(idx_t rank_val, MPI_Comm comm); // Make sure rank_val is same over all ranks. extern void assert_equality_over_ranks(idx_t rank_val, MPI_Comm comm, - const std::string& descr); - - // Helpers for aligned malloc and free. - extern char* yask_aligned_alloc(std::size_t nbytes); - class AlignedDeleter { - public: - void operator()(char* p) { - if (p) { - std::free(p); - p = NULL; - } - } - }; - - // Alloc aligned data as a shared ptr. - template - std::shared_ptr shared_aligned_alloc(size_t sz) { - auto _base = std::shared_ptr(yask_aligned_alloc(sz), AlignedDeleter()); - return _base; - } - - // Helpers for NUMA malloc and free. - extern char* numa_alloc(std::size_t nbytes, int numa_pref); - struct NumaDeleter { - std::size_t _nbytes; - int _numa_pref; - - // Ctor saves data needed for freeing. - NumaDeleter(std::size_t nbytes, int numa_pref) : - _nbytes(nbytes), - _numa_pref(numa_pref) - { } - - // Free p. - void operator()(char* p); - }; - - // Allocate NUMA memory from preferred node. - template - std::shared_ptr shared_numa_alloc(size_t sz, int numa_pref) { - auto _base = std::shared_ptr(numa_alloc(sz, numa_pref), - NumaDeleter(sz, numa_pref)); - return _base; - } - - // Helpers for PMEM malloc and free. - extern char* pmem_alloc(std::size_t nbytes, int dev_num); - struct PmemDeleter { - std::size_t _nbytes; - int _dev_num; - - // Ctor saves data needed for freeing. - PmemDeleter(std::size_t nbytes, int dev_num) : - _nbytes(nbytes), - _dev_num(dev_num) - { } - - // Free p. - void operator()(char* p); - }; - - // Allocate PMEM memory from given device. - template - std::shared_ptr shared_pmem_alloc(size_t sz, int dev_num) { - auto _base = std::shared_ptr(pmem_alloc(sz, dev_num), - PmemDeleter(sz, dev_num)); - return _base; - } - - // Helpers for MPI shm malloc and free. - extern char* shm_alloc(std::size_t nbytes, - const MPI_Comm* shm_comm, MPI_Win* shm_win); - struct ShmDeleter { - std::size_t _nbytes; - const MPI_Comm* _shm_comm; - MPI_Win* _shm_win; - - // Ctor saves data needed for freeing. - ShmDeleter(std::size_t nbytes, - const MPI_Comm* shm_comm, MPI_Win* shm_win): - _nbytes(nbytes), - _shm_comm(shm_comm), - _shm_win(shm_win) - { } - - // Free p. - void operator()(char* p); - }; - - // Allocate MPI shm memory. - template - std::shared_ptr shared_shm_alloc(size_t sz, - const MPI_Comm* shm_comm, MPI_Win* shm_win) { - auto _base = std::shared_ptr(shm_alloc(sz, shm_comm, shm_win), - ShmDeleter(sz, shm_comm, shm_win)); - return _base; - } + const std::string& descr); // A class for a simple producer-consumer memory lock on one item. class SimpleLock { @@ -192,6 +76,7 @@ namespace yask { }; LockVal _write_count, _read_count; + LockVal _data; // Optional simple data field. static constexpr idx_t _ival = 1000; @@ -264,9 +149,26 @@ namespace yask { _write_count.val++; _check("mark_write_done"); } + + // Access data value. + // Of course, other data can be gated w/this lock. + idx_t get_data() const { + return _data.val; + } + void set_data(idx_t v) { + _data.val = v; + } }; // A class for maintaining elapsed time. + // NOT a virtual class. + // Example: + // time ---> + // start() ... stop() ... start() ... stop() ... get_elapsed_time() + // | A secs | | B secs | + // 1st call to stop() returns A. + // 2nd call to stop() returns B. + // Call to get_elapsed_time() returns A + B. class YaskTimer { /* struct timespec { @@ -281,7 +183,7 @@ namespace yask { typedef struct timespec TimeSpec; YaskTimer() { clear(); } - virtual ~YaskTimer() { } + ~YaskTimer() { } // Reset elapsed time to zero. void clear() { @@ -289,7 +191,8 @@ namespace yask { _begin.tv_nsec = _elapsed.tv_nsec = 0; } - // Make a timespec that can be used for mutiple calls. + // Make a current timespec to be provided to start() or stop(). + // This allows multiple timers to use the same timespec. static TimeSpec get_timespec() { TimeSpec ts; clock_gettime(CLOCK_REALTIME, &ts); @@ -300,15 +203,24 @@ namespace yask { // start() and stop() can be called multiple times in // pairs before calling get_elapsed_secs(), which // will return the cumulative time over all timed regions. - void start(TimeSpec* ts = NULL); + void start(const TimeSpec& ts); + void start() { + auto ts = get_timespec(); + start(ts); + } // End a timed region. // Return time since previous call to start(); this is *not* // generally the same as the value returned by get_elapsed_secs(). - double stop(TimeSpec* ts = NULL); + double stop(const TimeSpec& ts); + double stop() { + auto ts = get_timespec(); + return stop(ts); + } - // Get elapsed time between preceding start/stop pairs. - // Does not reset value, so it may be used for cumulative time. + // Get elapsed time between all preceding start/stop pairs since + // object creation or previous call to clear(). Does not reset + // value, so it may be used for querying cumulative time. double get_elapsed_secs() const { // Make sure timer was stopped. @@ -317,7 +229,7 @@ namespace yask { return double(_elapsed.tv_sec) + double(_elapsed.tv_nsec) * 1e-9; } - // Get elapsed time since last start. + // Get elapsed time since previous start. // Used to check time w/o stopping timer. double get_secs_since_start() const; }; @@ -327,7 +239,7 @@ namespace yask { public: - // Base class for an allowed option. + // Base class for a command-line option. class OptionBase { protected: std::string _name; @@ -342,16 +254,20 @@ namespace yask { // Check for matching option to str at args[argi]. // Return true and increment argi if match. - virtual bool _check_arg(const std::vector& args, int& argi, - const std::string& str) const; + virtual bool _is_opt(const string_vec& args, int& argi, + const std::string& str) const; // Get one double value from args[argi++]. // Exit on failure. - virtual double _double_val(const std::vector& args, int& argi); + virtual double _double_val(const string_vec& args, int& argi); // Get one idx_t value from args[argi++]. // Exit on failure. - virtual idx_t _idx_val(const std::vector& args, int& argi); + virtual idx_t _idx_val(const string_vec& args, int& argi); + + // Get one string value from args[argi++]. + // Exit on failure. + virtual std::string _string_val(const string_vec& args, int& argi); public: OptionBase(const std::string& name, @@ -376,12 +292,16 @@ namespace yask { _print_help(os, _name, width); } + // Print current value of this option. + virtual std::ostream& print_value(std::ostream& os) const =0; + // Check for matching option and any needed args at args[argi]. // Return true, set val, and increment argi if match. - virtual bool check_arg(const std::vector& args, int& argi) =0; + virtual bool check_arg(const string_vec& args, int& argi) =0; }; + typedef std::shared_ptr OptionPtr; - // An allowed boolean option. + // A boolean option. class BoolOption : public OptionBase { bool& _val; @@ -392,11 +312,15 @@ namespace yask { OptionBase(name, help_msg), _val(val) { } virtual void print_help(std::ostream& os, - int width) const; - virtual bool check_arg(const std::vector& args, int& argi); + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + os << (_val ? "true" : "false"); + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; }; - // An allowed int option. + // An int option. class IntOption : public OptionBase { int& _val; @@ -407,11 +331,15 @@ namespace yask { OptionBase(name, help_msg), _val(val) { } virtual void print_help(std::ostream& os, - int width) const; - virtual bool check_arg(const std::vector& args, int& argi); + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + os << _val; + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; }; - // An allowed double option. + // A double option. class DoubleOption : public OptionBase { double& _val; @@ -422,11 +350,15 @@ namespace yask { OptionBase(name, help_msg), _val(val) { } virtual void print_help(std::ostream& os, - int width) const; - virtual bool check_arg(const std::vector& args, int& argi); + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + os << _val; + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; }; - // An allowed idx_t option. + // An idx_t option. class IdxOption : public OptionBase { idx_t& _val; @@ -437,11 +369,15 @@ namespace yask { OptionBase(name, help_msg), _val(val) { } virtual void print_help(std::ostream& os, - int width) const; - virtual bool check_arg(const std::vector& args, int& argi); + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + os << _val; + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; }; - // An allowed idx_t option that sets multiple vars. + // An idx_t option that sets multiple vars. class MultiIdxOption : public OptionBase { std::vector _vals; @@ -454,31 +390,80 @@ namespace yask { } virtual void print_help(std::ostream& os, - int width) const; - virtual bool check_arg(const std::vector& args, - int& argi); + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + for (size_t i = 0; i < _vals.size(); i++) { + if (i > 0) + os << ", "; + os << *_vals[i]; + } + return os; + } + virtual bool check_arg(const string_vec& args, + int& argi) override; + }; + + // A string option. + class StringOption : public OptionBase { + std::string& _val; + + public: + StringOption(const std::string& name, + const std::string& help_msg, + std::string& val) : + OptionBase(name, help_msg), _val(val) { } + + virtual void print_help(std::ostream& os, + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + os << _val; + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; + }; + + // A list-of-strings option. + class StringListOption : public OptionBase { + std::set _allowed_strs; // empty to allow any strings. + string_vec& _val; + + public: + StringListOption(const std::string& name, + const std::string& help_msg, + std::set allowed_strs, + string_vec& val) : + OptionBase(name, help_msg), + _allowed_strs(allowed_strs), _val(val) { } + + virtual void print_help(std::ostream& os, + int width) const override; + virtual std::ostream& print_value(std::ostream& os) const override { + int n = 0; + for (auto& v : _val) { + if (n) + os << ","; + os << v; + n++; + } + return os; + } + virtual bool check_arg(const string_vec& args, int& argi) override; }; protected: - std::map _opts; - int _width; + std::map _opts; + int _width = 78; public: // Ctor. - CommandLineParser() : _width(78) { } + CommandLineParser() { } // Dtor. - ~CommandLineParser() { - - // Delete options. - for (auto i : _opts) { - delete i.second; - } - } + virtual ~CommandLineParser() { } // Tokenize args from a string. - static std::vector set_args(const std::string& arg_string); + static string_vec set_args(const std::string& arg_string); // Set help width. virtual void set_width(int width) { @@ -486,19 +471,21 @@ namespace yask { } // Add an allowed option. - // Options will be deleted upon destruction. - virtual void add_option(OptionBase* opt) { + virtual void add_option(OptionPtr opt) { _opts[opt->get_name()] = opt; } // Print help info on all options. virtual void print_help(std::ostream& os) const; + // Print current settings of all options. + virtual void print_values(std::ostream& os) const; + // Parse options from 'args' and set corresponding vars. // Recognized strings from args are consumed, and unused ones // remain for further processing by the application. virtual std::string parse_args(const std::string& pgm_name, - const std::vector& args); + const string_vec& args); // Same as above, but splits 'arg_string' into tokens. virtual std::string parse_args(const std::string& pgm_name, @@ -511,7 +498,7 @@ namespace yask { // and rest of argv is parsed. virtual std::string parse_args(int argc, char** argv) { std::string pgm_name = argv[0]; - std::vector args; + string_vec args; for (int i = 1; i < argc; i++) args.push_back(argv[i]); return parse_args(pgm_name, args); diff --git a/src/kernel/lib/yask.hpp b/src/kernel/lib/yask.hpp index 4a3642e5..b61a69b1 100644 --- a/src/kernel/lib/yask.hpp +++ b/src/kernel/lib/yask.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -44,6 +44,7 @@ typedef int MPI_Comm; typedef int MPI_Win; typedef int MPI_Group; typedef int MPI_Request; +typedef int MPI_Status; #define MPI_PROC_NULL (-1) #define MPI_COMM_NULL ((MPI_Comm)0x04000000) #define MPI_REQUEST_NULL ((MPI_Request)0x2c000000) @@ -66,39 +67,29 @@ typedef int MPI_Request; #include #include #include -#include -#include +#include #include #include #include #include -#include #include +#include +#include +#include #include #include #include -#include -#include -#include #include #include #include #include -#ifdef USE_PMEM -#include -#include -#endif - -// Conditional inlining -#if defined(USE_ALWAYS_INLINE) && !defined(CHECK) -#define ALWAYS_INLINE __attribute__((always_inline)) inline -#else -#define ALWAYS_INLINE inline -#endif -// Additional type for unsigned indices. +// Type for unsigned indices. typedef std::uint64_t uidx_t; +// Type for bitmasks. +typedef std::uint64_t bit_mask_t; + // Common utilities. #include "common_utils.hpp" @@ -110,27 +101,78 @@ typedef std::uint64_t uidx_t; // Simple macros and stubs. -#ifndef NO_VEC +// OMP offload (cannot be in offload.hpp because it's needed earlier). +#ifdef USE_OFFLOAD +#ifndef _OPENMP +#error Offload enabled without OpenMP enabled +#endif + +#ifdef USE_OFFLOAD_USM +#pragma omp requires unified_shared_memory +#else +#define USE_OFFLOAD_NO_USM +#endif + +#define OMP_DECL_TARGET _Pragma("omp declare target") +#define OMP_END_DECL_TARGET _Pragma("omp end declare target") + +#else +#define OMP_DECL_TARGET +#define OMP_END_DECL_TARGET + +#endif + +// Conditional inlining +#if !defined(NO_INLINE) && !defined(CHECK) +#ifndef ALWAYS_INLINE +#define ALWAYS_INLINE __attribute__((always_inline)) inline +#endif +#ifndef FORCE_INLINE +#define FORCE_INLINE _Pragma("forceinline") +#endif +#ifndef FORCE_INLINE_RECURSIVE +#define FORCE_INLINE_RECURSIVE _Pragma("forceinline recursive") +#endif + +#else +#ifndef ALWAYS_INLINE +#define ALWAYS_INLINE inline +#endif +#ifndef FORCE_INLINE +#define FORCE_INLINE +#endif +#ifndef FORCE_INLINE_RECURSIVE +#define FORCE_INLINE_RECURSIVE +#endif +#endif + +// Vector pragmas supported by classic and LLVM-based Intel compilers. +#ifndef NO_PRAGMA_VEC1 #define _NO_VECTOR _Pragma("novector") -#define _VEC_ALIGNED _Pragma("vector aligned") -#define _VEC_UNALIGNED _Pragma("vector unaligned") #define _VEC_ALWAYS _Pragma("vector always") +#define _VEC_ALIGNED _Pragma("vector aligned") #define _VEC_STREAMING _Pragma("vector nontemporal") #else #define _NO_VECTOR -#define _VEC_ALIGNED -#define _VEC_UNALIGNED #define _VEC_ALWAYS +#define _VEC_ALIGNED #define _VEC_STREAMING #endif -#ifndef NO_SIMD +// Vector pragmas supported by classic but not LLVM-based Intel compiler. +#ifndef NO_PRAGMA_VEC2 +#define _VEC_UNALIGNED _Pragma("vector unaligned") +#else +#define _VEC_UNALIGNED +#endif + +#ifndef NO_PRAGMA_SIMD #define _SIMD _Pragma("omp simd") #else #define _SIMD #endif -#ifndef NO_UNROLL +#ifndef NO_PRAGMA_UNROLL #define _UNROLL _Pragma("unroll") #else #define _UNROLL @@ -142,26 +184,28 @@ typedef std::uint64_t uidx_t; #endif // VTune or stubs. +// https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/top/api-support/instrumentation-and-tracing-technology-apis/instrumentation-tracing-technology-api-reference/collection-control-api.html #ifdef USE_VTUNE #include "ittnotify.h" #define VTUNE_PAUSE __itt_pause() #define VTUNE_RESUME __itt_resume() +#define VTUNE_DETACH __itt_detach() #else #define VTUNE_PAUSE ((void)0) #define VTUNE_RESUME ((void)0) +#define VTUNE_DETACH ((void)0) #endif -// Stringizing hacks for the C preprocessor. -#define YSTR1(s) #s -#define YSTR2(s) YSTR1(s) - // Default alloc settings. #define CACHELINE_BYTES (64) #define YASK_PAD (3) // cache-lines between data buffers. #define YASK_PAD_BYTES (CACHELINE_BYTES * YASK_PAD) #define YASK_HUGE_ALIGNMENT (2 * 1024 * 1024) // 2MiB-page for large allocs. #define CACHE_ALIGNED __attribute__ ((aligned (CACHELINE_BYTES))) -#ifndef USE_NUMA +#ifdef USE_OFFLOAD +#undef NUMA_PREF +#define NUMA_PREF yask_numa_offload +#elif !defined USE_NUMA #undef NUMA_PREF #define NUMA_PREF yask_numa_none #elif !defined NUMA_PREF @@ -178,50 +222,69 @@ typedef std::uint64_t uidx_t; KernelEnv::unset_debug_lock(); \ } while(0) -// 'state' is a pointer to a KernelState. -#define DEBUG_MSG1(state, msg) do { \ - if (state->_env->_debug.get()) { \ - auto& os = state->_env->_debug.get()->get_ostream(); \ - DEBUG_MSG0(os, msg); \ - } } while(0) - -// Macro for debug message when 'state' is defined. -#define DEBUG_MSG(msg) DEBUG_MSG1(state, msg) +#define DEBUG_MSG(msg) do { \ + auto dbg = yk_env::get_debug_output(); \ + auto& os = dbg.get()->get_ostream(); \ + DEBUG_MSG0(os, msg); \ + } while(0) // Macro for trace message. // Enabled only if compiled with TRACE macro and run with -trace option. #ifdef TRACE - -// 'os is an ostream. -#define TRACE_MSG0(os, msg) do { \ - if (state->_env->_trace) { \ - DEBUG_MSG0(os, "YASK: " << msg); \ - } } while(0) - -// 'state' is a pointer to a KernelState. -#define TRACE_MSG1(state, msg) do { \ - if (state->_env->_trace) { \ - DEBUG_MSG1(state, "YASK: " << msg); \ +#ifdef TRACE_FULL_FN +# if defined __cplusplus ? __GNUC_PREREQ (2, 6) : __GNUC_PREREQ (2, 4) +# define __TRACE_FN __PRETTY_FUNCTION__ +# else +# define __TRACE_FN " unknown function" +# endif +#else +# define __TRACE_FN __func__ +#endif +#define TRACE_MSG(msg) do { \ + if (KernelEnv::_trace) { \ + std::string fname(__FILE__); \ + const auto last_slash_idx = fname.find_last_of("/"); \ + if (std::string::npos != last_slash_idx) \ + fname.erase(0, last_slash_idx + 1); \ + DEBUG_MSG("YASK: " << __TRACE_FN << ": " << msg << \ + " at " << fname << ":" << __LINE__); \ } } while(0) #else -#define TRACE_MSG0(os, msg) ((void)0) -#define TRACE_MSG1(state, msg) ((void)0) +#define TRACE_MSG(msg) ((void)0) #endif -// Macro for trace message when 'state' is defined. -#define TRACE_MSG(msg) TRACE_MSG1(state, msg) - -// Macro for mem-trace when 'state' is defined. +// Macro for mem-trace. // Enabled only if compiled with TRACE_MEM macro and run with -trace option. #ifdef TRACE_MEM -#define TRACE_MEM_MSG(msg) TRACE_MSG1(state, msg) +#define TRACE_MEM_MSG(msg) TRACE_MSG(msg) #else #define TRACE_MEM_MSG(msg) ((void)0) #endif -// breakpoint. +// Debug breakpoint. #define INT3 asm volatile("int $3") +// SSC marks for emulator instrumentation. +#define TRACING_SSC_MARK( MARK_ID ) \ + __asm__ __volatile__ ( \ + "\n\t movl $"#MARK_ID", %%ebx" \ + "\n\t .byte 0x64, 0x67, 0x90" \ + : : : "%ebx" ); +namespace yask { + inline void ssc_start() + { + asm volatile ("push %rbx"); + TRACING_SSC_MARK(111); + asm volatile ("pop %rbx"); + } + inline void ssc_stop() + { + asm volatile ("push %rbx"); + TRACING_SSC_MARK(222); + asm volatile ("pop %rbx"); + } +}; + // L1 and L2 hints #define L1_HINT _MM_HINT_T0 #define L2_HINT _MM_HINT_T1 @@ -243,4 +306,3 @@ extern yask::Cache cache_model; #include "utils.hpp" #include "tuple.hpp" - diff --git a/src/kernel/lib/yask_stencil.hpp b/src/kernel/lib/yask_stencil.hpp index e2863569..693ab320 100644 --- a/src/kernel/lib/yask_stencil.hpp +++ b/src/kernel/lib/yask_stencil.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -40,12 +40,12 @@ IN THE SOFTWARE. // Macro to loop thru domain dims w/stencil index 'i' and domain index 'j'. // Step index must be at index zero. -#define _DOMAIN_VAR_LOOP(i, j) \ +#define DOMAIN_VAR_LOOP(i, j) \ for (int i = 1, j = 0; j < NUM_DOMAIN_DIMS; i++, j++) -#if (defined CHECK) || (defined TRACE) -#define DOMAIN_VAR_LOOP(i, j) _DOMAIN_VAR_LOOP(i, j) +#ifdef CHECK +#define DOMAIN_VAR_LOOP_FAST(i, j) DOMAIN_VAR_LOOP(i, j) #else -#define DOMAIN_VAR_LOOP(i, j) _UNROLL _DOMAIN_VAR_LOOP(i, j) +#define DOMAIN_VAR_LOOP_FAST(i, j) _UNROLL DOMAIN_VAR_LOOP(i, j) #endif // Max number of dims allowed in Indices. @@ -62,7 +62,7 @@ IN THE SOFTWARE. // First/last index macros. // These are relative to global problem, not rank. #define FIRST_INDEX(dim) (0) -#define LAST_INDEX(dim) (_context->get_settings().get()->_global_sizes[STENCIL_DIM_IDX_ ## dim] - 1) +#define LAST_INDEX(dim) (core_data->_common_core._global_sizes[STENCIL_DIM_IDX_ ## dim] - 1) // Macros for 1D<->n_d transforms. #include "yask_layout_macros.hpp" @@ -73,6 +73,8 @@ IN THE SOFTWARE. // Base types for stencil context, etc. #include "indices.hpp" #include "settings.hpp" +#include "offload.hpp" +#include "alloc.hpp" #include "generic_var.hpp" #include "yk_var.hpp" #include "auto_tuner.hpp" diff --git a/src/kernel/lib/yk_var.cpp b/src/kernel/lib/yk_var.cpp index 17857d3a..ae0240f0 100644 --- a/src/kernel/lib/yk_var.cpp +++ b/src/kernel/lib/yk_var.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -32,13 +32,10 @@ namespace yask { // Ctor. // Important: _data is NOT yet constructed. - YkVarBase::YkVarBase(KernelStateBase& stateb, - const VarDimNames& dim_names) : - KernelStateBase(stateb) { - STATE_VARS(this); + YkVarBaseCore::YkVarBaseCore(int ndims) { // Init indices. - int n = int(dim_names.size()); + int n = ndims; _domains.set_from_const(0, n); _req_left_pads.set_from_const(0, n); _req_right_pads.set_from_const(0, n); @@ -58,19 +55,44 @@ namespace yask { _vec_left_pads.set_from_const(0, n); _vec_allocs.set_from_const(0, n); _vec_local_offsets.set_from_const(0, n); + _vec_strides.set_from_const(0, n); + } - // Set masks. + // Ctor. + // Important: *corep exists but is NOT yet constructed. + YkVarBase::YkVarBase(KernelStateBase& stateb, + YkVarBaseCore* corep, + const VarDimNames& dim_names) : + KernelStateBase(stateb), _corep(corep) { + STATE_VARS(&stateb); + + // Set masks & counts in core. + _step_dim_mask = 0; + _domain_dim_mask = 0; + _misc_dim_mask = 0; + _num_step_dims = 0; + _num_domain_dims = 0; + _num_misc_dims = 0; for (size_t i = 0; i < dim_names.size(); i++) { idx_t mbit = 1LL << i; auto& dname = dim_names[i]; - if (dname == step_dim) + if (dname == step_dim) { _step_dim_mask |= mbit; - else if (domain_dims.lookup(dname)) + _num_step_dims++; + } + else if (domain_dims.lookup(dname)) { _domain_dim_mask |= mbit; - else + _num_domain_dims++; + } + else { _misc_dim_mask |= mbit; + _num_misc_dims++; + } } + assert(dim_names.size() == + _num_step_dims + _num_domain_dims + _num_misc_dims); } + // Convenience function to format indices like // "x=5, y=3". @@ -79,9 +101,8 @@ namespace yask { std::string infix, std::string prefix, std::string suffix) const { - IdxTuple tmp = get_allocs(); // get dims. - idxs.set_tuple_vals(tmp); // set vals from idxs. - return tmp.make_dim_val_str(separator, infix, prefix, suffix); + auto& tmp = get_dim_tuple(); + return idxs.make_dim_val_str(tmp, separator, infix, prefix, suffix); } // Does this var cover the N-D domain? @@ -98,17 +119,17 @@ namespace yask { } // Halo-exchange flag accessors. - bool YkVarBase::is_dirty(idx_t step_idx) const { - if (_dirty_steps.size() == 0) + bool YkVarBase::is_dirty(dirty_idx whose, idx_t step_idx) const { + if (_dirty_steps[whose].size() == 0) const_cast(this)->resize(); if (_has_step_dim) step_idx = _wrap_step(step_idx); else step_idx = 0; - return _dirty_steps[step_idx]; + return _dirty_steps[whose][step_idx]; } - void YkVarBase::set_dirty(bool dirty, idx_t step_idx) { - if (_dirty_steps.size() == 0) + void YkVarBase::set_dirty(dirty_idx whose, bool dirty, idx_t step_idx) { + if (_dirty_steps[whose].size() == 0) resize(); if (_has_step_dim) { @@ -121,12 +142,12 @@ namespace yask { } else step_idx = 0; - set_dirty_using_alloc_index(dirty, step_idx); + set_dirty_using_alloc_index(whose, dirty, step_idx); } - void YkVarBase::set_dirty_all(bool dirty) { - if (_dirty_steps.size() == 0) + void YkVarBase::set_dirty_all(dirty_idx whose, bool dirty) { + if (_dirty_steps[whose].size() == 0) resize(); - for (auto i : _dirty_steps) + for (auto i : _dirty_steps[whose]) i = dirty; } @@ -144,27 +165,37 @@ namespace yask { return posn; } + #define IDX_STR(v) make_index_string(_corep->v) + #define IDX_STR2(v, sep) make_index_string(_corep->v, sep) + string YkVarBase::make_info_string(bool long_info) const { std::stringstream oss; if (is_scratch()) oss << "scratch "; if (is_user_var()) oss << "user-defined "; if (_fixed_size) oss << "fixed-size "; - oss << _make_info_string() << " and meta-data at " << - (void*)this; + oss << _make_info_string() << + ", meta-data at " << (void*)this << + ", and core-data at " << (void*)_corep; + #ifdef USE_OFFLOAD + if (KernelEnv::_use_offload) + oss << " (" << (void*)get_dev_ptr(_corep, false, false) << + " on device)"; + #endif if (long_info) { - oss << " for "; - if (_domains._get_num_dims()) - oss << make_index_string(_allocs, " * ") << " FP elem(s)" - " at rank-offsets " << make_index_string(_rank_offsets) << - ", local-offsets " << make_index_string(_local_offsets) << - ", left-halos " << make_index_string(_left_halos) << - ", right-halos " << make_index_string(_right_halos) << - ", left-pads " << make_index_string(_actl_left_pads) << - ", right-pads " << make_index_string(_actl_right_pads) << - ", left-wf-exts " << make_index_string(_left_wf_exts) << - ", right-wf-exts " << make_index_string(_right_wf_exts) << - ", and "; - oss << _dirty_steps.size() << " dirty flag(s)"; + if (_corep->_domains.get_num_dims()) + oss << + ", allocs = " << IDX_STR2(_allocs, " * ") << + ", domains = " << IDX_STR2(_domains, " * ") << + ", rank-offsets = " << IDX_STR(_rank_offsets) << + ", local-offsets = " << IDX_STR(_local_offsets) << + ", left-halos = " << IDX_STR(_left_halos) << + ", right-halos = " << IDX_STR(_right_halos) << + ", left-pads = " << IDX_STR(_actl_left_pads) << + ", right-pads = " << IDX_STR(_actl_right_pads) << + ", left-wf-exts = " << IDX_STR(_left_wf_exts) << + ", right-wf-exts = " << IDX_STR(_right_wf_exts) << + ", vec-strides = " << IDX_STR(_vec_strides); + oss << ", " << _dirty_steps[self].size() << " dirty flag(s)"; } return oss.str(); } @@ -190,13 +221,13 @@ namespace yask { // where this var is not vectorized. for (int i = 0; i < get_num_dims(); i++) { if (mp[i]) - mp[i] += _soln_vec_lens[i] - 1; + mp[i] += _corep->_soln_vec_lens[i] - 1; } return mp; } // Resizes the underlying generic var. - // Modifies _pads and _allocs. + // Updates dependent core info. // Fails if mem different and already alloc'd. void YkVarBase::resize() { STATE_VARS(this); @@ -213,89 +244,84 @@ namespace yask { // Check settings. for (int i = 0; i < get_num_dims(); i++) { - if (_left_halos[i] < 0) + if (_corep->_left_halos[i] < 0) THROW_YASK_EXCEPTION("Error: negative left halo in var '" + get_name() + "'"); - if (_right_halos[i] < 0) + if (_corep->_right_halos[i] < 0) THROW_YASK_EXCEPTION("Error: negative right halo in var '" + get_name() + "'"); - if (_left_wf_exts[i] < 0) + if (_corep->_left_wf_exts[i] < 0) THROW_YASK_EXCEPTION("Error: negative left wave-front ext in var '" + get_name() + "'"); - if (_right_wf_exts[i] < 0) + if (_corep->_right_wf_exts[i] < 0) THROW_YASK_EXCEPTION("Error: negative right wave-front ext in var '" + get_name() + "'"); - if (_req_left_pads[i] < 0) + if (_corep->_req_left_pads[i] < 0) THROW_YASK_EXCEPTION("Error: negative left padding in var '" + get_name() + "'"); - if (_req_right_pads[i] < 0) + if (_corep->_req_right_pads[i] < 0) THROW_YASK_EXCEPTION("Error: negative right padding in var '" + get_name() + "'"); - if (_req_left_epads[i] < 0) + if (_corep->_req_left_epads[i] < 0) THROW_YASK_EXCEPTION("Error: negative left extra padding in var '" + get_name() + "'"); - if (_req_right_epads[i] < 0) + if (_corep->_req_right_epads[i] < 0) THROW_YASK_EXCEPTION("Error: negative right extra padding in var '" + get_name() + "'"); } // Increase padding as needed and calculate new allocs. - Indices new_left_pads = get_reqd_pad(_left_halos, _left_wf_exts); - Indices new_right_pads = get_reqd_pad(_right_halos, _right_wf_exts); + Indices new_left_pads = get_reqd_pad(_corep->_left_halos, _corep->_left_wf_exts); + Indices new_right_pads = get_reqd_pad(_corep->_right_halos, _corep->_right_wf_exts); IdxTuple new_allocs(old_allocs); for (int i = 0; i < get_num_dims(); i++) { idx_t mbit = 1LL << i; // New allocation in each dim. - new_allocs[i] = _domains[i]; + new_allocs[i] = _corep->_domains[i]; // Adjust padding only for domain dims. if (_domain_dim_mask & mbit) { - // Get max of existing pad & new required pad if allocated. - // This will avoid throwing an exception due to decreasing - // requested padding after allocation. - if (p) { - new_left_pads[i] = max(new_left_pads[i], _actl_left_pads[i]); - new_right_pads[i] = max(new_right_pads[i], _actl_right_pads[i]); - } + // Add more padding. + new_left_pads[i] += _corep->_req_left_epads[i]; + new_right_pads[i] += _corep->_req_right_epads[i]; + new_left_pads[i] = max(new_left_pads[i], _corep->_req_left_pads[i]); + new_right_pads[i] = max(new_right_pads[i], _corep->_req_right_pads[i]); - // If storage not yet allocated, increase to requested pad. - // Final pad is max of requested pad and halo + requested extra pad. - // Requested padding is a hint, so ignoring it when allocated - // will avoid throwing an exception due to increasing - // requested padding after allocation. - if (!p) { - new_left_pads[i] = max(new_left_pads[i], _left_halos[i] + _req_left_epads[i]); - new_right_pads[i] = max(new_right_pads[i], _right_halos[i] + _req_right_epads[i]); - new_left_pads[i] = max(new_left_pads[i], _req_left_pads[i]); - new_right_pads[i] = max(new_right_pads[i], _req_right_pads[i]); + // If storage is allocated, get max of existing pad & new + // pad. This will avoid throwing an exception due to + // decreasing requested padding after allocation. + if (p) { + new_left_pads[i] = max(new_left_pads[i], _corep->_actl_left_pads[i]); + new_right_pads[i] = max(new_right_pads[i], _corep->_actl_right_pads[i]); } // Round left pad up to vec len. - new_left_pads[i] = ROUND_UP(new_left_pads[i], _var_vec_lens[i]); + new_left_pads[i] = ROUND_UP(new_left_pads[i], _corep->_var_vec_lens[i]); // Round domain + right pad up to soln vec len by extending right pad. // Using soln vec len to allow reading a non-vec var in this dim // while calculating a vec var. (The var vec-len is always 1 or the same // as the soln vec-len in a given dim.) - idx_t dprp = ROUND_UP(_domains[i] + new_right_pads[i], _soln_vec_lens[i]); - new_right_pads[i] = dprp - _domains[i]; + idx_t dprp = ROUND_UP(_corep->_domains[i] + new_right_pads[i], + _corep->_soln_vec_lens[i]); + new_right_pads[i] = dprp - _corep->_domains[i]; // Make inner dim an odd number of vecs. // This reportedly helps avoid some uarch aliasing. if (!p && - opts->_allow_addl_pad && - get_dim_name(i) == inner_dim && - (new_allocs[i] / _var_vec_lens[i]) % 2 == 0) { - new_right_pads[i] += _var_vec_lens[i]; + actl_opts->_allow_addl_pad && + get_dim_name(i) == inner_layout_dim && + (new_allocs[i] / _corep->_var_vec_lens[i]) % 2 == 0) { + new_right_pads[i] += _corep->_var_vec_lens[i]; } // New allocation in each dim. new_allocs[i] += new_left_pads[i] + new_right_pads[i]; - assert(new_allocs[i] == new_left_pads[i] + _domains[i] + new_right_pads[i]); + assert(new_allocs[i] == new_left_pads[i] + _corep->_domains[i] + new_right_pads[i]); // Since the left pad and domain + right pad were rounded up, // the sum should also be a vec mult. - assert(new_allocs[i] % _var_vec_lens[i] == 0); + assert(new_allocs[i] % _corep->_var_vec_lens[i] == 0); } // Non-domain dims. else { - assert(new_allocs[i] == _domains[i]); - assert(_var_vec_lens[i] == 1); + assert(new_allocs[i] == _corep->_domains[i]); + assert(_corep->_var_vec_lens[i] == 1); } } @@ -309,31 +335,35 @@ namespace yask { } // Do the resize and calculate number of dirty bits needed. - _allocs = new_allocs; - _actl_left_pads = new_left_pads; - _actl_right_pads = new_right_pads; + _corep->_allocs = new_allocs; + _corep->_actl_left_pads = new_left_pads; + _corep->_actl_right_pads = new_right_pads; size_t new_dirty = 1; // default if no step dim. for (int i = 0; i < get_num_dims(); i++) { idx_t mbit = 1LL << i; // Calc vec-len values. - _vec_left_pads[i] = new_left_pads[i] / _var_vec_lens[i]; - _vec_allocs[i] = _allocs[i] / _var_vec_lens[i]; + _corep->_vec_left_pads[i] = new_left_pads[i] / _corep->_var_vec_lens[i]; + _corep->_vec_allocs[i] = _corep->_allocs[i] / _corep->_var_vec_lens[i]; // Actual resize of underlying var. - set_dim_size(i, _vec_allocs[i]); + set_dim_size(i, _corep->_vec_allocs[i]); // Number of dirty bits is number of steps. if (_step_dim_mask & mbit) - new_dirty = _allocs[i]; + new_dirty = _corep->_allocs[i]; } + + // Calc new strides. + _corep->_vec_strides = get_vec_strides(); // Resize dirty flags, too. - size_t old_dirty = _dirty_steps.size(); + size_t old_dirty = _dirty_steps[self].size(); if (old_dirty != new_dirty) { // Resize & set all as dirty. - _dirty_steps.assign(new_dirty, true); + _dirty_steps[self].assign(new_dirty, true); + _dirty_steps[others].assign(new_dirty, true); // Init range. init_valid_steps(); @@ -347,6 +377,10 @@ namespace yask { TRACE_MSG("resize: FROM " << old_info << " TO " << new_info); } #endif + + // Copy changes to device. + // TODO: do this only when needed. + sync_core(); } // Check whether dim is used and of allowed type. @@ -370,44 +404,45 @@ namespace yask { STATE_VARS(this); if (!ref) { DEBUG_MSG("** mismatch: no reference var."); - return _allocs.product(); // total number of elements. + return _corep->_allocs.product(); // total number of elements. } // Dims & sizes same? if (!are_dims_and_sizes_same(*ref)) { DEBUG_MSG("** mismatch due to incompatible vars: " << make_info_string() << " and " << ref->make_info_string()); - return _allocs.product(); // total number of elements. + return _corep->_allocs.product(); // total number of elements. } // Compare each element. idx_t errs = 0; auto allocs = get_allocs(); + set err_msgs; // This will loop over the entire allocation. // We use this as a handy way to get offsets, - // but not all will be used. + // but not all points will be used. allocs.visit_all_points_in_parallel ([&](const IdxTuple& pt, size_t idx) { // Adjust alloc indices to overall indices. IdxTuple opt(pt); bool ok = true; - for (int i = 0; ok && i < pt._get_num_dims(); i++) { + for (int i = 0; ok && i < pt.get_num_dims(); i++) { auto val = pt.get_val(i); idx_t mbit = 1LL << i; // Convert to API index. opt[i] = val; if (!(_step_dim_mask & mbit)) - opt[i] += _rank_offsets[i] + _local_offsets[i]; + opt[i] += _corep->_rank_offsets[i] + _corep->_local_offsets[i]; // Don't compare points outside the domain. // TODO: check points in outermost halo. auto& dname = pt.get_dim_name(i); if (domain_dims.lookup(dname)) { - auto first_ok = _rank_offsets[i]; - auto last_ok = first_ok + _domains[i] - 1; + auto first_ok = _corep->_rank_offsets[i]; + auto last_ok = first_ok + _corep->_domains[i] - 1; if (opt[i] < first_ok || opt[i] > last_ok) ok = false; } @@ -419,42 +454,47 @@ namespace yask { auto te = read_elem(opt, asi, __LINE__); auto re = ref->read_elem(opt, asi, __LINE__); if (!within_tolerance(te, re, epsilon)) { -#pragma omp critical + #pragma omp critical { errs++; - if (errs <= max_print) { - if (errs < max_print) - DEBUG_MSG("** mismatch at " << get_name() << - "(" << opt.make_dim_val_str() << "): " << - te << " != " << re); - else - DEBUG_MSG("** Additional errors not printed for var '" << - get_name() << "'"); + if (errs < max_print) { + err_msgs.insert(get_name() + + "(" + opt.make_dim_val_str() + + "): got " + to_string(te) + + "; expected " + to_string(re)); } } } return true; // keep visiting. }); + + for (auto& msg : err_msgs) + DEBUG_MSG("** mismatch at " << msg); + if (errs > max_print) + DEBUG_MSG("** Additional errors not printed for var '" << get_name() << "'"); TRACE_MSG("detailed compare returned " << errs); return errs; } // Make sure indices are in range. // Side-effect: If clipped_indices is not NULL, + // 0) copy indices to *clipped_indices, and // 1) set them to in-range if out-of-range, and - // 2) normalize them if 'normalize' is 'true'. + // 2) convert to rank-local and normalize them if 'normalize' is 'true'. bool YkVarBase::check_indices(const Indices& indices, const string& fn, // name for error msg. bool strict_indices, // die if out-of-range. bool check_step, // check step index. bool normalize, // div by vec lens. Indices* clipped_indices) const { + if (normalize) + assert(clipped_indices != 0); STATE_VARS(this); bool all_ok = true; auto n = get_num_dims(); - if (indices._get_num_dims() != n) { + if (indices.get_num_dims() != n) { FORMAT_AND_THROW_YASK_EXCEPTION("Error: '" << fn << "' called with " << - indices._get_num_dims() << + indices.get_num_dims() << " indices instead of " << n); } if (clipped_indices) @@ -468,7 +508,7 @@ namespace yask { // If this is the step dim and we're not checking // it, then anything is ok. - if (is_step_dim && (!check_step || opts->_step_wrap)) + if (is_step_dim && (!check_step || actl_opts->_step_wrap)) ok = true; // Otherwise, check range. @@ -503,8 +543,8 @@ namespace yask { // Normalize? if (clipped_indices && normalize) { if (_domain_dim_mask & mbit) { - (*clipped_indices)[i] -= _rank_offsets[i]; // rank-local. - (*clipped_indices)[i] = idiv_flr((*clipped_indices)[i], _var_vec_lens[i]); + (*clipped_indices)[i] -= _corep->_rank_offsets[i]; // rank-local. + (*clipped_indices)[i] = idiv_flr((*clipped_indices)[i], _corep->_var_vec_lens[i]); } } } // var dims. @@ -518,11 +558,11 @@ namespace yask { // If 't' is before first step, pull offset back. if (t < get_first_local_index(step_posn)) - _local_offsets[step_posn] = t; + _corep->_local_offsets[step_posn] = t; // If 't' is after last step, push offset out. else if (t > get_last_local_index(step_posn)) - _local_offsets[step_posn] = t - _domains[step_posn] + 1; + _corep->_local_offsets[step_posn] = t - _corep->_domains[step_posn] + 1; TRACE_MSG("update_valid_step(" << t << "): valid step(s) in '" << get_name() << "' are now [" << get_first_local_index(step_posn) << @@ -535,23 +575,11 @@ namespace yask { void YkVarBase::set_dirty_in_slice(const Indices& first_indices, const Indices& last_indices) { if (_has_step_dim) { - for (idx_t i = first_indices[+Indices::step_posn]; - i <= last_indices[+Indices::step_posn]; i++) - set_dirty(true, i); + for (idx_t i = first_indices[+step_posn]; + i <= last_indices[+step_posn]; i++) + set_dirty(self, true, i); } else - set_dirty_using_alloc_index(true, 0); - } - - // Make tuple needed for slicing. - IdxTuple YkVarBase::get_slice_range(const Indices& first_indices, - const Indices& last_indices) const { - // Find ranges. - Indices num_elems = last_indices.add_const(1).sub_elements(first_indices); - IdxTuple num_elems_tuple = get_allocs(); - num_elems.set_tuple_vals(num_elems_tuple); - num_elems_tuple.set_first_inner(_is_col_major); - - return num_elems_tuple; + set_dirty_using_alloc_index(self, true, 0); } // Print one element like @@ -571,39 +599,71 @@ namespace yask { TRACE_MEM_MSG(str); } - // Print one vector. + // Print each elem in one vector. // Indices must be normalized and rank-relative. void YkVarBase::print_vec_norm(const std::string& msg, - const Indices& idxs, - const real_vec_t& val, - int line) const { + const Indices& idxs, + const real_vec_t& val, + int line) const { STATE_VARS_CONST(this); // Convert to elem indices. - Indices eidxs = idxs.mul_elements(_var_vec_lens); + Indices eidxs = idxs.mul_elements(_corep->_var_vec_lens); // Add offsets, i.e., convert to overall indices. - eidxs = eidxs.add_elements(_rank_offsets); + eidxs = eidxs.add_elements(_corep->_rank_offsets); - IdxTuple idxs2 = get_allocs(); // get dims. + IdxTuple idxs2 = get_dim_tuple(); eidxs.set_tuple_vals(idxs2); // set vals from eidxs. + // TODO: is above correct for vars that aren't domain dims? + Indices idxs3(idxs2); // Visit every point in fold. - IdxTuple folds = dims->_fold_pts; - folds.visit_all_points([&](const IdxTuple& fofs, - size_t idx) { + auto& folds = dims->_fold_sizes; + bool first_inner = dims->_fold_pts.is_first_inner(); + folds.visit_all_points + (first_inner, + [&](const Indices& fofs, size_t idx) { - // Get element from vec val. - real_t ev = val[idx]; + // Get element from vec val. + real_t ev = val[idx]; - // Add fold offsets to elem indices for printing. - IdxTuple pt2 = idxs2.add_elements(fofs, false); - Indices pt3(pt2); + // Add fold offsets to elem indices for printing. + auto pt3 = idxs3.add_elements(fofs); - print_elem(msg, pt3, ev, line); - return true; // keep visiting. - }); + print_elem(msg, pt3, ev, line); + return true; // keep visiting. + }); + } + + // Copy data to/from device. + void YkVarBase::copy_data_to_device() { + STATE_VARS(this); + if (_coh.need_to_update_dev()) { + void* vp = get_storage(); + char* cp = static_cast(vp); + auto nb = get_num_bytes(); + if (vp && nb) { + TRACE_MSG("'" << get_name() << "' data copied to device"); + offload_copy_to_device(cp, nb); + _coh.host_copied_to_dev(); + } + } + } + void YkVarBase::copy_data_from_device() { + STATE_VARS(this); + if (_coh.need_to_update_host()) { + void* vp = get_storage(); + char* cp = static_cast(vp); + auto nb = get_num_bytes(); + if (vp && nb) { + TRACE_MSG("'" << get_name() << "' data copied from device"); + offload_copy_from_device(cp, nb); + _coh.dev_copied_to_host(); + } + } } + } // namespace. diff --git a/src/kernel/lib/yk_var.hpp b/src/kernel/lib/yk_var.hpp index 09b0f130..ca5a2fc9 100644 --- a/src/kernel/lib/yk_var.hpp +++ b/src/kernel/lib/yk_var.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -27,69 +27,556 @@ IN THE SOFTWARE. namespace yask { - // Underlying storage using GenericVars. - typedef GenericVarTyped RealElemVar; - typedef GenericVarTyped RealVecVar; - - // Base class implementing all yk_var functionality. Used for - // vars that contain either individual elements or vectors. - class YkVarBase : - public KernelStateBase { - friend class YkVarImpl; - - // Rank and local offsets in domain dim: - - // | ... | +------+ | - // | global ofs | | | - // |<------------>| var | | - // | | loc |domain| | - // |rank | ofs | | | - // | ofs |<------>| | | - // |<--->| +------+ | - // ^ ^ ^ ^ - // | | | last rank-domain index - // | | 0 index in underlying storage. - // | first rank-domain index - // first overall-domain index - - // Rank offset is not necessarily a vector multiple. - // Local offset must be a vector multiple. - - protected: - - // The following masks have one bit for each dim in the var. - idx_t _step_dim_mask = 0; - idx_t _domain_dim_mask = 0; - idx_t _misc_dim_mask = 0; + /* + Rank and local offsets in domain dim: + + | ... | +------+ | + | global ofs | | | + |<------------>| var | | + | | loc |domain| | + |rank | ofs | | | + | ofs |<------>| | | + |<--->| +------+ | + ^ ^ ^ ^ + | | | last rank-domain index + | | 0 index in underlying storage. + | first rank-domain index + first overall-domain index + + Rank offset is not necessarily a vector multiple. + Local offset must be a vector multiple. + + OOD: yk_var (API) + ^ + | + YkVarBaseCore <---ptr------- YkVarBase <--sh_ptr-- YkVarImpl + ^ ^ ^ ^ | + | | | | +---------------+ + YkElemVarCore YkVecVarCore YkElemVar YkVecVar | + | ^ ^ | | | | + | | | | | | | + | | +---------------|--has-a-+ | + | +--------------------has-a--+ | + | | | + has-a has-a | + | | | + v v | + +-> GenericVarCore GenericVarCore <-+ | + | _elems ptr _elems ptr | | + | | | + | | | + | GenericVarBase <-------------ptr------------+ + ptr ^ ^ _base ptr | + | | | ptr + | GenericVarTyped GenericVarTyped | + | ^ ^ | + | | | | + +- GenericVar GenericVar --------+ + + "Core" types are non-virtual and can be trivially copied, e.g., + to an offload device; others are virtual and cannot. + "LF" is a layout-function type. + */ + + ///// Yk*Var*Core types ///// + + // Core data that is needed for computations using a var. + // A trivially-copyable type for offloading. + struct YkVarBaseCore { // The following indices have one value for each dim in the var. - // All values are in units of reals, not underlying elements, if different. + // All values are in units of reals, not underlying SIMD vectors, if different. // See diagram above for '_rank_offsets' and '_local_offsets'. // Comments show settings for domain dims | non-domain dims. - Indices _domains; // size of "interior" of var | alloc size. + Indices _domains; // size of "interior" of var (i.e., not pads) | alloc size. Indices _req_left_epads, _req_right_epads; // requested extra space around halos | zero. - Indices _req_left_pads, _req_right_pads; // requested extra space around domains | zero. - Indices _actl_left_pads, _actl_right_pads; // actual extra space around domains | zero. + Indices _req_left_pads, _req_right_pads; // requested space around domains | zero. + Indices _actl_left_pads, _actl_right_pads; // actual space around domains | zero. Indices _left_halos, _right_halos; // space within pads for halo exchange | zero. Indices _left_wf_exts, _right_wf_exts; // additional halos for wave-fronts | zero. - Indices _rank_offsets; // offsets of this var domain in overall problem | zero. - Indices _local_offsets; // offsets of this var domain in this rank | first index for step or misc. - Indices _allocs; // actual var alloc in reals | same. + Indices _rank_offsets; // offsets of this rank in global space | zero. + Indices _local_offsets; // offsets of this var's domain in this rank | first index. + Indices _allocs; // actual var alloc | same. - // Each entry in _soln_vec_lens is same as dims->_fold_pts. + // Each entry in _soln_vec_lens is same as the corresponding dim in dims->_fold_pts. Indices _soln_vec_lens; // num reals in each elem in soln fold | one. // Each entry in _var_vec_lens may be same as dims->_fold_pts or one, depending // on whether var is fully vectorized. Indices _var_vec_lens; // num reals in each elem in this var | one. - // Sizes in vectors for sizes that are always vec lens (to avoid division). - Indices _vec_left_pads; // _actl_left_pads / _var_vec_lens. - Indices _vec_allocs; // _allocs / _var_vec_lens. - Indices _vec_local_offsets; // _local_offsets / _var_vec_lens. + // Sizes in vectors for sizes that are always vec lens. + // These are pre-calculated to avoid division later. + Indices _vec_left_pads; // _actl_left_pads / _var_vec_lens | zero. + Indices _vec_allocs; // _allocs / _var_vec_lens | _allocs. + Indices _vec_local_offsets; // _local_offsets / _var_vec_lens | first index. + Indices _vec_strides; // num vecs between consecutive indices | one. + + // Ctor. + YkVarBaseCore(int ndims); + + // Index math. + ALWAYS_INLINE idx_t get_first_local_index(idx_t posn) const { + return _rank_offsets[posn] + _local_offsets[posn] - _actl_left_pads[posn]; + } + ALWAYS_INLINE idx_t get_last_local_index(idx_t posn) const { + return _rank_offsets[posn] + _local_offsets[posn] + _domains[posn] + _actl_right_pads[posn] - 1; + } + + // Adjust logical time index to 0-based index + // using temporal allocation size. + ALWAYS_INLINE idx_t _wrap_step(idx_t t) const { + + // Index wraps in tdim. + // Examples based on tdim == 2: + // t => return value. + // --- ------------- + // -2 => 0. + // -1 => 1. + // 0 => 0. + // 1 => 1. + // 2 => 0. + + // Avoid discontinuity in dividing negative numbers + // by using floored-mod. + idx_t res = imod_flr(t, _domains[+step_posn]); + return res; + } + + }; // YkVarBaseCore. + + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); + + // Core data for YASK var of real elements. + template + struct YkElemVarCore final : public YkVarBaseCore { + + // Core for generic storage is owned here by composition. + // We do this to reduce the number of structs that need to be + // copied to the offload device. + typedef GenericVarCore _data_t; + static_assert(std::is_trivially_copyable<_data_t>::value, + "Needed for OpenMP offload"); + _data_t _data; + + // Ctor. + YkElemVarCore(int ndims) : + YkVarBaseCore(ndims) { } + + protected: + + // Calc one adjusted index and recurse to i-1. + template + void _get_adj_idx(Indices& adj_idxs, + const Indices& idxs, + idx_t alloc_step_idx) const { + if constexpr (i < 0) + return; + + // Special handling for step index. + constexpr auto sp = +step_posn; + if constexpr (_use_step_idx && i == sp) { + host_assert(alloc_step_idx == _wrap_step(idxs[sp])); + adj_idxs[i] = alloc_step_idx; + } + + // All other indices. + else { + + // Adjust for offsets and padding. + // This gives a positive 0-based local element index. + idx_t ai = idxs[i] + _actl_left_pads[i] - _local_offsets[i]; + + // Also adjust for rank offsets if using global indices. + if constexpr (is_global) + ai -= _rank_offsets[i]; + + host_assert(ai >= 0); + adj_idxs[i] = uidx_t(ai); + } + + // Recurse (during compilation) until done. + if constexpr (i > 0) + _get_adj_idx(adj_idxs, idxs, alloc_step_idx); + } + + // Get a pointer to given element. + // 'alloc_step_idx' must be within allocation bounds and consistent + // with 'idxs[step_posn]'. + template + const real_t* _get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds) const { + constexpr auto n = LayoutFn::get_num_sizes(); + Indices adj_idxs(n); + _get_adj_idx(adj_idxs, idxs, alloc_step_idx); + + // Get pointer via layout in _data. + return _data.get_ptr(adj_idxs, check_bounds); + } + + public: + ALWAYS_INLINE + const real_t* get_elem_ptr(const Indices& global_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const { + return _get_elem_ptr(global_idxs, alloc_step_idx, check_bounds); + } + ALWAYS_INLINE + const real_t* get_elem_ptr_local(const Indices& local_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const { + return _get_elem_ptr(local_idxs, alloc_step_idx, check_bounds); + } + + // Non-const versions. + // Implemented via casting. + ALWAYS_INLINE + real_t* get_elem_ptr(const Indices& global_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) { + const real_t* p = + const_cast(this)-> + get_elem_ptr(global_idxs, alloc_step_idx, check_bounds); + return const_cast(p); + } + ALWAYS_INLINE + real_t* get_elem_ptr_local(const Indices& local_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) { + const real_t* p = + const_cast(this)-> + get_elem_ptr_local(local_idxs, alloc_step_idx, check_bounds); + return const_cast(p); + } + + // Read one element. + // Indices are global, i.e., relative to overall problem domain. + ALWAYS_INLINE + real_t read_elem(const Indices& idxs, + idx_t alloc_step_idx) const { + const real_t* ep = get_elem_ptr(idxs, alloc_step_idx); + return *ep; + } + + // Write one element. + // Indices are global, i.e., relative to overall problem domain. + ALWAYS_INLINE + void write_elem(real_t val, + const Indices& idxs, + idx_t alloc_step_idx) { + real_t* ep = get_elem_ptr(idxs, alloc_step_idx); + *ep = val; + } + + + // Read one element. + // Indices are local. + ALWAYS_INLINE + real_t read_elem_local(const Indices& idxs, + idx_t alloc_step_idx) const { + const real_t* ep = get_elem_ptr_local(idxs, alloc_step_idx); + return *ep; + } + + // Write one element. + // Indices are local. + ALWAYS_INLINE + void write_elem_local(real_t val, + const Indices& idxs, + idx_t alloc_step_idx) { + real_t* ep = get_elem_ptr_local(idxs, alloc_step_idx); + *ep = val; + } + + }; // YkElemVarCore. + + // Core data for YASK var of real vectors. + template + struct YkVecVarCore final : public YkVarBaseCore { + + // Positions of var dims in vector fold dims. + Indices _vec_fold_posns; + + // Storage core is owned here by composition. + typedef GenericVarCore _data_t; + static_assert(std::is_trivially_copyable<_data_t>::value, + "Needed for OpenMP offload"); + _data_t _data; + + // Ctor. + YkVecVarCore(int ndims) : + YkVarBaseCore(ndims), + _vec_fold_posns(idx_t(0), ndims) { } + + protected: + // Calc one adjusted vec idx and offset and recurse to i-1. + template + void _get_adj_idx(Indices& vec_idxs, + Indices& elem_ofs, + const Indices& idxs, + idx_t alloc_step_idx) const { + if constexpr (i < 0) + return; + + constexpr int nvls = sizeof...(_templ_vec_lens); + constexpr uidx_t vls[nvls] { _templ_vec_lens... }; + + // Special handling for step index. + constexpr auto sp = +step_posn; + if constexpr (_use_step_idx && i == sp) { + host_assert(alloc_step_idx == _wrap_step(idxs[sp])); + vec_idxs[sp] = alloc_step_idx; + elem_ofs[sp] = 0; + } + + // All other indices. + else { + + // Adjust for offset and padding. + // This gives a positive 0-based local element index. + idx_t ai = idxs[i] + _actl_left_pads[i] - + (_rank_offsets[i] + _local_offsets[i]); + host_assert(ai >= 0); + uidx_t adj_idx = uidx_t(ai); + + // Get vector index and offset. + // Use unsigned DIV and MOD to avoid compiler having to + // emit code for preserving sign when using shifts. + vec_idxs[i] = idx_t(adj_idx / vls[i]); + elem_ofs[i] = idx_t(adj_idx % vls[i]); + host_assert(vec_idxs[i] == idx_t(adj_idx / _var_vec_lens[i])); + host_assert(elem_ofs[i] == idx_t(adj_idx % _var_vec_lens[i])); + } + + // Recurse (during compilation) until done. + if constexpr (i > 0) + _get_adj_idx(vec_idxs, elem_ofs, idxs, alloc_step_idx); + } + + // Calc one fold offset and recurse to i-1. + template + void _get_fold_ofs(Indices& fold_ofs, + const Indices& elem_ofs) const { + if constexpr (i < 0) + return; + + int j = _vec_fold_posns[i]; + fold_ofs[i] = elem_ofs[j]; + + // Recurse (during compilation) until done. + if constexpr (i > 0) + _get_fold_ofs(fold_ofs, elem_ofs); + } + + public: + // Get a pointer to given element. + const real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const { + + // Use template vec lengths instead of run-time values for + // efficiency. + constexpr int nvls = sizeof...(_templ_vec_lens); + Indices vec_idxs(nvls), elem_ofs(nvls); + #ifdef DEBUG_LAYOUT + constexpr auto ns = LayoutFn::get_num_sizes(); + host_assert(ns == nvls); + #endif + _get_adj_idx(vec_idxs, elem_ofs, idxs, alloc_step_idx); + + // Get only the vectorized fold offsets, i.e., those + // with vec-lengths > 1. + // And, they need to be in the original folding order, + // which might be different than the var-dim order. + Indices fold_ofs(NUM_VEC_FOLD_DIMS); + _get_fold_ofs(fold_ofs, elem_ofs); + + // Get 1D element index into vector. + //auto i = dims->get_elem_index_in_vec(fold_ofs); + idx_t i = VEC_FOLD_LAYOUT(fold_ofs); + + // Get pointer to vector. + const real_vec_t* vp = _data.get_ptr(vec_idxs, check_bounds); + + // Get pointer to element. + const real_t* ep = &(*vp)[i]; + return ep; + } + + // Non-const version. + // Implemented with casting. + ALWAYS_INLINE + real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) { + const real_t* p = + const_cast(this)->get_elem_ptr(idxs, alloc_step_idx, + check_bounds); + return const_cast(p); + } + + // Read one element. + // Indices are relative to overall problem domain. + ALWAYS_INLINE + real_t read_elem(const Indices& idxs, + idx_t alloc_step_idx) const { + const real_t* ep = get_elem_ptr(idxs, alloc_step_idx); + return *ep; + } + + // Write one element. + // Indices are relative to overall problem domain. + ALWAYS_INLINE + void write_elem(real_t val, + const Indices& idxs, + idx_t alloc_step_idx) { + real_t* ep = get_elem_ptr(idxs, alloc_step_idx); + *ep = val; + } + + protected: + + // Get one adjusted index and recurse to i-1. + template + void _get_adj_idx(Indices& adj_idxs, + const Indices& vec_idxs, + idx_t alloc_step_idx) const { + if constexpr (i < 0) + return; + + // Special handling for step index. + constexpr auto sp = +step_posn; + if constexpr (_use_step_idx && i == sp) { + host_assert(alloc_step_idx == _wrap_step(vec_idxs[sp])); + adj_idxs[i] = alloc_step_idx; + } + + // Other indices. + else { + + // Adjust for padding. + // Since the indices are rank-relative, subtract only + // the local offsets. + // This gives a 0-based local *vector* index. + adj_idxs[i] = vec_idxs[i] + _vec_left_pads[i] - _vec_local_offsets[i]; + } + + // Recurse (during compilation) until done. + if constexpr (i > 0) + _get_adj_idx(adj_idxs, vec_idxs, alloc_step_idx); + + } + + public: + // Get a pointer to given vector. + // Indices must be normalized and rank-relative. + // It's important that this function be efficient, since + // it's used in the stencil kernel. + ALWAYS_INLINE + const real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const { + + constexpr int nvls = sizeof...(_templ_vec_lens); + Indices adj_idxs(nvls); + _get_adj_idx(adj_idxs, vec_idxs, alloc_step_idx); + + // Get ptr via layout in _data. + return _data.get_ptr(adj_idxs, check_bounds); + } + + // Non-const version. + ALWAYS_INLINE + real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) { + const real_vec_t* p = + const_cast(this)-> + get_vec_ptr_norm(vec_idxs, alloc_step_idx, check_bounds); + return const_cast(p); + } + + // Read one vector. + // Indices must be normalized and rank-relative. + // 'alloc_step_idx' is pre-calculated or 0 if not used. + ALWAYS_INLINE + real_vec_t read_vec_norm(const Indices& vec_idxs, + idx_t alloc_step_idx) const { + const real_vec_t* vp = get_vec_ptr_norm(vec_idxs, alloc_step_idx); + real_vec_t res; + res.load_from(vp); + return res; + } + + // Write one vector. + // Indices must be normalized and rank-relative. + // 'alloc_step_idx' is pre-calculated or 0 if not used. + ALWAYS_INLINE + void write_vec_norm(real_vec_t val, + const Indices& vec_idxs, + idx_t alloc_step_idx) { + real_vec_t* vp = get_vec_ptr_norm(vec_idxs, alloc_step_idx); + val.store_to(vp); + } + ALWAYS_INLINE + void write_vec_norm_masked(real_vec_t val, + const Indices& vec_idxs, + idx_t alloc_step_idx, + uidx_t mask) { + real_vec_t* vp = get_vec_ptr_norm(vec_idxs, alloc_step_idx); + val.store_to_masked(vp, mask); + } + + // Prefetch one vector. + // Indices must be normalized and rank-relative. + // 'alloc_step_idx' is pre-calculated or 0 if not used. + template + ALWAYS_INLINE + void prefetch_vec_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + int line) const { + auto p = get_vec_ptr_norm(vec_idxs, alloc_step_idx, false); + prefetch(p); + #ifdef MODEL_CACHE + cache_model.prefetch(p, level, line); + #endif + } + + }; // YkVecVarCore. + + ///// Yk*Var* types ///// + + // Base class implementing all yk_var functionality. Used for + // vars that contain either individual elements or vectors. + // This class is pure virtual. + class YkVarBase : + public KernelStateBase { + friend class YkVarImpl; + + public: + + // Index for distinguishing my var from neighbors' vars. + enum dirty_idx { self, others }; - // Whether step dim is used. - // If true, will always be in Indices::step_posn. + protected: + + // Ptr to the core data. + YkVarBaseCore* _corep = 0; + + // The following masks have one bit for each dim in the var. + idx_t _step_dim_mask; + idx_t _domain_dim_mask; + idx_t _misc_dim_mask; + + // Counts of each dim type. + idx_t _num_step_dims; + idx_t _num_domain_dims; + idx_t _num_misc_dims; + + // Whether step dim is used. + // If true, will always be in step_posn. bool _has_step_dim = false; // Whether certain dims can be changed. @@ -99,14 +586,6 @@ namespace yask { // Max L1 dist of halo accesses to this var. int _l1_dist = 0; - // Data that needs to be copied to neighbor's halos if using MPI. - // If this var has the step dim, there is one bit per alloc'd step. - // Otherwise, only bit 0 is used. - std::vector _dirty_steps; - - // Data layout for slice APIs. - bool _is_col_major = false; - // Whether to resize this var based on solution parameters. bool _fixed_size = false; @@ -116,13 +595,24 @@ namespace yask { // Whether this was created via an API. bool _is_user_var = false; + // Tracking flags for data modified since last halo exchange. + // [self]: Data needs to be copied to neighbors' halos of this var. + // [others]: Data *may* need to be copied from one or neighbors into + // the halo of this var. + // vector contents: If this var has the step dim, there is one flag + // per alloc'd step. Otherwise, only [0] is used. + std::vector _dirty_steps[2]; + + // Coherency of device data. + Coherency _coh; + // Convenience function to format indices like // "x=5, y=3". virtual std::string make_index_string(const Indices& idxs, - std::string separator=", ", - std::string infix="=", - std::string prefix="", - std::string suffix="") const; + std::string separator=", ", + std::string infix="=", + std::string prefix="", + std::string suffix="") const; // Determine required padding from halos. // Does not include user-specified min padding or @@ -131,66 +621,123 @@ namespace yask { // Check whether dim exists and is of allowed type. virtual void check_dim_type(const std::string& dim, - const std::string& fn_name, - bool step_ok, - bool domain_ok, - bool misc_ok) const; + const std::string& fn_name, + bool step_ok, + bool domain_ok, + bool misc_ok) const; // Index math. inline idx_t get_first_local_index(idx_t posn) const { - return _rank_offsets[posn] + _local_offsets[posn] - _actl_left_pads[posn]; + return _corep->get_first_local_index(posn); } inline idx_t get_last_local_index(idx_t posn) const { - return _rank_offsets[posn] + _local_offsets[posn] + _domains[posn] + _actl_right_pads[posn] - 1; + return _corep->get_last_local_index(posn); } // Make sure indices are in range. // Optionally fix them to be in range and return in 'fixed_indices'. // If 'normalize', make rank-relative, divide by vlen and return in 'fixed_indices'. - virtual bool check_indices(const Indices& indices, - const std::string& fn, // name for error msg. - bool strict_indices, // die if out-of-range. - bool check_step, // check step index. - bool normalize, // div by vec lens. - Indices* fixed_indices = NULL) const; + bool check_indices(const Indices& indices, + const std::string& fn, // name for error msg. + bool strict_indices, // die if out-of-range. + bool check_step, // check step index. + bool normalize, // div by vec lens. + Indices* fixed_indices = NULL) const; // Resize or fail if already allocated. - virtual void resize(); + void resize(); - // Set dirty flags in range. + // Set my dirty flags in range. void set_dirty_in_slice(const Indices& first_indices, const Indices& last_indices); - // Make tuple needed for slicing. - IdxTuple get_slice_range(const Indices& first_indices, - const Indices& last_indices) const; + // Find sizes needed for slicing. + inline Indices get_slice_range(const Indices& first_indices, + const Indices& last_indices) const { + return last_indices.add_const(1).sub_elements(first_indices); + } - public: - YkVarBase(KernelStateBase& state, + // Accessors to GenericVar. + virtual GenericVarBase* get_gvbp() =0; + virtual const GenericVarBase* get_gvbp() const =0; + + // Sync core on device. + // Does NOT sync underlying var data; see + // copy_data_{to,from}_device(). + virtual void sync_core() =0; + + // Ctor. + // Important: *corep exists but is NOT yet constructed. + YkVarBase(KernelStateBase& stateb, + YkVarBaseCore* corep, const VarDimNames& dim_names); + + // Dtor. virtual ~YkVarBase() { } + public: + + // Accessors to core. + YkVarBaseCore* get_corep() { + return _corep; + } + const YkVarBaseCore* get_corep() const { + return _corep; + } + // Wrappers to GenericVar. - virtual GenericVarBase* get_gvbp() =0; - virtual const GenericVarBase* get_gvbp() const =0; - virtual const IdxTuple& get_dim_tuple() const =0; - virtual const std::string& get_name() const =0; - virtual bool is_dim_used(const std::string& dim) const =0; - virtual const std::string& get_dim_name(int n) const =0; - virtual idx_t get_dim_size(int n) const =0; - virtual void set_dim_size(int n, idx_t size) =0; - virtual int get_numa_pref() const =0; - virtual bool set_numa_pref(int numa_node) =0; - virtual void default_alloc() =0; - virtual void release_storage() =0; - virtual void* get_storage() =0; - virtual const void* get_storage() const =0; - virtual size_t get_num_bytes() const =0; - virtual void set_storage(std::shared_ptr& base, size_t offset) =0; - - // Num dims. + const IdxTuple& get_dim_tuple() const { + return get_gvbp()->get_dim_tuple(); + } + const std::string& get_name() const { + return get_gvbp()->get_name(); + } + bool is_dim_used(const std::string& dim) const { + return get_gvbp()->is_dim_used(dim); + } + const std::string& get_dim_name(int n) const { + return get_gvbp()->get_dim_name(n); + } + idx_t get_dim_size(int n) const { + return get_gvbp()->get_dim_size(n); + } + void set_dim_size(int n, idx_t size) { + get_gvbp()->set_dim_size(n, size); + } + int get_numa_pref() const { + return get_gvbp()->get_numa_pref(); + }; + bool set_numa_pref(int numa_node) { + return get_gvbp()->set_numa_pref(numa_node); + }; + void default_alloc() { + get_gvbp()->default_alloc(); + }; + void release_storage() { + get_gvbp()->release_storage(true); + }; + void* get_storage() { + return get_gvbp()->get_storage(); + }; + const void* get_storage() const { + return get_gvbp()->get_storage(); + }; + size_t get_num_bytes() const { + return get_gvbp()->get_num_bytes(); + }; + void set_storage(std::shared_ptr& base, size_t offset) { + return get_gvbp()->set_storage(base, offset); + }; + + // Num dims in this var. + // Not necessarily same as stencil problem. inline int get_num_dims() const { - return _domains._get_num_dims(); + return _corep->_domains.get_num_dims(); + } + + // Num domain dims in this var. + inline int get_num_domain_dims() const { + return _num_domain_dims; } // Dims same? @@ -203,26 +750,30 @@ namespace yask { void update_valid_step(idx_t t); inline void update_valid_step(const Indices& indices) { if (_has_step_dim) - update_valid_step(indices[+Indices::step_posn]); + update_valid_step(indices[+step_posn]); } inline void init_valid_steps() { if (_has_step_dim) - _local_offsets[+Indices::step_posn] = 0; + _corep->_local_offsets[+step_posn] = 0; } // Halo-exchange flag accessors. - virtual bool is_dirty(idx_t step_idx) const; - virtual void set_dirty(bool dirty, idx_t step_idx); - virtual void set_dirty_all(bool dirty); - inline void set_dirty_using_alloc_index(bool dirty, idx_t alloc_idx) { - _dirty_steps[alloc_idx] = dirty; + virtual bool is_dirty(dirty_idx whose, idx_t step_idx) const; + virtual void set_dirty(dirty_idx whose, bool dirty, idx_t step_idx); + inline void set_dirty_using_alloc_index(dirty_idx whose, bool dirty, idx_t alloc_idx) { + _dirty_steps[whose][alloc_idx] = dirty; } + virtual void set_dirty_all(dirty_idx whose, bool dirty); + + // Coherency. + const Coherency& get_coh() const { return _coh; } + Coherency& get_coh() { return _coh; } // Resize flag accessors. virtual void set_fixed_size(bool is_fixed) { _fixed_size = is_fixed; if (is_fixed) { - _rank_offsets.set_from_const(0); + _corep->_rank_offsets.set_from_const(0); _is_dynamic_step_alloc = true; _is_dynamic_misc_alloc = true; } @@ -234,7 +785,7 @@ namespace yask { _is_dynamic_misc_alloc = is_dynamic; } - // Does this var cover the N-D domain? + // Does this var cover the n-D domain? virtual bool is_domain_var() const; // Scratch accessors. @@ -244,7 +795,7 @@ namespace yask { virtual void set_scratch(bool is_scratch) { _is_scratch = is_scratch; if (is_scratch) - _rank_offsets.set_from_const(0); + _corep->_rank_offsets.set_from_const(0); } // New-var accessors. @@ -268,33 +819,19 @@ namespace yask { // Adjust logical time index to 0-based index // using temporal allocation size. inline idx_t _wrap_step(idx_t t) const { - - // Index wraps in tdim. - // Examples based on tdim == 2: - // t => return value. - // --- ------------- - // -2 => 0. - // -1 => 1. - // 0 => 0. - // 1 => 1. - // 2 => 0. - - // Avoid discontinuity caused by dividing negative numbers - // using floored-mod. - idx_t res = imod_flr(t, _domains[+Indices::step_posn]); - return res; + return _corep->_wrap_step(t); } // Convert logical step index to index in allocated range. // If this var doesn't use the step dim, returns 0. inline idx_t get_alloc_step_index(const Indices& indices) const { - return _has_step_dim ? _wrap_step(indices[+Indices::step_posn]) : 0; + return _has_step_dim ? _wrap_step(indices[+step_posn]) : 0; } // Get var dims with allocations in number of reals. virtual IdxTuple get_allocs() const { IdxTuple allocs = get_dim_tuple(); // make a copy. - _allocs.set_tuple_vals(allocs); + _corep->_allocs.set_tuple_vals(allocs); return allocs; } @@ -302,35 +839,69 @@ namespace yask { virtual std::string _make_info_string() const =0; virtual std::string make_info_string(bool long_info = false) const; + // Print one element. + virtual void print_elem(const std::string& msg, + const Indices& idxs, + real_t e, + int line) const; + + // Print one vector. + // Indices must be normalized and rank-relative. + virtual void print_vec_norm(const std::string& msg, + const Indices& idxs, + const real_vec_t& val, + int line) const; + // Check for equality. // Return number of mismatches greater than epsilon. virtual idx_t compare(const YkVarBase* ref, real_t epsilon = EPSILON, int max_print = 20) const; + // Copy data to/from device. + void copy_data_to_device(); + void copy_data_from_device(); + + // Versions that lie about being 'const' so we can copy data to/from + // the device without changing the API user's view that it has not changed. + inline void const_copy_data_to_device() const { + const_cast(this)->copy_data_to_device(); + } + inline void const_copy_data_from_device() const { + const_cast(this)->copy_data_from_device(); + } + // Set elements. virtual void set_all_elements_in_seq(double seed) =0; virtual void set_all_elements_same(double seed) =0; // Set/get_elements_in_slice(). - virtual idx_t set_elements_in_slice_same(double val, - const Indices& first_indices, - const Indices& last_indices, - bool strict_indices); - virtual idx_t set_elements_in_slice(const void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices); - virtual idx_t get_elements_in_slice(void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) const; + idx_t set_elements_in_slice_same(double val, + const Indices& first_indices, + const Indices& last_indices, + bool strict_indices, + bool on_device); + idx_t set_elements_in_slice(const void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device); + idx_t get_elements_in_slice(void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device) const; // Possibly vectorized version of set/get_elements_in_slice(). virtual idx_t set_vecs_in_slice(const void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) =0; + const Indices& last_indices, + bool on_device) =0; virtual idx_t get_vecs_in_slice(void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) const =0; + const Indices& last_indices, + bool on_device) const =0; + + // Get strides in underlying storage. + virtual Indices get_vec_strides() const =0; // Get a pointer to one element. // Indices are relative to overall problem domain. @@ -338,59 +909,158 @@ namespace yask { // to be within the allocated space. This avoids lots of 'idiv' instrs. // Methods are implemented in concrete classes for efficiency. virtual const real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) const =0; + idx_t alloc_step_idx, + bool check_bounds=true) const =0; virtual real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) =0; + idx_t alloc_step_idx, + bool check_bounds=true) =0; // Read one element. // Indices are relative to overall problem domain. virtual real_t read_elem(const Indices& idxs, - idx_t alloc_step_idx, - int line) const =0; + idx_t alloc_step_idx, + int line) const =0; // Write one element. // Indices are relative to overall problem domain. - inline void write_elem(real_t val, - const Indices& idxs, - idx_t alloc_step_idx, - int line) { - real_t* ep = get_elem_ptr(idxs, alloc_step_idx); - *ep = val; -#ifdef TRACE_MEM - print_elem("write_elem", idxs, val, line); -#endif - } + virtual void write_elem(real_t val, + const Indices& idxs, + idx_t alloc_step_idx, + int line) =0; - // Update one element. + // Update one element atomically. // Indices are relative to overall problem domain. inline void add_to_elem(real_t val, - const Indices& idxs, - idx_t alloc_step_idx, - int line) { + const Indices& idxs, + idx_t alloc_step_idx, + int line) { real_t* ep = get_elem_ptr(idxs, alloc_step_idx); -#pragma omp atomic update + #pragma omp atomic update *ep += val; -#ifdef TRACE_MEM + #ifdef TRACE_MEM print_elem("add_to_elem", idxs, *ep, line); -#endif + #endif } - // Print one element. - virtual void print_elem(const std::string& msg, - const Indices& idxs, - real_t e, - int line) const; + protected: + // Templated method to visit points in a slice. + // Visitor should implement the following: + // static const char* fname(); // name of calling function. + // static void visit(YkVarBase* varp, // 'this' ptr. + // real_t* p, // copy of 'buffer_ptr'. + // idx_t pofs, // offset into buffer. + // const Indices& pt, // point in 'this' var. + // idx_t ti); // precomputed step index. + template + idx_t _visit_elements_in_slice(bool strict_indices, + void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device) { + STATE_VARS(this); + if (get_storage() == 0) { + if (strict_indices) + THROW_YASK_EXCEPTION(std::string("Error: call to '") + + Visitor::fname() + + "' with no storage allocated for var '" + + get_name() + "'"); + return 0; + } + + TRACE_MSG(Visitor::fname() << ": " << make_info_string() << " [" << + make_index_string(first_indices) << " ... " << + make_index_string(last_indices) << "] with buffer at " << + buffer_ptr << " on " << (on_device ? "OMP device" : "host")); + check_indices(first_indices, Visitor::fname(), strict_indices, true, false); + check_indices(last_indices, Visitor::fname(), strict_indices, true, false); + + // Find range. + auto range = get_slice_range(first_indices, last_indices); + auto ne = range.product(); + TRACE_MSG(Visitor::fname() << ": " << ne << " element(s) in shape " << + make_index_string(range)); + if (ne <= 0) + return 0; + + // Iterate through step index in outer loop. + // This avoids calling _wrap_step(t) at every point. + const auto sp = +step_posn; + idx_t first_t = 0, last_t = 0; + if (_has_step_dim) { + first_t = first_indices[sp]; + last_t = last_indices[sp]; + range[sp] = 1; // Do one step per iter. + } + + // Amount to advance pointer each step. + idx_t tsz = range.product(); + idx_t tofs = 0; + TRACE_MSG(Visitor::fname() << ": " << tsz << " element(s) in shape " << + make_index_string(range) << " for each step"); + + // Iterate through inner index in inner loop. + // This enables more optimization. + const auto ip = get_num_dims() - 1; + idx_t ni = range[ip]; + range[ip] = 1; // Do whole range in each iter. + TRACE_MSG(Visitor::fname() << ": " << ni << + " element(s) for each starting-point in shape " << + make_index_string(range) << " for each inner loop"); + + // Make copy of first_indices to use as starting point + // of each step. + auto start_indices(first_indices); + + // Outer loop through each step. + for (idx_t t = first_t; t <= last_t; t++) { + + // Do only this one step in this iteration. + idx_t ti = 0; + if (_has_step_dim) { + ti = _wrap_step(t); + start_indices[sp] = t; + } + + // Visit points in slice on host in parallel. + if (!on_device) { + range.visit_all_points_in_parallel + (false, + [&](const Indices& ofs, size_t idx) { + auto pt = start_indices.add_elements(ofs); + + // Inner loop. + for (idx_t i = 0; i < ni; i++) { + idx_t bofs = tofs + idx * ni + i; + #if 0 + TRACE_MSG(Visitor::fname() << ": visting pt " << + make_index_string(pt) << " w/buf ofs " << bofs); + #endif + + // Call visitor. + Visitor::visit(this, (real_t*)buffer_ptr, bofs, pt, ti); + pt[ip]++; + } + + return true; // keep going. + }); + } + + // Visit points in slice on device. + else { + THROW_YASK_EXCEPTION(std::string("Internal error: '") + + Visitor::fname() + "' for var '" + + get_name() + "' not implemented for offload device"); + } - // Print one vector. - // Indices must be normalized and rank-relative. - virtual void print_vec_norm(const std::string& msg, - const Indices& idxs, - const real_vec_t& val, - int line) const; + // Skip to next step in buffer. + tofs += tsz; + } // steps. + TRACE_MSG(Visitor::fname() << " returns " << ne); + return ne; + } + }; typedef std::shared_ptr VarBasePtr; @@ -400,17 +1070,50 @@ namespace yask { template class YkElemVar final : public YkVarBase { + public: + // Type for core data. + typedef YkElemVarCore core_t; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); + protected: - typedef GenericVar _var_type; - _var_type _data; + // Core data. + // This also contains the storage core: _core._data. + core_t _core; + + // Storage meta-data. + // Owned here via composition. + // This contains a pointer to _core._data. + GenericVar _data; + + // Accessors to GenericVar. + virtual GenericVarBase* get_gvbp() override final { + return &_data; + } + virtual const GenericVarBase* get_gvbp() const override final { + return &_data; + } + + // Sync core meta-data on device. + // Does NOT sync underlying var data; see + // copy_data_{to,from}_device(). + void sync_core() override { + STATE_VARS(this); + auto* var_cp = &_core; + offload_copy_to_device(var_cp, 1); + _data.sync_data_ptr(); + } + public: YkElemVar(KernelStateBase& stateb, - std::string name, - const VarDimNames& dim_names) : - YkVarBase(stateb, dim_names), - _data(stateb, name, dim_names) { + std::string name, + const VarDimNames& dim_names) : + YkVarBase(stateb, &_core, dim_names), + _core(int(dim_names.size())), + _data(stateb, &_core._data, name, dim_names) { STATE_VARS(this); + TRACE_MSG("creating element-var '" + get_name() + "'"); _has_step_dim = _use_step_idx; // Init vec sizes. @@ -421,151 +1124,102 @@ namespace yask { auto& dname = dim_names.at(i); auto* p = dims->_vec_fold_pts.lookup(dname); idx_t dval = p ? *p : 1; - _soln_vec_lens[i] = dval; + _core._soln_vec_lens[i] = dval; + _core._var_vec_lens[i] = 1; } + // Create core on offload device. + auto* var_cp = &_core; + offload_map_alloc(var_cp, 1); + resize(); } - // Wrappers to GenericVar. - virtual GenericVarBase* get_gvbp() override final { - return &_data; - } - virtual const GenericVarBase* get_gvbp() const override final { - return &_data; - } - virtual const IdxTuple& get_dim_tuple() const override final { - return _data.get_dim_tuple(); - } - virtual const std::string& get_name() const override final { - return _data.get_name(); - } - virtual bool is_dim_used(const std::string& dim) const override final { - return _data.is_dim_used(dim); - } - virtual const std::string& get_dim_name(int n) const override final { - return _data.get_dim_name(n); - }; - virtual idx_t get_dim_size(int n) const override final { - return _data.get_dim_size(n); - } - virtual void set_dim_size(int n, idx_t size) override final { - _data.set_dim_size(n, size); - } - virtual int get_numa_pref() const override final { - return _data.get_numa_pref(); - }; - virtual bool set_numa_pref(int numa_node) override final { - return _data.set_numa_pref(numa_node); - }; - virtual void default_alloc() override final { - _data.default_alloc(); - }; - virtual void release_storage() override final { - _data.release_storage(); - }; - virtual void* get_storage() override final { - return _data.get_storage(); - }; - virtual const void* get_storage() const override final { - return _data.get_storage(); - }; - virtual size_t get_num_bytes() const override final { - return _data.get_num_bytes(); - }; - virtual void set_storage(std::shared_ptr& base, size_t offset) override final { - return _data.set_storage(base, offset); - }; + // Dtor. + virtual ~YkElemVar() { + STATE_VARS(this); + // Release core from device. + auto* var_cp = &_core; + offload_map_free(var_cp, 1); + } + // Make a human-readable description. virtual std::string _make_info_string() const override final { return _data.make_info_string("FP"); } // Init data. - virtual void set_all_elements_same(double seed) override final { + void set_all_elements_same(double seed) override final { _data.set_elems_same(seed); - set_dirty_all(true); + set_dirty_all(self, true); } - virtual void set_all_elements_in_seq(double seed) override final { + void set_all_elements_in_seq(double seed) override final { _data.set_elems_in_seq(seed); - set_dirty_all(true); + set_dirty_all(self, true); } // Get a pointer to given element. - virtual const real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) const final { - STATE_VARS_CONST(this); - TRACE_MEM_MSG(_data.get_name() << "." << "YkElemVar::get_elem_ptr(" << - idxs.make_val_str(get_num_dims()) << ")"); - const auto n = get_num_dims(); - Indices adj_idxs(n); - - // Special handling for step index. - auto sp = +Indices::step_posn; - if (_use_step_idx) { - assert(alloc_step_idx == _wrap_step(idxs[sp])); - adj_idxs[sp] = alloc_step_idx; - } - - // All other indices. - _UNROLL for (int i = 0; i < n; i++) { - if (!(_use_step_idx && i == sp)) { - - // Adjust for offsets and padding. - // This gives a positive 0-based local element index. - idx_t ai = idxs[i] + _actl_left_pads[i] - - (_rank_offsets[i] + _local_offsets[i]); - assert(ai >= 0); - adj_idxs[i] = uidx_t(ai); - } - } - -#ifdef TRACE_MEM - if (check_bounds) - TRACE_MEM_MSG(" => " << _data.get_index(adj_idxs)); -#endif - - // Get pointer via layout in _data. - return _data.get_ptr(adj_idxs, check_bounds); + const real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const override final { + return _core.get_elem_ptr(idxs, alloc_step_idx, check_bounds); } // Non-const version. - virtual real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) final { - - const real_t* p = - const_cast(this)->get_elem_ptr(idxs, alloc_step_idx, check_bounds); - return const_cast(p); + real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) override final { + return _core.get_elem_ptr(idxs, alloc_step_idx, check_bounds); } // Read one element. // Indices are relative to overall problem domain. - virtual real_t read_elem(const Indices& idxs, - idx_t alloc_step_idx, - int line) const final { - const real_t* ep = YkElemVar::get_elem_ptr(idxs, alloc_step_idx); - real_t e = *ep; -#ifdef TRACE_MEM + real_t read_elem(const Indices& idxs, + idx_t alloc_step_idx, + int line) const override final { + real_t e = _core.read_elem(idxs, alloc_step_idx); + #ifdef TRACE_MEM print_elem("read_elem", idxs, e, line); -#endif + #endif return e; } + // Write one element. + // Indices are relative to overall problem domain. + void write_elem(real_t val, + const Indices& idxs, + idx_t alloc_step_idx, + int line) override final { + _core.write_elem(val, idxs, alloc_step_idx); + #ifdef TRACE_MEM + print_elem("write_elem", idxs, val, line); + #endif + } + // Non-vectorized fall-back versions. virtual idx_t set_vecs_in_slice(const void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) override { - return set_elements_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& last_indices, + bool on_device) override { + return set_elements_in_slice(buffer_ptr, + first_indices, last_indices, on_device); } virtual idx_t get_vecs_in_slice(void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) const override { - return get_elements_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& last_indices, + bool on_device) const override { + return get_elements_in_slice(buffer_ptr, + first_indices, last_indices, on_device); + } + + // Get strides in underlying storage. + // This will be element strides in this class. + virtual Indices get_vec_strides() const override { + return _data.get_strides(); } - }; // YkElemVar. + + }; // YkElemVar. // YASK var of real vectors. // Used for vars that contain all the folded dims. @@ -575,22 +1229,51 @@ namespace yask { template class YkVecVar final : public YkVarBase { + public: + // Type for core data. + typedef YkVecVarCore core_t; + static_assert(std::is_trivially_copyable::value, + "Needed for OpenMP offload"); + protected: - typedef GenericVar _var_type; - _var_type _data; - // Positions of var dims in vector fold dims. - Indices _vec_fold_posns; + // Core data. + // This also contains the storage core. + core_t _core; - public: + // Storage meta-data. + // Owned here via composition. + // This contains a pointer to _core._data. + GenericVar _data; + + // Accessors to GenericVar. + virtual GenericVarBase* get_gvbp() override final { + return &_data; + } + virtual const GenericVarBase* get_gvbp() const override final { + return &_data; + } + + // Sync core on device. + // Does NOT sync underlying var data; see + // copy_data_{to,from}_device(). + void sync_core() override { + STATE_VARS(this); + auto* var_cp = &_core; + offload_copy_to_device(var_cp, 1); + _data.sync_data_ptr(); + } + + public: YkVecVar(KernelStateBase& stateb, - const std::string& name, - const VarDimNames& dim_names) : - YkVarBase(stateb, dim_names), - _data(stateb, name, dim_names), - _vec_fold_posns(idx_t(0), int(dim_names.size())) { + const std::string& name, + const VarDimNames& dim_names) : + YkVarBase(stateb, &_core, dim_names), + _core(int(dim_names.size())), + _data(stateb, &_core._data, name, dim_names) { STATE_VARS(this); - _has_step_dim = _use_step_idx; + TRACE_MSG("creating vector-var '" + get_name() + "'"); + _has_step_dim = _use_step_idx; // Template vec lengths. const int nvls = sizeof...(_templ_vec_lens); @@ -606,88 +1289,52 @@ namespace yask { auto& dname = dim_names.at(i); auto* p = dims->_vec_fold_pts.lookup(dname); idx_t dval = p ? *p : 1; - _soln_vec_lens[i] = dval; - _var_vec_lens[i] = dval; + _corep->_soln_vec_lens[i] = dval; + _corep->_var_vec_lens[i] = dval; // Must be same as that in template parameter pack. assert(dval == vls[i]); } // Init var-dim positions of fold dims. - assert(dims->_vec_fold_pts._get_num_dims() == NUM_VEC_FOLD_DIMS); + // TODO: figure out how to do this statically. + assert(dims->_vec_fold_pts.get_num_dims() == NUM_VEC_FOLD_DIMS); for (int i = 0; i < NUM_VEC_FOLD_DIMS; i++) { auto& fdim = dims->_vec_fold_pts.get_dim_name(i); int j = get_dim_posn(fdim, true, "internal error: folded var missing folded dim"); assert(j >= 0); - _vec_fold_posns[i] = j; + _core._vec_fold_posns[i] = j; } + // Create core on offload device. + auto* var_cp = &_core; + offload_map_alloc(var_cp, 1); + resize(); } - // Wrappers to GenericVar. - virtual GenericVarBase* get_gvbp() override final { - return &_data; - } - virtual const GenericVarBase* get_gvbp() const override final { - return &_data; - } - virtual const IdxTuple& get_dim_tuple() const override final { - return _data.get_dim_tuple(); - } - virtual const std::string& get_name() const override final { - return _data.get_name(); - } - virtual bool is_dim_used(const std::string& dim) const override final { - return _data.is_dim_used(dim); - } - virtual const std::string& get_dim_name(int n) const override final { - return _data.get_dim_name(n); - }; - virtual idx_t get_dim_size(int n) const override final { - return _data.get_dim_size(n); - } - virtual void set_dim_size(int n, idx_t size) override final { - _data.set_dim_size(n, size); - } - virtual int get_numa_pref() const override final { - return _data.get_numa_pref(); - }; - virtual bool set_numa_pref(int numa_node) override final { - return _data.set_numa_pref(numa_node); - }; - virtual void default_alloc() override final { - _data.default_alloc(); - }; - virtual void release_storage() override final { - _data.release_storage(); - }; - virtual void* get_storage() override final { - return _data.get_storage(); - }; - virtual const void* get_storage() const override final { - return _data.get_storage(); - }; - virtual size_t get_num_bytes() const override final { - return _data.get_num_bytes(); - }; - virtual void set_storage(std::shared_ptr& base, size_t offset) override final { - return _data.set_storage(base, offset); - }; + // Dtor. + virtual ~YkVecVar() { + STATE_VARS(this); + // Release core from device. + auto* var_cp = &_core; + offload_map_free(var_cp, 1); + } + // Make a human-readable description. - virtual std::string _make_info_string() const override final { + std::string _make_info_string() const override final { return _data.make_info_string("SIMD FP"); } // Init data. - virtual void set_all_elements_same(double seed) override final { + void set_all_elements_same(double seed) override final { real_vec_t seedv = seed; // bcast. _data.set_elems_same(seedv); - set_dirty_all(true); + set_dirty_all(self, true); } - virtual void set_all_elements_in_seq(double seed) override final { + void set_all_elements_in_seq(double seed) override final { real_vec_t seedv; auto n = seedv.get_num_elems(); @@ -697,328 +1344,332 @@ namespace yask { for (int i = 0; i < n; i++) seedv[i] = seed * (1.0 + double(i) / n); _data.set_elems_in_seq(seedv); - set_dirty_all(true); + set_dirty_all(self, true); } // Get a pointer to given element. - virtual const real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) const final { - STATE_VARS_CONST(this); - TRACE_MEM_MSG(_data.get_name() << "." << "YkVecVar::get_elem_ptr(" << - idxs.make_val_str(get_num_dims()) << ")"); - - // Use template vec lengths instead of run-time values for - // efficiency. - static constexpr int nvls = sizeof...(_templ_vec_lens); - static constexpr uidx_t vls[nvls] { _templ_vec_lens... }; - Indices vec_idxs(nvls), elem_ofs(nvls); -#ifdef DEBUG_LAYOUT - const auto nd = get_num_dims(); - assert(nd == nvls); -#endif - - // Special handling for step index. - auto sp = +Indices::step_posn; - if (_use_step_idx) { - assert(alloc_step_idx == _wrap_step(idxs[sp])); - vec_idxs[sp] = alloc_step_idx; - elem_ofs[sp] = 0; - } - - // Try to force compiler to use shifts instead of DIV and MOD - // when the vec-lengths are 2^n. - // All other indices. - _UNROLL _NO_VECTOR - for (int i = 0; i < nvls; i++) { - if (!(_use_step_idx && i == sp)) { - - // Adjust for offset and padding. - // This gives a positive 0-based local element index. - idx_t ai = idxs[i] + _actl_left_pads[i] - - (_rank_offsets[i] + _local_offsets[i]); - assert(ai >= 0); - uidx_t adj_idx = uidx_t(ai); - - // Get vector index and offset. - // Use unsigned DIV and MOD to avoid compiler having to - // emit code for preserving sign when using shifts. - vec_idxs[i] = idx_t(adj_idx / vls[i]); - elem_ofs[i] = idx_t(adj_idx % vls[i]); - assert(vec_idxs[i] == idx_t(adj_idx / _var_vec_lens[i])); - assert(elem_ofs[i] == idx_t(adj_idx % _var_vec_lens[i])); - } - } - - // Get only the vectorized fold offsets, i.e., those - // with vec-lengths > 1. - // And, they need to be in the original folding order, - // which might be different than the var-dim order. - Indices fold_ofs(NUM_VEC_FOLD_DIMS); - _UNROLL for (int i = 0; i < NUM_VEC_FOLD_DIMS; i++) { - int j = _vec_fold_posns[i]; - fold_ofs[i] = elem_ofs[j]; - } - - // Get 1D element index into vector. - auto i = dims->get_elem_index_in_vec(fold_ofs); - -#ifdef DEBUG_LAYOUT - // Compare to more explicit offset extraction. - IdxTuple eofs = get_allocs(); // get dims for this var. - elem_ofs.set_tuple_vals(eofs); // set vals from elem_ofs. - auto i2 = dims->get_elem_index_in_vec(eofs); - assert(i == i2); -#endif - - if (check_bounds) - TRACE_MEM_MSG(" => " << _data.get_index(vec_idxs) << - "[" << i << "]"); - - // Get pointer to vector. - const real_vec_t* vp = _data.get_ptr(vec_idxs, check_bounds); - - // Get pointer to element. - const real_t* ep = &(*vp)[i]; - return ep; + const real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const override final { + return _core.get_elem_ptr(idxs, alloc_step_idx, check_bounds); } // Non-const version. - virtual real_t* get_elem_ptr(const Indices& idxs, - idx_t alloc_step_idx, - bool check_bounds=true) override final { - - const real_t* p = - const_cast(this)->get_elem_ptr(idxs, alloc_step_idx, - check_bounds); - return const_cast(p); + real_t* get_elem_ptr(const Indices& idxs, + idx_t alloc_step_idx, + bool check_bounds=true) override final { + return _core.get_elem_ptr(idxs, alloc_step_idx, check_bounds); } // Read one element. // Indices are relative to overall problem domain. - virtual real_t read_elem(const Indices& idxs, - idx_t alloc_step_idx, - int line) const override final { - const real_t* ep = YkVecVar::get_elem_ptr(idxs, alloc_step_idx); - real_t e = *ep; -#ifdef TRACE_MEM - print_elem("read_elem", idxs, e, line); -#endif - return e; + real_t read_elem(const Indices& idxs, + idx_t alloc_step_idx, + int line) const override final { + auto val = _core.read_elem(idxs, alloc_step_idx); + #ifdef TRACE_MEM + print_elem("read_elem", idxs, val, line); + #endif + return val; + } + + // Write one element. + // Indices are relative to overall problem domain. + void write_elem(real_t val, + const Indices& idxs, + idx_t alloc_step_idx, + int line) override final { + _core.write_elem(val, idxs, alloc_step_idx); + #ifdef TRACE_MEM + print_elem("write_elem", idxs, val, line); + #endif } // Get a pointer to given vector. // Indices must be normalized and rank-relative. // It's important that this function be efficient, since // it's indiectly used from the stencil kernel. - ALWAYS_INLINE const real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, - idx_t alloc_step_idx, - bool check_bounds=true) const { - STATE_VARS_CONST(this); - TRACE_MEM_MSG(_data.get_name() << "." << "YkVecVar::get_vec_ptr_norm(" << - vec_idxs.make_val_str(get_num_dims()) << ")"); - - static constexpr int nvls = sizeof...(_templ_vec_lens); -#ifdef DEBUG_LAYOUT - const auto nd = get_num_dims(); - assert(nd == nvls); -#endif - Indices adj_idxs(nvls); - - // Special handling for step index. - auto sp = +Indices::step_posn; - if (_use_step_idx) { - assert(alloc_step_idx == _wrap_step(vec_idxs[sp])); - adj_idxs[sp] = alloc_step_idx; - } - - // Domain indices. - _UNROLL for (int i = 0; i < nvls; i++) { - if (!(_use_step_idx && i == sp)) { - - // Adjust for padding. - // Since the indices are rank-relative, subtract only - // the local offsets. (Compare to get_elem_ptr().) - // This gives a 0-based local *vector* index. - adj_idxs[i] = vec_idxs[i] + _vec_left_pads[i] - _vec_local_offsets[i]; - } - } - TRACE_MEM_MSG(" => " << _data.get_index(adj_idxs, check_bounds)); - - // Get ptr via layout in _data. - return _data.get_ptr(adj_idxs, check_bounds); + const real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) const { + return _core.get_vec_ptr_norm(vec_idxs, alloc_step_idx, check_bounds); } // Non-const version. - ALWAYS_INLINE real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, - idx_t alloc_step_idx, - bool check_bounds=true) { - - const real_vec_t* p = - const_cast(this)->get_vec_ptr_norm(vec_idxs, - alloc_step_idx, check_bounds); - return const_cast(p); + real_vec_t* get_vec_ptr_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + bool check_bounds=true) { + return _core.get_vec_ptr_norm(vec_idxs, alloc_step_idx, check_bounds); } // Read one vector. // Indices must be normalized and rank-relative. // 'alloc_step_idx' is pre-calculated or 0 if not used. - inline real_vec_t read_vec_norm(const Indices& vec_idxs, - idx_t alloc_step_idx, - int line) const { - const real_vec_t* vp = get_vec_ptr_norm(vec_idxs, alloc_step_idx); - real_vec_t v = *vp; -#ifdef TRACE_MEM + real_vec_t read_vec_norm(const Indices& vec_idxs, + idx_t alloc_step_idx, + int line) const { + auto v = _core.read_vec_norm(vec_idxs, alloc_step_idx); + #ifdef TRACE_MEM print_vec_norm("read_vec_norm", vec_idxs, v, line); -#endif + #endif return v; } // Write one vector. // Indices must be normalized and rank-relative. // 'alloc_step_idx' is pre-calculated or 0 if not used. - inline void write_vec_norm(real_vec_t val, - const Indices& vec_idxs, - idx_t alloc_step_idx, - int line) { - real_vec_t* vp = get_vec_ptr_norm(vec_idxs, alloc_step_idx); - *vp = val; -#ifdef TRACE_MEM + void write_vec_norm(real_vec_t val, + const Indices& vec_idxs, + idx_t alloc_step_idx, + int line) { + _core.write_vec_norm(val, vec_idxs, alloc_step_idx); + #ifdef TRACE_MEM print_vec_norm("write_vec_norm", vec_idxs, val, line); -#endif - } - - // Prefetch one vector. - // Indices must be normalized and rank-relative. - // 'alloc_step_idx' is pre-calculated or 0 if not used. - template - ALWAYS_INLINE - void prefetch_vec_norm(const Indices& vec_idxs, - idx_t alloc_step_idx, - int line) const { - STATE_VARS_CONST(this); - TRACE_MEM_MSG("prefetch_vec_norm<" << level << ">(" << - make_index_string(vec_idxs.mul_elements(_var_vec_lens)) << ")"); - - auto p = get_vec_ptr_norm(vec_idxs, alloc_step_idx, false); - prefetch(p); -#ifdef MODEL_CACHE - cache_model.prefetch(p, level, line); -#endif - } - - // Vectorized version of set/get_elements_in_slice(). - // Indices must be vec-normalized and rank-relative. - virtual idx_t set_vecs_in_slice(const void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) override { + #endif + } + + private: + // Template for get/set_vecs_in_slice. + // Input indices are global and element granularity (not rank-local or normalized). + // This is similar to but simpler than _visit_elements_in_slice(). + template + idx_t _copy_vecs_in_slice(void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device) { STATE_VARS(this); - if (_data.get_storage() == 0) - return 0; + assert(_data.get_storage() != 0); + + // Use the core for efficiency and to allow offload. + core_t* core_p = &_core; + + #ifdef USE_OFFLOAD_NO_USM + if (on_device) { + auto devn = KernelEnv::_omp_devn; + + // 'buffer_ptr' and 'core_p' should exist on device. + assert(omp_target_is_present(buffer_ptr, devn)); + assert(omp_target_is_present(core_p, devn)); + } + #endif + Indices firstv, lastv; - check_indices(first_indices, "set_vecs_in_slice", true, false, true, &firstv); - check_indices(last_indices, "set_vecs_in_slice", true, false, true, &lastv); + check_indices(first_indices, "copy_vecs_in_slice", true, false, true, &firstv); + check_indices(last_indices, "copy_vecs_in_slice", true, false, true, &lastv); // Find range. - IdxTuple num_vecs_tuple = get_slice_range(firstv, lastv); - TRACE_MSG("set_vecs_in_slice: setting " << - num_vecs_tuple.make_dim_val_str(" * ") << " vecs at [" << - make_index_string(firstv) << " ... " << - make_index_string(lastv) << "]"); - - // Do step loop explicitly. - auto sp = +Indices::step_posn; + auto vec_range = get_slice_range(firstv, lastv); + auto nv = vec_range.product(); + auto ne = nv * VLEN; + TRACE_MSG("copying " << nv << " vec(s) in " << + make_info_string() << " [" << + make_index_string(firstv) << " ... " << + make_index_string(lastv) << "] with buffer at " << + buffer_ptr << " on " << (on_device ? "OMP device" : "host")); + if (nv < 1) + return 0; + + // Iterate through step index in outer loop. + // This avoids calling _wrap_step(t) at every point. + auto sp = +step_posn; idx_t first_t = 0, last_t = 0; if (_has_step_dim) { first_t = firstv[sp]; last_t = lastv[sp]; - num_vecs_tuple[sp] = 1; // Do one at a time. + vec_range[sp] = 1; // Do one step per iter. + } + + // Amount to advance pointer each step. + idx_t tsz = vec_range.product(); + idx_t tofs = 0; + + // Determine inner-loop dim. + // Use last dim by default. + auto ip = get_num_dims() - 1; + + // Use first non-step dim by default if inner loop dim doesn't match layout. + // Remember that this var may not have either. + if (dims->_inner_loop_dim != dims->_inner_layout_dim) + ip = _has_step_dim ? 1 : 0; + + // Look for specified dim. + // TODO: determine actual first or last layout dim. + for (int i = 0; i < get_num_dims(); i++) { + if (get_dim_name(i) == dims->_inner_loop_dim) { + ip = i; + break; + } } - idx_t iofs = 0; + + // Extract inner-loop range and re-init it in 'vec_range'. + idx_t ni = vec_range[ip]; + vec_range[ip] = 1; // Do whole range in each iter. + + // Inner-loop stride. + idx_t si = core_p->_vec_strides[ip]; + + // Outer loop through each step. for (idx_t t = first_t; t <= last_t; t++) { - // Do only this one step in this iteration. + // Do only step 't' in this iteration. idx_t ti = 0; if (_has_step_dim) { ti = _wrap_step(t); firstv[sp] = t; - lastv[sp] = t; } - // Visit points in slice. - num_vecs_tuple.visit_all_points_in_parallel - ([&](const IdxTuple& ofs, - size_t idx) { + if (on_device) { + #ifdef USE_OFFLOAD + auto devn = KernelEnv::_omp_devn; + auto nj = vec_range.product(); + + // Run outer loop on device in parallel. + _Pragma("omp target teams distribute parallel for device(devn)") + for (idx_t j = 0; j < nj; j++) { + + // Init vars for first point. + Indices ofs = vec_range.unlayout(false, j); Indices pt = firstv.add_elements(ofs); - real_vec_t val = ((real_vec_t*)buffer_ptr)[idx + iofs]; + auto* vp = core_p->get_vec_ptr_norm(pt, ti); + idx_t bofs = tofs + j * ni; + + // Inner loop. + for (idx_t i = 0; i < ni; i++) { + + // Do the copy operation specified in visitor. + Visitor::do_copy(((real_vec_t*)buffer_ptr), bofs, vp); + + // Next point in buffer and var. + vp += si; + bofs++; + } + } + #else + THROW_YASK_EXCEPTION("internal error: call to _copy_vecs_in_slice on device" + " in non-offload build"); + #endif + } - write_vec_norm(val, pt, ti, __LINE__); - return true; // keep going. - }); - iofs += num_vecs_tuple.product(); - } + // Visit starting points in range on host in parallel. + else { + vec_range.visit_all_points_in_parallel + (false, + [&](const Indices& ofs, size_t idx) { + + // Init vars for first point. + auto pt = firstv.add_elements(ofs); + auto* vp = core_p->get_vec_ptr_norm(pt, ti); + idx_t bofs = tofs + idx * ni; + + // Inner loop. + for (idx_t i = 0; i < ni; i++) { + + // Do the copy operation specified in visitor. + Visitor::do_copy(((real_vec_t*)buffer_ptr), bofs, vp); + + // Next point in buffer and var. + vp += si; + bofs++; + } + return true; // keep going. + }); + } - // Set appropriate dirty flag(s). - set_dirty_in_slice(first_indices, last_indices); + // Skip to next step in buffer. + tofs += tsz; + + } // time steps. - return num_vecs_tuple.product() * VLEN; + assert(tofs == nv); + return ne; } - virtual idx_t get_vecs_in_slice(void* buffer_ptr, + public: + // Vectorized version of set_elements_in_slice(). + // Input indices are global and element granularity (not rank-local or normalized). + virtual idx_t set_vecs_in_slice(const void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) const override final { - STATE_VARS(this); - if (_data.get_storage() == 0) - FORMAT_AND_THROW_YASK_EXCEPTION("Error: call to 'get_vecs_in_slice' with no storage allocated for var '" << - _data.get_name()); - Indices firstv, lastv; - check_indices(first_indices, "get_vecs_in_slice", true, true, true, &firstv); - check_indices(last_indices, "get_vecs_in_slice", true, true, true, &lastv); - - // Find range. - IdxTuple num_vecs_tuple = get_slice_range(firstv, lastv); - TRACE_MSG("get_vecs_in_slice: getting " << - num_vecs_tuple.make_dim_val_str(" * ") << " vecs at " << - make_index_string(firstv) << " ... " << - make_index_string(lastv)); - auto n = num_vecs_tuple.product() * VLEN; - - // Do step loop explicitly. - auto sp = +Indices::step_posn; - idx_t first_t = 0, last_t = 0; - if (_has_step_dim) { - first_t = firstv[sp]; - last_t = lastv[sp]; - num_vecs_tuple[sp] = 1; // Do one at a time. - } - idx_t iofs = 0; - for (idx_t t = first_t; t <= last_t; t++) { - - // Do only this one step in this iteration. - idx_t ti = 0; - if (_has_step_dim) { - ti = _wrap_step(t); - firstv[sp] = t; - lastv[sp] = t; + const Indices& last_indices, + bool on_device = false) override { + + // Specialize do_copy() to copy from buffer to var. + // Could have used a lambda, but this avoids possible conversion to std::function. + struct SetVec { + ALWAYS_INLINE + static void do_copy(real_vec_t* p, idx_t pofs, + real_vec_t* vp) { + + // Read vec from buffer. + real_vec_t val = p[pofs]; + + // Write to var. + val.store_to(vp); } + }; + + if (on_device) + const_copy_data_to_device(); + else + const_copy_data_from_device(); + + // Call the generic vec copier. + auto nset = _copy_vecs_in_slice((void*)buffer_ptr, + first_indices, last_indices, + on_device); - // Visit points in slice. - num_vecs_tuple.visit_all_points_in_parallel - ([&](const IdxTuple& ofs, - size_t idx) { - Indices pt = firstv.add_elements(ofs); + // Set appropriate dirty flag(s). + if (on_device) + _coh.mod_dev(); + else + _coh.mod_host(); + set_dirty_in_slice(first_indices, last_indices); - real_vec_t val = read_vec_norm(pt, ti, __LINE__); - ((real_vec_t*)buffer_ptr)[idx + iofs] = val; - return true; // keep going. - }); - iofs += num_vecs_tuple.product(); - } - assert(iofs * VLEN == n); + return nset; + } + // Vectorized version of get_elements_in_slice(). + // Input indices are global and element granularity (not rank-local or normalized). + virtual idx_t get_vecs_in_slice(void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device) const override final { + + // Specialize do_copy() to copy to buffer from var. + // Could have used a lambda, but this avoids possible conversion to std::function. + struct GetVec { + ALWAYS_INLINE + static void do_copy(real_vec_t* p, idx_t pofs, + real_vec_t* vp) { + + // Read vec from var. + real_vec_t res; + res.load_from(vp); + + // Write to buffer at proper index. + p[pofs] = res; + } + }; + + if (on_device) + const_copy_data_to_device(); + else + const_copy_data_from_device(); + + // Call the generic vec copier. + auto n = const_cast(this)-> + _copy_vecs_in_slice((void*)buffer_ptr, + first_indices, last_indices, on_device); + // Return number of writes. return n; } + + // Get strides in underlying storage. + // This will be vector strides in this class. + virtual Indices get_vec_strides() const override { + return _data.get_strides(); + } + }; // YkVecVar. // Implementation of yk_var interface. Class contains no real data, @@ -1040,34 +1691,47 @@ namespace yask { assert(_gbp.get()); return *(_gbp.get()); } - inline YkVarBase& gb() const { + inline const YkVarBase& gb() const { assert(_gbp.get()); return *(_gbp.get()); } inline YkVarBase* gbp() { return _gbp.get(); } - inline YkVarBase* gbp() const { + inline const YkVarBase* gbp() const { return _gbp.get(); } + inline YkVarBaseCore* corep() { + return gb().get_corep(); + } + inline const YkVarBaseCore* corep() const { + return gb().get_corep(); + } // Pass-thru methods to base. void set_all_elements_in_seq(double seed) { gb().set_all_elements_in_seq(seed); } idx_t set_vecs_in_slice(const void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) { - return gb().set_vecs_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& first_indices, + const Indices& last_indices, + bool on_device) { + return gb().set_vecs_in_slice(buffer_ptr, + first_indices, last_indices, on_device); } idx_t get_vecs_in_slice(void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) const { - return gb().get_vecs_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& first_indices, + const Indices& last_indices, + bool on_device) const { + return gb().get_vecs_in_slice(buffer_ptr, + first_indices, last_indices, on_device); } void resize() { gb().resize(); } + void sync_core() { + gb().sync_core(); + } // APIs. // See yask_kernel_api.hpp. @@ -1077,6 +1741,9 @@ namespace yask { virtual int get_num_dims() const { return gb().get_num_dims(); } + virtual int get_num_domain_dims() const { + return gb().get_num_domain_dims(); + } virtual bool is_dim_used(const std::string& dim) const { return gb().is_dim_used(dim); } @@ -1086,7 +1753,7 @@ namespace yask { return gb().get_dim_name(n); } virtual VarDimNames get_dim_names() const { - std::vector dims(get_num_dims()); + string_vec dims(get_num_dims()); for (int i = 0; i < get_num_dims(); i++) dims.at(i) = get_dim_name(i); return dims; @@ -1111,14 +1778,14 @@ namespace yask { if (!gb()._has_step_dim) THROW_YASK_EXCEPTION("Error: 'get_first_valid_step_index()' called on var '" + get_name() + "' that does not use the step dimension"); - return gb()._local_offsets[+Indices::step_posn]; + return corep()->_local_offsets[+step_posn]; } virtual idx_t get_last_valid_step_index() const { if (!gb()._has_step_dim) THROW_YASK_EXCEPTION("Error: 'get_last_valid_step_index()' called on var '" + get_name() + "' that does not use the step dimension"); - return gb()._local_offsets[+Indices::step_posn] + - gb()._domains[+Indices::step_posn] - 1; + return corep()->_local_offsets[+step_posn] + + corep()->_domains[+step_posn] - 1; } virtual int get_halo_exchange_l1_norm() const { @@ -1129,12 +1796,20 @@ namespace yask { gb()._l1_dist = norm; } -#define GET_VAR_API(api_name) \ - virtual idx_t api_name(const std::string& dim) const; \ - virtual idx_t api_name(int posn) const; -#define SET_VAR_API(api_name) \ - virtual void api_name(const std::string& dim, idx_t n); \ - virtual void api_name(int posn, idx_t n); + // See yk_var_apis.cpp for corresponding definition macros. + #define GET_VAR_API(api_name) \ + virtual idx_t api_name(const std::string& dim) const; \ + virtual idx_t api_name(int posn) const; + #define GET_VAR_API2(api_name) \ + GET_VAR_API(api_name) \ + virtual idx_t_vec api_name ## _vec() const; + #define SET_VAR_API(api_name) \ + virtual void api_name(const std::string& dim, idx_t n); \ + virtual void api_name(int posn, idx_t n); + #define SET_VAR_API2(api_name) \ + SET_VAR_API(api_name) \ + virtual void api_name ## _vec(idx_t_vec); \ + virtual void api_name ## _vec(idx_t_init_list); // Settings that should never be exposed as APIs because // they can break the usage model. @@ -1155,27 +1830,32 @@ namespace yask { SET_VAR_API(_set_rank_offset) SET_VAR_API(_set_right_pad_size) SET_VAR_API(_set_right_wf_ext) + SET_VAR_API(update_left_min_pad_size) + SET_VAR_API(update_right_min_pad_size) + SET_VAR_API(update_min_pad_size) + SET_VAR_API(update_left_extra_pad_size) + SET_VAR_API(update_right_extra_pad_size) + SET_VAR_API(update_extra_pad_size) // Exposed APIs. - GET_VAR_API(get_first_local_index) - GET_VAR_API(get_last_local_index) - GET_VAR_API(get_rank_domain_size) - GET_VAR_API(get_first_rank_domain_index) - GET_VAR_API(get_last_rank_domain_index) - GET_VAR_API(get_left_halo_size) - GET_VAR_API(get_right_halo_size) - GET_VAR_API(get_first_rank_halo_index) - GET_VAR_API(get_last_rank_halo_index) - GET_VAR_API(get_left_extra_pad_size) - GET_VAR_API(get_right_extra_pad_size) - GET_VAR_API(get_left_pad_size) - GET_VAR_API(get_right_pad_size) - GET_VAR_API(get_alloc_size) - GET_VAR_API(get_first_rank_alloc_index) - GET_VAR_API(get_last_rank_alloc_index) GET_VAR_API(get_first_misc_index) GET_VAR_API(get_last_misc_index) + GET_VAR_API2(get_first_local_index) + GET_VAR_API2(get_last_local_index) + GET_VAR_API2(get_rank_domain_size) + GET_VAR_API2(get_first_rank_domain_index) + GET_VAR_API2(get_last_rank_domain_index) + GET_VAR_API2(get_left_halo_size) + GET_VAR_API2(get_right_halo_size) + GET_VAR_API2(get_first_rank_halo_index) + GET_VAR_API2(get_last_rank_halo_index) + GET_VAR_API2(get_left_extra_pad_size) + GET_VAR_API2(get_right_extra_pad_size) + GET_VAR_API2(get_left_pad_size) + GET_VAR_API2(get_right_pad_size) + GET_VAR_API2(get_alloc_size) + SET_VAR_API(set_first_misc_index) SET_VAR_API(set_left_halo_size) SET_VAR_API(set_right_halo_size) SET_VAR_API(set_halo_size) @@ -1186,10 +1866,11 @@ namespace yask { SET_VAR_API(set_right_extra_pad_size) SET_VAR_API(set_extra_pad_size) SET_VAR_API(set_alloc_size) - SET_VAR_API(set_first_misc_index) -#undef GET_VAR_API -#undef SET_VAR_API + #undef GET_VAR_API + #undef GET_VAR_API2 + #undef SET_VAR_API + #undef SET_VAR_API2 virtual std::string format_indices(const Indices& indices) const { std::string str = get_name() + "(" + gb().make_index_string(indices) + ")"; @@ -1199,7 +1880,7 @@ namespace yask { const Indices indices2(indices); return format_indices(indices2); } - virtual std::string format_indices(const std::initializer_list& indices) const { + virtual std::string format_indices(const idx_t_init_list& indices) const { const Indices indices2(indices); return format_indices(indices2); } @@ -1209,7 +1890,7 @@ namespace yask { const Indices indices2(indices); return are_indices_local(indices2); } - virtual bool are_indices_local(const std::initializer_list& indices) const { + virtual bool are_indices_local(const idx_t_init_list& indices) const { const Indices indices2(indices); return are_indices_local(indices2); } @@ -1219,21 +1900,24 @@ namespace yask { const Indices indices2(indices); return get_element(indices2); } - virtual double get_element(const std::initializer_list& indices) const { + virtual double get_element(const idx_t_init_list& indices) const { const Indices indices2(indices); return get_element(indices2); } virtual idx_t get_elements_in_slice(void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) const { - return gb().get_elements_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& last_indices, + bool on_device) const { + return gb().get_elements_in_slice(buffer_ptr, + first_indices, last_indices, on_device); } virtual idx_t get_elements_in_slice(void* buffer_ptr, const VarIndices& first_indices, const VarIndices& last_indices) const { const Indices first(first_indices); const Indices last(last_indices); - return get_elements_in_slice(buffer_ptr, first, last); + return get_elements_in_slice(buffer_ptr, + first, last, false); } virtual idx_t set_element(double val, const Indices& indices, @@ -1245,23 +1929,23 @@ namespace yask { return set_element(val, indices2, strict_indices); } virtual idx_t set_element(double val, - const std::initializer_list& indices, + const idx_t_init_list& indices, bool strict_indices = false) { const Indices indices2(indices); return set_element(val, indices2, strict_indices); } virtual idx_t add_to_element(double val, - const Indices& indices, - bool strict_indices = false); + const Indices& indices, + bool strict_indices = false); virtual idx_t add_to_element(double val, - const VarIndices& indices, - bool strict_indices = false) { + const VarIndices& indices, + bool strict_indices = false) { const Indices indices2(indices); return add_to_element(val, indices2, strict_indices); } virtual idx_t add_to_element(double val, - const std::initializer_list& indices, - bool strict_indices = false) { + const idx_t_init_list& indices, + bool strict_indices = false) { const Indices indices2(indices); return add_to_element(val, indices2, strict_indices); } @@ -1272,8 +1956,11 @@ namespace yask { virtual idx_t set_elements_in_slice_same(double val, const Indices& first_indices, const Indices& last_indices, - bool strict_indices) { - return gb().set_elements_in_slice_same(val, first_indices, last_indices, strict_indices); + bool strict_indices, + bool on_device) { + return gb().set_elements_in_slice_same(val, + first_indices, last_indices, + strict_indices, on_device); } virtual idx_t set_elements_in_slice_same(double val, const VarIndices& first_indices, @@ -1281,20 +1968,22 @@ namespace yask { bool strict_indices) { const Indices first(first_indices); const Indices last(last_indices); - return set_elements_in_slice_same(val, first, last, strict_indices); + return set_elements_in_slice_same(val, first, last, strict_indices, false); } virtual idx_t set_elements_in_slice(const void* buffer_ptr, const Indices& first_indices, - const Indices& last_indices) { - return gb().set_elements_in_slice(buffer_ptr, first_indices, last_indices); + const Indices& last_indices, + bool on_device) { + return gb().set_elements_in_slice(buffer_ptr, + first_indices, last_indices, on_device); } virtual idx_t set_elements_in_slice(const void* buffer_ptr, const VarIndices& first_indices, const VarIndices& last_indices) { const Indices first(first_indices); const Indices last(last_indices); - return set_elements_in_slice(buffer_ptr, first, last); + return set_elements_in_slice(buffer_ptr, first, last, false); } virtual void alloc_storage() { @@ -1315,7 +2004,7 @@ namespace yask { return idx_t(gb().get_num_bytes()); } virtual idx_t get_num_storage_elements() const { - return gb()._allocs.product(); + return corep()->_allocs.product(); } virtual bool is_storage_layout_identical(const YkVarImpl* other, bool check_sizes) const; diff --git a/src/kernel/lib/yk_var_apis.cpp b/src/kernel/lib/yk_var_apis.cpp index 7316c4dc..4b730004 100644 --- a/src/kernel/lib/yk_var_apis.cpp +++ b/src/kernel/lib/yk_var_apis.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -30,113 +30,279 @@ using namespace std; namespace yask { -#define DEPRECATED(api_name) \ - cerr << "\n*** WARNING: call to deprecated YASK API '" \ - #api_name "' that will be removed in a future release ***\n" - // APIs to get info from vars: one with name of dim with a lot - // of checking, and one with index of dim with no checking. -#define GET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ - idx_t YkVarImpl::api_name(const string& dim) const { \ - STATE_VARS(gbp()); \ - dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ - int posn = gb().get_dim_posn(dim, true, #api_name); \ - idx_t mbit = 1LL << posn; \ - if (prep_req && gb()._rank_offsets[posn] < 0) \ - THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on var '" + \ - get_name() + "' before calling 'prepare_solution()'"); \ - auto rtn = expr; \ - return rtn; \ - } \ - idx_t YkVarImpl::api_name(int posn) const { \ - STATE_VARS(gbp()); \ - idx_t mbit = 1LL << posn; \ - auto rtn = expr; \ - return rtn; \ - } + // of checking, one with index of dim with no checking. + #define GET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ + idx_t YkVarImpl::api_name(const string& dim) const { \ + STATE_VARS(gbp()); \ + dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ + int posn = gb().get_dim_posn(dim, true, #api_name); \ + if (prep_req && corep()->_rank_offsets[posn] < 0) \ + THROW_YASK_EXCEPTION("Error: '" #api_name "()' called on var '" + \ + get_name() + "' before calling 'prepare_solution()'"); \ + auto cp = corep(); \ + auto rtn = expr; \ + return rtn; \ + } \ + idx_t YkVarImpl::api_name(int posn) const { \ + STATE_VARS(gbp()); \ + auto cp = corep(); \ + auto rtn = expr; \ + return rtn; \ + } + // Add vector version that retuns only allowed results. + #define GET_VAR_API2(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ + GET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok, prep_req) \ + idx_t_vec YkVarImpl::api_name ## _vec() const { \ + STATE_VARS(gbp()); \ + if (prep_req && corep()->_rank_offsets[0] < 0) \ + THROW_YASK_EXCEPTION("Error: '" #api_name "_vec()' called on var '" + \ + get_name() + "' before calling 'prepare_solution()'"); \ + auto cp = corep(); \ + auto nvdims = get_num_dims(); \ + auto nadims = 0; \ + if (step_ok) nadims += gb()._num_step_dims; \ + if (domain_ok) nadims += gb()._num_domain_dims; \ + if (misc_ok) nadims += gb()._num_misc_dims; \ + idx_t_vec res(nadims, 0); \ + int i = 0; \ + for (int posn = 0; posn < nvdims; posn++) { \ + idx_t mbit = idx_t(1) << posn; \ + if ((step_ok && (mbit & gb()._step_dim_mask) != 0) || \ + (domain_ok && (mbit & gb()._domain_dim_mask) != 0) || \ + (misc_ok && (mbit & gb()._misc_dim_mask) != 0)) { \ + res.at(i++) = expr; \ + } \ + } \ + assert(i == nadims); \ + return res; \ + } + + // APIs to set vars. + #define SET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok, need_resize) \ + void YkVarImpl::api_name(const string& dim, idx_t n) { \ + STATE_VARS(gbp()); \ + TRACE_MSG("var '" << get_name() << "'." \ + #api_name "('" << dim << "', " << n << ")"); \ + dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ + int posn = gb().get_dim_posn(dim, true, #api_name); \ + auto cp = corep(); \ + expr; \ + if (need_resize) resize(); \ + else sync_core(); \ + } \ + void YkVarImpl::api_name(int posn, idx_t n) { \ + STATE_VARS(gbp()); \ + TRACE_MSG("var '" << get_name() << "'." \ + #api_name "('" << posn << "', " << n << ")"); \ + auto cp = corep(); \ + expr; \ + if (need_resize) resize(); \ + else sync_core(); \ + } + + // Add vector versions that take only domain-dim vals. + #define SET_VAR_API2(api_name, expr, step_ok, domain_ok, misc_ok, need_resize) \ + SET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok, need_resize) \ + void YkVarImpl::api_name ## _vec(idx_t_vec vals) { \ + STATE_VARS(gbp()); \ + TRACE_MSG("var '" << get_name() << "'." \ + #api_name "_vec(...)"); \ + auto cp = corep(); \ + auto nvdims = get_num_dims(); \ + auto nadims = 0; \ + if (step_ok) nadims += gb()._num_step_dims; \ + if (domain_ok) nadims += gb()._num_domain_dims; \ + if (misc_ok) nadims += gb()._num_misc_dims; \ + if (vals.size() != nadims) \ + THROW_YASK_EXCEPTION("Error: '" #api_name \ + "_vec()' called on var '" + \ + get_name() + "' without the proper number of values"); \ + int i = 0; \ + for (int posn = 0; posn < nvdims; posn++) { \ + idx_t mbit = idx_t(1) << posn; \ + if ((step_ok && (mbit & gb()._step_dim_mask) != 0) || \ + (domain_ok && (mbit & gb()._domain_dim_mask) != 0) || \ + (misc_ok && (mbit & gb()._misc_dim_mask) != 0)) { \ + auto n = vals.at(i++); \ + expr; \ + } \ + } \ + if (need_resize) resize(); \ + else sync_core(); \ + } \ + void YkVarImpl::api_name ## _vec(idx_t_init_list vals) { \ + idx_t_vec vec(vals); \ + api_name ## _vec(vec); \ + } + // Internal APIs. - GET_VAR_API(_get_left_wf_ext, gb()._left_wf_exts[posn], true, true, true, false) - GET_VAR_API(_get_right_wf_ext, gb()._right_wf_exts[posn], true, true, true, false) - GET_VAR_API(_get_soln_vec_len, gb()._soln_vec_lens[posn], true, true, true, true) - GET_VAR_API(_get_var_vec_len, gb()._var_vec_lens[posn], true, true, true, true) - GET_VAR_API(_get_rank_offset, gb()._rank_offsets[posn], true, true, true, true) - GET_VAR_API(_get_local_offset, gb()._local_offsets[posn], true, true, true, false) + GET_VAR_API(_get_left_wf_ext, + cp->_left_wf_exts[posn], + true, true, true, false) + GET_VAR_API(_get_right_wf_ext, + cp->_right_wf_exts[posn], + true, true, true, false) + GET_VAR_API(_get_soln_vec_len, + cp->_soln_vec_lens[posn], + true, true, true, true) + GET_VAR_API(_get_var_vec_len, + cp->_var_vec_lens[posn], + true, true, true, true) + GET_VAR_API(_get_rank_offset, + cp->_rank_offsets[posn], + true, true, true, true) + GET_VAR_API(_get_local_offset, + cp->_local_offsets[posn], + true, true, true, false) // Exposed APIs. - GET_VAR_API(get_first_local_index, gb().get_first_local_index(posn), true, true, true, true) - GET_VAR_API(get_last_local_index, gb().get_last_local_index(posn), true, true, true, true) - GET_VAR_API(get_first_misc_index, gb()._local_offsets[posn], false, false, true, false) - GET_VAR_API(get_last_misc_index, gb()._local_offsets[posn] + gb()._domains[posn] - 1, false, false, true, false) - GET_VAR_API(get_rank_domain_size, gb()._domains[posn], false, true, false, false) - GET_VAR_API(get_left_pad_size, gb()._actl_left_pads[posn], false, true, false, false) - GET_VAR_API(get_right_pad_size, gb()._actl_right_pads[posn], false, true, false, false) - GET_VAR_API(get_left_halo_size, gb()._left_halos[posn], false, true, false, false) - GET_VAR_API(get_right_halo_size, gb()._right_halos[posn], false, true, false, false) - GET_VAR_API(get_left_extra_pad_size, gb()._actl_left_pads[posn] - gb()._left_halos[posn], false, true, false, false) - GET_VAR_API(get_right_extra_pad_size, gb()._actl_right_pads[posn] - gb()._right_halos[posn], false, true, false, false) - GET_VAR_API(get_alloc_size, gb()._allocs[posn], true, true, true, false) - GET_VAR_API(get_first_rank_domain_index, gb()._rank_offsets[posn], false, true, false, true) - GET_VAR_API(get_last_rank_domain_index, gb()._rank_offsets[posn] + gb()._domains[posn] - 1, false, true, false, true) - GET_VAR_API(get_first_rank_halo_index, gb()._rank_offsets[posn] - gb()._left_halos[posn], false, true, false, true) - GET_VAR_API(get_last_rank_halo_index, gb()._rank_offsets[posn] + gb()._domains[posn] + - gb()._right_halos[posn] - 1, false, true, false, true) - GET_VAR_API(get_first_rank_alloc_index, gb().get_first_local_index(posn), false, true, false, true) - GET_VAR_API(get_last_rank_alloc_index, gb().get_last_local_index(posn), false, true, false, true) -#undef GET_VAR_API - - // APIs to set vars. -#define COMMA , -#define SET_VAR_API(api_name, expr, step_ok, domain_ok, misc_ok) \ - void YkVarImpl::api_name(const string& dim, idx_t n) { \ - STATE_VARS(gbp()); \ - TRACE_MSG("var '" << get_name() << "'." \ - #api_name "('" << dim << "', " << n << ")"); \ - dims->check_dim_type(dim, #api_name, step_ok, domain_ok, misc_ok); \ - int posn = gb().get_dim_posn(dim, true, #api_name); \ - idx_t mbit = 1LL << posn; \ - expr; \ - } \ - void YkVarImpl::api_name(int posn, idx_t n) { \ - STATE_VARS(gbp()); \ - idx_t mbit = 1LL << posn; \ - int dim = posn; \ - expr; \ - } + GET_VAR_API(get_first_misc_index, + cp->_local_offsets[posn], + false, false, true, false) + GET_VAR_API(get_last_misc_index, + cp->_local_offsets[posn] + cp->_domains[posn] - 1, + false, false, true, false) + + GET_VAR_API2(get_alloc_size, + cp->_allocs[posn], + true, true, true, false) + GET_VAR_API2(get_first_local_index, + cp->get_first_local_index(posn), + true, true, true, true) + GET_VAR_API2(get_last_local_index, + cp->get_last_local_index(posn), + true, true, true, true) + + GET_VAR_API2(get_left_pad_size, + cp->_actl_left_pads[posn], + false, true, false, false) + GET_VAR_API2(get_right_pad_size, + cp->_actl_right_pads[posn], + false, true, false, false) + GET_VAR_API2(get_left_halo_size, + cp->_left_halos[posn], + false, true, false, false) + GET_VAR_API2(get_right_halo_size, + cp->_right_halos[posn], + false, true, false, false) + GET_VAR_API2(get_left_extra_pad_size, + cp->_actl_left_pads[posn] - cp->_left_halos[posn], + false, true, false, false) + GET_VAR_API2(get_right_extra_pad_size, + cp->_actl_right_pads[posn] - cp->_right_halos[posn], + false, true, false, false) + + GET_VAR_API2(get_rank_domain_size, + cp->_domains[posn], + false, true, false, !gb()._fixed_size) + GET_VAR_API2(get_first_rank_domain_index, + cp->_rank_offsets[posn], + false, true, false, true) + GET_VAR_API2(get_last_rank_domain_index, + cp->_rank_offsets[posn] + cp->_domains[posn] - 1, + false, true, false, true) + GET_VAR_API2(get_first_rank_halo_index, + cp->_rank_offsets[posn] - cp->_left_halos[posn], + false, true, false, true) + GET_VAR_API2(get_last_rank_halo_index, + cp->_rank_offsets[posn] + cp->_domains[posn] + cp->_right_halos[posn] - 1, + false, true, false, true) // These are the internal, unchecked access functions that allow // changes prohibited thru the APIs. - SET_VAR_API(_set_rank_offset, gb()._rank_offsets[posn] = n, true, true, true) - SET_VAR_API(_set_local_offset, gb()._local_offsets[posn] = n; - assert(imod_flr(n, gb()._var_vec_lens[posn]) == 0); - gb()._vec_local_offsets[posn] = n / gb()._var_vec_lens[posn], true, true, true) - SET_VAR_API(_set_domain_size, gb()._domains[posn] = n; resize(), true, true, true) - SET_VAR_API(_set_left_pad_size, gb()._actl_left_pads[posn] = n; resize(), true, true, true) - SET_VAR_API(_set_right_pad_size, gb()._actl_right_pads[posn] = n; resize(), true, true, true) - SET_VAR_API(_set_left_wf_ext, gb()._left_wf_exts[posn] = n; resize(), true, true, true) - SET_VAR_API(_set_right_wf_ext, gb()._right_wf_exts[posn] = n; resize(), true, true, true) - SET_VAR_API(_set_alloc_size, gb()._domains[posn] = n; resize(), true, true, true) + SET_VAR_API(_set_rank_offset, + cp->_rank_offsets[posn] = n, + true, true, true, false) + SET_VAR_API(_set_local_offset, + cp->_local_offsets[posn] = n; + assert(imod_flr(n, cp->_var_vec_lens[posn]) == 0); + cp->_vec_local_offsets[posn] = n / cp->_var_vec_lens[posn], + true, true, true, false) + SET_VAR_API(_set_domain_size, + cp->_domains[posn] = n, + true, true, true, true) + SET_VAR_API(_set_left_pad_size, + cp->_actl_left_pads[posn] = n, + true, true, true, true) + SET_VAR_API(_set_right_pad_size, + cp->_actl_right_pads[posn] = n, + true, true, true, true) + SET_VAR_API(_set_left_wf_ext, + cp->_left_wf_exts[posn] = n, + true, true, true, true) + SET_VAR_API(_set_right_wf_ext, + cp->_right_wf_exts[posn] = n, + true, true, true, true) + SET_VAR_API(_set_alloc_size, + cp->_domains[posn] = n, + true, true, true, true) + SET_VAR_API(update_left_min_pad_size, + cp->_req_left_pads[posn] = max(n, cp->_req_left_pads[posn]), + false, true, false, true) + SET_VAR_API(update_right_min_pad_size, + cp->_req_right_pads[posn] = max(n, cp->_req_right_pads[posn]), + false, true, false, true) + SET_VAR_API(update_min_pad_size, + cp->_req_left_pads[posn] = max(n, cp->_req_left_pads[posn]); + cp->_req_right_pads[posn] = max(n, cp->_req_right_pads[posn]), + false, true, false, true) + SET_VAR_API(update_left_extra_pad_size, + cp->_req_left_epads[posn] = max(n, cp->_req_left_epads[posn]), + false, true, false, true) + SET_VAR_API(update_right_extra_pad_size, + cp->_req_right_epads[posn] = max (n, cp->_req_right_epads[posn]), + false, true, false, true) + SET_VAR_API(update_extra_pad_size, + cp->_req_left_epads[posn] = max(n, cp->_req_left_epads[posn]); + cp->_req_right_epads[posn] = max (n, cp->_req_right_epads[posn]), + false, true, false, true) // These are the safer ones used in the APIs. - SET_VAR_API(set_left_halo_size, gb()._left_halos[posn] = n; resize(), false, true, false) - SET_VAR_API(set_right_halo_size, gb()._right_halos[posn] = n; resize(), false, true, false) - SET_VAR_API(set_halo_size, gb()._left_halos[posn] = gb()._right_halos[posn] = n; resize(), false, true, false) - SET_VAR_API(set_alloc_size, gb()._domains[posn] = n; resize(), - gb()._is_dynamic_step_alloc, gb()._fixed_size, gb()._is_dynamic_misc_alloc) - SET_VAR_API(set_left_min_pad_size, gb()._req_left_pads[posn] = n; resize(), false, true, false) - SET_VAR_API(set_right_min_pad_size, gb()._req_right_pads[posn] = n; resize(), false, true, false) - SET_VAR_API(set_min_pad_size, gb()._req_left_pads[posn] = gb()._req_right_pads[posn] = n; resize(), - false, true, false) - SET_VAR_API(set_left_extra_pad_size, gb()._req_left_epads[posn] = n; resize(), false, true, false) - SET_VAR_API(set_right_extra_pad_size, gb()._req_right_epads[posn] = n; resize(), false, true, false) - SET_VAR_API(set_extra_pad_size, gb()._req_left_epads[posn] = gb()._req_right_epads[posn] = n; resize(), - false, true, false) - SET_VAR_API(set_first_misc_index, gb()._local_offsets[posn] = n, false, false, gb()._is_user_var) -#undef COMMA -#undef SET_VAR_API + SET_VAR_API(set_first_misc_index, + cp->_local_offsets[posn] = n, + false, false, gb()._is_user_var, false) + SET_VAR_API(set_alloc_size, + cp->_domains[posn] = n, + gb()._is_dynamic_step_alloc, gb()._fixed_size, gb()._is_dynamic_misc_alloc, true) + + SET_VAR_API(set_left_halo_size, + cp->_left_halos[posn] = n, + false, true, false, true) + SET_VAR_API(set_right_halo_size, + cp->_right_halos[posn] = n, + false, true, false, true) + SET_VAR_API(set_halo_size, + cp->_left_halos[posn] = cp->_right_halos[posn] = n, + false, true, false, true) + SET_VAR_API(set_left_min_pad_size, + cp->_req_left_pads[posn] = n, + false, true, false, true) + SET_VAR_API(set_right_min_pad_size, + cp->_req_right_pads[posn] = n, + false, true, false, true) + SET_VAR_API(set_min_pad_size, + cp->_req_left_pads[posn] = n; + cp->_req_right_pads[posn] = n, + false, true, false, true) + SET_VAR_API(set_left_extra_pad_size, + cp->_req_left_epads[posn] = n, + false, true, false, true) + SET_VAR_API(set_right_extra_pad_size, + cp->_req_right_epads[posn] = n, + false, true, false, true) + SET_VAR_API(set_extra_pad_size, + cp->_req_left_epads[posn] = n; + cp->_req_right_epads[posn] = n, + false, true, false, true) + #undef SET_VAR_API + #undef SET_VAR_API2 + #undef GET_VAR_API + #undef GET_VAR_API2 bool YkVarImpl::is_storage_layout_identical(const YkVarImpl* op, - bool check_sizes) const { + bool check_sizes) const { // Same size? if (check_sizes && get_num_storage_bytes() != op->get_num_storage_bytes()) @@ -153,16 +319,16 @@ namespace yask { return false; // Same folding? - if (gb()._var_vec_lens[i] != op->gb()._var_vec_lens[i]) + if (corep()->_var_vec_lens[i] != op->corep()->_var_vec_lens[i]) return false; // Same dim sizes? if (check_sizes) { - if (gb()._domains[i] != op->gb()._domains[i]) + if (corep()->_domains[i] != op->corep()->_domains[i]) return false; - if (gb()._actl_left_pads[i] != op->gb()._actl_left_pads[i]) + if (corep()->_actl_left_pads[i] != op->corep()->_actl_left_pads[i]) return false; - if (gb()._actl_right_pads[i] != op->gb()._actl_right_pads[i]) + if (corep()->_actl_right_pads[i] != op->corep()->_actl_right_pads[i]) return false; } } @@ -219,14 +385,17 @@ namespace yask { get_name() + "'"); gb().check_indices(indices, "get_element", true, true, false); idx_t asi = gb().get_alloc_step_index(indices); + + gb().const_copy_data_from_device(); // TODO: make more efficient. real_t val = gb().read_elem(indices, asi, __LINE__); + TRACE_MSG("get_element({" << gb().make_index_string(indices) << "}) on '" << get_name() + "' returns " << val); return double(val); } idx_t YkVarImpl::set_element(double val, - const Indices& indices, - bool strict_indices) { + const Indices& indices, + bool strict_indices) { STATE_VARS(gbp()); TRACE_MSG("set_element(" << val << ", {" << gb().make_index_string(indices) << "}, " << @@ -242,11 +411,14 @@ namespace yask { // that updates the step index. gb().check_indices(indices, "set_element", strict_indices, false, false)) { idx_t asi = gb().get_alloc_step_index(indices); + + gb().copy_data_from_device(); // TODO: make more efficient. gb().write_elem(real_t(val), indices, asi, __LINE__); nup++; - // Set appropriate dirty flag. - gb().set_dirty_using_alloc_index(true, asi); + // Set appropriate dirty flags. + gb()._coh.mod_host(); + gb().set_dirty_using_alloc_index(YkVarBase::self, true, asi); } TRACE_MSG("set_element(" << val << ", {" << gb().make_index_string(indices) << "}, " << @@ -255,8 +427,8 @@ namespace yask { return nup; } idx_t YkVarImpl::add_to_element(double val, - const Indices& indices, - bool strict_indices) { + const Indices& indices, + bool strict_indices) { STATE_VARS(gbp()); TRACE_MSG("add_to_element(" << val << ", {" << gb().make_index_string(indices) << "}, " << @@ -271,11 +443,20 @@ namespace yask { // Check step index because this API must read before writing. gb().check_indices(indices, "add_to_element", strict_indices, true, false)) { idx_t asi = gb().get_alloc_step_index(indices); + + #ifdef USE_OFFLOAD_NO_USM + if (gb()._coh.need_to_update_host()) { + #pragma omp critical + gb().copy_data_from_device(); // TODO: make more efficient. + } + #endif + gb().add_to_elem(real_t(val), indices, asi, __LINE__); nup++; - // Set appropriate dirty flag. - gb().set_dirty_using_alloc_index(true, asi); + // Set appropriate dirty flags. + gb()._coh.mod_host(); + gb().set_dirty_using_alloc_index(YkVarBase::self, true, asi); } TRACE_MSG("add_to_element(" << val << ", {" << gb().make_index_string(indices) << "}, " << @@ -284,134 +465,142 @@ namespace yask { return nup; } + // Read into buffer from *this. idx_t YkVarBase::get_elements_in_slice(void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) const { - STATE_VARS(this); - TRACE_MSG("get_elements_in_slice(" << buffer_ptr << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}) on " << - make_info_string()); - if (get_storage() == 0) - THROW_YASK_EXCEPTION("Error: call to 'get_elements_in_slice' with no storage allocated for var '" + - get_name() + "'"); - check_indices(first_indices, "get_elements_in_slice", true, true, false); - check_indices(last_indices, "get_elements_in_slice", true, true, false); - - // Find range. - IdxTuple num_elems_tuple = get_slice_range(first_indices, last_indices); - - // Visit points in slice. - num_elems_tuple.visit_all_points_in_parallel - ([&](const IdxTuple& ofs, size_t idx) { - Indices pt = first_indices.add_elements(ofs); - - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); - - real_t val = read_elem(pt, asi, __LINE__); - ((real_t*)buffer_ptr)[idx] = val; - return true; // keep going. - }); - auto nup = num_elems_tuple.product(); - TRACE_MSG("get_elements_in_slice(" << buffer_ptr << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}) on '" << - get_name() + "' returns " << nup); - return nup; - } - idx_t YkVarBase::set_elements_in_slice_same(double val, - const Indices& first_indices, - const Indices& last_indices, - bool strict_indices) { - STATE_VARS(this); - TRACE_MSG("set_elements_in_slice_same(" << val << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}, " << - strict_indices << ") on " << - make_info_string()); - if (get_storage() == 0) { - if (strict_indices) - THROW_YASK_EXCEPTION("Error: call to 'set_elements_in_slice_same' with no storage allocated for var '" + - get_name() + "'"); - return 0; - } + const Indices& first_indices, + const Indices& last_indices, + bool on_device) const { + // A specialized visitor. + struct GetElem { + static const char* fname() { + return "get_elements_in_slice"; + } - // 'Fixed' copy of indices. - Indices first, last; - check_indices(first_indices, "set_elements_in_slice_same", - strict_indices, false, false, &first); - check_indices(last_indices, "set_elements_in_slice_same", - strict_indices, false, false, &last); + // Copy from the var to the buffer. + ALWAYS_INLINE + static void visit(YkVarBase* varp, + real_t* p, idx_t pofs, + const Indices& pt, idx_t ti) { - // Find range. - IdxTuple num_elems_tuple = get_slice_range(first, last); + // Read from var. + real_t val = varp->read_elem(pt, ti, __LINE__); - // Visit points in slice. - // TODO: optimize by setting vectors when possible. - num_elems_tuple.visit_all_points_in_parallel([&](const IdxTuple& ofs, - size_t idx) { - Indices pt = first.add_elements(ofs); + // Write to buffer at proper index. + p[pofs] = val; + } + }; + + if (on_device) + const_copy_data_to_device(); + else + const_copy_data_from_device(); + + // Call the generic visit. + auto n = const_cast(this)-> + _visit_elements_in_slice(true, (void*)buffer_ptr, + first_indices, last_indices, on_device); + + // Return number of writes. + return n; + } - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); + // Write to *this from buffer. + idx_t YkVarBase::set_elements_in_slice(const void* buffer_ptr, + const Indices& first_indices, + const Indices& last_indices, + bool on_device) { + // A specialized visitor. + struct SetElem { + static const char* fname() { + return "set_elements_in_slice"; + } - write_elem(real_t(val), pt, asi, __LINE__); - return true; // keep going. - }); + // Copy from the buffer to the var. + ALWAYS_INLINE + static void visit(YkVarBase* varp, + real_t* p, idx_t pofs, + const Indices& pt, idx_t ti) { - // Set appropriate dirty flag(s). - set_dirty_in_slice(first, last); + // Read from buffer. + real_t val = p[pofs]; - auto nup = num_elems_tuple.product(); - TRACE_MSG("set_elements_in_slice_same(" << val << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}, " << - strict_indices << ") on '" << - get_name() + "' returns " << nup); - return nup; - } - idx_t YkVarBase::set_elements_in_slice(const void* buffer_ptr, - const Indices& first_indices, - const Indices& last_indices) { - STATE_VARS(this); - TRACE_MSG("set_elements_in_slice(" << buffer_ptr << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}) on " << - make_info_string()); - if (get_storage() == 0) - THROW_YASK_EXCEPTION("Error: call to 'set_elements_in_slice' with no storage allocated for var '" + - get_name() + "'"); - check_indices(first_indices, "set_elements_in_slice", true, false, false); - check_indices(last_indices, "set_elements_in_slice", true, false, false); + // Write to var + varp->write_elem(val, pt, ti, __LINE__); + } + }; + + if (on_device) + const_copy_data_to_device(); + else + const_copy_data_from_device(); + + // Call the generic visit. + auto n = + _visit_elements_in_slice(true, (void*)buffer_ptr, + first_indices, last_indices, on_device); + + // Set appropriate dirty flags. + if (on_device) + _coh.mod_dev(); + else + _coh.mod_host(); + set_dirty_in_slice(first_indices, last_indices); - // Find range. - IdxTuple num_elems_tuple = get_slice_range(first_indices, last_indices); + // Return number of writes. + return n; + } - // Visit points in slice. - num_elems_tuple.visit_all_points_in_parallel - ([&](const IdxTuple& ofs, - size_t idx) { - Indices pt = first_indices.add_elements(ofs); + // Write to *this from 'val'. + idx_t YkVarBase::set_elements_in_slice_same(double val, + const Indices& first_indices, + const Indices& last_indices, + bool strict_indices, + bool on_device) { + // A specialized visitor. + struct SetElem { + static const char* fname() { + return "set_elements_in_slice_same"; + } - // TODO: move this outside of loop for const step index. - idx_t asi = get_alloc_step_index(pt); + // Set the var. + ALWAYS_INLINE + static void visit(YkVarBase* varp, + real_t* p, idx_t pofs, + const Indices& pt, idx_t ti) { - real_t val = ((real_t*)buffer_ptr)[idx]; - write_elem(val, pt, asi, __LINE__); - return true; // keep going. - }); + // Get const value, ignoring offset. + real_t val = *p; - // Set appropriate dirty flag(s). + // Write to var + varp->write_elem(val, pt, ti, __LINE__); + } + }; + + if (on_device) + const_copy_data_to_device(); + else + const_copy_data_from_device(); + + // Set up pointer to val for visitor. + // Requires casting if real_t is a float. + real_t v = real_t(val); + auto* buffer_ptr = &v; + + // Call the generic visit. + auto n = + _visit_elements_in_slice(strict_indices, (void*)buffer_ptr, + first_indices, last_indices, on_device); + + // Set appropriate dirty flags. + if (on_device) + _coh.mod_dev(); + else + _coh.mod_host(); set_dirty_in_slice(first_indices, last_indices); - auto nup = num_elems_tuple.product(); - TRACE_MSG("set_elements_in_slice(" << buffer_ptr << ", {" << - make_index_string(first_indices) << "}, {" << - make_index_string(last_indices) << "}) on '" << - get_name() + "' returns " << nup); - return nup; + // Return number of writes. + return n; } - + } // namespace. diff --git a/src/kernel/swig/yask_kernel_api.i b/src/kernel/swig/yask_kernel_api.i index ad21c3d0..8beb7c18 100644 --- a/src/kernel/swig/yask_kernel_api.i +++ b/src/kernel/swig/yask_kernel_api.i @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/kernel/tests/openmp_test.cpp b/src/kernel/tests/openmp_test.cpp new file mode 100644 index 00000000..05a44748 --- /dev/null +++ b/src/kernel/tests/openmp_test.cpp @@ -0,0 +1,432 @@ +/***************************************************************************** + +YASK: Yet Another Stencil Kit +Copyright (c) 2014-2022, Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. + +*****************************************************************************/ + +// Test some OpenMP constructs with a hand-coded 2-stage, 1-D stencil. +// Test hierarchical offloading if enabled. + +#include +#include +#include +#include +#include +#include + +using namespace std; + +// Macro defaults +#ifndef FN +#define FN 3 +#endif +#ifndef USE_HOST_OMP +#define USE_HOST_OMP 1 +#endif +#ifndef USE_NESTED_HOST_OMP +#define USE_NESTED_HOST_OMP 1 +#endif + +// Rounding macros for integer types. +#define CEIL_DIV(numer, denom) (((numer) + (denom) - 1) / (denom)) +#define ROUND_UP(n, mult) (CEIL_DIV(n, mult) * (mult)) +#define ROUND_DOWN(n, mult) (((n) / (mult)) * (mult)) + +// Determine whether val is "close enough" to ref. +template +bool within_tolerance(T val, T ref, T epsilon) { + if (val == ref) + return true; + bool ok; + double adiff = fabs(val - ref); + if (fabs(ref) > 1.0) + epsilon = fabs(ref * epsilon); + ok = adiff < epsilon; + return ok; +} + +// Divide 'num' equally into 'nparts'. +// Returns the *cumulative* sizes of the 0-'n'th parts, +// if 0 <= 'n' < 'nparts' and 0 if n < 0. +// The <0 case is handy for calculating the initial +// starting point when passing 'n'-1 and 'n'==0. +template +inline T div_equally_cumu_size_n(T num, T nparts, T n) { + if (n < 0) + return 0; + T p = (num / nparts) * (n + 1); + T rem = num % nparts; + p += (n < rem) ? (n + 1) : rem; + return p; +} + +// Various functions to implement. +// FN_A and FN_B are 1-element computations from A and B, resp. +// FN_C is a stencil from 2 points in A and 1 point in C. +// Enable one. + +// Using simple non-library math. +#if FN==1 +#define FN_A(a) (3. * a) +#define FN_B(b) (b * 2. + 1.) +#define FN_C(am, ap, c) ((2. * (am) + (ap)) + 4. * (c)) + +// Using math lib fns. +#elif FN==2 +#define FN_A(a) (cos(a) - 2. * sin(a)) +#define FN_B(b) (pow(b, 1./2.5)) +#define FN_C(am, ap, c) (atan((am) + (ap)) + cbrt(c)) + +// Same as FN=2, but using sincos() instead of separate sin() and cos(). +#elif FN==3 +#define FN_A(a) (sincos(a, &sa, &ca), ca - 2. * sa) +#define CHK_A(a) (cos(a) - 2. * sin(a)) +#define FN_B(b) (pow(b, 1./2.5)) +#define FN_C(am, ap, c) (atan((am) + (ap)) + cbrt(c)) +#else +#error "FN macro value not valid" +#endif + +// Checking fns are same as perf fns if not specifically defined. +#ifndef CHK_A +#define CHK_A(a) FN_A(a) +#endif +#ifndef CHK_B +#define CHK_B(b) FN_B(b) +#endif +#ifndef CHK_C +#define CHK_C(am, ap, c) FN_C(am, ap, c) +#endif + +// Initial value. +#define INIT_FN(ai, i) (5. + (2. * ai) + i) + +int main(int argc, char* argv[]) { + + cout << "Outer host OpenMP parallel region is " + #if USE_HOST_OMP==0 + "NOT " + #endif + "enabled.\n"; + cout << "Inner host OpenMP parallel region is " + #if USE_NESTED_HOST_OMP==0 + "NOT " + #endif + "enabled.\n"; + + // Cmd-line settings. + long pn = 1024; // Problem size. + int nreg = 2; // Number of regions. + int nthr = 4; // Number of host threads. + int nruns = 2; // Number of runs. + bool help = false; + #if USE_HOST_OMP==0 + nthr = 1; + #endif + + int ai = 0; + ai++; + if (argc > ai) { + if (string(argv[ai]) == "-h") + help = true; + else + nthr = atoi(argv[ai]); + } + ai++; + if (argc > ai) + pn = atol(argv[ai]); + ai++; + if (argc > ai) + nreg = atoi(argv[ai]); + int nblks = nthr * 2 - nthr / 2; // Blocks per region. + ai++; + if (argc > ai) + nblks = atoi(argv[ai]); + ai++; + if (argc > ai) + nruns = atoi(argv[ai]); + + cout << "Current settings: " + " num-host-threads=" << nthr << + ", prob-size=" << pn << + ", num-regions=" << nreg << + ", num-blocks=" << nblks << + ", num-runs=" << nruns << + endl; + if (help || nthr < 1 || pn < 1 || nreg < 1 || nblks < 1 || nruns < 1) { + cerr << "Usage: " << argv[0] << " [num host threads] [problem size]" + " [num regions] [num blocks per region] [num runs]\n"; + return 1; + } + + #ifdef USE_OFFLOAD + int ndev = omp_get_num_devices(); + int hostn = omp_get_initial_device(); + int devn = omp_get_default_device(); + cout << ndev << " OMP device(s)\n" + " host: " << hostn << "\n" + " dev: " << devn << endl << flush; + if (ndev == 0) { + cerr << "No OMP devices available.\n"; + exit(1); + } + + // Dummy OMP offload section to trigger JIT. + int MOLUE = 42; + #pragma omp target data device(devn) map(MOLUE) + { } + + #else + cout << "NOT testing on offload device.\n"; + #endif + + constexpr long halo_sz = 2; + long an = pn + 2 * halo_sz; // array size. + size_t bsz = an * sizeof(double); // array size in bytes. + constexpr int na = 3; // num of arrays. + double* p[na]; // host ptrs. + #ifdef USE_OFFLOAD + void* devp[na]; // dev ptrs. + #endif + for (int k = 0; k < na; k++) { + p[k] = new double[an]; + assert(p[k]); + } + + for (int r = 0; r < nruns; r++) { + long pbegin = halo_sz; + long pend = pbegin + pn; + cout << "Run " << (r+1) << endl << + " Working range = [" << pbegin << "..." << + pend << ")\n" << flush; + + // Init data. + for (int k = 0; k < na; k++) { + for (long i = 0; i < an; i++) + p[k][i] = INIT_FN(k, i); + + // alloc buffers on target and copy to it. + #ifdef USE_OFFLOAD + devp[k] = omp_target_alloc(bsz, devn); + assert(devp[k]); + int res = omp_target_associate_ptr(p[k], devp[k], bsz, 0, devn); + assert(res == 0); + assert(omp_target_is_present(p[k], devn)); + res = omp_target_memcpy(devp[k], p[k], bsz, 0, 0, devn, hostn); + assert(res == 0); + + // invalidate local copy to catch data transfer problems. + memset(p[k], 0x55, bsz); + #endif + } + omp_set_max_active_levels(2); + + // Divide pn into regions. + long n_per_reg = CEIL_DIV(pn, nreg); + + // Divide region into blocks. + long n_per_blk = CEIL_DIV(n_per_reg, nblks); + cout << " Using " << nreg << " region(s) of " << n_per_reg << + " point(s) each comprising " << nblks << " block(s) of " << + n_per_blk << " point(s) each.\n"; + + // Calc regions sequentially. + for (int reg = 0; reg < nreg; reg++) { + long rbegin = reg * n_per_reg + halo_sz; + long rend = (reg+1) * n_per_reg + halo_sz; // Don't need to trim yet. + cout << " Region " << reg << " on [" << + rbegin << "..." << rend << ")\n"; + + // Calc both stages in this region. + for (int stage = 0; stage < 2; stage++) { + long sbegin = rbegin; + long send = rend; + + // Shift after 1st stage to handle dependencies. + long shift = halo_sz * stage; + sbegin -= shift; + if (reg < nreg-1) + send -= shift; + else + send = an - halo_sz; + + // Trim to array size w/o halos. + sbegin = max(sbegin, pbegin); + send = min(send, pend); + cout << " Stage " << stage << " on [" << + sbegin << "..." << send << ")\n" << + " Scheduling " << nthr << " host thread(s) on " << + nblks << " blk(s) of data...\n" << flush; + + // OMP on host. + #if USE_HOST_OMP==1 + omp_set_num_threads(nthr); + #pragma omp parallel for schedule(dynamic,1) + #endif + for ( long i = 0; i < nblks; i++ ) + { + long begin = sbegin + i * n_per_blk; + long end = min(begin + n_per_blk, send); + if (i == nblks-1) + end = send; + int tn0 = omp_get_thread_num(); + #if USE_HOST_OMP==1 + #pragma omp critical + #endif + { + cout << " Running thread " << tn0 << " on blk [" << + begin << "..." << end << ")\n" << flush; + } + + double* A = p[0]; + double* B = p[1]; + double* C = p[2]; + + // Calc current stage in current block. + // Use OMP on target if enabled. + if (stage == 0) { + #ifdef USE_OFFLOAD + #pragma omp target parallel for device(devn) schedule(static,1) + #endif + for (long j = begin; j < end; j++) + { + double sa, ca; + A[j] = FN_A(A[j]); + B[j] = FN_B(B[j]); + } + } + + // 2nd stage, 1st run. + else if (r == 0) { + #ifdef USE_OFFLOAD + #pragma omp target teams distribute parallel for device(devn) + #elif USE_NESTED_HOST_OMP==1 + #pragma omp parallel for num_threads(nthr) + #endif + for (long j = begin; j < end; j++) + { + if (j == begin) { + int ntm1 = omp_get_num_teams(); + int nt1 = omp_get_num_threads(); + int nthr = ntm1 * nt1; + printf(" Running thread %i w/%i teams and %i threads/team: %i threads\n", + tn0, ntm1, nt1, nthr); + #ifndef USE_OFFLOAD + fflush(stdout); + #endif + } + C[j] = FN_C(A[j - halo_sz], A[j + halo_sz], C[j]); + } + } + + // 2nd stage, >1st run. + else { + + // Use a manually-generated parallel loop after the 1st run. + #ifdef USE_OFFLOAD + #pragma omp critical + { + cout << " Launching kernel from host thread " << tn0 << + "\n" << flush; + } + #pragma omp target teams device(devn) + #pragma omp parallel + #elif USE_NESTED_HOST_OMP==1 + #pragma omp parallel num_threads(nthr) + #endif + { + int ntm1 = omp_get_num_teams(); + int nt1 = omp_get_num_threads(); + int tmn1 = omp_get_team_num(); + int tn1 = omp_get_thread_num(); + long nthr = ntm1 * nt1; + long tn = (tmn1 * nt1 + tn1); + long niters = end - begin; + + // Calculate begin and end points for this thread. + long tbegin = div_equally_cumu_size_n(niters, nthr, tn - 1) + begin; + long tend = div_equally_cumu_size_n(niters, nthr, tn) + begin; + #ifdef SHOW_THREADS + #pragma omp critical + { + printf(" Running thread %i w/team %i/%i & thread %i/%i (%li/%li) on [%li...%li)\n", + tn0, tmn1, ntm1, tn1, nt1, tn, nthr, tbegin, tend); + #ifndef USE_OFFLOAD + fflush(stdout); + #endif + } + #endif + + for (long j = tbegin; j < tend; j++) + { + C[j] = FN_C(A[j - halo_sz], A[j + halo_sz], C[j]); + } + } + } + } + } // stages. + } // regions. + + // Copy results back and free mem on dev. + #ifdef USE_OFFLOAD + for (int k = 0; k < na; k++) { + omp_target_memcpy(p[k], devp[k], bsz, 0, 0, hostn, devn); + omp_target_disassociate_ptr(p[k], devn); + omp_target_free(devp[k], devn); + } + #endif + + // Check. + + long cbegin = halo_sz; + long cend = cbegin + pn; + cout << "Checking [" << cbegin << "..." << cend << ")\n" << flush; + long nbad = 0; + for (long i = cbegin; i < cend; i++) { + double sa, ca; + double expected = CHK_A(INIT_FN(0, i)); + if (!within_tolerance(p[0][i], expected, 1e-6)) { + cout << "A[" << i << "] = " << p[0][i] << "; expecting " << expected << endl; + nbad++; + } + expected = CHK_B(INIT_FN(1, i)); + if (!within_tolerance(p[1][i], expected, 1e-6)) { + cout << "B[" << i << "] = " << p[1][i] << "; expecting " << expected << endl; + nbad++; + } + expected = CHK_C(p[0][i-halo_sz], p[0][i+halo_sz], INIT_FN(2, i)); + if (!within_tolerance(p[2][i], expected, 1e-6)) { + cout << "C[" << i << "] = " << p[2][i] << "; expecting " << expected << endl; + nbad++; + } + } + + cout << "Run " << (r+1) << ": num errors: " << nbad << endl; + if (nbad) + return nbad; + + } // Runs. + for (int k = 0; k < na; k++) + delete[] p[k]; + + return 0; +} diff --git a/src/kernel/tests/var_test.cpp b/src/kernel/tests/var_test.cpp index f90f7094..300d8c5c 100644 --- a/src/kernel/tests/var_test.cpp +++ b/src/kernel/tests/var_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -24,10 +24,10 @@ IN THE SOFTWARE. *****************************************************************************/ // Test the YASK vars. -// This must be compiled with a kernel lib containing 'x', 'y', and 'z' dims. +// This must be compiled with a kernel containing 'x', 'y', and 'z' dims. -// enable assert(). -#define DEBUG +// Enable extra checking. +#define DEBUG_LAYOUT #include "yask_stencil.hpp" using namespace std; @@ -37,21 +37,26 @@ using namespace yask; #define DEFINE_CONTEXT #include "yask_stencil_code.hpp" -int main(int argc, char** argv) { +void run_tests(int argc, char* argv[]) { - // Bootstrap factory from kernel API. + // Bootstrap objects from kernel API. yk_factory kfac; + yask_output_factory yof; // Set up the environment (e.g., OpenMP & MPI). + yk_env::set_debug_output(yof.new_stdout_output()); + yk_env::set_trace_enabled(true); auto kenv = kfac.new_env(); // Object containing data and parameters for stencil eval. // TODO: do everything through API without cast to StencilContext. auto ksoln = kfac.new_solution(kenv); + ksoln->apply_command_line_options(argc, argv); + auto ebytes = ksoln->get_element_bytes(); auto context = dynamic_pointer_cast(ksoln); assert(context.get()); ostream& os = kenv->get_debug_output()->get_ostream(); - auto settings = context->get_settings(); + auto settings = context->get_actl_opts(); // Problem dimensions. auto dims = YASK_STENCIL_CONTEXT::new_dims(); @@ -61,19 +66,21 @@ int main(int argc, char** argv) { // Set domain size. int i = 0; for (auto dname : ksoln->get_domain_dim_names()) { - ksoln->set_rank_domain_size(dname, 9 + i++); + ksoln->set_rank_domain_size(dname, 3 + i * 2); + i++; } // 0D test. { os << "0-D test...\n"; VarDimNames gdims; - string name = "test var"; - auto gb0 = make_shared>(*context, name, gdims); + + // Make two scalar vars. + auto gb0 = make_shared>(*context, "var1", gdims); YkVarPtr g0 = make_shared(gb0); g0->alloc_storage(); os << gb0->make_info_string() << endl; - auto gb1 = make_shared>(*context, name, gdims); + auto gb1 = make_shared>(*context, "var2", gdims); YkVarPtr g1 = make_shared(gb1); g1->alloc_storage(); os << gb1->make_info_string() << endl; @@ -85,19 +92,25 @@ int main(int argc, char** argv) { auto v0 = g0->get_element({}); auto v1 = g1->get_element({}); assert(v0 == v1); + os << "Exiting 0-D test\n"; } // 3D test. { os << "3-D test...\n"; VarDimNames gdims = {"x", "y", "z"}; - string name = "test var"; - auto gb3 = make_shared>(*context, name, gdims); + + // Make two 3D vars. + // An element-storage var. + auto gb3 = make_shared>(*context, "var3", gdims); YkVarPtr g3 = make_shared(gb3); - auto gb3f = make_shared>(*context, name, gdims); + + // A vec-storage var (folded). + auto gb3f = make_shared>(*context, "var4", gdims); YkVarPtr g3f = make_shared(gb3f); + int i = 0; - int min_pad = 3; + int min_pad = 1; for (auto dname : gdims) { g3->_set_domain_size(dname, ksoln->get_rank_domain_size(dname)); g3->set_min_pad_size(dname, min_pad + i); @@ -106,38 +119,154 @@ int main(int argc, char** argv) { i++; } g3->alloc_storage(); - os << gb3->make_info_string() << endl; g3f->alloc_storage(); - os << gb3f->make_info_string() << endl; + auto sizes = gb3->get_allocs(); + auto sizesf = gb3f->get_allocs(); + + // gf3 may be larger because of folding. + assert(sizes.product() <= sizesf.product()); + if (VLEN_X * VLEN_Y * VLEN_Z == 1) + assert(sizes.product() == sizesf.product()); - os << "Copying seq of vals\n"; + os << "Setting vals in " << gb3->get_name() << endl; gb3->set_all_elements_in_seq(1.0); - auto sizes = gb3->get_allocs(); - sizes.visit_all_points_in_parallel([&](const IdxTuple& pt, - size_t idx) { - IdxTuple pt2 = pt; - for (auto dname : gdims) - pt2[dname] += g3->get_first_rank_alloc_index(dname); - Indices ipt(pt2); - auto val = gb3->read_elem(ipt, 0, __LINE__); - gb3f->write_elem(val, ipt, 0, __LINE__); - return true; - }); - os << "Checking seq of vals\n"; - sizes.visit_all_points([&](const IdxTuple& pt, - size_t idx) { - IdxTuple pt2 = pt; - for (auto dname : gdims) - pt2[dname] += g3->get_first_rank_alloc_index(dname); - Indices ipt(pt2); - ipt.add_const(-min_pad); - auto val = gb3->read_elem(ipt, 0, __LINE__); - auto valf = gb3f->read_elem(ipt, 0, __LINE__); - assert(val == valf); - return true; - }); + + IdxTuple first, last; + for (auto dname : gdims) { + first.add_dim_back(dname, g3->get_first_rank_alloc_index(dname)); + last.add_dim_back(dname, g3->get_last_rank_alloc_index(dname)); + } + Indices firsti(first), lasti(last); + + size_t sz = g3f->get_num_storage_bytes(); + char* buf = new char[sz]; + offload_map_alloc(buf, sz); + + bool done = false; + for (int testn = 0; !done; testn++) { + + // Fill w/bad values. + gb3f->set_all_elements_same(-1.0); + gb3f->copy_data_to_device(); + + os << testn << ". copying seq of vals to " << gb3f->get_name() << endl; + switch (testn) { + + case 0: { + os << " element-by-element in parallel on host...\n"; + sizes.visit_all_points_in_parallel + ( [&](const IdxTuple& pt, + size_t idx) { + IdxTuple pt2 = pt; + for (auto dname : gdims) + pt2[dname] += first[dname]; + Indices ipt(pt2); + auto val = gb3->read_elem(ipt, 0, __LINE__); + gb3f->write_elem(val, ipt, 0, __LINE__); + return true; + }); + break; + } + + case 1: { + os << " by slice on host...\n"; + auto n = gb3->get_elements_in_slice(buf, firsti, lasti, false); + assert(n); + gb3f->set_elements_in_slice(buf, firsti, lasti, false); + break; + } + + #ifdef USE_OFFLOAD + case 2: { + os << " by slice then copy to/from device...\n"; + + // Same as test 1. + gb3->get_elements_in_slice(buf, firsti, lasti, false); + gb3f->set_elements_in_slice(buf, firsti, lasti, false); + + // Copy data to device; invalidate host data; copy data back. + gb3f->copy_data_to_device(); + + #ifndef USE_OFFLOAD_USM + gb3f->set_all_elements_same(-2.0); + gb3f->get_coh()._force_state(Coherency::dev_mod); + #endif + + gb3f->copy_data_from_device(); + + break; + } + #if 1 + // OMP sections across 2 modules not currently supported. + case 3: { + os << " by slice on device...\n"; + assert(VLEN_X * VLEN_Y * VLEN_Z == 1); + + // Copy from var to buffer on host. + gb3->get_elements_in_slice(buf, firsti, lasti, false); + gb3f->set_elements_in_slice(buf, firsti, lasti, false); + gb3f->get_vecs_in_slice(buf, firsti, lasti, false); + + #ifndef USE_OFFLOAD_USM + gb3f->set_all_elements_same(-2.0); + gb3f->get_coh()._force_state(Coherency::dev_mod); + #endif + + // Copy buffer to dev. + offload_copy_to_device(buf, sz); + + // Copy from buffer to var on dev. + gb3f->set_vecs_in_slice(buf, firsti, lasti, true); + + // Copy var back to host. + gb3f->copy_data_from_device(); + break; + } + #endif + + #endif + + default: + done = true; + } + + if (!done) { + os << "Checking vals...\n"; + idx_t nbad = 0; + idx_t max_bad = 50; + sizes.visit_all_points + ([&](const IdxTuple& pt, + size_t idx) { + IdxTuple pt2 = pt; + for (auto dname : gdims) + pt2[dname] += first[dname]; + Indices ipt(pt2); + ipt.add_const(-min_pad); + auto val = gb3->read_elem(ipt, 0, __LINE__); + auto valf = gb3f->read_elem(ipt, 0, __LINE__); + if (val != valf) { + if (nbad < max_bad) + os << "*** error: value at " << ipt.make_val_str() << + " is " << valf << "; expected " << val << endl; + else if (nbad == max_bad) + os << "Additional errors not printed.\n"; + nbad++; + } + return true; + }); + os << " done checking\n"; + if (nbad) + exit(1); + } + } + delete[] buf; + offload_map_free(buf, sz); + os << "Exiting 3-D test\n"; } +} - os << "End of YASK var test.\n"; +int main(int argc, char* argv[]) { + run_tests(argc, argv); + cout << "End of YASK var test.\n"; return 0; } diff --git a/src/kernel/tests/yask_kernel_api_exception_test.cpp b/src/kernel/tests/yask_kernel_api_exception_test.cpp index 40c37c7b..dbea63a3 100644 --- a/src/kernel/tests/yask_kernel_api_exception_test.cpp +++ b/src/kernel/tests/yask_kernel_api_exception_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -35,8 +35,8 @@ using namespace yask; int main() { - // Counter for exception test - int num_exception = 0; + // Counter for exception test + int num_exception = 0; // The factory from which all other kernel objects are made. yk_factory kfac; @@ -71,7 +71,7 @@ int main() { } catch (yask_exception& e) { cout << "YASK throws an exception.\n"; cout << e.get_message(); - cout << "Exception Test: Catch exception correctly.\n"; + cout << "Exception Test: Caught exception correctly.\n"; num_exception++; } @@ -82,7 +82,7 @@ int main() { } catch (yask_exception& e) { cout << "YASK throws an exception.\n"; cout << e.get_message(); - cout << "Exception Test: Catch exception correctly.\n"; + cout << "Exception Test: Caught exception correctly.\n"; num_exception++; } @@ -91,100 +91,16 @@ int main() { // Set other data structures needed for stencil application. soln->prepare_solution(); - // Print some info about the solution. - auto name = soln->get_name(); - cout << "Stencil-solution '" << name << "':\n"; - cout << " Step dimension: '" << soln->get_step_dim_name() << "'\n"; - cout << " Domain dimensions:"; - set domain_dim_set; - for (auto dname : soln->get_domain_dim_names()) { - cout << " '" << dname << "'"; - domain_dim_set.insert(dname); - } - cout << endl; - - // Print out some info about the vars and init their data. - for (auto var : soln->get_vars()) { - cout << " " << var->get_name() << "("; - for (auto dname : var->get_dim_names()) - cout << " '" << dname << "'"; - cout << " )\n"; - for (auto dname : var->get_dim_names()) { - if (domain_dim_set.count(dname)) { - cout << " '" << dname << "' domain index range on this rank: " << - var->get_first_rank_domain_index(dname) << " ... " << - var->get_last_rank_domain_index(dname) << endl; - cout << " '" << dname << "' allowed index range on this rank: " << - var->get_first_rank_alloc_index(dname) << " ... " << - var->get_last_rank_alloc_index(dname) << endl; - } - } - - // First, just init all the elements to the same value. - var->set_all_elements_same(0.1); - - // Done with fixed-size vars. - if (var->is_fixed_size()) - continue; - - // Create indices describing a subset of the overall domain. - vector first_indices, last_indices; - for (auto dname : var->get_dim_names()) { - - // Is this a domain dim? - if (domain_dim_set.count(dname)) { - - // Set indices to creaete a small cube (assuming 3D) - // in center of overall problem. - idx_t psize = soln->get_overall_domain_size(dname); - idx_t first_idx = psize/2 - 10; - idx_t last_idx = psize/2 + 10; - first_indices.push_back(first_idx); - last_indices.push_back(last_idx); - } - - // Step dim? - else if (dname == soln->get_step_dim_name()) { - - // Add indices for timestep zero (0) only. - first_indices.push_back(0); - last_indices.push_back(0); - - } - - // Misc dim? - else { - - // Add indices to set all allowed values. - // (This isn't really meaningful; it's just illustrative.) - first_indices.push_back(var->get_first_misc_index(dname)); - last_indices.push_back(var->get_last_misc_index(dname)); - } - } - - // Init the values using the indices created above. - idx_t nset = var->set_elements_in_slice_same(0.9, first_indices, last_indices); - cout << " " << nset << " element(s) set.\n"; - - // Raw access to this var. - auto raw_p = var->get_raw_storage_buffer(); - auto num_elems = var->get_num_storage_elements(); - cout << " " << var->get_num_storage_bytes() << - " bytes of raw data at " << raw_p << ": "; - if (soln->get_element_bytes() == 4) - cout << ((float*)raw_p)[0] << ", ..., " << ((float*)raw_p)[num_elems-1] << "\n"; - else - cout << ((double*)raw_p)[0] << ", ..., " << ((double*)raw_p)[num_elems-1] << "\n"; - } - // Apply the stencil solution to the data. env->global_barrier(); cout << "Running the solution for 1 step...\n"; soln->run_solution(0); cout << "Running the solution for 10 more steps...\n"; soln->run_solution(1, 10); + soln->end_solution(); + soln->get_stats(); - // TODO: better to have exception test for the methods below + // TODO: add exception tests for the methods below: // StencilContext::calc_region // StencilContext::add_var // StencilContext::setup_rank @@ -206,7 +122,6 @@ int main() { // assert_equality_over_ranks // CommandLineParser::OptionBase::_idx_val - // Check whether program handles exceptions or not. if (num_exception != 2) { cout << "There is a problem in exception test.\n"; diff --git a/src/kernel/tests/yask_kernel_api_exception_test.py b/src/kernel/tests/yask_kernel_api_exception_test.py index 3ba937d6..639c3fee 100755 --- a/src/kernel/tests/yask_kernel_api_exception_test.py +++ b/src/kernel/tests/yask_kernel_api_exception_test.py @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -46,7 +46,7 @@ # Create solution. soln = kfac.new_solution(env) debug_output = ofac.new_string_output() - soln.set_debug_output(debug_output) + env.set_debug_output(debug_output) name = soln.get_name() # NB: At this point, the vars' meta-data exists, but the vars have no @@ -74,7 +74,7 @@ except RuntimeError as e: print ("YASK throws an exception.") print (format(e)) - print ("Exception Test: Catch exception correctly.") + print ("Exception Test: Caught exception correctly.") num_exception = num_exception + 1 # Exception test @@ -84,30 +84,13 @@ except RuntimeError as e: print ("YASK throws an exception.") print (format(e)) - print ("Exception Test: Catch exception correctly.") + print ("Exception Test: Caught exception correctly.") num_exception = num_exception + 1 # Allocate memory for any vars that do not have storage set. # Set other data structures needed for stencil application. soln.prepare_solution() - # Print some info about the solution. - print("Stencil-solution '" + name + "':") - print(" Step dimension: " + repr(soln.get_step_dim_name())) - print(" Domain dimensions: " + repr(soln.get_domain_dim_names())) - print(" Vars:") - for var in soln.get_vars() : - print(" " + var.get_name() + repr(var.get_dim_names())) - for dname in var.get_dim_names() : - if dname in soln.get_domain_dim_names() : - print(" '" + dname + "' allowed index range in this rank: " + - repr(var.get_first_rank_alloc_index(dname)) + " ... " + - repr(var.get_last_rank_alloc_index(dname))) - elif dname in soln.get_misc_dim_names() : - print(" '" + dname + "' allowed index range: " + - repr(var.get_first_misc_index(dname)) + " ... " + - repr(var.get_last_misc_index(dname))) - # Init the vars. for var in soln.get_vars() : @@ -119,7 +102,9 @@ print("Running the solution for 1 step...") soln.run_solution(0) - print("Debug output captured:\n" + debug_output.get_string()) + soln.end_solution() + soln.get_stats() + #print("Debug output captured:\n", debug_output.get_string()) if num_exception != 2: print("There is a problem in exception test.") diff --git a/src/kernel/tests/yask_kernel_api_test.cpp b/src/kernel/tests/yask_kernel_api_test.cpp index 342e4a33..47d2b742 100644 --- a/src/kernel/tests/yask_kernel_api_test.cpp +++ b/src/kernel/tests/yask_kernel_api_test.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -33,7 +33,6 @@ IN THE SOFTWARE. #include #include - using namespace std; using namespace yask; @@ -53,35 +52,60 @@ int main() { // Show output only from last rank. // This is an example of using the rank APIs, // the yask_output_factory, and set_debug_output(). - ostream* osp = &cout; int rank_num = env->get_rank_index(); if (rank_num < env->get_num_ranks() - 1) { - yask_output_factory ofac; - auto null_out = ofac.new_null_output(); - soln->set_debug_output(null_out); - osp = &null_out->get_ostream(); + yk_env::disable_debug_output(); cout << "Suppressing output on rank " << rank_num << ".\n"; } else cout << "Following information from rank " << rank_num << ".\n"; - ostream& os = *osp; + ostream& os = yk_env::get_debug_output()->get_ostream(); // Init solution settings. auto soln_dims = soln->get_domain_dim_names(); + int i = 0; for (auto dim_name : soln_dims) { + os << "Setting solution dim '" << dim_name << "'...\n"; // Set domain size in each dim. - soln->set_overall_domain_size(dim_name, 128); + idx_t dsize = 128 + i * 32; + soln->set_overall_domain_size(dim_name, dsize); + + // Check that vec API returns same. + auto dsizes = soln->get_overall_domain_size_vec(); + os << "global domain sizes:"; + for (auto ds : dsizes) + os << " " << ds; + os << "\n"; + assert(dsizes[i] == dsize); + + // Set with vec and check again. + soln->set_overall_domain_size_vec(dsizes); + auto ds = soln->get_overall_domain_size(dim_name); + assert(ds == dsize); // Ensure some minimal padding on all vars. soln->set_min_pad_size(dim_name, 1); - // Set block size to 64 in z dim and 32 in other dims. + // Set block size to 64 in last dim and 32 in other dims. // NB: just illustrative. - if (dim_name == "z") - soln->set_block_size(dim_name, 64); - else - soln->set_block_size(dim_name, 32); + idx_t bsize = (i == soln_dims.size() - 1) ? 64 : 32; + soln->set_block_size(dim_name, bsize); + + // Check that vec API returns same. + auto bsizes = soln->get_block_size_vec(); + os << "block sizes:"; + for (auto bs : bsizes) + os << " " << bs; + os << "\n"; + assert(bsizes[i] == bsize); + + // Set with vec and check again. + soln->set_block_size_vec(bsizes); + auto bs = soln->get_block_size(dim_name); + assert(bs == bsize); + + i++; } // Make a test fixed-size var. @@ -111,10 +135,18 @@ int main() { // Print out some info about the vars and init their data. for (auto var : soln->get_vars()) { os << " var '" << var->get_name() << ":\n"; + int dimi = 0; for (auto dname : var->get_dim_names()) { os << " '" << dname << "' dim:\n"; os << " alloc-size on this rank: " << var->get_alloc_size(dname) << endl; + os << " allowed index range on this rank: " << + var->get_first_local_index(dname) << " ... " << + var->get_last_local_index(dname) << endl; + assert(var->get_alloc_size(dname) == + var->get_last_local_index(dname) - var->get_first_local_index(dname) + 1); + auto asizes = var->get_alloc_size_vec(); + assert(asizes.at(dimi) == var->get_alloc_size(dname)); // Is this a domain dim? if (domain_dim_set.count(dname)) { @@ -124,9 +156,6 @@ int main() { os << " domain+halo index range on this rank: " << var->get_first_rank_halo_index(dname) << " ... " << var->get_last_rank_halo_index(dname) << endl; - os << " allowed index range on this rank: " << - var->get_first_rank_alloc_index(dname) << " ... " << - var->get_last_rank_alloc_index(dname) << endl; } // Step dim? @@ -142,6 +171,8 @@ int main() { var->get_first_misc_index(dname) << " ... " << var->get_last_misc_index(dname) << endl; } + + dimi++; } // First, just init all the elements to the same value. @@ -246,7 +277,7 @@ int main() { soln->run_solution(1, 10); soln->end_solution(); - + soln->get_stats(); os << "End of YASK kernel API test.\n"; return 0; } diff --git a/src/kernel/tests/yask_kernel_api_test.py b/src/kernel/tests/yask_kernel_api_test.py index 3dba26e8..32f796ee 100755 --- a/src/kernel/tests/yask_kernel_api_test.py +++ b/src/kernel/tests/yask_kernel_api_test.py @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -72,8 +72,8 @@ def make_ndarray(var, timestep) : point += (0, ) # Create a NumPy ndarray to hold the extracted data. - print("Creating a NumPy ndarray with shape " + repr(shape) + " and " + - repr(nelems) + " element(s)...") + print("Creating a NumPy ndarray with shape ", shape, " and ", + nelems, " element(s)...") ndarray = np.zeros(shape, dtype, 'C'); return ndarray, first_indices, last_indices, point @@ -83,12 +83,12 @@ def read_var(var, timestep) : # Ignore with fixed-sized vars. if var.is_fixed_size(): return - print("Testing reading var '" + var.get_name() + "' at time " + repr(timestep) + "...") + print("Testing reading var '", var.get_name(), "' at time ", timestep, "...") ndarray, first_indices, last_indices, point = make_ndarray(var, timestep) print("Reading 1 element...") val1 = var.get_element(first_indices) - print("Read value " + repr(val1)) + print("Read value ", val1) print("Reading all element(s) in ndarray...") nread = var.get_elements_in_slice(ndarray.data, first_indices, last_indices) @@ -102,11 +102,11 @@ def read_var(var, timestep) : raw_ptr = var.get_raw_storage_buffer() fp_ptr = ct.cast(int(raw_ptr), ptype) num_elems = var.get_num_storage_elements() - print("Raw data: " + repr(fp_ptr[0]) + ", ..., " + repr(fp_ptr[num_elems-1])) + print("Raw data: ", fp_ptr[0], ", ..., ", fp_ptr[num_elems-1]) # Init var using NumPy ndarray. def init_var(var, timestep) : - print("Initializing var '" + var.get_name() + "' at time " + repr(timestep) + "...") + print("Initializing var '", var.get_name(), "' at time ", timestep, "...") ndarray, first_indices, last_indices, point = make_ndarray(var, timestep) # Set one point to a non-zero value. @@ -116,7 +116,7 @@ def init_var(var, timestep) : print("Setting var from all element(s) in ndarray...") nset = var.set_elements_in_slice(ndarray.data, first_indices, last_indices) - print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index())) + print("Set ", nset, " element(s) in rank ", env.get_rank_index()) # Check that set worked. print("Reading those element(s)...") @@ -135,7 +135,7 @@ def init_var(var, timestep) : assert ndarray2.sum() == val1 # One point is val1; others are zero. # Test element set. - print("Testing setting 1 point at " + repr(last_indices) + "...") + print("Testing setting 1 point at ", last_indices, "...") val1 += 1.0 nset = var.set_element(val1, last_indices); assert nset == 1 @@ -144,7 +144,7 @@ def init_var(var, timestep) : # Test add. val3 = 2.0 - print("Testing adding to 1 point at " + repr(last_indices) + "...") + print("Testing adding to 1 point at ", last_indices, "...") nset = var.add_to_element(val3, last_indices); assert nset == 1 val2 = var.get_element(last_indices) @@ -162,8 +162,8 @@ def init_var(var, timestep) : # Create solution. soln = kfac.new_solution(env) - debug_output = ofac.new_string_output() - soln.set_debug_output(debug_output) + #debug_output = ofac.new_string_output() + #env.set_debug_output(debug_output) name = soln.get_name() # NB: At this point, the vars' meta-data exists, but the vars have no @@ -207,25 +207,25 @@ def init_var(var, timestep) : soln.prepare_solution() # Print some info about the solution. - print("Stencil-solution '" + name + "':") - print(" Step dimension: " + repr(soln.get_step_dim_name())) - print(" Domain dimensions: " + repr(soln.get_domain_dim_names())) + print("Stencil-solution '", name, "':") + print(" Step dimension: ", soln.get_step_dim_name()) + print(" Domain dimensions: ", soln.get_domain_dim_names()) print(" Vars:") for var in soln.get_vars() : - print(" " + var.get_name() + repr(var.get_dim_names())) + print(" ", var.get_name(), var.get_dim_names()) for dname in var.get_dim_names() : if dname in soln.get_domain_dim_names() : - print(" '" + dname + "' allowed domain index range in this rank: " + - repr(var.get_first_rank_alloc_index(dname)) + " ... " + - repr(var.get_last_rank_alloc_index(dname))) + print(" '", dname, "' allowed domain index range in this rank: ", + var.get_first_rank_alloc_index(dname), " ... ", + var.get_last_rank_alloc_index(dname)) elif dname == soln.get_step_dim_name() : - print(" '" + dname + "' allowed step index range: " + - repr(var.get_first_valid_step_index()) + " ... " + - repr(var.get_last_valid_step_index())) + print(" '", dname, "' allowed step index range: ", + var.get_first_valid_step_index(), " ... ", + var.get_last_valid_step_index()) else : - print(" '" + dname + "' allowed misc index range: " + - repr(var.get_first_misc_index(dname)) + " ... " + - repr(var.get_last_misc_index(dname))) + print(" '", dname, "' allowed misc index range: ", + var.get_first_misc_index(dname), " ... ", + var.get_last_misc_index(dname)) # Init the vars. for var in soln.get_vars() : @@ -293,11 +293,11 @@ def init_var(var, timestep) : # Init value at one point. nset = var.set_element(15.0, one_indices) - print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index())) + print("Set ", nset, " element(s) in rank ", env.get_rank_index()) # Init the values within the small cube. nset = var.set_elements_in_slice_same(0.5, first_indices, last_indices, False) - print("Set " + repr(nset) + " element(s) in rank " + repr(env.get_rank_index())) + print("Set ", nset, " element(s) in rank ", env.get_rank_index()) # Print the initial contents of the var. read_var(var, 0) @@ -318,5 +318,8 @@ def init_var(var, timestep) : for var in soln.get_vars() : read_var(var, 11) - print("Debug output captured:\n" + debug_output.get_string()) - print("End of YASK kernel API test.") + soln.end_solution() + soln.get_stats() + + #print("Debug output captured:\n", debug_output.get_string()) + print("End of YASK Python kernel API test.") diff --git a/src/kernel/yask.sh b/src/kernel/yask.sh index a7d50891..554b4967 100755 --- a/src/kernel/yask.sh +++ b/src/kernel/yask.sh @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -26,66 +26,76 @@ # Purpose: run stencil kernel in specified environment. # Create invocation string w/proper quoting. -invo="Invocation: $0" -whitespace="[[:space:]]" -for i in "$@" -do - if [[ $i =~ $whitespace ]]; then +invo="Script invocation: $0" +for i in "$@"; do + if [[ $i =~ [[:space:]] ]]; then i=\'$i\' fi invo="$invo $i" done -# Default env vars to print debug info. -envs="OMP_DISPLAY_ENV=VERBOSE" -envs="$envs KMP_VERSION=1" -envs="$envs I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5" - -# On Cygwin, need to put lib dir in path to load .dll's. -if [[ `uname -o` == "Cygwin" ]]; then - envs="$envs PATH='$PATH':"`dirname $0`/../lib -fi +# Default env vars to print debug info and set CPU and mem-binding. +# https://software.intel.com/content/www/us/en/develop/documentation/mpi-developer-reference-linux/top/environment-variable-reference/environment-variables-for-memory-policy-control.html +envs="OMP_DISPLAY_ENV=VERBOSE KMP_VERSION=1" +envs+=" OMP_PLACES=cores KMP_HOT_TEAMS_MODE=1 KMP_HOT_TEAMS_MAX_LEVEL=3" +envs+=" I_MPI_PRINT_VERSION=1 I_MPI_DEBUG=5" +#envs+=" I_MPI_HBW_POLICY=hbw_preferred,hbw_preferred" # Default arch. cpu_flags=`grep -m1 '^flags' /proc/cpuinfo` if [[ $cpu_flags =~ avx512dq ]]; then - arch=avx512 + def_arch=avx512 elif [[ $cpu_flags =~ avx512pf ]]; then - arch=knl + def_arch=knl elif [[ $cpu_flags =~ avx2 ]]; then - arch=avx2 + def_arch=avx2 elif [[ $cpu_flags =~ avx ]]; then - arch=avx + def_arch=avx else - arch=intel64 + def_arch=intel64 fi +arch=$def_arch # Default ranks. +# Try numactl then lscpu. +# For either, the goal is to count only NUMA nodes with CPUs. +# (Systems with HBM may have NUMA nodes without CPUs.) nranks=1 if command -v numactl >/dev/null; then - cpubind=`numactl -s | grep -m1 '^cpubind:'` - if [[ -n "$cpubind" ]]; then - cbwords=( $cpubind ) - nranks=$(( ${#cbwords[@]} - 1 )) + ncpubinds=`numactl -s | awk '/^cpubind:/ { print NF-1 }'` + if [[ -n "$ncpubinds" ]]; then + nranks=$ncpubinds + fi +elif command -v lscpu >/dev/null; then + nnumas=`lscpu | grep -c '^NUMA node.*CPU'` + if [[ -n "$nnumas" ]]; then + nranks=$nnumas fi fi # Other defaults. -pre_cmd=true +pre_cmd=":" post_cmd="" helping=0 opts="" +bindir=`dirname $0` +logdir="./logs" +tmplog="/tmp/yask-p$$" + +# Validation shortcut (-v) vars. +doval=0 +val="-validate -no-pre_auto_tune -no-auto_tune -no-warmup -num_trials 1 -trial_steps 2 -l 80 -Mb 72 -b 64 -mb 56 -nb 48 -pb 20" # Display stencils in this dir and exit. -bindir=`dirname $0` function show_stencils { echo "Available stencil.arch combos in '$bindir' directory:" find $bindir -name 'yask_kernel.*.*.exe' | sed -e 's/.*yask_kernel\./ -stencil /' -e 's/\./ -arch /' -e 's/.exe//' - echo "The default -arch argument for this host is '$arch'." + echo "The default -arch argument for this host is '$def_arch'." exit 1 } # Loop thru cmd-line args. +using_opt_outer_threads=0 while true; do if [[ ! -n ${1+set} ]]; then @@ -94,35 +104,41 @@ while true; do elif [[ "$1" == "-h" ]]; then shift echo "$0 is a wrapper around the YASK executable to set up the proper environment." - echo "Usage: $0 -stencil [options]" - echo " -stencil " - echo " Specify the solution-name part of the kernel executable." - echo " Should correspond to stencil= used during compilation." + echo "Usage: $0 -stencil [options]" + echo " -stencil " + echo " Specify the solution-name part of the YASK executable." + echo " Should correspond to stencil= used during compilation," + echo " or YK_STENCIL= if that was used to override the default." echo " Run this script without any options to see the available stencils." echo " " - echo "Some options are generic (parsed by the driver script and applied to any stencil)," - echo " and some options are parsed by the stencil executable determined by the -stencil." - echo " and -arch parameters." - echo " " - echo "Generic (script) options:" + echo "Script options:" echo " -h" echo " Print this help." - echo " To see YASK stencil-specific options, run '$0 -stencil [-arch ] -help'." - echo " -arch " + echo " To see more options from the YASK executable, run the following command:" + echo " $0 -stencil [-arch ] -help" + echo " This will run the executable with the '-help' option." + echo " -arch " echo " Specify the architecture-name part of the kernel executable." echo " Overrides the default architecture determined from /proc/cpuinfo flags." - echo " The default for this host is '$arch'." - echo " Should correspond to arch= used during compilation." - echo " -host |-mic " + echo " The default arch for this host is '$def_arch'." + echo " Should correspond to arch= used during compilation" + echo " with '.offload-' appended when built with 'offload=1'," + echo " or YK_ARCH= if that was used to override the default." + echo " In any case, the '-stencil' and '-arch' args required to launch" + echo " any executable are printed at the end of a successful compilation." + echo " -host " echo " Specify host to run executable on." - echo " 'ssh ' will be pre-pended to the sh_prefix command." - echo " If -mic is given, it implies the following (which can be overridden):" - echo " -arch 'knc'" - echo " -host "`hostname`"-mic" + echo " Run sub-shell under 'ssh '." echo " -sh_prefix " echo " Run sub-shell under , e.g., a custom ssh command." + echo " If -host and -sh_prefix are both specified, run sub-shell under" + echo " 'ssh '." + echo " -exe " + echo " Specify as YASK executable instead of one in the same directory as" + echo " this script with a name based on '-stencil' and '-arch'." + echo " /../lib will also be prepended to the LD_LIBRARY_PATH env var." echo " -exe_prefix " - echo " Run YASK executable under , e.g., 'numactl -N 0'." + echo " Run YASK executable as an argument to , e.g., 'numactl -N 0'." echo " -pre_cmd " echo " One or more commands to run before YASK executable." echo " -post_cmd " @@ -134,18 +150,28 @@ while true; do echo " Shortcut for the following options if > 1:" echo " -mpi_cmd 'mpirun -np '" echo " If a different MPI command is needed, use -mpi_cmd explicitly." - if [[ -n "$nranks" ]]; then - echo " The default for this host is '$nranks'." - fi - echo " -log " - echo " Write copy of output to ." - echo " Default is based on stencil, arch, host-name, and time-stamp." - echo " Use '/dev/null' to avoid making a log." + echo " The default for this host is '$nranks'." + echo " -log " + echo " Write copy of output to ." + echo " Default is based on stencil, arch, hostname, time-stamp, and process ID." + echo " Set to empty string ('') to avoid making a log." + echo " -log_dir " + echo " Directory name to prepend to log ." + echo " Default is '$logdir'." + echo " -v" + echo " Shortcut for the following options:" + echo " $val" + echo " If you want to override any of these values, place them after '-v'." echo " -show_arch" - echo " Print the architecture string and exit." - echo " " + echo " Print the default architecture string and exit." + echo " =" echo " Set environment variable to ." echo " Repeat as necessary to set multiple vars." + echo "" + echo " All script args not listed above will be passed to the executable." + echo "" + echo " Canonical command issued based on above options:" + echo " ssh <-host option> <-sh_prefix option> sh -c -x '; <-pre_cmd option>; env <-mpi_cmd option> <-exe_prefix option> <-exe option> ; <-post_cmd option>'" exit 0 elif [[ "$1" == "-help" ]]; then @@ -158,7 +184,7 @@ while true; do shift elif [[ "$1" == "-show_arch" ]]; then - echo $arch + echo $def_arch exit 0 elif [[ "$1" == "-stencil" && -n ${2+set} ]]; then @@ -171,6 +197,11 @@ while true; do shift shift + elif [[ "$1" == "-host" && -n ${2+set} ]]; then + host=$2 + shift + shift + elif [[ "$1" == "-sh_prefix" && -n ${2+set} ]]; then sh_prefix=$2 shift @@ -191,6 +222,12 @@ while true; do shift shift + elif [[ "$1" == "-exe" && -n ${2+set} ]]; then + exe=$2 + bindir=`dirname $exe` + shift + shift + elif [[ "$1" == "-exe_prefix" && -n ${2+set} ]]; then exe_prefix=$2 shift @@ -198,17 +235,14 @@ while true; do elif [[ "$1" == "-log" && -n ${2+set} ]]; then logfile=$2 + if [[ -z "$logfile" ]]; then + logfile=$tmplog + fi shift shift - elif [[ "$1" == "-host" && -n ${2+set} ]]; then - host=$2 - shift - shift - - elif [[ "$1" == "-mic" && -n ${2+set} ]]; then - arch="knc" - host=`hostname`-mic$2 + elif [[ "$1" == "-log_dir" && -n ${2+set} ]]; then + logdir=$2 shift shift @@ -217,8 +251,14 @@ while true; do shift shift + elif [[ "$1" == "-v" ]]; then + doval=1 + shift + elif [[ "$1" =~ ^[A-Za-z0-9_]+= ]]; then - envs="$envs $1" + + # Something like FOO=bar sets an env var. + envs+=" $1" shift elif [[ "$1" == "--" ]]; then @@ -228,8 +268,20 @@ while true; do opts+=" $@" break + elif [[ "$1" == "-thread_divisor" ]]; then + echo "Option '$1' is no longer supported." + echo "Use '-max_threads', '-outer_threads', and/or '-inner_threads'." + exit 1 + else - # Pass this unknown option to executable. + # Check for existance of some binary options, but don't consume them. + if [[ "$1" == "-outer_threads" ]]; then + using_opt_outer_threads=1 + elif [[ "$1" == "-trace" ]]; then + envs+=" KMP_AFFINITY=verbose" + fi + + # Pass this option to executable. opts+=" $1" shift @@ -240,13 +292,13 @@ echo $invo # Check required opt (yes, it's an oxymoron). if [[ -z ${stencil:+ok} ]]; then - echo "error: missing required option: -stencil " + echo "error: missing required option: -stencil " show_stencils fi -# Simplified MPI in x-dim only. -if [[ -n "$nranks" && $nranks > 1 ]]; then - true ${mpi_cmd="mpirun -np $nranks"} +# Simple MPI for one node. +if [[ $nranks > 1 ]]; then + : ${mpi_cmd="mpirun -np $nranks"} fi # Bail on errors past this point, but only errors @@ -260,7 +312,10 @@ exe_host=${host:-`hostname`} dump="head -v -n -0" # Init log file. -true ${logfile=logs/yask.$stencil.$arch.$exe_host.`date +%Y-%m-%d_%H-%M`_p$$.log} +: ${logfile:=yask.$stencil.$arch.$exe_host.`date +%Y-%m-%d_%H-%M-%S`_p$$.log} +if [[ -n "$logdir" ]]; then + logfile="$logdir/$logfile" +fi echo "Writing log to '$logfile'." mkdir -p `dirname $logfile` echo $invo > $logfile @@ -269,10 +324,16 @@ echo $invo > $logfile # If the executable is built by overriding YK_TAG, YK_EXT_BASE, and/or # YK_EXEC, this will fail. tag=$stencil.$arch -exe="$bindir/yask_kernel.$tag.exe" +: ${exe:="$bindir/yask_kernel.$tag.exe"} make_report="$bindir/../build/yask_kernel.$tag.make-report.txt" yc_report="$bindir/../build/yask_kernel.$tag.yask_compiler-report.txt" +# Heuristic to determine if this is an offload kernel. +is_offload=0 +if [[ $arch =~ "offload" ]]; then + is_offload=1 +fi + # Double-check that exe exists. if [[ ! -x $exe ]]; then echo "error: '$exe' not found or not executable." | tee -a $logfile @@ -287,23 +348,14 @@ if [[ -e $make_report ]]; then fi fi -# Additional setup for KNC. -if [[ $arch == "knc" && -n "$host" ]]; then - dir=/tmp/$USER - icc=`which icc` - iccdir=`dirname $icc`/../.. - libpath=":$iccdir/compiler/lib/mic" - ssh $host "rm -rf $dir; mkdir -p $dir/bin" - scp $exe $host:$dir/bin -else - dir=`pwd` - libpath=":$HOME/lib" -fi +dir=`pwd` +libpath=":$HOME/lib" -# Setup to run on specified host. +# Setup paths to run on specified host. +envs2="LD_LIBRARY_PATH=$bindir/../lib:$LD_LIBRARY_PATH$libpath" if [[ -n "$host" ]]; then sh_prefix="ssh $host $sh_prefix" - envs="$envs PATH=$PATH LD_LIBRARY_PATH=./lib:$LD_LIBRARY_PATH$libpath" + envs2+=" PATH=$PATH" nm=1 while true; do @@ -312,30 +364,56 @@ if [[ -n "$host" ]]; then echo "Waiting $nm min before trying again..." sleep $(( nm++ * 60 )) done -else - envs="$envs LD_LIBRARY_PATH=./lib:$LD_LIBRARY_PATH$libpath" +fi + +# Set OMP threads to number of cores per rank if not already specified and not special. +# TODO: extend to work on multiple nodes. +if [[ ( $using_opt_outer_threads == 0 ) && ( $arch != "knl" ) && ( $is_offload == 0) ]]; then + if command -v lscpu >/dev/null; then + nsocks=`lscpu | awk -F: '/Socket.s.:/ { print $2 }'` + ncores=`lscpu | awk -F: '/Core.s. per socket:/ { print $2 }'` + if [[ -n "$nsocks" && -n "$ncores" ]]; then + mthrs=$(( $nsocks * $ncores / $nranks )) + opts="-outer_threads $mthrs $opts" + fi + fi +fi + +# Add validation opts to beginning. +if [[ $doval == 1 ]]; then + opts="$val $opts" fi # Commands to capture some important system status and config info for benchmark documentation. -config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a" +config_cmds="sleep 1; uptime; lscpu; cpuinfo -A; sed '/^$/q' /proc/cpuinfo; cpupower frequency-info; uname -a; $dump /etc/system-release; $dump /proc/cmdline; $dump /proc/meminfo; free -gt; numactl -H; ulimit -a; ipcs -l; env | awk '/YASK/ { print \"env:\", \$1 }'" + +# Add settings for offload kernel. +if [[ $is_offload == 1 ]]; then + config_cmds+="; clinfo -l"; + if [[ $nranks > 1 ]]; then + envs+=" I_MPI_OFFLOAD_TOPOLIB=level_zero I_MPI_OFFLOAD=2" + else + envs+=" EnableImplicitScaling=1" + fi +fi # Command sequence to be run in a shell. -# Captures -cmds="cd $dir; ulimit -s unlimited; $config_cmds; ldd $exe; date; $pre_cmd; env $envs $mpi_cmd $exe_prefix $exe $opts" +exe_str="$mpi_cmd $exe_prefix $exe $opts" +cmds="cd $dir; ulimit -s unlimited; $config_cmds; ldd $exe; date; $pre_cmd; env $envs $envs2 $exe_str" if [[ -n "$post_cmd" ]]; then cmds+="; $post_cmd" fi +cmds+="; date" echo "===================" | tee -a $logfile -# Finally, invoke the binary. +# Finally, invoke the binary in a shell. if [[ -z "$sh_prefix" ]]; then sh -c -x "$cmds" 2>&1 | tee -a $logfile else echo "Running shell under '$sh_prefix'..." $sh_prefix "sh -c -x '$cmds'" 2>&1 | tee -a $logfile fi -date echo "===================" | tee -a $logfile # Exit if just getting help. @@ -343,24 +421,40 @@ if [[ $helping == 1 ]]; then exit 0 fi -echo $invo -echo "Log saved in '$logfile'." -echo "Run 'utils/bin/yask_log_to_csv.pl $logfile' to output in CSV format." +function finish { + if [[ "$logfile" == $tmplog ]]; then + rm $tmplog + else + echo "Log saved in '$logfile'." + echo "Run './utils/bin/yask_log_to_csv.pl $logfile' to output in CSV format." + fi + exit $1 +} -# A summary of the command to print. -exe_str="'$mpi_cmd $exe_prefix $exe $opts'" +# Print invocation again. +echo $invo +binvo="Binary invocation: $envs $exe_str" +echo $binvo | tee -a $logfile # Return a non-zero exit condition if test failed. if [[ `grep -c 'TEST FAILED' $logfile` > 0 ]]; then - echo $exe_str did not pass internal validation test. | tee -a $logfile - exit 1 + echo YASK did not pass internal validation test. | tee -a $logfile + finish 1 fi # Return a non-zero exit condition if executable didn't exit cleanly. if [[ `grep -c 'YASK DONE' $logfile` == 0 ]]; then - echo $exe_str did not exit cleanly. | tee -a $logfile - exit 1 + echo YASK did not exit cleanly. | tee -a $logfile + finish 1 fi -echo $exe_str ran successfully. | tee -a $logfile -exit 0 +# Print a message if test passed on at least one rank. +# (Script would have exited above if any rank failed.) +if [[ `grep -c 'TEST PASSED' $logfile` == $nranks ]]; then + echo YASK passed internal validation test. | tee -a $logfile +fi + +# Print a final message, which will print if not validated or passed validation. +echo YASK ran successfully. | tee -a $logfile +finish 0 + diff --git a/src/kernel/yask_main.cpp b/src/kernel/yask_main.cpp index 2c8288ea..b48b0585 100644 --- a/src/kernel/yask_main.cpp +++ b/src/kernel/yask_main.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -29,13 +29,11 @@ IN THE SOFTWARE. using namespace std; using namespace yask; -// Auto-generated stencil code that extends base types. -#define DEFINE_CONTEXT -#include YSTR2(YK_CODE_FILE) - // Add some command-line options for this application in addition to the // default ones provided by YASK. struct MySettings { + static constexpr double def_init_val = -99.; + bool help = false; // help requested. bool do_warmup = true; // whether to do warmup run. bool do_pre_auto_tune = true; // whether to do pre-auto-tuning. @@ -48,6 +46,7 @@ struct MySettings { int debug_sleep = 0; // sec to sleep for debug attach. bool do_trace = false; // tracing. int msg_rank = 0; // rank to print debug msgs. + double init_val = def_init_val; // value to init all points. // Ptr to the soln. yk_solution_ptr _ksoln; @@ -55,123 +54,85 @@ struct MySettings { MySettings(yk_solution_ptr ksoln) : _ksoln(ksoln) { } - // A custom option-handler for '-v'. - class ValOption : public CommandLineParser::OptionBase { - MySettings& _as; - static constexpr idx_t _lsz=63, _bsz=24; - - public: - - ValOption(MySettings& as) : - OptionBase("v", - "Minimal validation: shortcut for '-validate -no-pre_auto_tune -no-auto_tune" - " -no-warmup -t 1 -trial_steps 1 -l " + to_string(_lsz) + - " -b " + to_string(_bsz) + "'."), - _as(as) { } - - // Set multiple vars. - virtual bool check_arg(const std::vector& args, - int& argi) { - if (_check_arg(args, argi, _name)) { - _as.validate = true; - _as.do_pre_auto_tune = false; - _as.do_warmup = false; - _as.num_trials = 1; - _as.trial_steps = 1; - - // Create soln options and parse them if there is a soln. - if (_as._ksoln) { - for (auto& dname : _as._ksoln->get_domain_dim_names()) { - - // Local domain size, e.g., "-lx 63". - string arg = "-l" + dname + " " + to_string(_lsz); - - // Block size, e.g., "-bx 24". - arg += " -b" + dname + " " + to_string(_bsz); - - // Parse 'arg'. - auto rem = _as._ksoln->apply_command_line_options(arg); - assert(rem.length() == 0); - } - } - return true; - } - return false; - } - }; - // Parse options from the command-line and set corresponding vars. // Exit with message on error or request for help. - void parse(int argc, char** argv) { + // Return settings. + string parse(int argc, char** argv) { + string values; // Create a parser and add options to it. CommandLineParser parser; - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("help", "Print help message.", help)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("msg_rank", "Index of MPI rank that will print informational messages.", msg_rank)); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("trace", "Print internal debug messages if compiled with" " general and/or memory-access tracing enabled.", do_trace)); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("pre_auto_tune", "Run iteration(s) *before* performance trial(s) to find good-performing " "values for block sizes. " "Uses default values or command-line-provided values as a starting point.", do_pre_auto_tune)); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("warmup", "Run warmup iteration(s) before performance " "trial(s) and after auto-tuning iterations, if enabled.", do_warmup)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("step_alloc", "Number of steps to allocate in relevant vars, " "overriding default value from YASK compiler.", step_alloc)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("num_trials", "Number of performance trials.", num_trials)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("t", - "Alias for '-num_trials'; for backward-compatibility.", + "[Deprecated] Use '-num_trials'.", num_trials)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("trial_steps", "Number of steps to run each performance trial. " - "If zero, the 'trial_time' value is used.", + "If zero, the 'trial_time' value is used to determine the number of steps to run.", trial_steps)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("dt", - "Alias for '-trial_steps'; for backward-compatibility.", + "[Deprecated] Use '-trial_steps'.", trial_steps)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared + ("init_val", + string("Initialize all points in all stencil vars to given value. ") + + "If value is " + to_string(MySettings::def_init_val) + + ", points are set to varying values.", + init_val)); + parser.add_option(make_shared ("trial_time", "Approximate number of seconds to run each performance trial. " "When the 'trial_steps' value is zero, the number of steps is " "based on the rate measured in the warmup phase. " "(Thus, warmup cannot be disabled when the number of steps is zero.)", trial_time)); - parser.add_option(new CommandLineParser::IntOption + parser.add_option(make_shared ("sleep", "Number of seconds to sleep before each performance trial.", pre_trial_sleep_time)); - parser.add_option(new CommandLineParser::IntOption - ("debug_sleep", - "Number of seconds to sleep for debug attach.", + parser.add_option(make_shared + ("debug_delay", + "[Debug] Number of seconds to sleep for debug attach.", debug_sleep)); - parser.add_option(new CommandLineParser::BoolOption + parser.add_option(make_shared ("validate", "Run validation iteration(s) after performance trial(s).", validate)); - parser.add_option(new ValOption(*this)); // Parse 'args' and 'argv' cmd-line options, which sets values. // Any remaining strings will be returned. @@ -179,33 +140,63 @@ struct MySettings { // Handle additional knobs and help if there is a soln. if (_ksoln) { + + // TODO: make an API for this. + auto context = dynamic_pointer_cast(_ksoln); + assert(context.get()); + auto& req_opts = context->get_req_opts(); // Parse standard args not handled by this parser. rem_args = _ksoln->apply_command_line_options(rem_args); if (help) { - string app_notes = + string pgm_name(argv[0]); + cout << "Usage: " << pgm_name << " [options]\n" + "Options from the binary:\n"; + parser.print_help(cout); + + cout << "Options from the YASK library:\n"; + req_opts->print_usage(cout); + cout << "\nValidation is very slow and uses 2x memory,\n" " so run with very small sizes and number of time-steps.\n" " If validation fails, it may be due to rounding error;\n" " try building with 8-byte reals.\n"; - vector app_examples; - app_examples.push_back("-g 768 -num_trials 2"); - app_examples.push_back("-v"); - - // TODO: make an API for this. - auto context = dynamic_pointer_cast(_ksoln); - assert(context.get()); - auto& opts = context->get_settings(); - opts->print_usage(cout, parser, argv[0], app_notes, app_examples); + + // Make example knobs across dims. + string exg, exnr, exb; + int i = 1; + for (auto& dname : _ksoln->get_domain_dim_names()) { + exg += " -g" + dname + " " + to_string(i * 128); + exb += " -b" + dname + " " + to_string(i * 16); + exnr += " -nr" + dname + " " + to_string(i + 1); + i++; + } + cout << + "\nExamples:\n" + " " << pgm_name << " -g 768 # global-domain size in all dims same.\n" + " " << pgm_name << exg << " # global-domain size in each dim separately.\n" + " " << pgm_name << " -l 128 # local-domain (per-rank) size.\n" + " " << pgm_name << " -g 512" << exnr << " # number of ranks in each dim.\n" << + " " << pgm_name << " -g 512" << exb << " -no-pre_auto_tune # manual block size.\n" << + flush; exit_yask(1); } + // Add settings. + ostringstream oss; + oss << "Options from the binary:\n"; + parser.print_values(oss); + oss << "Options from the YASK library:\n"; + req_opts->print_values(oss); + values = oss.str(); + if (rem_args.length()) THROW_YASK_EXCEPTION("Error: extraneous parameter(s): '" + rem_args + "'; run with '-help' option for usage"); } + return values; } // Print splash banner and invocation string. @@ -217,28 +208,20 @@ struct MySettings { " \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n" " \u2502 Y.A.S.K. \u2500\u2500 Yet Another Stencil Kit \u2502\n" " \u2502 https://github.com/intel/yask \u2502\n" - " \u2502 Copyright (c) 2014-2021, Intel Corporation \u2502\n" + " \u2502 Copyright (c) 2014-2022, Intel Corporation \u2502\n" " \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n" "\n" "Version: " << yask_get_version_string() << endl << "Stencil name: " YASK_STENCIL_NAME << endl; // Echo invocation parameters for record-keeping. -#ifdef DEF_ARGS + #ifdef DEF_ARGS os << "Default arguments: " DEF_ARGS << endl; -#endif + #endif os << "Binary invocation:"; for (int argi = 0; argi < argc; argi++) os << " " << argv[argi]; os << endl; - - os << "PID: " << getpid() << endl; - if (debug_sleep) { - os << "Sleeping " << debug_sleep << - " second(s) to allow debug attach...\n"; - sleep(debug_sleep); - os << "Resuming...\n"; - } } }; // AppSettings. @@ -258,9 +241,23 @@ void alloc_steps(yk_solution_ptr soln, const MySettings& opts) { } } +// Init values in vars. +void init_vars(MySettings& opts, std::shared_ptr context) { + if (opts.init_val != MySettings::def_init_val) + context->init_same(opts.init_val); + else { + double seed = opts.validate ? 1.0 : 0.1; + context->init_diff(seed); + } +} + // Parse command-line args, run kernel, run validation if requested. int main(int argc, char** argv) { + // Stop collecting VTune data. + // Even better to use -start-paused option. + VTUNE_PAUSE; + // just a line. string div_line; for (int i = 0; i < 70; i++) @@ -268,49 +265,54 @@ int main(int argc, char** argv) div_line += "\n"; try { - // Bootstrap factories from kernel API. - yk_factory kfac; - yask_output_factory yof; - - // Parse custom options once just to get vars needed for env. + // Parse only custom options just to get vars needed to set up env. MySettings opts1(nullptr); opts1.parse(argc, argv); - - // Set up the environment (mostly MPI). + yk_env::set_trace_enabled(opts1.do_trace); + + // Bootstrap factory from kernel API. + yk_factory kfac; + + // Set up the environment. auto kenv = kfac.new_env(); - kenv->set_trace_enabled(opts1.do_trace); - if (opts1.msg_rank == kenv->get_rank_index()) - kenv->set_debug_output(yof.new_stdout_output()); - else - kenv->set_debug_output(yof.new_null_output()); auto ep = dynamic_pointer_cast(kenv); auto num_ranks = kenv->get_num_ranks(); - auto& os = kenv->get_debug_output()->get_ostream(); + // Enable debug only on requested rank. + if (opts1.msg_rank != kenv->get_rank_index()) + yk_env::disable_debug_output(); + auto& os = kenv->get_debug_output()->get_ostream(); + // Make solution object containing data and parameters for stencil eval. // TODO: do everything through API without cast to StencilContext. auto ksoln = kfac.new_solution(kenv); auto context = dynamic_pointer_cast(ksoln); assert(context.get()); - auto& copts = context->get_settings(); + auto& copts = context->get_actl_opts(); assert(copts); // Parse custom and library-provided cmd-line options and // exit on -help or error. // TODO: do this through APIs. MySettings opts(ksoln); - opts.parse(argc, argv); - - // Make sure warmup is on if needed. - if (opts.trial_steps <= 0 && opts.trial_time > 0) - opts.do_warmup = true; + auto opts_str = opts.parse(argc, argv); // Make sure any MPI/OMP debug data is dumped from all ranks before continuing. kenv->global_barrier(); // Print splash banner and related info. opts.splash(os, argc, argv); + os << "\n" << opts_str; + // Print PID and sleep for debug if needed. + os << "\nPID: " << getpid() << endl; + if (opts.debug_sleep) { + os << "Sleeping " << opts.debug_sleep << + " second(s) to allow debug attach...\n"; + sleep(opts.debug_sleep); + os << "Resuming...\n"; + } + // Override alloc if requested. alloc_steps(ksoln, opts); @@ -321,9 +323,12 @@ int main(int argc, char** argv) if (context->rank_bb.bb_num_points < 1) THROW_YASK_EXCEPTION("Exiting because there are no points in the domain"); - // init data in vars and params. - if (opts.do_warmup || !opts.validate) - context->init_data(); + // Init data in vars and params. + init_vars(opts, context); + + // Copy vars now instead of waiting for run_solution() to do it + // automatically. This will remove overhead from first call. + context->copy_vars_to_device(); // Invoke auto-tuner. if (opts.do_pre_auto_tune) @@ -332,47 +337,58 @@ int main(int argc, char** argv) // Enable/disable further auto-tuning. ksoln->reset_auto_tuner(copts->_do_auto_tune); + // Make sure warmup is on if needed. + if (opts.trial_steps <= 0 && opts.trial_time > 0) + opts.do_warmup = true; + // Warmup caches, threading, etc. // Measure time to change number of steps. if (opts.do_warmup) { // Turn off debug. auto dbg_out = context->get_debug_output(); - context->set_debug_output(yof.new_null_output()); + context->disable_debug_output(); os << endl << div_line; // Warmup and calibration phases. double rate = 1.0; - idx_t warmup_steps = 1; - idx_t max_wsteps = 10; - for (int n = 0; n < 3; n++) { - - // Run steps. - // Always run warmup forward, even for reverse stencils. - // (The result will be meaningless, but that doesn't matter.) - os << "Running " << warmup_steps << " step(s) for " << - (n ? "calibration" : "warm-up") << "...\n" << flush; - kenv->global_barrier(); - ksoln->run_solution(0, warmup_steps-1); - kenv->global_barrier(); - auto stats = context->get_stats(); - auto wtime = stats->get_elapsed_secs(); - os << " Done in " << make_num_str(wtime) << " secs.\n"; - rate = (wtime > 0.) ? double(warmup_steps) / wtime : 0; - - // Done if time est. isn't needed. - if (opts.trial_steps > 0) - break; - - // Use time to set number of steps for next trial. - double warmup_time = 0.5 * (n + 1); - warmup_steps = ceil(rate * warmup_time); - warmup_steps = min(warmup_steps, max_wsteps); - max_wsteps *= max_wsteps; - - // Average across all ranks because it is critical that - // all ranks use the same number of steps to avoid deadlock. - warmup_steps = CEIL_DIV(sum_over_ranks(warmup_steps, ep->comm), num_ranks); + { + idx_t warmup_steps = 1; + idx_t max_wsteps = 10; + int nruns = 3; + for (int n = 0; n < nruns; n++) { + + // Run steps. + // Always run warmup forward, even for reverse stencils. + // (The result will be meaningless, but that doesn't matter.) + os << "Running " << warmup_steps << " step(s) for " << + (n ? "calibration" : "warm-up") << "...\n" << flush; + kenv->global_barrier(); + ksoln->run_solution(0, warmup_steps-1); + kenv->global_barrier(); + auto stats = context->get_stats(); + auto wtime = stats->get_elapsed_secs(); + os << " Done in " << make_num_str(wtime) << " secs.\n"; + rate = (wtime > 0.) ? double(warmup_steps) / wtime : 0; + + // Done if time est. isn't needed. + if (opts.trial_steps > 0) + break; + + // Use time to set number of steps for next trial. + double warmup_time = 0.5 * (n + 1); + warmup_steps = ceil(rate * warmup_time); + warmup_steps = min(warmup_steps, max_wsteps); + max_wsteps *= max_wsteps; + + // Average across all ranks because it is critical that + // all ranks use the same number of steps to avoid deadlock. + warmup_steps = CEIL_DIV(sum_over_ranks(warmup_steps, ep->comm), num_ranks); + + // Done if only 1 step to do. + if (warmup_steps <= 1) + break; + } } // Set final number of steps. @@ -382,7 +398,7 @@ int main(int argc, char** argv) // Round up to multiple of temporal tiling if not too big. auto step_dim = ksoln->get_step_dim_name(); - auto rt = copts->_region_sizes[step_dim]; + auto rt = copts->_mega_block_sizes[step_dim]; auto bt = copts->_block_sizes[step_dim]; auto tt = max(rt, bt); const idx_t max_mult = 5; @@ -424,28 +440,36 @@ int main(int argc, char** argv) opts.trial_steps << " step(s) each...\n" << flush; for (idx_t tr = 0; tr < opts.num_trials; tr++) { os << div_line << - "Trial number: " << (tr + 1) << endl << flush; + "Trial number: " << (tr + 1) << endl << flush; - // init data before each trial for comparison if validating. - if (opts.validate) - context->init_data(); + // re-init data before each trial for comparison if validating. + if (opts.validate) { + init_vars(opts, context); + context->copy_vars_to_device(); + } // Warn if tuning. if (ksoln->is_auto_tuner_enabled()) os << "auto-tuner is active during this trial, so results may not be representative.\n"; - // Stabilize. + // Pause before trial. if (opts.pre_trial_sleep_time > 0) { os << flush; sleep(opts.pre_trial_sleep_time); } kenv->global_barrier(); + // Start vtune collection. + VTUNE_RESUME; + // Actual work. context->clear_timers(); ksoln->run_solution(first_t, last_t); kenv->global_barrier(); + // Stop vtune collection. + VTUNE_PAUSE; + // Calc and report perf. auto tstats = context->get_stats(); auto stats = dynamic_pointer_cast(tstats); @@ -454,6 +478,9 @@ int main(int argc, char** argv) trial_stats.push_back(stats); } + // Done with vtune. + VTUNE_DETACH; + // Report stats. if (trial_stats.size()) { @@ -467,8 +494,34 @@ int main(int argc, char** argv) auto& best_trial = trial_stats.front(); auto r50 = trial_stats.size() / 2; auto& mid_trial = trial_stats.at(r50); + + // Additional stats. + double sum_pps = 0., sum2_pps = 0., max_pps = 0., min_pps = 0.; + for (auto ts : trial_stats) { + auto pps = ts->pts_ps; + sum_pps += pps; + sum2_pps += pps * pps; + if (max_pps == 0. || pps > max_pps) + max_pps = pps; + if (min_pps == 0. || pps < min_pps) + min_pps = pps; + } + auto n = trial_stats.size(); + double ave_pps = sum_pps / n; + double var_pps = (sum2_pps - (sum_pps * sum_pps) / n) / (n - 1.); + double sd_pps = sqrt(var_pps); os << div_line << + "Throughput stats across trials:\n" + " num-trials: " << n << endl << + " min-throughput (num-points/sec): " << make_num_str(min_pps) << endl << + " max-throughput (num-points/sec): " << make_num_str(max_pps) << endl << + " ave-throughput (num-points/sec): " << make_num_str(ave_pps) << endl; + if (n > 2) + os << + " std-dev-throughput (num-points/sec): " << make_num_str(sd_pps) << endl; + os << + div_line << "Performance stats of best trial:\n" " best-num-steps-done: " << best_trial->nsteps << endl << " best-elapsed-time (sec): " << make_num_str(best_trial->run_time) << endl << @@ -483,19 +536,29 @@ int main(int argc, char** argv) " mid-throughput (num-reads/sec): " << make_num_str(mid_trial->reads_ps) << endl << " mid-throughput (num-writes/sec): " << make_num_str(mid_trial->writes_ps) << endl << " mid-throughput (est-FLOPS): " << make_num_str(mid_trial->flops) << endl << - " mid-throughput (num-points/sec): " << make_num_str(mid_trial->pts_ps) << endl << - div_line << - "Notes:\n" - " The 50th-percentile trial is the same as the median trial\n" - " when there is an odd number of trials. When there is an even\n" - " number of trials, the nearest-rank method is used. An odd\n" - " number of trials is recommended.\n" + " mid-throughput (num-points/sec): " << make_num_str(mid_trial->pts_ps) << endl; + os << div_line << + "Notes:\n"; + if (n == 1) + os << " Since there was only one trial, the best trial and the\n" + " 50th-percentile trial are the one and only trial.\n"; + else { + if (n % 2 == 1) + os << " Since there was an odd number of trials, the\n" + " 50th-percentile trial is the trial with the median performance:\n"; + else + os << " Since there was an even number of trials, the\n" + " 50th-percentile trial is chosen with the nearest-rank method:\n"; + os << " the trial with performance ranked " << (r50+1) << " out of " << + trial_stats.size() << ".\n"; + } + os << " Num-reads/sec, num-writes/sec, and FLOPS are metrics based on\n" " stencil specifications and can vary due to differences in\n" " implementations and optimizations.\n" - " Num-points/sec is based on overall problem size and is\n" - " a more reliable performance metric, esp. when comparing\n" - " across implementations.\n"; + " Num-points/sec is based on the number of results computed and\n" + " is a more reliable performance metric, esp. when comparing\n" + " across architectures and/or implementations.\n"; context->print_warnings(); } @@ -511,7 +574,6 @@ int main(int argc, char** argv) auto ref_soln = kfac.new_solution(kenv, ksoln); auto ref_context = dynamic_pointer_cast(ref_soln); assert(ref_context.get()); - auto& ref_opts = ref_context->get_settings(); // Reapply cmd-line options to override default settings. MySettings my_ref_opts(ref_soln); @@ -519,38 +581,27 @@ int main(int argc, char** argv) // Change some settings. ref_context->name += "-reference"; - ref_context->allow_vec_exchange = false; // exchange scalars in halos. + auto& ref_opts = ref_context->get_actl_opts(); + ref_opts->force_scalar_exchange = true; + ref_opts->do_halo_exchange = true; ref_opts->overlap_comms = false; ref_opts->use_shm = false; ref_opts->_numa_pref = yask_numa_none; - // TODO: re-enable the region and block settings below; - // requires allowing consistent init of different-sized vars - // in kernel code. -#if 0 - auto sdim = ref_soln->get_step_dim_name(); - ref_soln->set_region_size(sdim, 0); - ref_soln->set_block_size(sdim, 0); - for (auto ddim : ref_soln->get_domain_dim_names()) { - ref_soln->set_region_size(ddim, 0); - ref_soln->set_block_size(ddim, 0); - } -#endif - // Override allocations and prep solution as with ref soln. alloc_steps(ref_soln, my_ref_opts); ref_soln->prepare_solution(); // init to same value used in context. - ref_context->init_data(); + init_vars(opts, ref_context); -#ifdef CHECK_INIT + #ifdef CHECK_INIT // Debug code to determine if data compares immediately after init matches. os << endl << div_line << "Reinitializing data for minimal validation...\n" << flush; - context->init_data(); -#else + init_vars(opts, context); + #else // Ref trial. // Do same number as last perf run. @@ -560,11 +611,11 @@ int main(int argc, char** argv) // Discard perf report. auto dbg_out = ref_context->get_debug_output(); - ref_context->set_debug_output(yof.new_null_output()); + ref_context->disable_debug_output(); auto rstats = ref_context->get_stats(); ref_context->set_debug_output(dbg_out); os << " Done in " << make_num_str(rstats->get_elapsed_secs()) << " secs.\n" << flush; -#endif + #endif // check for equality. os << "\nChecking results...\n"; idx_t errs = context->compare_data(*ref_context); @@ -576,12 +627,12 @@ int main(int argc, char** argv) for (int r = 0; r < kenv->get_num_ranks(); r++) { kenv->global_barrier(); if (r == ri) { + + // Use 'cerr' to print on all ranks in case rank printing to 'os' + // passed and other(s) failed. if( errs == 0 ) - os << "TEST PASSED on rank " << ri << ".\n" << flush; + cerr << "TEST PASSED on rank " << ri << ".\n" << flush; else { - - // Use 'cerr' to print on all ranks in case rank printing to 'os' - // passed and other(s) failed. cerr << "TEST FAILED on rank " << ri << ": " << errs << " mismatch(es).\n"; if (REAL_BYTES < 8) cerr << " Small differences are not uncommon for low-precision FP; " @@ -597,7 +648,7 @@ int main(int argc, char** argv) os << "\nResults NOT VERIFIED.\n"; ksoln->end_solution(); - os << "Stencil '" << ksoln->get_name() << "'.\n"; + os << "Stencil '" << context->get_description() << "'.\n"; if (!ok) exit_yask(1); diff --git a/src/stencils/AwpStencil.cpp b/src/stencils/AwpStencil.cpp index 73bae7f3..a6e85e24 100644 --- a/src/stencils/AwpStencil.cpp +++ b/src/stencils/AwpStencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -45,6 +45,8 @@ using namespace yask; #define DO_ABOVE_SURFACE // Set the following macro to use intermediate scratch vars. +// This is a compute/memory tradeoff: using scratch vars reduces +// compute and increases memory accesses. //#define USE_SCRATCH_VARS // For the surface stress conditions, we need to write into 2 points @@ -521,11 +523,7 @@ namespace { soln->set_prefetch_dist(1, 1); soln->set_prefetch_dist(2, 0); } - else if (target == "avx512") { - soln->set_prefetch_dist(1, 1); - soln->set_prefetch_dist(2, 0); - } - else { + else if (target == "avx2") { soln->set_prefetch_dist(1, 1); soln->set_prefetch_dist(2, 2); } @@ -535,29 +533,28 @@ namespace { // This code is run immediately after 'kernel_soln' is created. soln->CALL_AFTER_NEW_SOLUTION ( - // Add extra padding in all dimensions. - kernel_soln.apply_command_line_options("-ep 1"); - - // Check target at kernel run-time. - auto isa = kernel_soln.get_target(); - if (isa == "knl") { - - // Use half the threads: 2 threads on 2 cores per block. - kernel_soln.apply_command_line_options("-thread_divisor 2 -block_threads 4"); + // Check CPU target at kernel run-time. + if (!kernel_soln.is_offloaded()) { + auto isa = kernel_soln.get_target(); + if (isa == "knl") { + + // Use half the threads: 2 threads on 2 cores per block. + kernel_soln.apply_command_line_options("-thread_divisor 2 -block_threads 4"); - kernel_soln.set_block_size("x", 48); - kernel_soln.set_block_size("y", 48); - kernel_soln.set_block_size("z", 112); - } - else if (isa == "avx512") { - kernel_soln.set_block_size("x", 64); - kernel_soln.set_block_size("y", 8); - kernel_soln.set_block_size("z", 108); - } - else { - kernel_soln.set_block_size("x", 64); - kernel_soln.set_block_size("y", 8); - kernel_soln.set_block_size("z", 64); + kernel_soln.set_block_size("x", 48); + kernel_soln.set_block_size("y", 48); + kernel_soln.set_block_size("z", 112); + } + else if (isa == "avx2") { + kernel_soln.set_block_size("x", 64); + kernel_soln.set_block_size("y", 8); + kernel_soln.set_block_size("z", 64); + } + else { + kernel_soln.set_block_size("x", 116); + kernel_soln.set_block_size("y", 8); + kernel_soln.set_block_size("z", 128); + } } ); } diff --git a/src/stencils/ElasticStencil/Elastic2Stencil.hpp b/src/stencils/ElasticStencil/Elastic2Stencil.hpp index 6c5a0781..95e39764 100644 --- a/src/stencils/ElasticStencil/Elastic2Stencil.hpp +++ b/src/stencils/ElasticStencil/Elastic2Stencil.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/stencils/ElasticStencil/ElasticStencil.hpp b/src/stencils/ElasticStencil/ElasticStencil.hpp index 9832fa51..200b7dbc 100644 --- a/src/stencils/ElasticStencil/ElasticStencil.hpp +++ b/src/stencils/ElasticStencil/ElasticStencil.hpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/stencils/FSGElastic2Stencil.cpp b/src/stencils/FSGElastic2Stencil.cpp index d13cff7c..a011fd19 100644 --- a/src/stencils/FSGElastic2Stencil.cpp +++ b/src/stencils/FSGElastic2Stencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/stencils/FSGElasticStencil.cpp b/src/stencils/FSGElasticStencil.cpp index b25bdd76..6e8f219e 100644 --- a/src/stencils/FSGElasticStencil.cpp +++ b/src/stencils/FSGElasticStencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -299,25 +299,27 @@ namespace fsg { // This code is run immediately after 'kernel_soln' is created. soln->CALL_AFTER_NEW_SOLUTION ( - // Check target at kernel run-time. - auto isa = kernel_soln.get_target(); - if (isa == "knl") { - // Use only 1 thread per core. - kernel_soln.apply_command_line_options("-thread_divisor 4 -block_threads 2"); - - kernel_soln.set_block_size("x", 16); - kernel_soln.set_block_size("y", 16); - kernel_soln.set_block_size("z", 16); - } - else if (isa == "avx512") { - kernel_soln.set_block_size("x", 188); - kernel_soln.set_block_size("y", 12); - kernel_soln.set_block_size("z", 24); - } - else { - kernel_soln.set_block_size("x", 48); - kernel_soln.set_block_size("y", 4); - kernel_soln.set_block_size("z", 128); + // Check CPU target at kernel run-time. + if (!kernel_soln.is_offloaded()) { + auto isa = kernel_soln.get_target(); + if (isa == "knl") { + // Use only 1 thread per core. + kernel_soln.apply_command_line_options("-thread_divisor 4 -block_threads 2"); + + kernel_soln.set_block_size("x", 16); + kernel_soln.set_block_size("y", 16); + kernel_soln.set_block_size("z", 16); + } + else if (isa == "avx2") { + kernel_soln.set_block_size("x", 48); + kernel_soln.set_block_size("y", 4); + kernel_soln.set_block_size("z", 128); + } + else { + kernel_soln.set_block_size("x", 188); + kernel_soln.set_block_size("y", 12); + kernel_soln.set_block_size("z", 24); + } } ); } diff --git a/src/stencils/Iso3dfdStencil.cpp b/src/stencils/Iso3dfdStencil.cpp index 5e34b539..3c745273 100644 --- a/src/stencils/Iso3dfdStencil.cpp +++ b/src/stencils/Iso3dfdStencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -64,20 +64,25 @@ namespace { // Define RHS expression for p at t+1 based on values from v and p at t. virtual yc_number_node_ptr get_next_p() { - // yc_var_proxy spacing. + // Grid spacing. // In this implementation, it's a constant. - // Could make this a YASK variable to allow setting at run-time. + // Could make this a scalar YASK variable to allow setting at run-time. double delta_xyz = 50.0; double d2 = delta_xyz * delta_xyz; // Spatial FD coefficients for 2nd derivative. - auto coeff = get_center_fd_coefficients(2, get_radius()); - size_t c0i = get_radius(); // index of center sample. + vector coeff; + size_t rad = get_radius(); + if (rad > 0) + coeff = get_center_fd_coefficients(2, get_radius()); + else + coeff.push_back(1.0); // Dummy value for zero radius (not FD). for (size_t i = 0; i < coeff.size(); i++) { // Need 3 copies of center sample for x, y, and z FDs. - if (i == c0i) + // Center coeff is at index 'rad'. + if (i == rad) coeff[i] *= 3.0; // Divide each by delta_xyz^2. @@ -86,7 +91,7 @@ namespace { // Calculate FDx + FDy + FDz. // Start with center value multiplied by coeff 0. - auto fd_sum = p(t, x, y, z) * coeff[c0i]; + auto fd_sum = p(t, x, y, z) * coeff[rad]; // Add values from x, y, and z axes multiplied by the // coeff for the given radius. @@ -106,7 +111,7 @@ namespace { p(t, x, y, z-r) + p(t, x, y, z+r) - ) * coeff[c0i + r]; // R & L coeffs are identical. + ) * coeff[rad + r]; // R & L coeffs are identical. } // Wave equation is: @@ -167,11 +172,7 @@ namespace { soln->set_prefetch_dist(1, 1); soln->set_prefetch_dist(2, 0); } - else if (target == "avx512") { - soln->set_prefetch_dist(1, 0); - soln->set_prefetch_dist(2, 2); - } - else { + else if (target == "avx2") { soln->set_prefetch_dist(1, 0); soln->set_prefetch_dist(2, 0); } @@ -181,25 +182,24 @@ namespace { // This code is run immediately after 'kernel_soln' is created. soln->CALL_AFTER_NEW_SOLUTION ( - // Add extra padding in all dimensions. - kernel_soln.apply_command_line_options("-ep 1"); - - // Check target at kernel run-time. - auto isa = kernel_soln.get_target(); - if (isa == "knl") { - kernel_soln.set_block_size("x", 160); - kernel_soln.set_block_size("y", 256); - kernel_soln.set_block_size("z", 96); - } - else if (isa == "avx512") { - kernel_soln.set_block_size("x", 108); - kernel_soln.set_block_size("y", 28); - kernel_soln.set_block_size("z", 132); - } - else { - kernel_soln.set_block_size("x", 48); - kernel_soln.set_block_size("y", 64); - kernel_soln.set_block_size("z", 112); + // Check CPU target at kernel run-time. + if (!kernel_soln.is_offloaded()) { + auto isa = kernel_soln.get_target(); + if (isa == "knl") { + kernel_soln.set_block_size("x", 160); + kernel_soln.set_block_size("y", 256); + kernel_soln.set_block_size("z", 96); + } + else if (isa == "avx2") { + kernel_soln.set_block_size("x", 48); + kernel_soln.set_block_size("y", 64); + kernel_soln.set_block_size("z", 112); + } + else { + kernel_soln.set_block_size("x", 96); + kernel_soln.set_block_size("y", 28); + kernel_soln.set_block_size("z", 96); + } } ); } diff --git a/src/stencils/SSGElastic2Stencil.cpp b/src/stencils/SSGElastic2Stencil.cpp index 8fdd753f..0144b654 100644 --- a/src/stencils/SSGElastic2Stencil.cpp +++ b/src/stencils/SSGElastic2Stencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/stencils/SSGElasticStencil.cpp b/src/stencils/SSGElasticStencil.cpp index 4c64d729..668f17cf 100644 --- a/src/stencils/SSGElasticStencil.cpp +++ b/src/stencils/SSGElasticStencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -148,22 +148,24 @@ class SSGElasticStencil : public ElasticStencilBase { // This code is run immediately after 'kernel_soln' is created. soln->CALL_AFTER_NEW_SOLUTION ( - // Check target at kernel run-time. - auto isa = kernel_soln.get_target(); - if (isa == "knl") { - kernel_soln.set_block_size("x", 16); - kernel_soln.set_block_size("y", 16); - kernel_soln.set_block_size("z", 32); - } - else if (isa == "avx512") { - kernel_soln.set_block_size("x", 96); - kernel_soln.set_block_size("y", 16); - kernel_soln.set_block_size("z", 80); - } - else { - kernel_soln.set_block_size("x", 64); - kernel_soln.set_block_size("y", 16); - kernel_soln.set_block_size("z", 96); + // Check CPU target at kernel run-time. + if (!kernel_soln.is_offloaded()) { + auto isa = kernel_soln.get_target(); + if (isa == "knl") { + kernel_soln.set_block_size("x", 16); + kernel_soln.set_block_size("y", 16); + kernel_soln.set_block_size("z", 32); + } + else if (isa == "avx2") { + kernel_soln.set_block_size("x", 64); + kernel_soln.set_block_size("y", 16); + kernel_soln.set_block_size("z", 96); + } + else { + kernel_soln.set_block_size("x", 96); + kernel_soln.set_block_size("y", 16); + kernel_soln.set_block_size("z", 80); + } } ); } diff --git a/src/stencils/SimpleStencils.cpp b/src/stencils/SimpleStencils.cpp index 0978e81f..e06b40d8 100644 --- a/src/stencils/SimpleStencils.cpp +++ b/src/stencils/SimpleStencils.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to diff --git a/src/stencils/TTIStencil.cpp b/src/stencils/TTIStencil.cpp index e599e552..1a58a73c 100644 --- a/src/stencils/TTIStencil.cpp +++ b/src/stencils/TTIStencil.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -24,7 +24,6 @@ IN THE SOFTWARE. *****************************************************************************/ // Implement TTI stencil using formulation from Devito project. -// TODO: provide a more readable TTI formulation. // YASK stencil solution(s) in this file will be integrated into the YASK compiler utility. #include "yask_compiler_api.hpp" @@ -83,514 +82,536 @@ namespace { } - // The following statements were auto-generated by the Devito symbolic finite-difference package. - // Define equation for 'u' and 'v' at t+1 based on values at t // for spatial order = 4. virtual void define_so4 () { - // Set BKC (best-known configs) found by automated and/or manual - // tuning. - // Only have BKCs for SP FP (4B). - auto soln = get_soln(); // pointer to compile-time soln. - if (soln->get_element_bytes() == 4) { + // Set BKC (best-known configs) found by automated and/or manual + // tuning. + // Only have BKCs for SP FP (4B). + auto soln = get_soln (); // pointer to compile-time soln. + if (soln->get_element_bytes () == 4) + { + + // Kernel run-time defaults, e.g., block-sizes. + // This code is run immediately after 'kernel_soln' is created. + soln->CALL_AFTER_NEW_SOLUTION + (kernel_soln.set_block_size ("x", 80); + kernel_soln.set_block_size ("y", 16); + kernel_soln.set_block_size ("z", 40);); + } + + // The following statements were auto-generated by the Devito symbolic + // finite-difference package. - // Kernel run-time defaults, e.g., block-sizes. - // This code is run immediately after 'kernel_soln' is created. - soln->CALL_AFTER_NEW_SOLUTION - ( - // Check target at kernel run-time. - auto isa = kernel_soln.get_target(); - if (isa == "avx512") { - kernel_soln.set_block_size("x", 80); - kernel_soln.set_block_size("y", 16); - kernel_soln.set_block_size("z", 40); - } - ); - } - auto temp161 = 2.5e-2 * (-u (t, x - 1, y, z) + u (t, x + 1, y, z)); auto temp163 = -7.5e-2 * u (t, x, y, z) + 1.0e-1 * u (t, x, y + 1, - z) - 2.5e-2 * u (t, x, y + 2, - z); + z) - 2.5e-2 * u (t, x, y + 2, + z); auto temp192 = 1.0e-1 * u (t, x, y, z) - 7.5e-2 * u (t, x, y - 1, - z) - 2.5e-2 * u (t, x, y + 1, - z); + z) - 2.5e-2 * u (t, x, y + 1, + z); auto temp183 = -2.5e-2 * u (t, x, y, z) - 7.5e-2 * u (t, x - 2, y, - z) + 1.0e-1 * u (t, x - 1, y, - z); - auto temp168 = -7.5e-2 * u (t, x, y + 1, z) + 1.0e-1 * u (t, x + 1, y + 1, - z) - 2.5e-2 * u (t, x + 2, - y + 1, - z); - auto temp197 = 3.75e-2 * (temp161 * ti1 (x, y, z) * ti2 (x, y, z) + temp163 * ti2 (x, y, z) * ti3 (x, y, - z) - (-7.5e-2 * u (t, x, y, - z) + 1.0e-1 * u (t, x, y, - z + 1) - 2.5e-2 * u (t, x, y, - z + 2)) * ti0 (x, y, z)); - auto temp171 = -7.5e-2 * u (t, x + 1, y, z) + 1.0e-1 * u (t, x + 1, y + 1, - z) - 2.5e-2 * u (t, x + 1, - y + 2, - z); + z) + 1.0e-1 * u (t, x - 1, y, + z); + auto temp168 = + -7.5e-2 * u (t, x, y + 1, z) + 1.0e-1 * u (t, x + 1, y + 1, + z) - 2.5e-2 * u (t, x + 2, + y + 1, + z); + auto temp197 = + 3.75e-2 * (temp161 * ti1 (x, y, z) * ti2 (x, y, z) + temp163 * ti2 (x, + y, z) * ti3 (x, y, + z) - (-7.5e-2 * u (t, x, y, + z) + 1.0e-1 * u (t, x, y, + z + 1) - 2.5e-2 * u (t, x, y, + z + 2)) * ti0 (x, y, z)); + auto temp171 = + -7.5e-2 * u (t, x + 1, y, z) + 1.0e-1 * u (t, x + 1, y + 1, + z) - 2.5e-2 * u (t, x + 1, + y + 2, + z); auto temp178 = 2.5e-2 * (u (t, x, y, z) - u (t, x - 2, y, z)); - auto temp182 = 2.5e-2 * (-u (t, x - 2, y - 1, z) + u (t, x - 2, y + 1, z)); + auto temp182 = + 2.5e-2 * (-u (t, x - 2, y - 1, z) + u (t, x - 2, y + 1, z)); auto temp190 = 1.0e-1 * u (t, x, y, z) - 7.5e-2 * u (t, x - 1, y, - z) - 2.5e-2 * u (t, x + 1, y, - z); + z) - 2.5e-2 * u (t, x + 1, y, + z); auto temp173 = 2.5e-2 * (u (t, x, y, z) - u (t, x, y - 2, z)); - auto temp188 = 2.5e-2 * (-u (t, x - 1, y - 1, z) + u (t, x - 1, y + 1, z)); + auto temp188 = + 2.5e-2 * (-u (t, x - 1, y - 1, z) + u (t, x - 1, y + 1, z)); auto temp154 = 2.5e-2 * (-u (t, x, y - 1, z) + u (t, x, y + 1, z)); auto temp186 = -2.5e-2 * u (t, x, y, z) - 7.5e-2 * u (t, x, y - 2, - z) + 1.0e-1 * u (t, x, y - 1, - z); + z) + 1.0e-1 * u (t, x, y - 1, + z); auto temp158 = -7.5e-2 * u (t, x, y, z) + 1.0e-1 * u (t, x + 1, y, - z) - 2.5e-2 * u (t, x + 2, y, - z); - auto temp191 = 2.5e-2 * (-u (t, x - 1, y - 1, z) + u (t, x + 1, y - 1, z)); - auto temp184 = 2.5e-2 * (-u (t, x - 1, y - 2, z) + u (t, x + 1, y - 2, z)); - auto temp181 = -7.5e-2 * u (t, x - 1, y, z) + 1.0e-1 * u (t, x - 1, y + 1, - z) - 2.5e-2 * u (t, x - 1, - y + 2, - z); - auto temp176 = -7.5e-2 * u (t, x, y - 1, z) + 1.0e-1 * u (t, x + 1, y - 1, - z) - 2.5e-2 * u (t, x + 2, - y - 1, - z); - auto temp123 = (2.5e-2 * ((v (t, x, y, z) - v (t, x, y - 2, z)) * ti0 (x, y - 1, z) * ti3 (x, - y - 1, - z) + (-v (t, x, y - 1, z - 1) + v (t, x, y - 1, z + 1)) * ti2 (x, y - 1, - z)) + - (-7.5e-2 * v (t, x, y - 1, z) + 1.0e-1 * v (t, x + 1, y - 1, z) - 2.5e-2 * v (t, x + 2, y - 1, z)) * ti0 (x, y - 1, z) * ti1 (x, y - 1, - z)) * ti0 (x, y - 1, z) * ti3 (x, y - 1, z); - auto temp102 = (2.5e-2 * ((-v (t, x - 1, y, z - 1) + v (t, x - 1, y, z + 1)) * ti2 (x - 1, y, - z) + (-v (t, - x - 1, - y - 1, - z) + v (t, - x - 1, - y + 1, - z)) * ti0 (x - 1, y, z) * ti3 (x - 1, y, z)) + (1.0e-1 * v (t, x, y, - z) - 7.5e-2 * v (t, x - 1, y, - z) - 2.5e-2 * v (t, x + 1, y, - z)) * ti0 (x - 1, - y, - z) * ti1 (x - 1, y, z)) * ti0 (x - 1, y, - z) * - ti1 (x - 1, y, z); - auto temp131 = (2.5e-2 * (-v (t, x - 1, y, z - 2) + v (t, x + 1, y, z - 2)) * ti0 (x, y, z - 2) * ti1 (x, y, - z - 2) + (-2.5e-2 * v (t, x, y, - z) - 7.5e-2 * v (t, - x, - y, - z - 2) + 1.0e-1 * v (t, x, y, - z - 1)) * ti2 (x, y, - z - 2) + - (-7.5e-2 * v (t, x, y, z - 2) + 1.0e-1 * v (t, x, y + 1, z - 2) - 2.5e-2 * v (t, x, y + 2, z - 2)) * ti0 (x, y, z - 2) * ti3 (x, y, - z - 2)) * ti2 (x, y, z - 2); - auto temp87 = (2.5e-2 * ((-v (t, x - 2, y, z - 1) + v (t, x - 2, y, z + 1)) * ti2 (x - 2, y, - z) + (-v (t, - x - 2, - y - 1, - z) + v (t, - x - 2, - y + 1, - z)) * ti0 (x - 2, y, z) * ti3 (x - 2, y, z)) + (-2.5e-2 * v (t, x, y, - z) - 7.5e-2 * v (t, x - 2, y, - z) + 1.0e-1 * v (t, x - 1, y, - z)) * ti0 (x - 2, - y, - z) * ti1 (x - 2, y, z)) * ti0 (x - 2, y, - z) * - ti1 (x - 2, y, z); - auto temp47 = 2.5e-2 * (-v (t, x - 1, y, z) + v (t, x + 1, y, z)) * ti0 (x, y, - z) * ti1 (x, - y, - z) + (-7.5e-2 * v (t, x, y, z) + 1.0e-1 * v (t, x, y, z + 1) - 2.5e-2 * v (t, x, y, z + 2)) * ti2 (x, y, z) + (-7.5e-2 * v (t, x, - y, - z) + 1.0e-1 * v (t, x, - y + 1, - z) - 2.5e-2 * v (t, x, - y + 2, - z)) * - ti0 (x, y, z) * ti3 (x, y, z); - auto temp110 = (2.5e-2 * ((-v (t, x, y, z) + v (t, x, y + 2, z)) * ti0 (x, y + 1, z) * ti3 (x, - y + 1, - z) + (-v (t, x, y + 1, z - 1) + v (t, x, y + 1, z + 1)) * ti2 (x, y + 1, - z)) + - (-7.5e-2 * v (t, x, y + 1, z) + 1.0e-1 * v (t, x + 1, y + 1, z) - 2.5e-2 * v (t, x + 2, y + 1, z)) * ti0 (x, y + 1, z) * ti1 (x, y + 1, - z)) * ti0 (x, y + 1, z) * ti3 (x, y + 1, z); - auto temp140 = (2.5e-2 * (-v (t, x, y, z) + v (t, x + 2, y, z)) * ti0 (x + 1, y, z) * ti1 (x + 1, y, - z) + (-7.5e-2 * v (t, x + 1, y, z) + 1.0e-1 * v (t, x + 1, y, - z + 1) - 2.5e-2 * v (t, x + 1, y, - z + 2)) * ti2 (x + 1, y, - z) + - (-7.5e-2 * v (t, x + 1, y, z) + 1.0e-1 * v (t, x + 1, y + 1, z) - 2.5e-2 * v (t, x + 1, y + 2, z)) * ti0 (x + 1, y, z) * ti3 (x + 1, y, - z)) * ti0 (x + 1, y, - z) * ti1 (x + 1, y, - z); - auto temp149 = (2.5e-2 * (-v (t, x - 1, y - 2, z) + v (t, x + 1, y - 2, z)) * ti0 (x, y - 2, z) * ti1 (x, y - 2, - z) + (-2.5e-2 * v (t, x, y, - z) - 7.5e-2 * v (t, x, - y - 2, - z) + 1.0e-1 * v (t, x, y - 1, z)) * ti0 (x, - y - 2, - z) * ti3 (x, y - 2, - z) + - (-7.5e-2 * v (t, x, y - 2, z) + 1.0e-1 * v (t, x, y - 2, - z + 1) - 2.5e-2 * v (t, x, y - 2, z + 2)) * ti2 (x, y - 2, - z)) * ti0 (x, - y - 2, - z) * ti3 (x, y - 2, z); - auto temp39 = (2.5e-2 * ((-v (t, x, y, z - 1) + v (t, x, y, z + 1)) * ti2 (x, y, - z) + (-v (t, x, - y - 1, - z) + v (t, - x, - y + 1, - z)) * ti0 (x, y, z) * ti3 (x, y, z)) + (-7.5e-2 * v (t, x, y, - z) + 1.0e-1 * v (t, - x + 1, - y, - z) - 2.5e-2 * v (t, x + 2, - y, - z)) * ti0 (x, y, - z) * ti1 (x, y, z)) * ti0 (x, y, z) * ti1 (x, y, z); - auto temp132 = (2.5e-2 * (-v (t, x - 1, y, z - 1) + v (t, x + 1, y, z - 1)) * ti0 (x, y, z - 1) * ti1 (x, y, - z - 1) + (1.0e-1 * v (t, x, y, - z) - 7.5e-2 * v (t, x, - y, - z - 1) - 2.5e-2 * v (t, x, y, - z + 1)) * ti2 (x, y, - z - 1) + - (-7.5e-2 * v (t, x, y, z - 1) + 1.0e-1 * v (t, x, y + 1, z - 1) - 2.5e-2 * v (t, x, y + 2, z - 1)) * ti0 (x, y, z - 1) * ti3 (x, y, - z - 1)) * ti2 (x, y, z - 1); - auto temp6 = 1.0 / (8.85879567828298e-1 * damp (x, y, z) + 2.0 * m (x, y, z)); - auto temp141 = (2.5e-2 * (v (t, x, y, z) - v (t, x - 2, y, z)) * ti0 (x - 1, y, z) * ti1 (x - 1, y, - z) + (-7.5e-2 * v (t, x - 1, y, z) + 1.0e-1 * v (t, x - 1, y, - z + 1) - 2.5e-2 * v (t, x - 1, y, - z + 2)) * ti2 (x - 1, - y, - z) - + (-7.5e-2 * v (t, x - 1, y, z) + 1.0e-1 * v (t, x - 1, y + 1, z) - 2.5e-2 * v (t, x - 1, y + 2, z)) * ti0 (x - 1, y, z) * ti3 (x - 1, y, - z)) * ti0 (x - 1, y, - z) * ti1 (x - 1, y, - z); + z) - 2.5e-2 * u (t, x + 2, y, + z); + auto temp191 = + 2.5e-2 * (-u (t, x - 1, y - 1, z) + u (t, x + 1, y - 1, z)); + auto temp184 = + 2.5e-2 * (-u (t, x - 1, y - 2, z) + u (t, x + 1, y - 2, z)); + auto temp181 = + -7.5e-2 * u (t, x - 1, y, z) + 1.0e-1 * u (t, x - 1, y + 1, + z) - 2.5e-2 * u (t, x - 1, + y + 2, + z); + auto temp176 = + -7.5e-2 * u (t, x, y - 1, z) + 1.0e-1 * u (t, x + 1, y - 1, + z) - 2.5e-2 * u (t, x + 2, + y - 1, + z); + auto temp123 = + (2.5e-2 * ((v (t, x, y, z) - v (t, x, y - 2, z)) * ti0 (x, y - 1, + z) * ti3 (x, + y - 1, + z) + (-v (t, x, y - 1, z - 1) + v (t, x, y - 1, z + 1)) * ti2 (x, + y - 1, + z)) + (-7.5e-2 * v (t, x, y - 1, z) + 1.0e-1 * v (t, x + 1, y - 1, + z) - 2.5e-2 * v (t, x + 2, y - 1, z)) * ti0 (x, y - 1, + z) * ti1 (x, y - 1, + z)) * ti0 (x, y - 1, z) * ti3 (x, y - 1, z); + auto temp102 = + (2.5e-2 * ((-v (t, x - 1, y, z - 1) + v (t, x - 1, y, + z + 1)) * ti2 (x - 1, y, + z) + (-v (t, + x - 1, + y - 1, + z) + v (t, + x - 1, + y + 1, + z)) * ti0 (x - 1, y, z) * ti3 (x - 1, y, z)) + (1.0e-1 * v (t, + x, y, + z) - 7.5e-2 * v (t, x - 1, y, + z) - 2.5e-2 * v (t, x + 1, y, + z)) * ti0 (x - 1, + y, + z) * ti1 (x - 1, y, z)) * ti0 (x - 1, y, + z) * ti1 (x - 1, y, z); + auto temp131 = + (2.5e-2 * (-v (t, x - 1, y, z - 2) + v (t, x + 1, y, z - 2)) * ti0 (x, + y, z - 2) * ti1 (x, y, + z - 2) + (-2.5e-2 * v (t, x, y, + z) - 7.5e-2 * v (t, + x, + y, + z - 2) + 1.0e-1 * v (t, x, y, + z - 1)) * ti2 (x, y, + z - 2) + + (-7.5e-2 * v (t, x, y, z - 2) + 1.0e-1 * v (t, x, y + 1, + z - 2) - 2.5e-2 * v (t, x, y + 2, z - 2)) * ti0 (x, y, + z - 2) * ti3 (x, y, + z - 2)) * ti2 (x, y, z - 2); + auto temp87 = + (2.5e-2 * ((-v (t, x - 2, y, z - 1) + v (t, x - 2, y, + z + 1)) * ti2 (x - 2, y, + z) + (-v (t, + x - 2, + y - 1, + z) + v (t, + x - 2, + y + 1, + z)) * ti0 (x - 2, y, z) * ti3 (x - 2, y, z)) + (-2.5e-2 * v (t, + x, y, + z) - 7.5e-2 * v (t, x - 2, y, + z) + 1.0e-1 * v (t, x - 1, y, + z)) * ti0 (x - 2, + y, + z) * ti1 (x - 2, y, z)) * ti0 (x - 2, y, + z) * ti1 (x - 2, y, z); + auto temp47 = + 2.5e-2 * (-v (t, x - 1, y, z) + v (t, x + 1, y, z)) * ti0 (x, y, + z) * ti1 (x, + y, + z) + (-7.5e-2 * v (t, x, y, z) + 1.0e-1 * v (t, x, y, + z + 1) - 2.5e-2 * v (t, x, y, z + 2)) * ti2 (x, y, + z) + (-7.5e-2 * v (t, x, + y, + z) + 1.0e-1 * v (t, x, + y + 1, + z) - 2.5e-2 * v (t, x, + y + 2, + z)) * ti0 (x, y, z) * ti3 (x, y, z); + auto temp110 = + (2.5e-2 * ((-v (t, x, y, z) + v (t, x, y + 2, z)) * ti0 (x, y + 1, + z) * ti3 (x, + y + 1, + z) + (-v (t, x, y + 1, z - 1) + v (t, x, y + 1, z + 1)) * ti2 (x, + y + 1, + z)) + (-7.5e-2 * v (t, x, y + 1, z) + 1.0e-1 * v (t, x + 1, y + 1, + z) - 2.5e-2 * v (t, x + 2, y + 1, z)) * ti0 (x, y + 1, + z) * ti1 (x, y + 1, + z)) * ti0 (x, y + 1, z) * ti3 (x, y + 1, z); + auto temp140 = + (2.5e-2 * (-v (t, x, y, z) + v (t, x + 2, y, z)) * ti0 (x + 1, y, + z) * ti1 (x + 1, y, + z) + (-7.5e-2 * v (t, x + 1, y, z) + 1.0e-1 * v (t, x + 1, y, + z + 1) - 2.5e-2 * v (t, x + 1, y, + z + 2)) * ti2 (x + 1, y, + z) + + (-7.5e-2 * v (t, x + 1, y, z) + 1.0e-1 * v (t, x + 1, y + 1, + z) - 2.5e-2 * v (t, x + 1, y + 2, z)) * ti0 (x + 1, y, + z) * ti3 (x + 1, y, + z)) * ti0 (x + 1, y, + z) * ti1 (x + 1, y, + z); + auto temp149 = + (2.5e-2 * (-v (t, x - 1, y - 2, z) + v (t, x + 1, y - 2, z)) * ti0 (x, + y - 2, z) * ti1 (x, y - 2, + z) + (-2.5e-2 * v (t, x, y, + z) - 7.5e-2 * v (t, x, + y - 2, + z) + 1.0e-1 * v (t, x, y - 1, z)) * ti0 (x, + y - 2, + z) * ti3 (x, y - 2, + z) + (-7.5e-2 * v (t, x, y - 2, z) + 1.0e-1 * v (t, x, y - 2, + z + 1) - 2.5e-2 * v (t, x, y - 2, z + 2)) * ti2 (x, y - 2, + z)) * ti0 (x, + y - 2, + z) * ti3 (x, y - 2, z); + auto temp39 = + (2.5e-2 * ((-v (t, x, y, z - 1) + v (t, x, y, z + 1)) * ti2 (x, y, + z) + (-v (t, x, + y - 1, + z) + v (t, + x, + y + 1, + z)) * ti0 (x, y, z) * ti3 (x, y, z)) + (-7.5e-2 * v (t, x, y, + z) + 1.0e-1 * v (t, + x + 1, + y, + z) - 2.5e-2 * v (t, x + 2, + y, + z)) * ti0 (x, y, + z) * ti1 (x, y, z)) * ti0 (x, y, z) * ti1 (x, y, z); + auto temp132 = + (2.5e-2 * (-v (t, x - 1, y, z - 1) + v (t, x + 1, y, z - 1)) * ti0 (x, + y, z - 1) * ti1 (x, y, + z - 1) + (1.0e-1 * v (t, x, y, + z) - 7.5e-2 * v (t, x, + y, + z - 1) - 2.5e-2 * v (t, x, y, + z + 1)) * ti2 (x, y, + z - 1) + + (-7.5e-2 * v (t, x, y, z - 1) + 1.0e-1 * v (t, x, y + 1, + z - 1) - 2.5e-2 * v (t, x, y + 2, z - 1)) * ti0 (x, y, + z - 1) * ti3 (x, y, + z - 1)) * ti2 (x, y, z - 1); + auto temp6 = + 1.0 / (8.85879567828298e-1 * damp (x, y, z) + 2.0 * m (x, y, z)); + auto temp141 = + (2.5e-2 * (v (t, x, y, z) - v (t, x - 2, y, z)) * ti0 (x - 1, y, + z) * ti1 (x - 1, y, + z) + (-7.5e-2 * v (t, x - 1, y, z) + 1.0e-1 * v (t, x - 1, y, + z + 1) - 2.5e-2 * v (t, x - 1, y, + z + 2)) * ti2 (x - 1, + y, + z) + + (-7.5e-2 * v (t, x - 1, y, z) + 1.0e-1 * v (t, x - 1, y + 1, + z) - 2.5e-2 * v (t, x - 1, y + 2, z)) * ti0 (x - 1, y, + z) * ti3 (x - 1, y, + z)) * ti0 (x - 1, y, + z) * ti1 (x - 1, y, + z); auto temp10 = 8.85879567828298e-1 * damp (x, y, z) - 2.0 * m (x, y, z); - auto temp215 = 3.75e-2 * (-temp154 * ti1 (x, y, z) + temp158 * ti3 (x, y, z)) * ti3 (x, - y, - z) + (temp197 * ti3 (x, y, - z) + (3.75e-2 * (temp154 * ti3 (x, y, - z) + temp158 * ti1 (x, - y, - z)) * ti2 (x, y, - z) - 9.375e-4 * (-u (t, x, y, z - 1) + u (t, x, y, - z + 1)) * ti0 (x, - y, - z)) * ti1 (x, y, - z)) * ti2 (x, - y, - z) - + (1.25e-2 * (temp182 * ti3 (x - 2, y, z) + temp183 * ti1 (x - 2, y, z)) * ti2 (x - 2, - y, - z) - 3.125e-4 * (-u (t, x - 2, y, z - 1) + u (t, x - 2, y, z + 1)) * ti0 (x - 2, - y, - z)) * ti1 (x - 2, - y, - z) * ti2 (x - 2, y, - z) + - 5.0e-2 * ((temp191 * ti3 (x, y - 1, z) - temp192 * ti1 (x, y - 1, z)) * ti1 (x, y - 1, - z) + (2.5e-2 * (-u (t, x - 1, y, z - 1) + u (t, x + 1, y, z - 1)) * ti1 (x, y, - z - 1) * ti2 (x, - y, - z - 1) - (1.0e-1 * u (t, x, y, z) - 7.5e-2 * u (t, x, y, z - 1) - 2.5e-2 * u (t, x, - y, - z + - 1)) * - ti0 (x, y, - z - 1) + (-7.5e-2 * u (t, x, - y, - z - 1) + 1.0e-1 * u (t, x, - y + 1, - z - 1) - 2.5e-2 * u (t, x, - y + 2, - z - 1)) * ti2 (x, y, - z - 1) * ti3 (x, y, z - 1)) * ti0 (x, y, - z - 1)) + - (1.25e-2 * temp168 * ti1 (x, y + 1, z) * ti2 (x, y + 1, z) + 3.125e-4 * ((-u (t, x, y, z) + u (t, x, y + 2, z)) * ti2 (x, y + 1, z) * ti3 (x, y + 1, - z) - (-u (t, x, y + 1, - z - 1) + u (t, x, - y + 1, - z + 1)) * ti0 (x, y + 1, z))) * ti2 (x, y + 1, z) * ti3 (x, y + 1, - z) + - (1.25e-2 * (temp171 * ti2 (x + 1, y, z) * ti3 (x + 1, y, z) - (-7.5e-2 * u (t, x + 1, y, z) + 1.0e-1 * u (t, x + 1, y, z + 1) - 2.5e-2 * u (t, x + 1, y, z + 2)) * ti0 (x + 1, y, - z)) + 3.125e-4 * (-u (t, x, - y, - z) + u (t, - x + 2, - y, - z)) * ti1 (x + 1, y, z) * ti2 (x + 1, y, - z)) * ti1 (x + 1, - y, - z) * - ti2 (x + 1, y, - z) + 1.25e-2 * ((-temp171 * ti1 (x + 1, y, z) + 2.5e-2 * (-u (t, x, y, z) + u (t, x + 2, y, z)) * ti3 (x + 1, y, - z)) * ti3 (x + 1, - y, - z) + (-temp173 * ti1 (x, y - 1, z) + temp176 * ti3 (x, y - 1, z)) * ti1 (x, - y - 1, - z) + (-temp182 * ti1 (x - 2, y, - z) + temp183 * ti3 (x - 2, y, - z)) * ti3 (x - - 2, - y, - z) - + ((temp184 * ti1 (x, y - 2, z) + temp186 * ti3 (x, y - 2, z)) * ti2 (x, - y - 2, - z) - (-7.5e-2 * u (t, - x, - y - 2, - z) + 1.0e-1 * u (t, - x, - y - 2, - z + 1) - 2.5e-2 * u (t, - x, - y - 2, - z + 2)) * ti0 (x, y - 2, z)) * ti2 (x, y - 2, z) * ti3 (x, y - 2, - z) + - (2.5e-2 * (-(u (t, x, y, z) - u (t, x, y, z - 2)) * ti0 (x, y, z - 1) + (-u (t, x, y - 1, z - 1) + u (t, x, y + 1, z - 1)) * ti2 (x, y, - z - 1) * ti3 (x, y, z - 1)) + (-7.5e-2 * u (t, x, y, z - 1) + 1.0e-1 * u (t, - x + 1, - y, - z - - - 1) - - 2.5e-2 * u (t, x + 2, y, z - 1)) * ti1 (x, - y, - z - - 1) * - ti2 (x, y, z - 1)) * ti0 (x, y, z - 1)) - (temp197 * ti0 (x, y, - z) + 3.75e-2 * (temp161 * ti3 (x, y, - z) - temp163 * ti1 (x, - y, - z)) * ti1 (x, y, - z) + 1.25e-2 * (temp168 * ti3 (x, y + 1, - z) - 2.5e-2 * (-u (t, x, y, - z) + u (t, - x, - y + 2, - z)) * ti1 (x, y + 1, - z)) * ti1 (x, y + 1, - z) + - 1.25e-2 * (temp178 * ti3 (x - 1, y, - z) - temp181 * ti1 (x - 1, y, - z)) * ti3 (x - 1, y, - z) + 1.25e-2 * (temp184 * ti3 (x, y - 2, - z) - temp186 * ti1 (x, - y - 2, - z)) * ti1 (x, y - 2, - z) + 5.0e-2 * (-temp188 * ti1 (x - 1, y, - z) + temp190 * ti3 (x - 1, - y, - z)) * - ti3 (x - 1, y, - z) + (1.25e-2 * (temp173 * ti3 (x, y - 1, - z) + temp176 * ti1 (x, y - 1, - z)) * ti2 (x, y - 1, - z) - 3.125e-4 * (-u (t, x, y - 1, z - 1) + u (t, x, y - 1, - z + 1)) * ti0 (x, - y - 1, - z)) * ti2 (x, y - 1, z) * ti3 (x, - y - 1, - z) + - 1.25e-2 * ((temp178 * ti1 (x - 1, y, - z) + temp181 * ti3 (x - 1, y, - z)) * ti2 (x - 1, y, - z) - (-7.5e-2 * u (t, x - 1, y, - z) + 1.0e-1 * u (t, - x - 1, - y, - z + 1) - 2.5e-2 * u (t, x - 1, y, - z + 2)) * ti0 (x - 1, - y, - z)) * ti1 (x - 1, - y, - z) * ti2 (x - 1, y, - z) + - (5.0e-2 * (temp188 * ti3 (x - 1, y, - z) + temp190 * ti1 (x - 1, y, - z)) * ti2 (x - 1, y, - z) - 1.25e-3 * (-u (t, x - 1, y, z - 1) + u (t, x - 1, y, - z + 1)) * ti0 (x - 1, - y, - z)) * ti1 (x - 1, y, - z) * ti2 (x - 1, y, - z) + 5.0e-2 * ((temp191 * ti1 (x, y - 1, - z) + - temp192 * ti3 (x, y - 1, - z)) * ti2 (x, - y - - 1, - z) - - (-7.5e-2 * u (t, x, y - 1, - z) + - 1.0e-1 * u (t, - x, - y - 1, - z + - 1) - - 2.5e-2 * u (t, x, y - 1, - z + - 2)) * ti0 (x, - y - - 1, - z)) - * ti2 (x, y - 1, - z) * ti3 (x, y - 1, - z) + 1.25e-2 * (2.5e-2 * (-(-u (t, x, y, - z) + u (t, - x, - y, - z + 2)) * ti0 (x, y, - z + 1) + (-u (t, x, y - 1, - z + 1) + u (t, - x, - y + 1, - z + 1)) * ti2 (x, y, - z + 1) * ti3 (x, y, - z + 1)) + (-7.5e-2 * u (t, x, y, - z + 1) + 1.0e-1 * u (t, - x + 1, - y, - z + - 1) - - 2.5e-2 * u (t, - x + 2, - y, - z + 1)) * ti1 (x, y, - z + - 1) * ti2 (x, y, - z + - 1)) * - ti0 (x, y, - z + 1) + 1.25e-2 * (2.5e-2 * (-u (t, x - 1, y, - z - 2) + u (t, x + 1, - y, - z - 2)) * ti1 (x, - y, - z - 2) * ti2 (x, y, - z - 2) - (-2.5e-2 * u (t, x, y, - z) - 7.5e-2 * u (t, - x, - y, - z - 2) + 1.0e-1 * u (t, - x, - y, - z - 1)) * ti0 (x, y, - z - 2) + (-7.5e-2 * u (t, - x, - y, - z - - 2) + - 1.0e-1 * u (t, - x, - y + - 1, - z - - 2) - - 2.5e-2 * u (t, - x, - y + - 2, - z - - 2)) * - ti2 (x, y, - z - 2) * ti3 (x, y, - z - 2)) * ti0 (x, y, z - 2)); - auto temp77 = (2.5e-2 * ((v (t, x, y, z) - v (t, x, y, z - 2)) * ti2 (x, y, - z - 1) + (-v (t, - x, - y - 1, - z - 1) + v (t, - x, - y + 1, - z - 1)) * ti0 (x, y, z - 1) * ti3 (x, y, z - 1)) + (-7.5e-2 * v (t, - x, - y, - z - 1) + 1.0e-1 * v (t, - x + 1, y, - z - 1) - 2.5e-2 * v (t, - x + 2, - y, - z - 1)) * ti0 (x, - y, - z - 1) * ti1 (x, y, - z - 1)) * ti2 (x, - y, - z - - 1); - auto temp61 = (2.5e-2 * ((-v (t, x, y, z) + v (t, x, y, z + 2)) * ti2 (x, y, - z + 1) + (-v (t, - x, - y - 1, - z + 1) + v (t, - x, - y + 1, - z + 1)) * ti0 (x, y, z + 1) * ti3 (x, y, z + 1)) + (-7.5e-2 * v (t, - x, - y, - z + 1) + 1.0e-1 * v (t, - x + 1, y, - z + 1) - 2.5e-2 * v (t, - x + 2, - y, - z + 1)) * ti0 (x, - y, - z + 1) * ti1 (x, y, - z + 1)) * ti2 (x, - y, - z - + - 1); - auto temp150 = (2.5e-2 * (-v (t, x - 1, y - 1, z) + v (t, x + 1, y - 1, z)) * ti0 (x, y - 1, z) * ti1 (x, y - 1, - z) + (1.0e-1 * v (t, x, y, - z) - 7.5e-2 * v (t, x, - y - 1, - z) - 2.5e-2 * v (t, x, y + 1, z)) * ti0 (x, - y - 1, - z) * ti3 (x, y - 1, - z) + (-7.5e-2 * v (t, x, y - 1, z) + 1.0e-1 * v (t, x, y - 1, - z + 1) - - 2.5e-2 * v (t, x, y - 1, z + 2)) * ti2 (x, y - 1, - z)) * ti0 (x, - y - - 1, - z) * - ti3 (x, y - 1, z); + auto temp215 = + 3.75e-2 * (-temp154 * ti1 (x, y, z) + temp158 * ti3 (x, y, + z)) * ti3 (x, + y, + z) + (temp197 * ti3 (x, y, + z) + (3.75e-2 * (temp154 * ti3 (x, y, + z) + temp158 * ti1 (x, + y, + z)) * ti2 (x, y, + z) - 9.375e-4 * (-u (t, x, y, z - 1) + u (t, x, y, + z + 1)) * ti0 (x, + y, + z)) * ti1 (x, y, + z)) * ti2 (x, + y, + z) + + (1.25e-2 * (temp182 * ti3 (x - 2, y, z) + temp183 * ti1 (x - 2, y, + z)) * ti2 (x - 2, + y, + z) - 3.125e-4 * (-u (t, x - 2, y, z - 1) + u (t, x - 2, y, + z + 1)) * ti0 (x - 2, + y, + z)) * ti1 (x - 2, + y, + z) * ti2 (x - 2, y, + z) + + 5.0e-2 * ((temp191 * ti3 (x, y - 1, z) - temp192 * ti1 (x, y - 1, + z)) * ti1 (x, y - 1, + z) + (2.5e-2 * (-u (t, x - 1, y, z - 1) + u (t, x + 1, y, + z - 1)) * ti1 (x, y, + z - 1) * ti2 (x, + y, + z - 1) - (1.0e-1 * u (t, x, y, z) - 7.5e-2 * u (t, x, y, + z - 1) - 2.5e-2 * u (t, x, + y, + z + 1)) * ti0 (x, y, + z - 1) + (-7.5e-2 * u (t, x, + y, + z - 1) + 1.0e-1 * u (t, x, + y + 1, + z - 1) - 2.5e-2 * u (t, x, + y + 2, + z - 1)) * ti2 (x, y, + z - 1) * ti3 (x, y, z - 1)) * ti0 (x, y, + z - 1)) + + (1.25e-2 * temp168 * ti1 (x, y + 1, z) * ti2 (x, y + 1, + z) + 3.125e-4 * ((-u (t, x, y, z) + u (t, x, y + 2, z)) * ti2 (x, + y + 1, z) * ti3 (x, y + 1, + z) - (-u (t, x, y + 1, + z - 1) + u (t, x, + y + 1, + z + 1)) * ti0 (x, y + 1, z))) * ti2 (x, y + 1, z) * ti3 (x, + y + 1, + z) + (1.25e-2 * (temp171 * ti2 (x + 1, y, z) * ti3 (x + 1, y, + z) - (-7.5e-2 * u (t, x + 1, y, z) + 1.0e-1 * u (t, x + 1, y, + z + 1) - 2.5e-2 * u (t, x + 1, y, z + 2)) * ti0 (x + 1, y, + z)) + 3.125e-4 * (-u (t, x, + y, + z) + u (t, + x + 2, + y, + z)) * ti1 (x + 1, y, z) * ti2 (x + 1, y, + z)) * ti1 (x + 1, + y, + z) * ti2 (x + 1, y, + z) + 1.25e-2 * ((-temp171 * ti1 (x + 1, y, z) + 2.5e-2 * (-u (t, x, y, + z) + u (t, x + 2, y, z)) * ti3 (x + 1, y, + z)) * ti3 (x + 1, + y, + z) + (-temp173 * ti1 (x, y - 1, z) + temp176 * ti3 (x, y - 1, + z)) * ti1 (x, + y - 1, + z) + (-temp182 * ti1 (x - 2, y, + z) + temp183 * ti3 (x - 2, y, + z)) * ti3 (x - 2, + y, + z) + + ((temp184 * ti1 (x, y - 2, z) + temp186 * ti3 (x, y - 2, + z)) * ti2 (x, + y - 2, + z) - (-7.5e-2 * u (t, + x, + y - 2, + z) + 1.0e-1 * u (t, + x, + y - 2, + z + 1) - 2.5e-2 * u (t, + x, + y - 2, + z + 2)) * ti0 (x, y - 2, z)) * ti2 (x, y - 2, z) * ti3 (x, + y - 2, + z) + (2.5e-2 * (-(u (t, x, y, z) - u (t, x, y, z - 2)) * ti0 (x, y, + z - 1) + (-u (t, x, y - 1, z - 1) + u (t, x, y + 1, + z - 1)) * ti2 (x, y, + z - 1) * ti3 (x, y, z - 1)) + (-7.5e-2 * u (t, x, y, + z - 1) + 1.0e-1 * u (t, + x + 1, + y, + z - 1) - 2.5e-2 * u (t, x + 2, y, z - 1)) * ti1 (x, + y, + z - + 1) * + ti2 (x, y, z - 1)) * ti0 (x, y, z - 1)) - (temp197 * ti0 (x, y, + z) + 3.75e-2 * (temp161 * ti3 (x, y, + z) - temp163 * ti1 (x, + y, + z)) * ti1 (x, y, + z) + 1.25e-2 * (temp168 * ti3 (x, y + 1, + z) - 2.5e-2 * (-u (t, x, y, + z) + u (t, + x, + y + 2, + z)) * ti1 (x, y + 1, + z)) * ti1 (x, y + 1, + z) + 1.25e-2 * (temp178 * ti3 (x - 1, y, + z) - temp181 * ti1 (x - 1, y, + z)) * ti3 (x - 1, y, + z) + 1.25e-2 * (temp184 * ti3 (x, y - 2, + z) - temp186 * ti1 (x, + y - 2, + z)) * ti1 (x, y - 2, + z) + 5.0e-2 * (-temp188 * ti1 (x - 1, y, + z) + temp190 * ti3 (x - 1, + y, + z)) * ti3 (x - 1, y, + z) + (1.25e-2 * (temp173 * ti3 (x, y - 1, + z) + temp176 * ti1 (x, y - 1, + z)) * ti2 (x, y - 1, + z) - 3.125e-4 * (-u (t, x, y - 1, z - 1) + u (t, x, y - 1, + z + 1)) * ti0 (x, + y - 1, + z)) * ti2 (x, y - 1, z) * ti3 (x, + y - 1, + z) + 1.25e-2 * ((temp178 * ti1 (x - 1, y, + z) + temp181 * ti3 (x - 1, y, + z)) * ti2 (x - 1, y, + z) - (-7.5e-2 * u (t, x - 1, y, + z) + 1.0e-1 * u (t, + x - 1, + y, + z + 1) - 2.5e-2 * u (t, x - 1, y, + z + 2)) * ti0 (x - 1, + y, + z)) * ti1 (x - 1, + y, + z) * ti2 (x - 1, y, + z) + (5.0e-2 * (temp188 * ti3 (x - 1, y, + z) + temp190 * ti1 (x - 1, y, + z)) * ti2 (x - 1, y, + z) - 1.25e-3 * (-u (t, x - 1, y, z - 1) + u (t, x - 1, y, + z + 1)) * ti0 (x - 1, + y, + z)) * ti1 (x - 1, y, + z) * ti2 (x - 1, y, + z) + 5.0e-2 * ((temp191 * ti1 (x, y - 1, + z) + temp192 * ti3 (x, y - 1, + z)) * ti2 (x, + y - 1, + z) - (-7.5e-2 * u (t, x, y - 1, + z) + 1.0e-1 * u (t, + x, + y - 1, + z + 1) - 2.5e-2 * u (t, x, y - 1, + z + 2)) * ti0 (x, + y - 1, + z)) * ti2 (x, y - 1, + z) * ti3 (x, y - 1, + z) + 1.25e-2 * (2.5e-2 * (-(-u (t, x, y, + z) + u (t, + x, + y, + z + 2)) * ti0 (x, y, + z + 1) + (-u (t, x, y - 1, + z + 1) + u (t, + x, + y + 1, + z + 1)) * ti2 (x, y, + z + 1) * ti3 (x, y, + z + 1)) + (-7.5e-2 * u (t, x, y, + z + 1) + 1.0e-1 * u (t, + x + 1, + y, + z + 1) - 2.5e-2 * u (t, + x + 2, + y, + z + 1)) * ti1 (x, y, + z + 1) * ti2 (x, y, + z + 1)) * ti0 (x, y, + z + 1) + 1.25e-2 * (2.5e-2 * (-u (t, x - 1, y, + z - 2) + u (t, x + 1, + y, + z - 2)) * ti1 (x, + y, + z - 2) * ti2 (x, y, + z - 2) - (-2.5e-2 * u (t, x, y, + z) - 7.5e-2 * u (t, + x, + y, + z - 2) + 1.0e-1 * u (t, + x, + y, + z - 1)) * ti0 (x, y, + z - 2) + (-7.5e-2 * u (t, + x, + y, + z - 2) + 1.0e-1 * u (t, + x, + y + 1, + z - 2) - 2.5e-2 * u (t, + x, + y + 2, + z - 2)) * ti2 (x, y, + z - 2) * ti3 (x, y, + z - 2)) * ti0 (x, y, z - 2)); + auto temp77 = + (2.5e-2 * ((v (t, x, y, z) - v (t, x, y, z - 2)) * ti2 (x, y, + z - 1) + (-v (t, + x, + y - 1, + z - 1) + v (t, + x, + y + 1, + z - 1)) * ti0 (x, y, z - 1) * ti3 (x, y, + z - 1)) + (-7.5e-2 * v (t, + x, + y, + z - 1) + 1.0e-1 * v (t, + x + 1, y, + z - 1) - 2.5e-2 * v (t, + x + 2, + y, + z - 1)) * ti0 (x, + y, + z - 1) * ti1 (x, y, + z - 1)) * ti2 (x, + y, + z - 1); + auto temp61 = + (2.5e-2 * ((-v (t, x, y, z) + v (t, x, y, z + 2)) * ti2 (x, y, + z + 1) + (-v (t, + x, + y - 1, + z + 1) + v (t, + x, + y + 1, + z + 1)) * ti0 (x, y, z + 1) * ti3 (x, y, + z + 1)) + (-7.5e-2 * v (t, + x, + y, + z + 1) + 1.0e-1 * v (t, + x + 1, y, + z + 1) - 2.5e-2 * v (t, + x + 2, + y, + z + 1)) * ti0 (x, + y, + z + 1) * ti1 (x, y, + z + 1)) * ti2 (x, + y, + z + 1); + auto temp150 = + (2.5e-2 * (-v (t, x - 1, y - 1, z) + v (t, x + 1, y - 1, z)) * ti0 (x, + y - 1, z) * ti1 (x, y - 1, + z) + (1.0e-1 * v (t, x, y, + z) - 7.5e-2 * v (t, x, + y - 1, + z) - 2.5e-2 * v (t, x, y + 1, z)) * ti0 (x, + y - 1, + z) * ti3 (x, y - 1, + z) + (-7.5e-2 * v (t, x, y - 1, z) + 1.0e-1 * v (t, x, y - 1, + z + 1) - 2.5e-2 * v (t, x, y - 1, z + 2)) * ti2 (x, y - 1, + z)) * ti0 (x, + y - 1, + z) * ti3 (x, y - 1, z); // Next time-step values. u (t + 1, x, y, - z) EQUALS temp6 *(temp10 * u (t - 1, x, y, - z) + - 1.5695652173913 * (temp215 * epsilon (x, y, z) + - (3.75e-2 * - (temp39 + - temp47 * (ti0 (x, y, z) * - ti3 (x, y, - z) + ti2 (x, y, - z))) - - 5.0e-2 * (temp102 + temp132 + temp150) + 1.25e-2 * (temp110 - temp123 + temp131 + temp140 - temp141 + temp149 + temp61 - temp77 + temp87)) * delta (x, y, z)) + 4.0 * m (x, y, z) * u (t, x, y, z)); - v (t + 1, x, y, - z) EQUALS temp6 *(temp10 * v (t - 1, x, y, - z) + 1.5695652173913 * temp215 * delta (x, - y, - z) - + 5.88586956521739e-2 * (temp39 + - temp47 * (ti0 (x, y, z) * - ti3 (x, y, - z) + ti2 (x, - y, - z))) - - 7.84782608695652e-2 * (temp102 + temp132 + temp150) + 1.96195652173913e-2 * (temp110 - temp123 + temp131 + temp140 - temp141 + temp149 + temp61 - temp77 + temp87) + 4.0 * m (x, y, z) * v (t, x, y, z)); + z) EQUALS temp6 *(temp10 * u (t - 1, x, y, + z) + + 1.5695652173913 * (temp215 * epsilon (x, y, z) + + (3.75e-2 * + (temp39 + + temp47 * (ti0 (x, y, z) * + ti3 (x, y, + z) + ti2 (x, y, + z))) - + 5.0e-2 * (temp102 + temp132 + temp150) + 1.25e-2 * (temp110 - + temp123 + temp131 + temp140 - temp141 + temp149 + temp61 - + temp77 + temp87)) * delta (x, y, z)) + 4.0 * m (x, y, z) * u (t, + x, y, z)); + v (t + 1, x, y, z) EQUALS temp6 *(temp10 * v (t - 1, x, y, + z) + 1.5695652173913 * temp215 * delta (x, y, + z) + 5.88586956521739e-2 * (temp39 + temp47 * (ti0 (x, y, + z) * ti3 (x, y, z) + ti2 (x, y, + z))) - 7.84782608695652e-2 * (temp102 + temp132 + temp150) + + 1.96195652173913e-2 * (temp110 - temp123 + temp131 + temp140 - + temp141 + temp149 + temp61 - temp77 + temp87) + 4.0 * m (x, y, + z) * v (t, x, y, z)); } @@ -599,1369 +620,1309 @@ namespace virtual void define_so8 () { - auto temp330 = 4.16666666666667e-3 * (u (t, x - 2, y, z) - u (t, x + 2, y, z)) + 3.33333333333333e-2 * (-u (t, x - 1, y, z) + u (t, x + 1, y, z)); - auto temp333 = -4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 1, - z) + 7.5e-2 * u (t, x, y + 1, z) - 2.5e-2 * u (t, x, y + 2, - z) + 4.16666666666667e-3 * u (t, x, y + 3, z); - auto temp336 = 3.33333333333333e-2 * (-u (t, x, y, z) + u (t, x, y + 2, z)) + 4.16666666666667e-3 * (u (t, x, y - 1, z) - u (t, x, y + 3, z)); - auto temp395 = 4.16666666666667e-3 * (u (t, x - 2, y - 2, z) - u (t, x + 2, y - 2, - z)) + 3.33333333333333e-2 * (-u (t, x - 1, - y - 2, - z) + u (t, - x + 1, - y - 2, - z)); - auto temp365 = -4.16666666666667e-2 * u (t, x - 2, y, z) - 1.25e-2 * u (t, x - 2, - y - 1, - z) + 7.5e-2 * u (t, x - 2, y + 1, z) - 2.5e-2 * u (t, x - 2, y + 2, - z) + 4.16666666666667e-3 * u (t, x - 2, y + 3, z); - auto temp375 = -4.16666666666667e-2 * u (t, x + 2, y, z) - 1.25e-2 * u (t, x + 2, - y - 1, - z) + 7.5e-2 * u (t, x + 2, y + 1, z) - 2.5e-2 * u (t, x + 2, y + 2, - z) + 4.16666666666667e-3 * u (t, x + 2, y + 3, z); + auto temp330 = + 4.16666666666667e-3 * (u (t, x - 2, y, z) - u (t, x + 2, y, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, y, z) + u (t, x + 1, y, + z)); + auto temp333 = + -4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 1, + z) + 7.5e-2 * u (t, x, y + 1, z) - 2.5e-2 * u (t, x, y + 2, + z) + 4.16666666666667e-3 * u (t, x, y + 3, z); + auto temp336 = + 3.33333333333333e-2 * (-u (t, x, y, z) + u (t, x, y + 2, + z)) + 4.16666666666667e-3 * (u (t, x, y - 1, z) - u (t, x, y + 3, + z)); + auto temp395 = + 4.16666666666667e-3 * (u (t, x - 2, y - 2, z) - u (t, x + 2, y - 2, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, + y - 2, + z) + u (t, + x + 1, + y - 2, + z)); + auto temp365 = + -4.16666666666667e-2 * u (t, x - 2, y, z) - 1.25e-2 * u (t, x - 2, + y - 1, + z) + 7.5e-2 * u (t, x - 2, y + 1, z) - 2.5e-2 * u (t, x - 2, y + 2, + z) + 4.16666666666667e-3 * u (t, x - 2, y + 3, z); + auto temp375 = + -4.16666666666667e-2 * u (t, x + 2, y, z) - 1.25e-2 * u (t, x + 2, + y - 1, + z) + 7.5e-2 * u (t, x + 2, y + 1, z) - 2.5e-2 * u (t, x + 2, y + 2, + z) + 4.16666666666667e-3 * u (t, x + 2, y + 3, z); auto temp394 = -2.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x - 3, y, - z) - 4.16666666666667e-2 * u (t, x - 2, y, z) + 7.5e-2 * u (t, x - 1, y, - z) + 4.16666666666667e-3 * u (t, x + 1, y, z); - auto temp407 = 4.16666666666667e-3 * (u (t, x - 1, y - 2, z) - u (t, x - 1, y + 2, - z)) + 3.33333333333333e-2 * (-u (t, x - 1, - y - 1, - z) + u (t, - x - 1, - y + 1, - z)); - auto temp352 = 4.16666666666667e-3 * (-u (t, x, y, z) + u (t, x, y - 4, z)) + 3.33333333333333e-2 * (-u (t, x, y - 3, z) + u (t, x, y - 1, z)); - auto temp347 = -4.16666666666667e-2 * u (t, x + 1, y, z) - 1.25e-2 * u (t, x + 1, - y - 1, - z) + 7.5e-2 * u (t, x + 1, y + 1, z) - 2.5e-2 * u (t, x + 1, y + 2, - z) + 4.16666666666667e-3 * u (t, x + 1, y + 3, z); - auto temp409 = 4.16666666666667e-3 * (u (t, x + 1, y - 2, z) - u (t, x + 1, y + 2, - z)) + 3.33333333333333e-2 * (-u (t, x + 1, - y - 1, - z) + u (t, - x + 1, - y + 1, - z)); - auto temp318 = 4.16666666666667e-3 * (u (t, x, y - 2, z) - u (t, x, y + 2, z)) + 3.33333333333333e-2 * (-u (t, x, y - 1, z) + u (t, x, y + 1, z)); - auto temp381 = -4.16666666666667e-2 * u (t, x, y - 1, z) - 1.25e-2 * u (t, x - 1, - y - 1, - z) + 7.5e-2 * u (t, x + 1, y - 1, z) - 2.5e-2 * u (t, x + 2, y - 1, - z) + 4.16666666666667e-3 * u (t, x + 3, y - 1, z); + z) - 4.16666666666667e-2 * u (t, x - 2, y, z) + 7.5e-2 * u (t, x - 1, + y, + z) + 4.16666666666667e-3 * u (t, x + 1, y, z); + auto temp407 = + 4.16666666666667e-3 * (u (t, x - 1, y - 2, z) - u (t, x - 1, y + 2, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, + y - 1, + z) + u (t, + x - 1, + y + 1, + z)); + auto temp352 = + 4.16666666666667e-3 * (-u (t, x, y, z) + u (t, x, y - 4, + z)) + 3.33333333333333e-2 * (-u (t, x, y - 3, z) + u (t, x, y - 1, + z)); + auto temp347 = + -4.16666666666667e-2 * u (t, x + 1, y, z) - 1.25e-2 * u (t, x + 1, + y - 1, + z) + 7.5e-2 * u (t, x + 1, y + 1, z) - 2.5e-2 * u (t, x + 1, y + 2, + z) + 4.16666666666667e-3 * u (t, x + 1, y + 3, z); + auto temp409 = + 4.16666666666667e-3 * (u (t, x + 1, y - 2, z) - u (t, x + 1, y + 2, + z)) + 3.33333333333333e-2 * (-u (t, x + 1, + y - 1, + z) + u (t, + x + 1, + y + 1, + z)); + auto temp318 = + 4.16666666666667e-3 * (u (t, x, y - 2, z) - u (t, x, y + 2, + z)) + 3.33333333333333e-2 * (-u (t, x, y - 1, z) + u (t, x, y + 1, + z)); + auto temp381 = + -4.16666666666667e-2 * u (t, x, y - 1, z) - 1.25e-2 * u (t, x - 1, + y - 1, + z) + 7.5e-2 * u (t, x + 1, y - 1, z) - 2.5e-2 * u (t, x + 2, y - 1, + z) + 4.16666666666667e-3 * u (t, x + 3, y - 1, z); auto temp408 = 7.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x - 2, y, - z) - 4.16666666666667e-2 * u (t, x - 1, y, z) - 2.5e-2 * u (t, x + 1, y, - z) + 4.16666666666667e-3 * u (t, x + 2, y, z); - auto temp390 = 4.16666666666667e-3 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 4, - z) - 4.16666666666667e-2 * u (t, x, y - 3, z) + 7.5e-2 * u (t, x, y - 2, - z) - 2.5e-2 * u (t, x, y - 1, z); - auto temp360 = 4.16666666666667e-3 * (-u (t, x, y, z) + u (t, x - 4, y, z)) + 3.33333333333333e-2 * (-u (t, x - 3, y, z) + u (t, x - 1, y, z)); - auto temp370 = -4.16666666666667e-2 * u (t, x, y + 2, z) - 1.25e-2 * u (t, x - 1, - y + 2, - z) + 7.5e-2 * u (t, x + 1, y + 2, z) - 2.5e-2 * u (t, x + 2, y + 2, - z) + 4.16666666666667e-3 * u (t, x + 3, y + 2, z); - auto temp357 = -4.16666666666667e-2 * u (t, x, y - 2, z) - 1.25e-2 * u (t, x - 1, - y - 2, - z) + 7.5e-2 * u (t, x + 1, y - 2, z) - 2.5e-2 * u (t, x + 2, y - 2, - z) + 4.16666666666667e-3 * u (t, x + 3, y - 2, z); - auto temp385 = -4.16666666666667e-2 * u (t, x - 1, y, z) - 1.25e-2 * u (t, x - 1, - y - 1, - z) + 7.5e-2 * u (t, x - 1, y + 1, z) - 2.5e-2 * u (t, x - 1, y + 2, - z) + 4.16666666666667e-3 * u (t, x - 1, y + 3, z); - auto temp377 = 3.33333333333333e-2 * (u (t, x, y, z) - u (t, x, y - 2, z)) + 4.16666666666667e-3 * (u (t, x, y - 3, z) - u (t, x, y + 1, z)); - auto temp343 = 3.33333333333333e-2 * (-u (t, x, y, z) + u (t, x + 2, y, z)) + 4.16666666666667e-3 * (u (t, x - 1, y, z) - u (t, x + 3, y, z)); - auto temp388 = 4.16666666666667e-3 * (u (t, x - 2, y - 3, z) - u (t, x + 2, y - 3, - z)) + 3.33333333333333e-2 * (-u (t, x - 1, - y - 3, - z) + u (t, - x + 1, - y - 3, - z)); - auto temp406 = -1.25e-2 * u (t, x, y, z) - 4.16666666666667e-2 * u (t, x, y + 1, - z) + 7.5e-2 * u (t, x, y + 2, z) - 2.5e-2 * u (t, x, y + 3, - z) + 4.16666666666667e-3 * u (t, x, y + 4, z); - auto temp404 = 4.16666666666667e-3 * (u (t, x - 2, y + 1, z) - u (t, x + 2, y + 1, - z)) + 3.33333333333333e-2 * (-u (t, x - 1, - y + 1, - z) + u (t, - x + 1, - y + 1, - z)); - auto temp342 = -4.16666666666667e-2 * u (t, x, y + 1, z) - 1.25e-2 * u (t, x - 1, - y + 1, - z) + 7.5e-2 * u (t, x + 1, y + 1, z) - 2.5e-2 * u (t, x + 2, y + 1, - z) + 4.16666666666667e-3 * u (t, x + 3, y + 1, z); - auto temp410 = -1.25e-2 * u (t, x, y, z) - 4.16666666666667e-2 * u (t, x + 1, y, - z) + 7.5e-2 * u (t, x + 2, y, z) - 2.5e-2 * u (t, x + 3, y, - z) + 4.16666666666667e-3 * u (t, x + 4, y, z); - auto temp367 = 4.16666666666667e-3 * (u (t, x, y, z) - u (t, x, y + 4, z)) + 3.33333333333333e-2 * (-u (t, x, y + 1, z) + u (t, x, y + 3, z)); - auto temp399 = 4.16666666666667e-3 * (u (t, x - 2, y - 1, z) - u (t, x + 2, y - 1, - z)) + 3.33333333333333e-2 * (-u (t, x - 1, - y - 1, - z) + u (t, - x + 1, - y - 1, - z)); - auto temp325 = -4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x - 1, y, - z) + 7.5e-2 * u (t, x + 1, y, z) - 2.5e-2 * u (t, x + 2, y, - z) + 4.16666666666667e-3 * u (t, x + 3, y, z); - auto temp386 = 4.16666666666667e-3 * (u (t, x - 3, y - 2, z) - u (t, x - 3, y + 2, - z)) + 3.33333333333333e-2 * (-u (t, x - 3, - y - 1, - z) + u (t, - x - 3, - y + 1, - z)); + z) - 4.16666666666667e-2 * u (t, x - 1, y, z) - 2.5e-2 * u (t, x + 1, + y, + z) + 4.16666666666667e-3 * u (t, x + 2, y, z); + auto temp390 = + 4.16666666666667e-3 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 4, + z) - 4.16666666666667e-2 * u (t, x, y - 3, z) + 7.5e-2 * u (t, x, + y - 2, + z) - 2.5e-2 * u (t, x, y - 1, z); + auto temp360 = + 4.16666666666667e-3 * (-u (t, x, y, z) + u (t, x - 4, y, + z)) + 3.33333333333333e-2 * (-u (t, x - 3, y, z) + u (t, x - 1, y, + z)); + auto temp370 = + -4.16666666666667e-2 * u (t, x, y + 2, z) - 1.25e-2 * u (t, x - 1, + y + 2, + z) + 7.5e-2 * u (t, x + 1, y + 2, z) - 2.5e-2 * u (t, x + 2, y + 2, + z) + 4.16666666666667e-3 * u (t, x + 3, y + 2, z); + auto temp357 = + -4.16666666666667e-2 * u (t, x, y - 2, z) - 1.25e-2 * u (t, x - 1, + y - 2, + z) + 7.5e-2 * u (t, x + 1, y - 2, z) - 2.5e-2 * u (t, x + 2, y - 2, + z) + 4.16666666666667e-3 * u (t, x + 3, y - 2, z); + auto temp385 = + -4.16666666666667e-2 * u (t, x - 1, y, z) - 1.25e-2 * u (t, x - 1, + y - 1, + z) + 7.5e-2 * u (t, x - 1, y + 1, z) - 2.5e-2 * u (t, x - 1, y + 2, + z) + 4.16666666666667e-3 * u (t, x - 1, y + 3, z); + auto temp377 = + 3.33333333333333e-2 * (u (t, x, y, z) - u (t, x, y - 2, + z)) + 4.16666666666667e-3 * (u (t, x, y - 3, z) - u (t, x, y + 1, + z)); + auto temp343 = + 3.33333333333333e-2 * (-u (t, x, y, z) + u (t, x + 2, y, + z)) + 4.16666666666667e-3 * (u (t, x - 1, y, z) - u (t, x + 3, y, + z)); + auto temp388 = + 4.16666666666667e-3 * (u (t, x - 2, y - 3, z) - u (t, x + 2, y - 3, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, + y - 3, + z) + u (t, + x + 1, + y - 3, + z)); + auto temp406 = + -1.25e-2 * u (t, x, y, z) - 4.16666666666667e-2 * u (t, x, y + 1, + z) + 7.5e-2 * u (t, x, y + 2, z) - 2.5e-2 * u (t, x, y + 3, + z) + 4.16666666666667e-3 * u (t, x, y + 4, z); + auto temp404 = + 4.16666666666667e-3 * (u (t, x - 2, y + 1, z) - u (t, x + 2, y + 1, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, + y + 1, + z) + u (t, + x + 1, + y + 1, + z)); + auto temp342 = + -4.16666666666667e-2 * u (t, x, y + 1, z) - 1.25e-2 * u (t, x - 1, + y + 1, + z) + 7.5e-2 * u (t, x + 1, y + 1, z) - 2.5e-2 * u (t, x + 2, y + 1, + z) + 4.16666666666667e-3 * u (t, x + 3, y + 1, z); + auto temp410 = + -1.25e-2 * u (t, x, y, z) - 4.16666666666667e-2 * u (t, x + 1, y, + z) + 7.5e-2 * u (t, x + 2, y, z) - 2.5e-2 * u (t, x + 3, y, + z) + 4.16666666666667e-3 * u (t, x + 4, y, z); + auto temp367 = + 4.16666666666667e-3 * (u (t, x, y, z) - u (t, x, y + 4, + z)) + 3.33333333333333e-2 * (-u (t, x, y + 1, z) + u (t, x, y + 3, + z)); + auto temp399 = + 4.16666666666667e-3 * (u (t, x - 2, y - 1, z) - u (t, x + 2, y - 1, + z)) + 3.33333333333333e-2 * (-u (t, x - 1, + y - 1, + z) + u (t, + x + 1, + y - 1, + z)); + auto temp325 = + -4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x - 1, y, + z) + 7.5e-2 * u (t, x + 1, y, z) - 2.5e-2 * u (t, x + 2, y, + z) + 4.16666666666667e-3 * u (t, x + 3, y, z); + auto temp386 = + 4.16666666666667e-3 * (u (t, x - 3, y - 2, z) - u (t, x - 3, y + 2, + z)) + 3.33333333333333e-2 * (-u (t, x - 3, + y - 1, + z) + u (t, + x - 3, + y + 1, + z)); auto temp396 = -2.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 3, - z) - 4.16666666666667e-2 * u (t, x, y - 2, z) + 7.5e-2 * u (t, x, y - 1, - z) + 4.16666666666667e-3 * u (t, x, y + 1, z); + z) - 4.16666666666667e-2 * u (t, x, y - 2, z) + 7.5e-2 * u (t, x, + y - 1, + z) + 4.16666666666667e-3 * u (t, x, y + 1, z); auto temp401 = 7.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y - 2, - z) - 4.16666666666667e-2 * u (t, x, y - 1, z) - 2.5e-2 * u (t, x, y + 1, - z) + 4.16666666666667e-3 * u (t, x, y + 2, z); - auto temp372 = 4.16666666666667e-3 * (u (t, x, y, z) - u (t, x + 4, y, z)) + 3.33333333333333e-2 * (-u (t, x + 1, y, z) + u (t, x + 3, y, z)); - auto temp420 = 2.08333333333333e-2 * ((temp330 * ti1 (x, y, z) + temp333 * ti3 (x, y, z)) * ti2 (x, y, - z) - (-4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y, z - 1) + 7.5e-2 * u (t, x, y, - z + 1) - 2.5e-2 * u (t, - x, - y, - z + - 2) - + 4.16666666666667e-3 * u (t, x, y, z + 3)) * ti0 (x, y, z)); - auto temp387 = 4.16666666666667e-3 * u (t, x, y, z) - 1.25e-2 * u (t, x - 4, y, - z) - 4.16666666666667e-2 * u (t, x - 3, y, z) + 7.5e-2 * u (t, x - 2, y, - z) - 2.5e-2 * u (t, x - 1, y, z); - auto temp383 = 3.33333333333333e-2 * (u (t, x, y, z) - u (t, x - 2, y, z)) + 4.16666666666667e-3 * (u (t, x - 3, y, z) - u (t, x + 1, y, z)); - auto temp392 = 4.16666666666667e-3 * (u (t, x - 2, y - 2, z) - u (t, x - 2, y + 2, - z)) + 3.33333333333333e-2 * (-u (t, x - 2, - y - 1, - z) + u (t, - x - 2, - y + 1, - z)); - auto temp284 = (((4.16666666666667e-3 * (v (t, x - 2, y, z - 2) - v (t, x + 2, y, z - 2)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z - 2) + v (t, x + 1, y, z - 2))) * ti1 (x, y, - z - 2) + - (-4.16666666666667e-2 * v (t, x, y, z - 2) - 1.25e-2 * v (t, x, y - 1, z - 2) + 7.5e-2 * v (t, x, y + 1, - z - 2) - 2.5e-2 * v (t, x, y + 2, z - 2) + 4.16666666666667e-3 * v (t, x, - y + 3, - z - 2)) * ti3 (x, y, z - 2)) * ti0 (x, y, z - 2) + (-2.5e-2 * v (t, x, y, - z) - - 1.25e-2 * v (t, x, y, - z - 3) - - 4.16666666666667e-2 * - v (t, - x, - y, - z - - - 2) - + 7.5e-2 * v (t, x, - y, - z - - 1) + - 4.16666666666667e-3 * - v (t, - x, - y, - z - + - 1)) - * ti2 (x, y, z - 2)) * ti2 (x, y, z - 2); - auto temp308 = (((4.16666666666667e-3 * (v (t, x - 2, y - 2, z) - v (t, x + 2, y - 2, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 2, z) + v (t, x + 1, y - 2, z))) * ti1 (x, y - 2, - z) + - (-2.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 3, z) - 4.16666666666667e-2 * v (t, x, y - 2, z) + 7.5e-2 * v (t, x, y - 1, - z) + 4.16666666666667e-3 * v (t, x, y + 1, z)) * ti3 (x, y - 2, - z)) * ti0 (x, - y - 2, - z) + (-4.16666666666667e-2 * v (t, x, y - 2, z) - - 1.25e-2 * v (t, x, y - 2, - z - 1) + 7.5e-2 * v (t, x, y - 2, - z + 1) - - 2.5e-2 * v (t, x, y - 2, - z + 2) + 4.16666666666667e-3 * v (t, - x, - y - - - 2, - z - + - 3)) - * ti2 (x, y - 2, z)) * ti0 (x, y - 2, z) * ti3 (x, y - 2, z); - auto temp92 = (((4.16666666666667e-3 * (v (t, x, y - 2, z + 1) - v (t, x, y + 2, z + 1)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z + 1) + v (t, x, y + 1, z + 1))) * ti3 (x, y, - z + 1) + - (-4.16666666666667e-2 * v (t, x, y, z + 1) - 1.25e-2 * v (t, x - 1, y, z + 1) + 7.5e-2 * v (t, x + 1, y, - z + 1) - 2.5e-2 * v (t, x + 2, y, z + 1) + 4.16666666666667e-3 * v (t, - x + 3, - y, - z + 1)) * ti1 (x, y, z + 1)) * ti0 (x, y, - z + 1) + - (3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x, y, z + 2)) + 4.16666666666667e-3 * (v (t, x, y, z - 1) - v (t, x, y, z + 3))) * ti2 (x, y, - z + 1)) * ti2 (x, - y, - z + 1); - auto temp185 = (((4.16666666666667e-3 * (v (t, x - 2, y - 2, z) - v (t, x - 2, y + 2, z)) + 3.33333333333333e-2 * (-v (t, x - 2, y - 1, z) + v (t, x - 2, y + 1, z))) * ti3 (x - 2, y, - z) + - (-2.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x - 3, y, z) - 4.16666666666667e-2 * v (t, x - 2, y, z) + 7.5e-2 * v (t, x - 1, y, - z) + 4.16666666666667e-3 * v (t, x + 1, y, z)) * ti1 (x - 2, y, - z)) * ti0 (x - 2, - y, - z) + - (4.16666666666667e-3 * (v (t, x - 2, y, z - 2) - v (t, x - 2, y, z + 2)) + 3.33333333333333e-2 * (-v (t, x - 2, y, z - 1) + v (t, x - 2, y, z + 1))) * ti2 (x - 2, y, - z)) * ti0 (x - 2, - y, - z) * ti1 (x - 2, y, z); - auto temp283 = (((4.16666666666667e-3 * (v (t, x - 2, y, z - 3) - v (t, x + 2, y, z - 3)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z - 3) + v (t, x + 1, y, z - 3))) * ti1 (x, y, - z - 3) + - (-4.16666666666667e-2 * v (t, x, y, z - 3) - 1.25e-2 * v (t, x, y - 1, z - 3) + 7.5e-2 * v (t, x, y + 1, - z - 3) - 2.5e-2 * v (t, x, y + 2, z - 3) + 4.16666666666667e-3 * v (t, x, - y + 3, - z - - 3)) * - ti3 (x, y, z - 3)) * ti0 (x, y, z - 3) + (4.16666666666667e-3 * v (t, - x, - y, - z) - 1.25e-2 * v (t, x, y, - z - 4) - 4.16666666666667e-2 * v (t, - x, - y, - z - 3) + 7.5e-2 * v (t, x, y, - z - 2) - 2.5e-2 * v (t, x, y, - z - 1)) * ti2 (x, - y, - z - 3)) * ti2 (x, y, z - 3); - auto temp309 = (((4.16666666666667e-3 * (v (t, x - 2, y - 1, z) - v (t, x + 2, y - 1, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 1, z) + v (t, x + 1, y - 1, z))) * ti1 (x, y - 1, - z) + - (7.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 2, z) - 4.16666666666667e-2 * v (t, x, y - 1, z) - 2.5e-2 * v (t, x, y + 1, - z) + 4.16666666666667e-3 * v (t, x, y + 2, z)) * ti3 (x, y - 1, - z)) * ti0 (x, - y - 1, - z) + (-4.16666666666667e-2 * v (t, x, y - 1, z) - - 1.25e-2 * v (t, x, y - 1, z - 1) + 7.5e-2 * v (t, - x, - y - - - 1, - z - + - 1) - - 2.5e-2 * v (t, x, y - 1, - z + 2) + - 4.16666666666667e-3 * v (t, x, - y - 1, - z + - 3)) * - ti2 (x, y - 1, z)) * ti0 (x, y - 1, z) * ti3 (x, y - 1, z); - auto temp310 = (((4.16666666666667e-3 * (v (t, x - 2, y + 1, z) - v (t, x + 2, y + 1, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y + 1, z) + v (t, x + 1, y + 1, z))) * ti1 (x, y + 1, - z) + - (-1.25e-2 * v (t, x, y, z) - 4.16666666666667e-2 * v (t, x, y + 1, z) + 7.5e-2 * v (t, x, y + 2, - z) - 2.5e-2 * v (t, x, y + 3, z) + 4.16666666666667e-3 * v (t, x, y + 4, - z)) * ti3 (x, - y + 1, - z)) * ti0 (x, y + 1, - z) + (-4.16666666666667e-2 * v (t, x, - y + 1, - z) - 1.25e-2 * v (t, x, y + 1, - z - 1) + - 7.5e-2 * v (t, - x, - y + 1, - z + 1) - 2.5e-2 * v (t, x, y + 1, - z + 2) + - 4.16666666666667e-3 * v (t, x, y + 1, - z + 3)) * ti2 (x, - y + 1, - z)) * ti0 (x, - y + 1, - z) * - ti3 (x, y + 1, z); - auto temp307 = (((4.16666666666667e-3 * (v (t, x - 2, y - 3, z) - v (t, x + 2, y - 3, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 3, z) + v (t, x + 1, y - 3, z))) * ti1 (x, y - 3, - z) + - (4.16666666666667e-3 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 4, z) - 4.16666666666667e-2 * v (t, x, - y - 3, - z) + 7.5e-2 * v (t, x, y - 2, z) - 2.5e-2 * v (t, x, y - 1, - z)) * ti3 (x, y - 3, - z)) * ti0 (x, - y - 3, - z) + - (-4.16666666666667e-2 * v (t, x, y - 3, z) - 1.25e-2 * v (t, x, y - 3, z - 1) + 7.5e-2 * v (t, x, y - 3, - z + 1) - 2.5e-2 * v (t, x, y - 3, z + 2) + 4.16666666666667e-3 * v (t, x, - y - 3, - z + 3)) * ti2 (x, y - 3, z)) * ti0 (x, y - 3, z) * ti3 (x, y - 3, z); - auto temp298 = (((4.16666666666667e-3 * (v (t, x, y, z) - v (t, x + 4, y, z)) + 3.33333333333333e-2 * (-v (t, x + 1, y, z) + v (t, x + 3, y, z))) * ti1 (x + 2, y, - z) + - (-4.16666666666667e-2 * v (t, x + 2, y, z) - 1.25e-2 * v (t, x + 2, y - 1, z) + 7.5e-2 * v (t, x + 2, y + 1, - z) - 2.5e-2 * v (t, - x + 2, - y + 2, - z) + 4.16666666666667e-3 * v (t, x + 2, y + 3, z)) * ti3 (x + 2, y, - z)) * ti0 (x + 2, - y, - z) + (-4.16666666666667e-2 * v (t, x + 2, y, - z) - 1.25e-2 * v (t, - x + - 2, - y, - z - - 1) + - 7.5e-2 * v (t, x + 2, y, z + 1) - 2.5e-2 * v (t, - x + - 2, - y, - z + - 2) - + 4.16666666666667e-3 * v (t, x + 2, y, - z + 3)) * ti2 (x + 2, - y, - z)) * - ti0 (x + 2, - y, - z) * ti1 (x + 2, y, z); - auto temp165 = (((4.16666666666667e-3 * (v (t, x - 3, y - 2, z) - v (t, x - 3, y + 2, z)) + 3.33333333333333e-2 * (-v (t, x - 3, y - 1, z) + v (t, x - 3, y + 1, z))) * ti3 (x - 3, y, - z) + - (4.16666666666667e-3 * v (t, x, y, z) - 1.25e-2 * v (t, x - 4, y, z) - 4.16666666666667e-2 * v (t, x - 3, - y, - z) + 7.5e-2 * v (t, x - 2, y, z) - 2.5e-2 * v (t, x - 1, y, - z)) * ti1 (x - 3, y, - z)) * ti0 (x - 3, - y, - z) - + (4.16666666666667e-3 * (v (t, x - 3, y, z - 2) - v (t, x - 3, y, z + 2)) + 3.33333333333333e-2 * (-v (t, x - 3, y, z - 1) + v (t, x - 3, y, z + 1))) * ti2 (x - 3, y, - z)) * ti0 (x - 3, - y, - z) * ti1 (x - 3, y, z); - auto temp154 = (((4.16666666666667e-3 * (v (t, x, y - 2, z - 1) - v (t, x, y + 2, z - 1)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z - 1) + v (t, x, y + 1, z - 1))) * ti3 (x, y, - z - 1) + - (-4.16666666666667e-2 * v (t, x, y, z - 1) - 1.25e-2 * v (t, x - 1, y, z - 1) + 7.5e-2 * v (t, x + 1, y, - z - 1) - 2.5e-2 * v (t, x + 2, y, z - 1) + 4.16666666666667e-3 * v (t, - x + 3, - y, - z - 1)) * ti1 (x, y, z - 1)) * ti0 (x, y, - z - 1) + - (3.33333333333333e-2 * (v (t, x, y, z) - v (t, x, y, z - 2)) + 4.16666666666667e-3 * (v (t, x, y, z - 3) - v (t, x, y, z + 1))) * ti2 (x, y, - z - 1)) * ti2 (x, - y, - z - 1); - auto temp299 = (((3.33333333333333e-2 * (v (t, x, y, z) - v (t, x - 2, y, z)) + 4.16666666666667e-3 * (v (t, x - 3, y, z) - v (t, x + 1, y, z))) * ti1 (x - 1, y, - z) + - (-4.16666666666667e-2 * v (t, x - 1, y, z) - 1.25e-2 * v (t, x - 1, y - 1, z) + 7.5e-2 * v (t, x - 1, y + 1, - z) - 2.5e-2 * v (t, - x - 1, - y + 2, - z) + 4.16666666666667e-3 * v (t, x - 1, y + 3, z)) * ti3 (x - 1, y, - z)) * ti0 (x - 1, - y, - z) + (-4.16666666666667e-2 * v (t, x - 1, y, - z) - 1.25e-2 * v (t, - x - - 1, - y, - z - - 1) + - 7.5e-2 * v (t, x - 1, y, z + 1) - 2.5e-2 * v (t, - x - - 1, - y, - z + - 2) - + 4.16666666666667e-3 * v (t, x - 1, y, - z + 3)) * ti2 (x - 1, - y, - z)) * - ti0 (x - 1, - y, - z) * ti1 (x - 1, y, z); - auto temp289 = (((3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x + 2, y, z)) + 4.16666666666667e-3 * (v (t, x - 1, y, z) - v (t, x + 3, y, z))) * ti1 (x + 1, y, - z) + - (-4.16666666666667e-2 * v (t, x + 1, y, z) - 1.25e-2 * v (t, x + 1, y - 1, z) + 7.5e-2 * v (t, x + 1, y + 1, - z) - 2.5e-2 * v (t, - x + 1, - y + 2, - z) + 4.16666666666667e-3 * v (t, x + 1, y + 3, z)) * ti3 (x + 1, y, - z)) * ti0 (x + 1, - y, - z) + (-4.16666666666667e-2 * v (t, x + 1, y, - z) - 1.25e-2 * v (t, - x + - 1, - y, - z - - 1) + - 7.5e-2 * v (t, x + 1, y, z + 1) - 2.5e-2 * v (t, - x + - 1, - y, - z + - 2) - + 4.16666666666667e-3 * v (t, x + 1, y, - z + 3)) * ti2 (x + 1, - y, - z)) * - ti0 (x + 1, - y, - z) * ti1 (x + 1, y, z); - auto temp116 = (((4.16666666666667e-3 * (v (t, x, y - 2, z - 2) - v (t, x, y + 2, z - 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z - 2) + v (t, x, y + 1, z - 2))) * ti3 (x, y, - z - 2) + - (-4.16666666666667e-2 * v (t, x, y, z - 2) - 1.25e-2 * v (t, x - 1, y, z - 2) + 7.5e-2 * v (t, x + 1, y, - z - 2) - 2.5e-2 * v (t, x + 2, y, z - 2) + 4.16666666666667e-3 * v (t, - x + 3, - y, - z - 2)) * ti1 (x, y, z - 2)) * ti0 (x, y, - z - 2) + - (4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x, y, z - 4)) + 3.33333333333333e-2 * (-v (t, x, y, z - 3) + v (t, x, y, z - 1))) * ti2 (x, y, - z - 2)) * ti2 (x, - y, - z - 2); - auto temp56 = (((4.16666666666667e-3 * (v (t, x, y - 2, z) - v (t, x, y + 2, z)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z) + v (t, x, y + 1, z))) * ti3 (x, y, - z) + - (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x - 1, y, z) + 7.5e-2 * v (t, x + 1, y, - z) - 2.5e-2 * v (t, - x + 2, - y, - z) + 4.16666666666667e-3 * v (t, x + 3, y, z)) * ti1 (x, y, z)) * ti0 (x, - y, - z) - + (4.16666666666667e-3 * (v (t, x, y, z - 2) - v (t, x, y, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y, z - 1) + v (t, x, y, z + 1))) * ti2 (x, y, - z)) * ti0 (x, y, - z) * ti1 (x, y, z); - auto temp204 = (((4.16666666666667e-3 * (v (t, x - 1, y - 2, z) - v (t, x - 1, y + 2, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 1, z) + v (t, x - 1, y + 1, z))) * ti3 (x - 1, y, - z) + - (7.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x - 2, y, z) - 4.16666666666667e-2 * v (t, x - 1, y, z) - 2.5e-2 * v (t, x + 1, y, - z) + 4.16666666666667e-3 * v (t, x + 2, y, z)) * ti1 (x - 1, y, - z)) * ti0 (x - 1, - y, - z) + - (4.16666666666667e-3 * (v (t, x - 1, y, z - 2) - v (t, x - 1, y, z + 2)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z - 1) + v (t, x - 1, y, z + 1))) * ti2 (x - 1, y, - z)) * ti0 (x - 1, - y, - z) * ti1 (x - 1, y, z); - auto temp288 = (((4.16666666666667e-3 * (v (t, x - 2, y, z + 1) - v (t, x + 2, y, z + 1)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z + 1) + v (t, x + 1, y, z + 1))) * ti1 (x, y, - z + 1) + - (-4.16666666666667e-2 * v (t, x, y, z + 1) - 1.25e-2 * v (t, x, y - 1, z + 1) + 7.5e-2 * v (t, x, y + 1, - z + 1) - 2.5e-2 * v (t, x, y + 2, z + 1) + 4.16666666666667e-3 * v (t, x, - y + 3, - z + - 1)) * ti3 (x, y, z + 1)) * ti0 (x, y, z + 1) + (-1.25e-2 * v (t, x, y, - z) - - 4.16666666666667e-2 * - v (t, - x, - y, - z + 1) + - 7.5e-2 * v (t, x, y, - z + 2) - - 2.5e-2 * v (t, x, y, - z + 3) + - 4.16666666666667e-3 * - v (t, - x, - y, - z - + - 4)) - * ti2 (x, y, z + 1)) * ti2 (x, y, z + 1); + z) - 4.16666666666667e-2 * u (t, x, y - 1, z) - 2.5e-2 * u (t, x, + y + 1, + z) + 4.16666666666667e-3 * u (t, x, y + 2, z); + auto temp372 = + 4.16666666666667e-3 * (u (t, x, y, z) - u (t, x + 4, y, + z)) + 3.33333333333333e-2 * (-u (t, x + 1, y, z) + u (t, x + 3, y, + z)); + auto temp420 = + 2.08333333333333e-2 * ((temp330 * ti1 (x, y, z) + temp333 * ti3 (x, y, + z)) * ti2 (x, y, + z) - (-4.16666666666667e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y, + z - 1) + 7.5e-2 * u (t, x, y, + z + 1) - 2.5e-2 * u (t, + x, + y, + z + + 2) + 4.16666666666667e-3 * u (t, x, y, z + 3)) * ti0 (x, y, z)); + auto temp387 = + 4.16666666666667e-3 * u (t, x, y, z) - 1.25e-2 * u (t, x - 4, y, + z) - 4.16666666666667e-2 * u (t, x - 3, y, z) + 7.5e-2 * u (t, x - 2, + y, + z) - 2.5e-2 * u (t, x - 1, y, z); + auto temp383 = + 3.33333333333333e-2 * (u (t, x, y, z) - u (t, x - 2, y, + z)) + 4.16666666666667e-3 * (u (t, x - 3, y, z) - u (t, x + 1, y, + z)); + auto temp392 = + 4.16666666666667e-3 * (u (t, x - 2, y - 2, z) - u (t, x - 2, y + 2, + z)) + 3.33333333333333e-2 * (-u (t, x - 2, + y - 1, + z) + u (t, + x - 2, + y + 1, + z)); + auto temp284 = + (((4.16666666666667e-3 * (v (t, x - 2, y, z - 2) - v (t, x + 2, y, + z - 2)) + 3.33333333333333e-2 * (-v (t, x - 1, y, + z - 2) + v (t, x + 1, y, z - 2))) * ti1 (x, y, + z - 2) + (-4.16666666666667e-2 * v (t, x, y, + z - 2) - 1.25e-2 * v (t, x, y - 1, z - 2) + 7.5e-2 * v (t, x, + y + 1, + z - 2) - 2.5e-2 * v (t, x, y + 2, + z - 2) + 4.16666666666667e-3 * v (t, x, + y + 3, + z - 2)) * ti3 (x, y, z - 2)) * ti0 (x, y, + z - 2) + (-2.5e-2 * v (t, x, y, + z) - 1.25e-2 * v (t, x, y, + z - 3) - 4.16666666666667e-2 * v (t, + x, + y, + z - 2) + 7.5e-2 * v (t, x, + y, + z - 1) + 4.16666666666667e-3 * v (t, + x, + y, + z + 1)) * ti2 (x, y, z - 2)) * ti2 (x, y, z - 2); + auto temp308 = + (((4.16666666666667e-3 * (v (t, x - 2, y - 2, z) - v (t, x + 2, y - 2, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 2, z) + v (t, + x + 1, y - 2, z))) * ti1 (x, y - 2, + z) + (-2.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 3, + z) - 4.16666666666667e-2 * v (t, x, y - 2, z) + 7.5e-2 * v (t, + x, y - 1, + z) + 4.16666666666667e-3 * v (t, x, y + 1, z)) * ti3 (x, y - 2, + z)) * ti0 (x, + y - 2, + z) + (-4.16666666666667e-2 * v (t, x, y - 2, z) - + 1.25e-2 * v (t, x, y - 2, + z - 1) + 7.5e-2 * v (t, x, y - 2, + z + 1) - 2.5e-2 * v (t, x, y - 2, + z + 2) + 4.16666666666667e-3 * v (t, + x, + y - 2, + z + + + 3)) * ti2 (x, y - 2, z)) * ti0 (x, y - 2, z) * ti3 (x, y - 2, z); + auto temp92 = + (((4.16666666666667e-3 * (v (t, x, y - 2, z + 1) - v (t, x, y + 2, + z + 1)) + 3.33333333333333e-2 * (-v (t, x, y - 1, + z + 1) + v (t, x, y + 1, z + 1))) * ti3 (x, y, + z + 1) + (-4.16666666666667e-2 * v (t, x, y, + z + 1) - 1.25e-2 * v (t, x - 1, y, z + 1) + 7.5e-2 * v (t, + x + 1, y, + z + 1) - 2.5e-2 * v (t, x + 2, y, + z + 1) + 4.16666666666667e-3 * v (t, + x + 3, + y, + z + 1)) * ti1 (x, y, z + 1)) * ti0 (x, y, + z + 1) + + (3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x, y, + z + 2)) + 4.16666666666667e-3 * (v (t, x, y, z - 1) - v (t, x, + y, z + 3))) * ti2 (x, y, + z + 1)) * ti2 (x, + y, + z + 1); + auto temp185 = + (((4.16666666666667e-3 * (v (t, x - 2, y - 2, z) - v (t, x - 2, y + 2, + z)) + 3.33333333333333e-2 * (-v (t, x - 2, y - 1, z) + v (t, + x - 2, y + 1, z))) * ti3 (x - 2, y, + z) + (-2.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x - 3, y, + z) - 4.16666666666667e-2 * v (t, x - 2, y, z) + 7.5e-2 * v (t, + x - 1, y, + z) + 4.16666666666667e-3 * v (t, x + 1, y, z)) * ti1 (x - 2, y, + z)) * ti0 (x - 2, + y, + z) + + (4.16666666666667e-3 * (v (t, x - 2, y, z - 2) - v (t, x - 2, y, + z + 2)) + 3.33333333333333e-2 * (-v (t, x - 2, y, z - 1) + v (t, + x - 2, y, z + 1))) * ti2 (x - 2, y, + z)) * ti0 (x - 2, + y, + z) * ti1 (x - 2, y, z); + auto temp283 = + (((4.16666666666667e-3 * (v (t, x - 2, y, z - 3) - v (t, x + 2, y, + z - 3)) + 3.33333333333333e-2 * (-v (t, x - 1, y, + z - 3) + v (t, x + 1, y, z - 3))) * ti1 (x, y, + z - 3) + (-4.16666666666667e-2 * v (t, x, y, + z - 3) - 1.25e-2 * v (t, x, y - 1, z - 3) + 7.5e-2 * v (t, x, + y + 1, + z - 3) - 2.5e-2 * v (t, x, y + 2, + z - 3) + 4.16666666666667e-3 * v (t, x, + y + 3, + z - + 3)) * + ti3 (x, y, z - 3)) * ti0 (x, y, + z - 3) + (4.16666666666667e-3 * v (t, + x, + y, + z) - 1.25e-2 * v (t, x, y, + z - 4) - 4.16666666666667e-2 * v (t, + x, + y, + z - 3) + 7.5e-2 * v (t, x, y, + z - 2) - 2.5e-2 * v (t, x, y, + z - 1)) * ti2 (x, + y, + z - 3)) * ti2 (x, y, z - 3); + auto temp309 = + (((4.16666666666667e-3 * (v (t, x - 2, y - 1, z) - v (t, x + 2, y - 1, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 1, z) + v (t, + x + 1, y - 1, z))) * ti1 (x, y - 1, + z) + (7.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 2, + z) - 4.16666666666667e-2 * v (t, x, y - 1, z) - 2.5e-2 * v (t, + x, y + 1, + z) + 4.16666666666667e-3 * v (t, x, y + 2, z)) * ti3 (x, y - 1, + z)) * ti0 (x, + y - 1, + z) + (-4.16666666666667e-2 * v (t, x, y - 1, z) - + 1.25e-2 * v (t, x, y - 1, z - 1) + 7.5e-2 * v (t, + x, + y - 1, + z + 1) - 2.5e-2 * v (t, x, y - 1, + z + 2) + 4.16666666666667e-3 * v (t, x, + y - 1, + z + + 3)) * ti2 (x, y - 1, z)) * ti0 (x, y - 1, z) * ti3 (x, y - 1, z); + auto temp310 = + (((4.16666666666667e-3 * (v (t, x - 2, y + 1, z) - v (t, x + 2, y + 1, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y + 1, z) + v (t, + x + 1, y + 1, z))) * ti1 (x, y + 1, + z) + (-1.25e-2 * v (t, x, y, z) - 4.16666666666667e-2 * v (t, x, + y + 1, z) + 7.5e-2 * v (t, x, y + 2, + z) - 2.5e-2 * v (t, x, y + 3, z) + 4.16666666666667e-3 * v (t, + x, y + 4, + z)) * ti3 (x, + y + 1, + z)) * ti0 (x, y + 1, + z) + (-4.16666666666667e-2 * v (t, x, + y + 1, + z) - 1.25e-2 * v (t, x, y + 1, + z - 1) + 7.5e-2 * v (t, + x, + y + 1, + z + 1) - 2.5e-2 * v (t, x, y + 1, + z + 2) + 4.16666666666667e-3 * v (t, x, y + 1, + z + 3)) * ti2 (x, + y + 1, + z)) * ti0 (x, + y + 1, + z) * ti3 (x, y + 1, z); + auto temp307 = + (((4.16666666666667e-3 * (v (t, x - 2, y - 3, z) - v (t, x + 2, y - 3, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 3, z) + v (t, + x + 1, y - 3, z))) * ti1 (x, y - 3, + z) + (4.16666666666667e-3 * v (t, x, y, z) - 1.25e-2 * v (t, x, + y - 4, z) - 4.16666666666667e-2 * v (t, x, + y - 3, + z) + 7.5e-2 * v (t, x, y - 2, z) - 2.5e-2 * v (t, x, y - 1, + z)) * ti3 (x, y - 3, + z)) * ti0 (x, + y - 3, + z) + + (-4.16666666666667e-2 * v (t, x, y - 3, z) - 1.25e-2 * v (t, x, y - 3, + z - 1) + 7.5e-2 * v (t, x, y - 3, + z + 1) - 2.5e-2 * v (t, x, y - 3, + z + 2) + 4.16666666666667e-3 * v (t, x, + y - 3, + z + 3)) * ti2 (x, y - 3, z)) * ti0 (x, y - 3, z) * ti3 (x, y - 3, + z); + auto temp298 = + (((4.16666666666667e-3 * (v (t, x, y, z) - v (t, x + 4, y, + z)) + 3.33333333333333e-2 * (-v (t, x + 1, y, z) + v (t, + x + 3, y, z))) * ti1 (x + 2, y, + z) + (-4.16666666666667e-2 * v (t, x + 2, y, z) - 1.25e-2 * v (t, + x + 2, y - 1, z) + 7.5e-2 * v (t, x + 2, y + 1, + z) - 2.5e-2 * v (t, + x + 2, + y + 2, + z) + 4.16666666666667e-3 * v (t, x + 2, y + 3, z)) * ti3 (x + 2, + y, + z)) * ti0 (x + 2, + y, + z) + (-4.16666666666667e-2 * v (t, x + 2, y, + z) - 1.25e-2 * v (t, + x + 2, + y, + z - 1) + 7.5e-2 * v (t, x + 2, y, z + 1) - 2.5e-2 * v (t, + x + 2, + y, + z + 2) + 4.16666666666667e-3 * v (t, x + 2, y, + z + 3)) * ti2 (x + 2, + y, + z)) * ti0 (x + 2, + y, + z) * ti1 (x + 2, y, z); + auto temp165 = + (((4.16666666666667e-3 * (v (t, x - 3, y - 2, z) - v (t, x - 3, y + 2, + z)) + 3.33333333333333e-2 * (-v (t, x - 3, y - 1, z) + v (t, + x - 3, y + 1, z))) * ti3 (x - 3, y, + z) + (4.16666666666667e-3 * v (t, x, y, z) - 1.25e-2 * v (t, + x - 4, y, z) - 4.16666666666667e-2 * v (t, x - 3, + y, + z) + 7.5e-2 * v (t, x - 2, y, z) - 2.5e-2 * v (t, x - 1, y, + z)) * ti1 (x - 3, y, + z)) * ti0 (x - 3, + y, + z) + + (4.16666666666667e-3 * (v (t, x - 3, y, z - 2) - v (t, x - 3, y, + z + 2)) + 3.33333333333333e-2 * (-v (t, x - 3, y, z - 1) + v (t, + x - 3, y, z + 1))) * ti2 (x - 3, y, + z)) * ti0 (x - 3, + y, + z) * ti1 (x - 3, y, z); + auto temp154 = + (((4.16666666666667e-3 * (v (t, x, y - 2, z - 1) - v (t, x, y + 2, + z - 1)) + 3.33333333333333e-2 * (-v (t, x, y - 1, + z - 1) + v (t, x, y + 1, z - 1))) * ti3 (x, y, + z - 1) + (-4.16666666666667e-2 * v (t, x, y, + z - 1) - 1.25e-2 * v (t, x - 1, y, z - 1) + 7.5e-2 * v (t, + x + 1, y, + z - 1) - 2.5e-2 * v (t, x + 2, y, + z - 1) + 4.16666666666667e-3 * v (t, + x + 3, + y, + z - 1)) * ti1 (x, y, z - 1)) * ti0 (x, y, + z - 1) + + (3.33333333333333e-2 * (v (t, x, y, z) - v (t, x, y, + z - 2)) + 4.16666666666667e-3 * (v (t, x, y, z - 3) - v (t, x, + y, z + 1))) * ti2 (x, y, + z - 1)) * ti2 (x, + y, + z - 1); + auto temp299 = + (((3.33333333333333e-2 * (v (t, x, y, z) - v (t, x - 2, y, + z)) + 4.16666666666667e-3 * (v (t, x - 3, y, z) - v (t, x + 1, + y, z))) * ti1 (x - 1, y, + z) + (-4.16666666666667e-2 * v (t, x - 1, y, z) - 1.25e-2 * v (t, + x - 1, y - 1, z) + 7.5e-2 * v (t, x - 1, y + 1, + z) - 2.5e-2 * v (t, + x - 1, + y + 2, + z) + 4.16666666666667e-3 * v (t, x - 1, y + 3, z)) * ti3 (x - 1, + y, + z)) * ti0 (x - 1, + y, + z) + (-4.16666666666667e-2 * v (t, x - 1, y, + z) - 1.25e-2 * v (t, + x - 1, + y, + z - 1) + 7.5e-2 * v (t, x - 1, y, z + 1) - 2.5e-2 * v (t, + x - 1, + y, + z + 2) + 4.16666666666667e-3 * v (t, x - 1, y, + z + 3)) * ti2 (x - 1, + y, + z)) * ti0 (x - 1, + y, + z) * ti1 (x - 1, y, z); + auto temp289 = + (((3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x + 2, y, + z)) + 4.16666666666667e-3 * (v (t, x - 1, y, z) - v (t, x + 3, + y, z))) * ti1 (x + 1, y, + z) + (-4.16666666666667e-2 * v (t, x + 1, y, z) - 1.25e-2 * v (t, + x + 1, y - 1, z) + 7.5e-2 * v (t, x + 1, y + 1, + z) - 2.5e-2 * v (t, + x + 1, + y + 2, + z) + 4.16666666666667e-3 * v (t, x + 1, y + 3, z)) * ti3 (x + 1, + y, + z)) * ti0 (x + 1, + y, + z) + (-4.16666666666667e-2 * v (t, x + 1, y, + z) - 1.25e-2 * v (t, + x + 1, + y, + z - 1) + 7.5e-2 * v (t, x + 1, y, z + 1) - 2.5e-2 * v (t, + x + 1, + y, + z + 2) + 4.16666666666667e-3 * v (t, x + 1, y, + z + 3)) * ti2 (x + 1, + y, + z)) * ti0 (x + 1, + y, + z) * ti1 (x + 1, y, z); + auto temp116 = + (((4.16666666666667e-3 * (v (t, x, y - 2, z - 2) - v (t, x, y + 2, + z - 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, + z - 2) + v (t, x, y + 1, z - 2))) * ti3 (x, y, + z - 2) + (-4.16666666666667e-2 * v (t, x, y, + z - 2) - 1.25e-2 * v (t, x - 1, y, z - 2) + 7.5e-2 * v (t, + x + 1, y, + z - 2) - 2.5e-2 * v (t, x + 2, y, + z - 2) + 4.16666666666667e-3 * v (t, + x + 3, + y, + z - 2)) * ti1 (x, y, z - 2)) * ti0 (x, y, + z - 2) + + (4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x, y, + z - 4)) + 3.33333333333333e-2 * (-v (t, x, y, z - 3) + v (t, x, + y, z - 1))) * ti2 (x, y, + z - 2)) * ti2 (x, + y, + z - 2); + auto temp56 = + (((4.16666666666667e-3 * (v (t, x, y - 2, z) - v (t, x, y + 2, + z)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z) + v (t, x, + y + 1, z))) * ti3 (x, y, + z) + (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, + x - 1, y, z) + 7.5e-2 * v (t, x + 1, y, + z) - 2.5e-2 * v (t, + x + 2, + y, + z) + 4.16666666666667e-3 * v (t, x + 3, y, z)) * ti1 (x, y, + z)) * ti0 (x, + y, + z) + + (4.16666666666667e-3 * (v (t, x, y, z - 2) - v (t, x, y, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y, z - 1) + v (t, x, + y, z + 1))) * ti2 (x, y, + z)) * ti0 (x, y, + z) * ti1 (x, y, z); + auto temp204 = + (((4.16666666666667e-3 * (v (t, x - 1, y - 2, z) - v (t, x - 1, y + 2, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y - 1, z) + v (t, + x - 1, y + 1, z))) * ti3 (x - 1, y, + z) + (7.5e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x - 2, y, + z) - 4.16666666666667e-2 * v (t, x - 1, y, z) - 2.5e-2 * v (t, + x + 1, y, + z) + 4.16666666666667e-3 * v (t, x + 2, y, z)) * ti1 (x - 1, y, + z)) * ti0 (x - 1, + y, + z) + + (4.16666666666667e-3 * (v (t, x - 1, y, z - 2) - v (t, x - 1, y, + z + 2)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z - 1) + v (t, + x - 1, y, z + 1))) * ti2 (x - 1, y, + z)) * ti0 (x - 1, + y, + z) * ti1 (x - 1, y, z); + auto temp288 = + (((4.16666666666667e-3 * (v (t, x - 2, y, z + 1) - v (t, x + 2, y, + z + 1)) + 3.33333333333333e-2 * (-v (t, x - 1, y, + z + 1) + v (t, x + 1, y, z + 1))) * ti1 (x, y, + z + 1) + (-4.16666666666667e-2 * v (t, x, y, + z + 1) - 1.25e-2 * v (t, x, y - 1, z + 1) + 7.5e-2 * v (t, x, + y + 1, + z + 1) - 2.5e-2 * v (t, x, y + 2, + z + 1) + 4.16666666666667e-3 * v (t, x, + y + 3, + z + + 1)) * ti3 (x, y, z + 1)) * ti0 (x, y, z + 1) + (-1.25e-2 * v (t, + x, y, + z) - 4.16666666666667e-2 * v (t, + x, + y, + z + 1) + 7.5e-2 * v (t, x, y, + z + 2) - 2.5e-2 * v (t, x, y, + z + 3) + 4.16666666666667e-3 * v (t, + x, + y, + z + 4)) * ti2 (x, y, z + 1)) * ti2 (x, y, z + 1); auto temp10 = 8.85879567828298e-1 * damp (x, y, z) - 2.0 * m (x, y, z); - auto temp133 = (((4.16666666666667e-3 * (v (t, x, y - 2, z + 2) - v (t, x, y + 2, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z + 2) + v (t, x, y + 1, z + 2))) * ti3 (x, y, - z + 2) + - (-4.16666666666667e-2 * v (t, x, y, z + 2) - 1.25e-2 * v (t, x - 1, y, z + 2) + 7.5e-2 * v (t, x + 1, y, - z + 2) - 2.5e-2 * v (t, x + 2, y, z + 2) + 4.16666666666667e-3 * v (t, - x + 3, - y, - z + 2)) * ti1 (x, y, z + 2)) * ti0 (x, y, - z + 2) + - (4.16666666666667e-3 * (v (t, x, y, z) - v (t, x, y, z + 4)) + 3.33333333333333e-2 * (-v (t, x, y, z + 1) + v (t, x, y, z + 3))) * ti2 (x, y, - z + 2)) * ti2 (x, - y, - z + 2); - auto temp286 = (((4.16666666666667e-3 * (v (t, x - 2, y, z - 1) - v (t, x + 2, y, z - 1)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z - 1) + v (t, x + 1, y, z - 1))) * ti1 (x, y, - z - 1) + - (-4.16666666666667e-2 * v (t, x, y, z - 1) - 1.25e-2 * v (t, x, y - 1, z - 1) + 7.5e-2 * v (t, x, y + 1, - z - 1) - 2.5e-2 * v (t, x, y + 2, z - 1) + 4.16666666666667e-3 * v (t, x, - y + 3, - z - 1)) * ti3 (x, y, z - 1)) * ti0 (x, y, z - 1) + (7.5e-2 * v (t, x, y, - z) - - 1.25e-2 * v (t, x, y, - z - 2) - - 4.16666666666667e-2 * - v (t, - x, - y, - z - - - 1) - - 2.5e-2 * v (t, x, - y, - z + - 1) + - 4.16666666666667e-3 * - v (t, - x, - y, - z - + - 2)) - * ti2 (x, y, z - 1)) * ti2 (x, y, z - 1); - auto temp252 = (((4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x, y - 4, z)) + 3.33333333333333e-2 * (-v (t, x, y - 3, z) + v (t, x, y - 1, z))) * ti3 (x, y - 2, - z) + - (-4.16666666666667e-2 * v (t, x, y - 2, z) - 1.25e-2 * v (t, x - 1, y - 2, z) + 7.5e-2 * v (t, x + 1, y - 2, - z) - 2.5e-2 * v (t, - x + 2, - y - 2, - z) + 4.16666666666667e-3 * v (t, x + 3, y - 2, z)) * ti1 (x, y - 2, - z)) * ti0 (x, - y - 2, - z) - + (4.16666666666667e-3 * (v (t, x, y - 2, z - 2) - v (t, x, y - 2, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 2, z - 1) + v (t, x, y - 2, z + 1))) * ti2 (x, y - 2, - z)) * ti0 (x, - y - 2, - z) * ti3 (x, y - 2, z); + auto temp133 = + (((4.16666666666667e-3 * (v (t, x, y - 2, z + 2) - v (t, x, y + 2, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, + z + 2) + v (t, x, y + 1, z + 2))) * ti3 (x, y, + z + 2) + (-4.16666666666667e-2 * v (t, x, y, + z + 2) - 1.25e-2 * v (t, x - 1, y, z + 2) + 7.5e-2 * v (t, + x + 1, y, + z + 2) - 2.5e-2 * v (t, x + 2, y, + z + 2) + 4.16666666666667e-3 * v (t, + x + 3, + y, + z + 2)) * ti1 (x, y, z + 2)) * ti0 (x, y, + z + 2) + + (4.16666666666667e-3 * (v (t, x, y, z) - v (t, x, y, + z + 4)) + 3.33333333333333e-2 * (-v (t, x, y, z + 1) + v (t, x, + y, z + 3))) * ti2 (x, y, + z + 2)) * ti2 (x, + y, + z + 2); + auto temp286 = + (((4.16666666666667e-3 * (v (t, x - 2, y, z - 1) - v (t, x + 2, y, + z - 1)) + 3.33333333333333e-2 * (-v (t, x - 1, y, + z - 1) + v (t, x + 1, y, z - 1))) * ti1 (x, y, + z - 1) + (-4.16666666666667e-2 * v (t, x, y, + z - 1) - 1.25e-2 * v (t, x, y - 1, z - 1) + 7.5e-2 * v (t, x, + y + 1, + z - 1) - 2.5e-2 * v (t, x, y + 2, + z - 1) + 4.16666666666667e-3 * v (t, x, + y + 3, + z - 1)) * ti3 (x, y, z - 1)) * ti0 (x, y, + z - 1) + (7.5e-2 * v (t, x, y, + z) - 1.25e-2 * v (t, x, y, + z - 2) - 4.16666666666667e-2 * v (t, + x, + y, + z - 1) - 2.5e-2 * v (t, x, + y, + z + 1) + 4.16666666666667e-3 * v (t, + x, + y, + z + 2)) * ti2 (x, y, z - 1)) * ti2 (x, y, z - 1); + auto temp252 = + (((4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x, y - 4, + z)) + 3.33333333333333e-2 * (-v (t, x, y - 3, z) + v (t, x, + y - 1, z))) * ti3 (x, y - 2, + z) + (-4.16666666666667e-2 * v (t, x, y - 2, z) - 1.25e-2 * v (t, + x - 1, y - 2, z) + 7.5e-2 * v (t, x + 1, y - 2, + z) - 2.5e-2 * v (t, + x + 2, + y - 2, + z) + 4.16666666666667e-3 * v (t, x + 3, y - 2, z)) * ti1 (x, + y - 2, + z)) * ti0 (x, + y - 2, + z) + + (4.16666666666667e-3 * (v (t, x, y - 2, z - 2) - v (t, x, y - 2, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 2, z - 1) + v (t, + x, y - 2, z + 1))) * ti2 (x, y - 2, + z)) * ti0 (x, + y - 2, + z) * ti3 (x, y - 2, z); auto temp487 = temp420 * ti2 (x, y, z) * ti3 (x, y, - z) + 2.08333333333333e-2 * ((-temp318 * ti1 (x, y, z) + temp325 * ti3 (x, y, z)) * ti3 (x, y, - z) + ((temp318 * ti3 (x, y, z) + temp325 * ti1 (x, y, z)) * ti2 (x, y, - z) - - (4.16666666666667e-3 * (u (t, x, y, z - 2) - u (t, x, y, z + 2)) + - 3.33333333333333e-2 * (-u (t, x, y, z - 1) + u (t, x, y, z + 1))) * ti0 (x, y, - z)) * ti1 (x, - y, - z) * ti2 (x, y, - z)) + - 3.75e-2 * ((temp399 * ti3 (x, y - 1, z) - temp401 * ti1 (x, - y - 1, - z)) * ti1 (x, y - 1, - z) + (((4.16666666666667e-3 * (u (t, x - 2, y, z - 1) - u (t, x + 2, y, - z - 1)) + 3.33333333333333e-2 * (-u (t, - x - 1, - y, - z - 1) + u (t, - x + 1, - y, - z - 1))) * ti1 (x, y, - z - 1) + (-4.16666666666667e-2 * u (t, - x, - y, - z - 1) - - 1.25e-2 * u (t, x, y - 1, - z - 1) + 7.5e-2 * u (t, x, y + 1, - z - 1) - - 2.5e-2 * u (t, x, y + 2, - z - 1) + - 4.16666666666667e-3 * u (t, - x, - y + 3, - z - 1)) * ti3 (x, y, - z - - 1)) * - ti2 (x, y, - z - 1) - (7.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y, - z - 2) - 4.16666666666667e-2 * u (t, x, y, - z - 1) - 2.5e-2 * u (t, x, y, - z + 1) + 4.16666666666667e-3 * u (t, x, y, - z + 2)) * ti0 (x, y, - z - - 1)) * ti0 (x, y, - z - - 1)) + - 1.25e-2 * ((-temp392 * ti1 (x - 2, - y, - z) + temp394 * ti3 (x - 2, y, - z)) * ti3 (x - 2, y, - z) + ((temp392 * ti3 (x - 2, y, - z) + temp394 * ti1 (x - 2, - y, - z)) * ti2 (x - 2, y, - z) - (4.16666666666667e-3 * (u (t, x - 2, y, - z - 2) - u (t, - x - 2, y, - z + 2)) + 3.33333333333333e-2 * (-u (t, x - 2, y, - z - 1) + u (t, - x - 2, y, - z + 1))) * ti0 (x - 2, y, - z)) * ti1 (x - 2, y, - z) * ti2 (x - 2, - y, - z) + - ((temp395 * ti1 (x, y - 2, - z) + temp396 * ti3 (x, - y - 2, - z)) * ti2 (x, y - 2, - z) - (-4.16666666666667e-2 * u (t, x, y - 2, - z) - 1.25e-2 * u (t, x, - y - 2, - z - 1) + 7.5e-2 * u (t, x, - y - 2, - z + 1) - 2.5e-2 * u (t, x, - y - 2, - z + 2) + 4.16666666666667e-3 * u (t, x, y - 2, - z + 3)) * ti0 (x, - y - 2, - z)) * ti2 (x, y - 2, - z) * ti3 (x, - y - 2, - z)) - + 6.25e-3 * ((-temp409 * ti1 (x + 1, y, z) + temp410 * ti3 (x + 1, y, z)) * ti3 (x + 1, - y, - z) + ((temp404 * ti1 (x, y + 1, z) + temp406 * ti3 (x, y + 1, z)) * ti2 (x, - y + 1, - z) - - (-4.16666666666667e-2 * u (t, x, y + 1, z) - 1.25e-2 * u (t, x, y + 1, z - 1) + 7.5e-2 * u (t, x, - y + 1, - z + 1) - 2.5e-2 * u (t, x, y + 1, - z + 2) + 4.16666666666667e-3 * u (t, x, - y + 1, - z + - 3)) * - ti0 (x, y + 1, z)) * ti2 (x, y + 1, z) * ti3 (x, y + 1, - z) + ((temp409 * ti3 (x + 1, y, z) + temp410 * ti1 (x + 1, y, z)) * ti2 (x + 1, - y, - z) - - - (4.16666666666667e-3 * - (u (t, x + 1, y, z - 2) - u (t, x + 1, y, z + 2)) + - 3.33333333333333e-2 * (-u (t, x + 1, y, z - 1) + u (t, x + 1, y, - z + 1))) * ti0 (x + 1, y, - z)) * ti1 (x + 1, - y, - z) * ti2 (x + 1, y, - z)) + - 1.66666666666667e-2 * ((temp343 * ti3 (x + 1, y, z) - temp347 * ti1 (x + 1, y, z)) * ti3 (x + 1, - y, - z) + (-temp377 * ti1 (x, y - 1, z) + temp381 * ti3 (x, y - 1, z)) * ti1 (x, - y - 1, - z) + ((temp336 * ti3 (x, y + 1, z) + temp342 * ti1 (x, y + 1, z)) * ti2 (x, - y + 1, - z) - - (4.16666666666667e-3 * - (u (t, x, y + 1, z - 2) - u (t, x, y + 1, z + 2)) + - 3.33333333333333e-2 * (-u (t, x, y + 1, z - 1) + - u (t, x, y + 1, z + 1))) * ti0 (x, y + 1, - z)) * ti2 (x, - y + 1, - z) * - ti3 (x, y + 1, - z) + ((temp343 * ti1 (x + 1, y, z) + temp347 * ti3 (x + 1, y, z)) * ti2 (x + 1, y, - z) - (-4.16666666666667e-2 * u (t, x + 1, y, z) - 1.25e-2 * u (t, x + 1, y, z - 1) + 7.5e-2 * u (t, - x + 1, - y, - z + 1) - 2.5e-2 * u (t, x + 1, y, - z + 2) + - 4.16666666666667e-3 * u (t, x + 1, - y, - z + 3)) * ti0 (x + 1, y, z)) * ti1 (x + 1, y, z) * ti2 (x + 1, y, - z) + - (((4.16666666666667e-3 * (u (t, x, y - 2, z - 1) - u (t, x, y + 2, z - 1)) + 3.33333333333333e-2 * (-u (t, x, y - 1, z - 1) + u (t, x, y + 1, z - 1))) * ti3 (x, y, - z - 1) + - (-4.16666666666667e-2 * u (t, x, y, z - 1) - 1.25e-2 * u (t, x - 1, y, z - 1) + 7.5e-2 * u (t, x + 1, y, - z - 1) - 2.5e-2 * u (t, x + 2, y, z - 1) + 4.16666666666667e-3 * u (t, - x + 3, - y, - z - 1)) * ti1 (x, y, z - 1)) * ti2 (x, y, - z - - 1) - - (3.33333333333333e-2 * (u (t, x, y, z) - u (t, x, y, - z - 2)) + 4.16666666666667e-3 * (u (t, x, y, - z - 3) - u (t, x, y, - z + 1))) * ti0 (x, y, - z - 1)) * ti0 (x, - y, - z - - - 1)) - + 2.08333333333333e-3 * ((temp360 * ti3 (x - 2, y, z) - temp365 * ti1 (x - 2, y, z)) * ti3 (x - 2, - y, - z) + (-temp367 * ti1 (x, y + 2, z) + temp370 * ti3 (x, y + 2, z)) * ti1 (x, - y + 2, - z) + (temp388 * ti3 (x, y - 3, z) - temp390 * ti1 (x, y - 3, z)) * ti1 (x, - y - 3, - z) - + ((temp352 * ti3 (x, y - 2, z) + temp357 * ti1 (x, y - 2, z)) * ti2 (x, - y - 2, - z) - - (4.16666666666667e-3 * (u (t, x, y - 2, z - 2) - u (t, x, y - 2, z + 2)) + 3.33333333333333e-2 * (-u (t, x, y - 2, z - 1) + u (t, x, y - 2, z + 1))) * ti0 (x, y - 2, - z)) * ti2 (x, - y - 2, - z) * ti3 (x, y - 2, - z) + - ((temp360 * ti1 (x - 2, y, z) + temp365 * ti3 (x - 2, y, z)) * ti2 (x - 2, y, - z) - (-4.16666666666667e-2 * u (t, x - 2, y, z) - 1.25e-2 * u (t, x - 2, y, z - 1) + 7.5e-2 * u (t, - x - 2, - y, - z + 1) - 2.5e-2 * u (t, x - 2, y, - z + 2) + - 4.16666666666667e-3 * u (t, x - 2, - y, - z + 3)) * ti0 (x - 2, y, z)) * ti1 (x - 2, y, z) * ti2 (x - 2, y, - z) + - (((4.16666666666667e-3 * (u (t, x, y - 2, z + 2) - u (t, x, y + 2, z + 2)) + 3.33333333333333e-2 * (-u (t, x, y - 1, z + 2) + u (t, x, y + 1, z + 2))) * ti3 (x, y, - z + 2) + - (-4.16666666666667e-2 * u (t, x, y, z + 2) - 1.25e-2 * u (t, x - 1, y, z + 2) + 7.5e-2 * u (t, x + 1, y, - z + 2) - 2.5e-2 * u (t, x + 2, y, z + 2) + 4.16666666666667e-3 * u (t, - x + 3, - y, - z + 2)) * ti1 (x, y, z + 2)) * ti2 (x, y, - z + - 2) - - (4.16666666666667e-3 * (u (t, x, y, z) - u (t, x, y, z + 4)) + 3.33333333333333e-2 * (-u (t, x, y, z + 1) + u (t, x, y, z + 3))) * ti0 (x, y, - z + 2)) * ti0 (x, - y, - z - + - 2) - + (((4.16666666666667e-3 * (u (t, x - 2, y, z - 3) - u (t, x + 2, y, z - 3)) + 3.33333333333333e-2 * (-u (t, x - 1, y, z - 3) + u (t, x + 1, y, z - 3))) * ti1 (x, y, - z - 3) + - (-4.16666666666667e-2 * u (t, x, y, z - 3) - 1.25e-2 * u (t, x, y - 1, z - 3) + 7.5e-2 * u (t, x, y + 1, - z - 3) - 2.5e-2 * u (t, x, y + 2, z - 3) + 4.16666666666667e-3 * u (t, x, - y + 3, - z - 3)) * ti3 (x, y, z - 3)) * ti2 (x, y, - z - - 3) - - (4.16666666666667e-3 * u (t, x, - y, - z) - 1.25e-2 * u (t, x, y, - z - 4) - 4.16666666666667e-2 * u (t, x, - y, - z - 3) + 7.5e-2 * u (t, x, y, - z - 2) - 2.5e-2 * u (t, x, y, - z - 1)) * ti0 (x, - y, - z - 3)) * ti0 (x, y, z - 3)) - (temp420 * ti0 (x, y, - z) + - 2.08333333333333e-2 * (temp330 * ti3 (x, y, - z) - - temp333 * ti1 (x, y, - z)) * - ti1 (x, y, - z) + - 1.66666666666667e-2 * (-temp336 * ti1 (x, y + 1, - z) + - temp342 * ti3 (x, - y + 1, - z)) * - ti1 (x, y + 1, - z) + - 2.08333333333333e-3 * (-temp352 * ti1 (x, y - 2, - z) + - temp357 * ti3 (x, - y - 2, - z)) * - ti1 (x, y - 2, - z) + - 2.08333333333333e-3 * (temp372 * ti3 (x + 2, y, - z) - - temp375 * ti1 (x + 2, y, - z)) * - ti3 (x + 2, y, - z) + - 1.66666666666667e-2 * (temp383 * ti3 (x - 1, y, - z) - - temp385 * ti1 (x - 1, y, - z)) * - ti3 (x - 1, y, - z) + - 2.08333333333333e-3 * (-temp386 * ti1 (x - 3, y, - z) + - temp387 * ti3 (x - 3, y, - z)) * - ti3 (x - 3, y, - z) + 1.25e-2 * (temp395 * ti3 (x, - y - 2, - z) - - temp396 * ti1 (x, - y - 2, - z)) * - ti1 (x, y - 2, - z) + 6.25e-3 * (temp404 * ti3 (x, - y + 1, - z) - - temp406 * ti1 (x, - y + 1, - z)) * - ti1 (x, y + 1, - z) + 3.75e-2 * (-temp407 * ti1 (x - 1, y, - z) + - temp408 * ti3 (x - 1, y, - z)) * - ti3 (x - 1, y, - z) + - 2.08333333333333e-3 * ((temp367 * ti3 (x, y + 2, - z) + - temp370 * ti1 (x, - y + 2, - z)) - * ti2 (x, y + 2, - z) - - (4.16666666666667e-3 - * (u (t, x, y + 2, - z - 2) - u (t, x, - y + 2, - z + - 2)) + - 3.33333333333333e-2 - * (-u (t, x, - y + 2, - z - 1) + u (t, x, - y + 2, - z + - 1))) * - ti0 (x, y + 2, - z)) * ti2 (x, - y + 2, - z) * ti3 (x, - y + - 2, - z) + - 2.08333333333333e-3 * ((temp372 * ti1 (x + 2, y, - z) + - temp375 * ti3 (x + 2, - y, - z)) - * ti2 (x + 2, y, - z) - - (-4.16666666666667e-2 - * u (t, x + 2, y, - z) - 1.25e-2 * u (t, - x + 2, - y, - z - - 1) + - 7.5e-2 * u (t, - x + 2, - y, - z + - 1) - - 2.5e-2 * u (t, - x + 2, - y, - z + - 2) + - 4.16666666666667e-3 - * u (t, x + 2, y, - z + 3)) * ti0 (x + 2, - y, - z)) - * ti1 (x + 2, y, - z) * ti2 (x + 2, - y, - z) + - 1.66666666666667e-2 * ((temp377 * ti3 (x, y - 1, - z) + - temp381 * ti1 (x, - y - 1, - z)) - * ti2 (x, y - 1, - z) - - (4.16666666666667e-3 - * (u (t, x, y - 1, - z - 2) - u (t, x, - y - 1, - z + - 2)) + - 3.33333333333333e-2 - * (-u (t, x, - y - 1, - z - 1) + u (t, x, - y - 1, - z + - 1))) * - ti0 (x, y - 1, - z)) * ti2 (x, - y - 1, - z) * ti3 (x, - y - - 1, - z) + - 1.66666666666667e-2 * ((temp383 * ti1 (x - 1, y, - z) + - temp385 * ti3 (x - 1, - y, - z)) - * ti2 (x - 1, y, - z) - - (-4.16666666666667e-2 - * u (t, x - 1, y, - z) - 1.25e-2 * u (t, - x - 1, - y, - z - - 1) + - 7.5e-2 * u (t, - x - 1, - y, - z + - 1) - - 2.5e-2 * u (t, - x - 1, - y, - z + - 2) + - 4.16666666666667e-3 - * u (t, x - 1, y, - z + 3)) * ti0 (x - 1, - y, - z)) - * ti1 (x - 1, y, - z) * ti2 (x - 1, - y, - z) + - 2.08333333333333e-3 * ((temp386 * ti3 (x - 3, y, - z) + - temp387 * ti1 (x - 3, - y, - z)) - * ti2 (x - 3, y, - z) - - (4.16666666666667e-3 - * (u (t, x - 3, y, - z - 2) - u (t, - x - 3, y, - z + - 2)) + - 3.33333333333333e-2 * (-u (t, - x - - - 3, - y, - z - - - 1) - + - u (t, - x - - 3, - y, - z + - 1))) - * ti0 (x - 3, y, - z)) * ti1 (x - 3, - y, - z) * - ti2 (x - 3, y, - z) + - 2.08333333333333e-3 * ((temp388 * ti1 (x, y - 3, - z) + - temp390 * ti3 (x, - y - 3, - z)) - * ti2 (x, y - 3, - z) - - (-4.16666666666667e-2 - * u (t, x, y - 3, - z) - 1.25e-2 * u (t, x, - y - 3, - z - - 1) + - 7.5e-2 * u (t, x, - y - 3, - z + - 1) - - 2.5e-2 * u (t, x, - y - 3, - z + - 2) + - 4.16666666666667e-3 - * u (t, x, y - 3, - z + 3)) * ti0 (x, - y - 3, - z)) - * ti2 (x, y - 3, - z) * ti3 (x, - y - 3, - z) + - 3.75e-2 * ((temp399 * ti1 (x, y - 1, - z) + temp401 * ti3 (x, - y - 1, - z)) - * ti2 (x, y - 1, - z) - - (-4.16666666666667e-2 * u (t, x, y - 1, - z) - - 1.25e-2 * u (t, x, - y - 1, - z - 1) + 7.5e-2 * u (t, x, - y - 1, - z + - 1) - - 2.5e-2 * u (t, x, - y - 1, - z + - 2) + - 4.16666666666667e-3 * u (t, x, y - 1, - z + - 3)) * ti0 (x, - y - - 1, - z)) - * ti2 (x, y - 1, - z) * ti3 (x, - y - 1, - z) + - 3.75e-2 * ((temp407 * ti3 (x - 1, y, - z) + temp408 * ti1 (x - 1, - y, - z)) - * ti2 (x - 1, y, - z) - - (4.16666666666667e-3 * (u (t, x - 1, y, - z - 2) - u (t, - x - - 1, - y, - z + - 2)) - + 3.33333333333333e-2 * (-u (t, - x - 1, y, - z - 1) + - u (t, - x - 1, - y, - z + - 1))) * - ti0 (x - 1, y, - z)) * ti1 (x - 1, - y, - z) * ti2 (x - 1, y, - z) + - 2.08333333333333e-3 * - (((4.16666666666667e-3 * - (u (t, x, y - 2, z - 2) - u (t, x, y + 2, - z - 2)) + - 3.33333333333333e-2 * (-u (t, - x, - y - 1, - z - 2) + u (t, x, - y + 1, - z - - 2))) * - ti3 (x, y, - z - 2) + (-4.16666666666667e-2 * u (t, - x, - y, - z - - 2) - - 1.25e-2 * u (t, - x - 1, y, - z - 2) + 7.5e-2 * u (t, - x - + - 1, - y, - z - - - 2) - - 2.5e-2 * u (t, - x + 2, - y, - z - - 2) + - 4.16666666666667e-3 * u (t, - x + 3, - y, - z - - 2)) - * ti1 (x, y, - z - 2)) * ti2 (x, y, - z - 2) - - (4.16666666666667e-3 - * (-u (t, x, y, z) + u (t, x, y, - z - 4)) + - 3.33333333333333e-2 * (-u (t, x, y, - z - 3) + u (t, x, - y, - z - - 1))) - * ti0 (x, - y, - z - 2)) * ti0 (x, y, - z - 2) + - 1.66666666666667e-2 * - (((4.16666666666667e-3 * - (u (t, x, y - 2, z + 1) - u (t, x, y + 2, - z + 1)) + - 3.33333333333333e-2 * (-u (t, - x, - y - 1, - z + 1) + u (t, x, - y + 1, - z + - 1))) * - ti3 (x, y, - z + 1) + (-4.16666666666667e-2 * u (t, - x, - y, - z + - 1) - - 1.25e-2 * u (t, - x - 1, y, - z + 1) + 7.5e-2 * u (t, - x - + - 1, - y, - z - + - 1) - - 2.5e-2 * u (t, - x + 2, - y, - z + - 1) + - 4.16666666666667e-3 * u (t, - x + 3, - y, - z + - 1)) - * ti1 (x, y, - z + 1)) * ti2 (x, y, - z + 1) - - (3.33333333333333e-2 - * (-u (t, x, y, z) + u (t, x, y, - z + 2)) + - 4.16666666666667e-3 - * (u (t, x, y, z - 1) - u (t, x, y, - z + 3))) * ti0 (x, - y, - z - + - 1)) - * ti0 (x, y, - z + 1) + - 1.25e-2 * - (((4.16666666666667e-3 * - (u (t, x - 2, y, z - 2) - u (t, x + 2, y, - z - 2)) + - 3.33333333333333e-2 * (-u (t, - x - 1, - y, - z - 2) + u (t, - x + 1, - y, - z - - 2))) * - ti1 (x, y, - z - 2) + (-4.16666666666667e-2 * u (t, - x, - y, - z - - 2) - - 1.25e-2 * u (t, x, - y - 1, - z - - 2) + 7.5e-2 * u (t, x, - y + 1, - z - - 2) - - 2.5e-2 * u (t, x, - y + 2, - z - - 2) + - 4.16666666666667e-3 * u (t, - x, - y + 3, - z - - 2)) - * ti3 (x, y, - z - 2)) * ti2 (x, y, - z - 2) - - (-2.5e-2 * u (t, x, y, - z) - 1.25e-2 * u (t, x, - y, - z - - 3) - - 4.16666666666667e-2 * u (t, x, y, - z - 2) + 7.5e-2 * u (t, x, - y, - z - - 1) + - 4.16666666666667e-3 * u (t, x, y, - z + 1)) * ti0 (x, - y, - z - - 2)) - * ti0 (x, y, - z - 2) + - 6.25e-3 * - (((4.16666666666667e-3 * - (u (t, x - 2, y, z + 1) - u (t, x + 2, y, - z + 1)) + - 3.33333333333333e-2 * (-u (t, - x - 1, - y, - z + 1) + u (t, - x + 1, - y, - z + - 1))) * - ti1 (x, y, - z + 1) + (-4.16666666666667e-2 * u (t, - x, - y, - z + - 1) - - 1.25e-2 * u (t, x, - y - 1, - z + - 1) + 7.5e-2 * u (t, x, - y + 1, - z + - 1) - - 2.5e-2 * u (t, x, - y + 2, - z + - 1) + - 4.16666666666667e-3 * u (t, - x, - y + 3, - z + - 1)) - * ti3 (x, y, - z + 1)) * ti2 (x, y, - z + 1) - - (-1.25e-2 * u (t, x, y, - z) - - 4.16666666666667e-2 * u (t, x, y, - z + 1) + 7.5e-2 * u (t, x, - y, - z + - 2) - - 2.5e-2 * u (t, - x, - y, - z + - 3) + 4.16666666666667e-3 * u (t, x, y, - z + - 4)) * - ti0 (x, - y, - z + 1)) * ti0 (x, y, z + 1)); - auto temp290 = (((4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x - 4, y, z)) + 3.33333333333333e-2 * (-v (t, x - 3, y, z) + v (t, x - 1, y, z))) * ti1 (x - 2, y, - z) + - (-4.16666666666667e-2 * v (t, x - 2, y, z) - 1.25e-2 * v (t, x - 2, y - 1, z) + 7.5e-2 * v (t, x - 2, y + 1, - z) - 2.5e-2 * v (t, - x - 2, - y + 2, - z) + 4.16666666666667e-3 * v (t, x - 2, y + 3, z)) * ti3 (x - 2, y, - z)) * ti0 (x - 2, - y, - z) + (-4.16666666666667e-2 * v (t, x - 2, y, - z) - 1.25e-2 * v (t, - x - - 2, - y, - z - - 1) + - 7.5e-2 * v (t, x - 2, y, z + 1) - 2.5e-2 * v (t, - x - - 2, - y, - z + - 2) - + 4.16666666666667e-3 * v (t, x - 2, y, - z + 3)) * ti2 (x - 2, - y, - z)) * - ti0 (x - 2, - y, - z) * ti1 (x - 2, y, z); - auto temp275 = (((3.33333333333333e-2 * (v (t, x, y, z) - v (t, x, y - 2, z)) + 4.16666666666667e-3 * (v (t, x, y - 3, z) - v (t, x, y + 1, z))) * ti3 (x, y - 1, - z) + - (-4.16666666666667e-2 * v (t, x, y - 1, z) - 1.25e-2 * v (t, x - 1, y - 1, z) + 7.5e-2 * v (t, x + 1, y - 1, - z) - 2.5e-2 * v (t, - x + 2, - y - 1, - z) + 4.16666666666667e-3 * v (t, x + 3, y - 1, z)) * ti1 (x, y - 1, - z)) * ti0 (x, - y - 1, - z) - + (4.16666666666667e-3 * (v (t, x, y - 1, z - 2) - v (t, x, y - 1, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z - 1) + v (t, x, y - 1, z + 1))) * ti2 (x, y - 1, - z)) * ti0 (x, - y - 1, - z) * ti3 (x, y - 1, z); - auto temp237 = (((3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x, y + 2, z)) + 4.16666666666667e-3 * (v (t, x, y - 1, z) - v (t, x, y + 3, z))) * ti3 (x, y + 1, - z) + - (-4.16666666666667e-2 * v (t, x, y + 1, z) - 1.25e-2 * v (t, x - 1, y + 1, z) + 7.5e-2 * v (t, x + 1, y + 1, - z) - 2.5e-2 * v (t, - x + 2, - y + 1, - z) + 4.16666666666667e-3 * v (t, x + 3, y + 1, z)) * ti1 (x, y + 1, - z)) * ti0 (x, - y + 1, - z) - + (4.16666666666667e-3 * (v (t, x, y + 1, z - 2) - v (t, x, y + 1, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y + 1, z - 1) + v (t, x, y + 1, z + 1))) * ti2 (x, y + 1, - z)) * ti0 (x, - y + 1, - z) * ti3 (x, y + 1, z); - auto temp262 = (((4.16666666666667e-3 * (v (t, x, y, z) - v (t, x, y + 4, z)) + 3.33333333333333e-2 * (-v (t, x, y + 1, z) + v (t, x, y + 3, z))) * ti3 (x, y + 2, - z) + - (-4.16666666666667e-2 * v (t, x, y + 2, z) - 1.25e-2 * v (t, x - 1, y + 2, z) + 7.5e-2 * v (t, x + 1, y + 2, - z) - 2.5e-2 * v (t, - x + 2, - y + 2, - z) + 4.16666666666667e-3 * v (t, x + 3, y + 2, z)) * ti1 (x, y + 2, - z)) * ti0 (x, - y + 2, - z) - + (4.16666666666667e-3 * (v (t, x, y + 2, z - 2) - v (t, x, y + 2, z + 2)) + 3.33333333333333e-2 * (-v (t, x, y + 2, z - 1) + v (t, x, y + 2, z + 1))) * ti2 (x, y + 2, - z)) * ti0 (x, - y + 2, - z) * ti3 (x, y + 2, z); - auto temp68 = ((4.16666666666667e-3 * (v (t, x - 2, y, z) - v (t, x + 2, y, z)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z) + v (t, x + 1, y, z))) * ti1 (x, y, - z) + - (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y - 1, z) + 7.5e-2 * v (t, x, y + 1, - z) - 2.5e-2 * v (t, x, - y + 2, - z) + 4.16666666666667e-3 * v (t, x, y + 3, z)) * ti3 (x, y, z)) * ti0 (x, - y, - z) - + (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y, z - 1) + 7.5e-2 * v (t, x, y, - z + 1) - 2.5e-2 * v (t, - x, - y, - z + 2) + 4.16666666666667e-3 * v (t, x, y, z + 3)) * ti2 (x, y, z); - auto temp6 = 1.0 / (8.85879567828298e-1 * damp (x, y, z) + 2.0 * m (x, y, z)); - auto temp225 = (((4.16666666666667e-3 * (v (t, x + 1, y - 2, z) - v (t, x + 1, y + 2, z)) + 3.33333333333333e-2 * (-v (t, x + 1, y - 1, z) + v (t, x + 1, y + 1, z))) * ti3 (x + 1, y, - z) + - (-1.25e-2 * v (t, x, y, z) - 4.16666666666667e-2 * v (t, x + 1, y, z) + 7.5e-2 * v (t, x + 2, y, - z) - 2.5e-2 * v (t, x + 3, y, z) + 4.16666666666667e-3 * v (t, x + 4, y, - z)) * ti1 (x + 1, - y, - z)) * ti0 (x + 1, y, - z) + - (4.16666666666667e-3 * (v (t, x + 1, y, z - 2) - v (t, x + 1, y, z + 2)) + 3.33333333333333e-2 * (-v (t, x + 1, y, z - 1) + v (t, x + 1, y, z + 1))) * ti2 (x + 1, y, - z)) * ti0 (x + 1, - y, - z) * ti1 (x + 1, y, z); + z) + 2.08333333333333e-2 * ((-temp318 * ti1 (x, y, + z) + temp325 * ti3 (x, y, z)) * ti3 (x, y, + z) + ((temp318 * ti3 (x, y, z) + temp325 * ti1 (x, y, z)) * ti2 (x, + y, + z) - (4.16666666666667e-3 * (u (t, x, y, z - 2) - u (t, x, y, + z + 2)) + 3.33333333333333e-2 * (-u (t, x, y, z - 1) + u (t, + x, y, z + 1))) * ti0 (x, y, + z)) * ti1 (x, + y, + z) * ti2 (x, y, + z)) + 3.75e-2 * ((temp399 * ti3 (x, y - 1, z) - temp401 * ti1 (x, + y - 1, + z)) * ti1 (x, y - 1, + z) + (((4.16666666666667e-3 * (u (t, x - 2, y, z - 1) - u (t, x + 2, + y, + z - 1)) + 3.33333333333333e-2 * (-u (t, + x - 1, + y, + z - 1) + u (t, + x + 1, + y, + z - 1))) * ti1 (x, y, + z - 1) + (-4.16666666666667e-2 * u (t, + x, + y, + z - 1) - 1.25e-2 * u (t, x, y - 1, + z - 1) + 7.5e-2 * u (t, x, y + 1, + z - 1) - 2.5e-2 * u (t, x, y + 2, + z - 1) + 4.16666666666667e-3 * u (t, + x, + y + 3, + z - 1)) * ti3 (x, y, + z - 1)) * ti2 (x, y, + z - 1) - (7.5e-2 * u (t, x, y, z) - 1.25e-2 * u (t, x, y, + z - 2) - 4.16666666666667e-2 * u (t, x, y, + z - 1) - 2.5e-2 * u (t, x, y, + z + 1) + 4.16666666666667e-3 * u (t, x, y, + z + 2)) * ti0 (x, y, + z - 1)) * ti0 (x, y, + z - 1)) + 1.25e-2 * ((-temp392 * ti1 (x - 2, + y, + z) + temp394 * ti3 (x - 2, y, + z)) * ti3 (x - 2, y, + z) + ((temp392 * ti3 (x - 2, y, + z) + temp394 * ti1 (x - 2, + y, + z)) * ti2 (x - 2, y, + z) - (4.16666666666667e-3 * (u (t, x - 2, y, + z - 2) - u (t, + x - 2, y, + z + 2)) + 3.33333333333333e-2 * (-u (t, x - 2, y, + z - 1) + u (t, + x - 2, y, + z + 1))) * ti0 (x - 2, y, + z)) * ti1 (x - 2, y, + z) * ti2 (x - 2, + y, + z) + ((temp395 * ti1 (x, y - 2, + z) + temp396 * ti3 (x, + y - 2, + z)) * ti2 (x, y - 2, + z) - (-4.16666666666667e-2 * u (t, x, y - 2, + z) - 1.25e-2 * u (t, x, + y - 2, + z - 1) + 7.5e-2 * u (t, x, + y - 2, + z + 1) - 2.5e-2 * u (t, x, + y - 2, + z + 2) + 4.16666666666667e-3 * u (t, x, y - 2, + z + 3)) * ti0 (x, + y - 2, + z)) * ti2 (x, y - 2, + z) * ti3 (x, + y - 2, + z)) + + 6.25e-3 * ((-temp409 * ti1 (x + 1, y, z) + temp410 * ti3 (x + 1, y, + z)) * ti3 (x + 1, + y, + z) + ((temp404 * ti1 (x, y + 1, z) + temp406 * ti3 (x, y + 1, + z)) * ti2 (x, + y + 1, + z) + - (-4.16666666666667e-2 * u (t, x, y + 1, z) - 1.25e-2 * u (t, x, + y + 1, z - 1) + 7.5e-2 * u (t, x, + y + 1, + z + 1) - 2.5e-2 * u (t, x, y + 1, + z + 2) + 4.16666666666667e-3 * u (t, x, + y + 1, + z + + 3)) * ti0 (x, y + 1, z)) * ti2 (x, y + 1, z) * ti3 (x, y + 1, + z) + ((temp409 * ti3 (x + 1, y, z) + temp410 * ti1 (x + 1, y, + z)) * ti2 (x + 1, + y, + z) + - + (4.16666666666667e-3 * + (u (t, x + 1, y, z - 2) - u (t, x + 1, y, z + 2)) + + 3.33333333333333e-2 * (-u (t, x + 1, y, z - 1) + u (t, x + 1, y, + z + 1))) * ti0 (x + 1, y, + z)) * ti1 (x + 1, + y, + z) * ti2 (x + 1, y, + z)) + + 1.66666666666667e-2 * ((temp343 * ti3 (x + 1, y, + z) - temp347 * ti1 (x + 1, y, z)) * ti3 (x + 1, + y, + z) + (-temp377 * ti1 (x, y - 1, z) + temp381 * ti3 (x, y - 1, + z)) * ti1 (x, + y - 1, + z) + ((temp336 * ti3 (x, y + 1, z) + temp342 * ti1 (x, y + 1, + z)) * ti2 (x, + y + 1, + z) - + (4.16666666666667e-3 * + (u (t, x, y + 1, z - 2) - u (t, x, y + 1, z + 2)) + + 3.33333333333333e-2 * (-u (t, x, y + 1, z - 1) + + u (t, x, y + 1, z + 1))) * ti0 (x, y + 1, + z)) * ti2 (x, + y + 1, + z) * ti3 (x, y + 1, + z) + ((temp343 * ti1 (x + 1, y, z) + temp347 * ti3 (x + 1, y, + z)) * ti2 (x + 1, y, + z) - (-4.16666666666667e-2 * u (t, x + 1, y, z) - 1.25e-2 * u (t, + x + 1, y, z - 1) + 7.5e-2 * u (t, + x + 1, + y, + z + 1) - 2.5e-2 * u (t, x + 1, y, + z + 2) + 4.16666666666667e-3 * u (t, x + 1, + y, + z + 3)) * ti0 (x + 1, y, z)) * ti1 (x + 1, y, z) * ti2 (x + 1, + y, + z) + (((4.16666666666667e-3 * (u (t, x, y - 2, z - 1) - u (t, x, + y + 2, z - 1)) + 3.33333333333333e-2 * (-u (t, x, y - 1, + z - 1) + u (t, x, y + 1, z - 1))) * ti3 (x, y, + z - 1) + (-4.16666666666667e-2 * u (t, x, y, + z - 1) - 1.25e-2 * u (t, x - 1, y, z - 1) + 7.5e-2 * u (t, + x + 1, y, + z - 1) - 2.5e-2 * u (t, x + 2, y, + z - 1) + 4.16666666666667e-3 * u (t, + x + 3, + y, + z - 1)) * ti1 (x, y, z - 1)) * ti2 (x, y, + z - 1) - (3.33333333333333e-2 * (u (t, x, y, z) - u (t, x, y, + z - 2)) + 4.16666666666667e-3 * (u (t, x, y, + z - 3) - u (t, x, y, + z + 1))) * ti0 (x, y, + z - 1)) * ti0 (x, + y, + z + - + 1)) + + 2.08333333333333e-3 * ((temp360 * ti3 (x - 2, y, + z) - temp365 * ti1 (x - 2, y, z)) * ti3 (x - 2, + y, + z) + (-temp367 * ti1 (x, y + 2, z) + temp370 * ti3 (x, y + 2, + z)) * ti1 (x, + y + 2, + z) + (temp388 * ti3 (x, y - 3, z) - temp390 * ti1 (x, y - 3, + z)) * ti1 (x, + y - 3, + z) + + ((temp352 * ti3 (x, y - 2, z) + temp357 * ti1 (x, y - 2, + z)) * ti2 (x, + y - 2, + z) + - (4.16666666666667e-3 * (u (t, x, y - 2, z - 2) - u (t, x, y - 2, + z + 2)) + 3.33333333333333e-2 * (-u (t, x, y - 2, + z - 1) + u (t, x, y - 2, z + 1))) * ti0 (x, y - 2, + z)) * ti2 (x, + y - 2, + z) * ti3 (x, y - 2, + z) + + ((temp360 * ti1 (x - 2, y, z) + temp365 * ti3 (x - 2, y, + z)) * ti2 (x - 2, y, + z) - (-4.16666666666667e-2 * u (t, x - 2, y, z) - 1.25e-2 * u (t, + x - 2, y, z - 1) + 7.5e-2 * u (t, + x - 2, + y, + z + 1) - 2.5e-2 * u (t, x - 2, y, + z + 2) + 4.16666666666667e-3 * u (t, x - 2, + y, + z + 3)) * ti0 (x - 2, y, z)) * ti1 (x - 2, y, z) * ti2 (x - 2, + y, + z) + (((4.16666666666667e-3 * (u (t, x, y - 2, z + 2) - u (t, x, + y + 2, z + 2)) + 3.33333333333333e-2 * (-u (t, x, y - 1, + z + 2) + u (t, x, y + 1, z + 2))) * ti3 (x, y, + z + 2) + (-4.16666666666667e-2 * u (t, x, y, + z + 2) - 1.25e-2 * u (t, x - 1, y, z + 2) + 7.5e-2 * u (t, + x + 1, y, + z + 2) - 2.5e-2 * u (t, x + 2, y, + z + 2) + 4.16666666666667e-3 * u (t, + x + 3, + y, + z + 2)) * ti1 (x, y, z + 2)) * ti2 (x, y, + z + + 2) - + (4.16666666666667e-3 * (u (t, x, y, z) - u (t, x, y, + z + 4)) + 3.33333333333333e-2 * (-u (t, x, y, z + 1) + u (t, + x, y, z + 3))) * ti0 (x, y, + z + 2)) * ti0 (x, + y, + z + + + 2) + + (((4.16666666666667e-3 * (u (t, x - 2, y, z - 3) - u (t, x + 2, y, + z - 3)) + 3.33333333333333e-2 * (-u (t, x - 1, y, + z - 3) + u (t, x + 1, y, z - 3))) * ti1 (x, y, + z - 3) + (-4.16666666666667e-2 * u (t, x, y, + z - 3) - 1.25e-2 * u (t, x, y - 1, z - 3) + 7.5e-2 * u (t, x, + y + 1, + z - 3) - 2.5e-2 * u (t, x, y + 2, + z - 3) + 4.16666666666667e-3 * u (t, x, + y + 3, + z - 3)) * ti3 (x, y, z - 3)) * ti2 (x, y, + z - 3) - (4.16666666666667e-3 * u (t, x, + y, + z) - 1.25e-2 * u (t, x, y, + z - 4) - 4.16666666666667e-2 * u (t, x, + y, + z - 3) + 7.5e-2 * u (t, x, y, + z - 2) - 2.5e-2 * u (t, x, y, + z - 1)) * ti0 (x, + y, + z - 3)) * ti0 (x, y, z - 3)) - (temp420 * ti0 (x, y, + z) + 2.08333333333333e-2 * (temp330 * ti3 (x, y, + z) - temp333 * ti1 (x, y, + z)) * ti1 (x, y, + z) + 1.66666666666667e-2 * (-temp336 * ti1 (x, y + 1, + z) + temp342 * ti3 (x, + y + 1, + z)) * ti1 (x, y + 1, + z) + 2.08333333333333e-3 * (-temp352 * ti1 (x, y - 2, + z) + temp357 * ti3 (x, + y - 2, + z)) * ti1 (x, y - 2, + z) + 2.08333333333333e-3 * (temp372 * ti3 (x + 2, y, + z) - temp375 * ti1 (x + 2, y, + z)) * ti3 (x + 2, y, + z) + 1.66666666666667e-2 * (temp383 * ti3 (x - 1, y, + z) - temp385 * ti1 (x - 1, y, + z)) * ti3 (x - 1, y, + z) + 2.08333333333333e-3 * (-temp386 * ti1 (x - 3, y, + z) + temp387 * ti3 (x - 3, y, + z)) * ti3 (x - 3, y, + z) + 1.25e-2 * (temp395 * ti3 (x, + y - 2, + z) - temp396 * ti1 (x, + y - 2, + z)) * ti1 (x, y - 2, + z) + 6.25e-3 * (temp404 * ti3 (x, + y + 1, + z) - temp406 * ti1 (x, + y + 1, + z)) * ti1 (x, y + 1, + z) + 3.75e-2 * (-temp407 * ti1 (x - 1, y, + z) + temp408 * ti3 (x - 1, y, + z)) * ti3 (x - 1, y, + z) + 2.08333333333333e-3 * ((temp367 * ti3 (x, y + 2, + z) + temp370 * ti1 (x, + y + 2, + z)) * ti2 (x, y + 2, + z) - (4.16666666666667e-3 * (u (t, x, y + 2, + z - 2) - u (t, x, + y + 2, + z + 2)) + 3.33333333333333e-2 * (-u (t, x, + y + 2, + z - 1) + u (t, x, + y + 2, + z + 1))) * ti0 (x, y + 2, + z)) * ti2 (x, + y + 2, + z) * ti3 (x, + y + 2, + z) + 2.08333333333333e-3 * ((temp372 * ti1 (x + 2, y, + z) + temp375 * ti3 (x + 2, + y, + z)) * ti2 (x + 2, y, + z) - (-4.16666666666667e-2 * u (t, x + 2, y, + z) - 1.25e-2 * u (t, + x + 2, + y, + z - 1) + 7.5e-2 * u (t, + x + 2, + y, + z + 1) - 2.5e-2 * u (t, + x + 2, + y, + z + 2) + 4.16666666666667e-3 * u (t, x + 2, y, + z + 3)) * ti0 (x + 2, + y, + z)) * ti1 (x + 2, y, + z) * ti2 (x + 2, + y, + z) + 1.66666666666667e-2 * ((temp377 * ti3 (x, y - 1, + z) + temp381 * ti1 (x, + y - 1, + z)) * ti2 (x, y - 1, + z) - (4.16666666666667e-3 * (u (t, x, y - 1, + z - 2) - u (t, x, + y - 1, + z + 2)) + 3.33333333333333e-2 * (-u (t, x, + y - 1, + z - 1) + u (t, x, + y - 1, + z + 1))) * ti0 (x, y - 1, + z)) * ti2 (x, + y - 1, + z) * ti3 (x, + y - 1, + z) + 1.66666666666667e-2 * ((temp383 * ti1 (x - 1, y, + z) + temp385 * ti3 (x - 1, + y, + z)) * ti2 (x - 1, y, + z) - (-4.16666666666667e-2 * u (t, x - 1, y, + z) - 1.25e-2 * u (t, + x - 1, + y, + z - 1) + 7.5e-2 * u (t, + x - 1, + y, + z + 1) - 2.5e-2 * u (t, + x - 1, + y, + z + 2) + 4.16666666666667e-3 * u (t, x - 1, y, + z + 3)) * ti0 (x - 1, + y, + z)) * ti1 (x - 1, y, + z) * ti2 (x - 1, + y, + z) + 2.08333333333333e-3 * ((temp386 * ti3 (x - 3, y, + z) + temp387 * ti1 (x - 3, + y, + z)) * ti2 (x - 3, y, + z) - (4.16666666666667e-3 * (u (t, x - 3, y, + z - 2) - u (t, + x - 3, y, + z + 2)) + 3.33333333333333e-2 * (-u (t, + x - 3, + y, + z - 1) + u (t, + x - 3, + y, + z + 1))) * ti0 (x - 3, y, + z)) * ti1 (x - 3, + y, + z) * ti2 (x - 3, y, + z) + 2.08333333333333e-3 * ((temp388 * ti1 (x, y - 3, + z) + temp390 * ti3 (x, + y - 3, + z)) * ti2 (x, y - 3, + z) - (-4.16666666666667e-2 * u (t, x, y - 3, + z) - 1.25e-2 * u (t, x, + y - 3, + z - 1) + 7.5e-2 * u (t, x, + y - 3, + z + 1) - 2.5e-2 * u (t, x, + y - 3, + z + 2) + 4.16666666666667e-3 * u (t, x, y - 3, + z + 3)) * ti0 (x, + y - 3, + z)) * ti2 (x, y - 3, + z) * ti3 (x, + y - 3, + z) + 3.75e-2 * ((temp399 * ti1 (x, y - 1, + z) + temp401 * ti3 (x, + y - 1, + z)) * ti2 (x, y - 1, + z) - (-4.16666666666667e-2 * u (t, x, y - 1, + z) - 1.25e-2 * u (t, x, + y - 1, + z - 1) + 7.5e-2 * u (t, x, + y - 1, + z + 1) - 2.5e-2 * u (t, x, + y - 1, + z + 2) + 4.16666666666667e-3 * u (t, x, y - 1, + z + 3)) * ti0 (x, + y - 1, + z)) * ti2 (x, y - 1, + z) * ti3 (x, + y - 1, + z) + 3.75e-2 * ((temp407 * ti3 (x - 1, y, + z) + temp408 * ti1 (x - 1, + y, + z)) * ti2 (x - 1, y, + z) - (4.16666666666667e-3 * (u (t, x - 1, y, + z - 2) - u (t, + x - 1, + y, + z + 2)) + 3.33333333333333e-2 * (-u (t, + x - 1, y, + z - 1) + u (t, + x - 1, + y, + z + 1))) * ti0 (x - 1, y, + z)) * ti1 (x - 1, + y, + z) * ti2 (x - 1, y, + z) + + 2.08333333333333e-3 * + (((4.16666666666667e-3 * (u (t, x, y - 2, z - 2) - u (t, x, y + 2, + z - 2)) + 3.33333333333333e-2 * (-u (t, + x, + y - 1, + z - 2) + u (t, x, + y + 1, + z - 2))) * ti3 (x, y, + z - 2) + (-4.16666666666667e-2 * u (t, + x, + y, + z - 2) - 1.25e-2 * u (t, + x - 1, y, + z - 2) + 7.5e-2 * u (t, + x + 1, + y, + z - 2) - 2.5e-2 * u (t, + x + 2, + y, + z - 2) + 4.16666666666667e-3 * u (t, + x + 3, + y, + z - 2)) * ti1 (x, y, + z - 2)) * ti2 (x, y, + z - 2) - (4.16666666666667e-3 * (-u (t, x, y, z) + u (t, x, y, + z - 4)) + 3.33333333333333e-2 * (-u (t, x, y, + z - 3) + u (t, x, + y, + z - 1))) * ti0 (x, + y, + z - 2)) * ti0 (x, y, + z - 2) + + 1.66666666666667e-2 * + (((4.16666666666667e-3 * (u (t, x, y - 2, z + 1) - u (t, x, y + 2, + z + 1)) + 3.33333333333333e-2 * (-u (t, + x, + y - 1, + z + 1) + u (t, x, + y + 1, + z + 1))) * ti3 (x, y, + z + 1) + (-4.16666666666667e-2 * u (t, + x, + y, + z + 1) - 1.25e-2 * u (t, + x - 1, y, + z + 1) + 7.5e-2 * u (t, + x + 1, + y, + z + 1) - 2.5e-2 * u (t, + x + 2, + y, + z + 1) + 4.16666666666667e-3 * u (t, + x + 3, + y, + z + 1)) * ti1 (x, y, + z + 1)) * ti2 (x, y, + z + 1) - (3.33333333333333e-2 * (-u (t, x, y, z) + u (t, x, y, + z + 2)) + + 4.16666666666667e-3 * (u (t, x, y, z - 1) - u (t, x, y, + z + 3))) * ti0 (x, + y, + z + 1)) * ti0 (x, y, + z + 1) + + 1.25e-2 * + (((4.16666666666667e-3 * (u (t, x - 2, y, z - 2) - u (t, x + 2, y, + z - 2)) + 3.33333333333333e-2 * (-u (t, + x - 1, + y, + z - 2) + u (t, + x + 1, + y, + z - 2))) * ti1 (x, y, + z - 2) + (-4.16666666666667e-2 * u (t, + x, + y, + z - 2) - 1.25e-2 * u (t, x, + y - 1, + z - 2) + 7.5e-2 * u (t, x, + y + 1, + z - 2) - 2.5e-2 * u (t, x, + y + 2, + z - 2) + 4.16666666666667e-3 * u (t, + x, + y + 3, + z - 2)) * ti3 (x, y, + z - 2)) * ti2 (x, y, + z - 2) - (-2.5e-2 * u (t, x, y, + z) - 1.25e-2 * u (t, x, + y, + z - 3) - 4.16666666666667e-2 * u (t, x, y, + z - 2) + 7.5e-2 * u (t, x, + y, + z - 1) + 4.16666666666667e-3 * u (t, x, y, + z + 1)) * ti0 (x, + y, + z - 2)) * ti0 (x, y, + z - 2) + + 6.25e-3 * + (((4.16666666666667e-3 * (u (t, x - 2, y, z + 1) - u (t, x + 2, y, + z + 1)) + 3.33333333333333e-2 * (-u (t, + x - 1, + y, + z + 1) + u (t, + x + 1, + y, + z + 1))) * ti1 (x, y, + z + 1) + (-4.16666666666667e-2 * u (t, + x, + y, + z + 1) - 1.25e-2 * u (t, x, + y - 1, + z + 1) + 7.5e-2 * u (t, x, + y + 1, + z + 1) - 2.5e-2 * u (t, x, + y + 2, + z + 1) + 4.16666666666667e-3 * u (t, + x, + y + 3, + z + 1)) * ti3 (x, y, + z + 1)) * ti2 (x, y, + z + 1) - (-1.25e-2 * u (t, x, y, + z) - 4.16666666666667e-2 * u (t, x, y, + z + 1) + 7.5e-2 * u (t, x, + y, + z + 2) - 2.5e-2 * u (t, + x, + y, + z + 3) + 4.16666666666667e-3 * u (t, x, y, + z + 4)) * ti0 (x, + y, + z + 1)) * ti0 (x, y, z + 1)); + auto temp290 = + (((4.16666666666667e-3 * (-v (t, x, y, z) + v (t, x - 4, y, + z)) + 3.33333333333333e-2 * (-v (t, x - 3, y, z) + v (t, + x - 1, y, z))) * ti1 (x - 2, y, + z) + (-4.16666666666667e-2 * v (t, x - 2, y, z) - 1.25e-2 * v (t, + x - 2, y - 1, z) + 7.5e-2 * v (t, x - 2, y + 1, + z) - 2.5e-2 * v (t, + x - 2, + y + 2, + z) + 4.16666666666667e-3 * v (t, x - 2, y + 3, z)) * ti3 (x - 2, + y, + z)) * ti0 (x - 2, + y, + z) + (-4.16666666666667e-2 * v (t, x - 2, y, + z) - 1.25e-2 * v (t, + x - 2, + y, + z - 1) + 7.5e-2 * v (t, x - 2, y, z + 1) - 2.5e-2 * v (t, + x - 2, + y, + z + 2) + 4.16666666666667e-3 * v (t, x - 2, y, + z + 3)) * ti2 (x - 2, + y, + z)) * ti0 (x - 2, + y, + z) * ti1 (x - 2, y, z); + auto temp275 = + (((3.33333333333333e-2 * (v (t, x, y, z) - v (t, x, y - 2, + z)) + 4.16666666666667e-3 * (v (t, x, y - 3, z) - v (t, x, + y + 1, z))) * ti3 (x, y - 1, + z) + (-4.16666666666667e-2 * v (t, x, y - 1, z) - 1.25e-2 * v (t, + x - 1, y - 1, z) + 7.5e-2 * v (t, x + 1, y - 1, + z) - 2.5e-2 * v (t, + x + 2, + y - 1, + z) + 4.16666666666667e-3 * v (t, x + 3, y - 1, z)) * ti1 (x, + y - 1, + z)) * ti0 (x, + y - 1, + z) + + (4.16666666666667e-3 * (v (t, x, y - 1, z - 2) - v (t, x, y - 1, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y - 1, z - 1) + v (t, + x, y - 1, z + 1))) * ti2 (x, y - 1, + z)) * ti0 (x, + y - 1, + z) * ti3 (x, y - 1, z); + auto temp237 = + (((3.33333333333333e-2 * (-v (t, x, y, z) + v (t, x, y + 2, + z)) + 4.16666666666667e-3 * (v (t, x, y - 1, z) - v (t, x, + y + 3, z))) * ti3 (x, y + 1, + z) + (-4.16666666666667e-2 * v (t, x, y + 1, z) - 1.25e-2 * v (t, + x - 1, y + 1, z) + 7.5e-2 * v (t, x + 1, y + 1, + z) - 2.5e-2 * v (t, + x + 2, + y + 1, + z) + 4.16666666666667e-3 * v (t, x + 3, y + 1, z)) * ti1 (x, + y + 1, + z)) * ti0 (x, + y + 1, + z) + + (4.16666666666667e-3 * (v (t, x, y + 1, z - 2) - v (t, x, y + 1, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y + 1, z - 1) + v (t, + x, y + 1, z + 1))) * ti2 (x, y + 1, + z)) * ti0 (x, + y + 1, + z) * ti3 (x, y + 1, z); + auto temp262 = + (((4.16666666666667e-3 * (v (t, x, y, z) - v (t, x, y + 4, + z)) + 3.33333333333333e-2 * (-v (t, x, y + 1, z) + v (t, x, + y + 3, z))) * ti3 (x, y + 2, + z) + (-4.16666666666667e-2 * v (t, x, y + 2, z) - 1.25e-2 * v (t, + x - 1, y + 2, z) + 7.5e-2 * v (t, x + 1, y + 2, + z) - 2.5e-2 * v (t, + x + 2, + y + 2, + z) + 4.16666666666667e-3 * v (t, x + 3, y + 2, z)) * ti1 (x, + y + 2, + z)) * ti0 (x, + y + 2, + z) + + (4.16666666666667e-3 * (v (t, x, y + 2, z - 2) - v (t, x, y + 2, + z + 2)) + 3.33333333333333e-2 * (-v (t, x, y + 2, z - 1) + v (t, + x, y + 2, z + 1))) * ti2 (x, y + 2, + z)) * ti0 (x, + y + 2, + z) * ti3 (x, y + 2, z); + auto temp68 = + ((4.16666666666667e-3 * (v (t, x - 2, y, z) - v (t, x + 2, y, + z)) + 3.33333333333333e-2 * (-v (t, x - 1, y, z) + v (t, x + 1, + y, z))) * ti1 (x, y, + z) + (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, + y - 1, z) + 7.5e-2 * v (t, x, y + 1, + z) - 2.5e-2 * v (t, x, + y + 2, + z) + 4.16666666666667e-3 * v (t, x, y + 3, z)) * ti3 (x, y, + z)) * ti0 (x, + y, + z) + + (-4.16666666666667e-2 * v (t, x, y, z) - 1.25e-2 * v (t, x, y, + z - 1) + 7.5e-2 * v (t, x, y, + z + 1) - 2.5e-2 * v (t, + x, + y, + z + 2) + 4.16666666666667e-3 * v (t, x, y, z + 3)) * ti2 (x, y, z); + auto temp6 = + 1.0 / (8.85879567828298e-1 * damp (x, y, z) + 2.0 * m (x, y, z)); + auto temp225 = + (((4.16666666666667e-3 * (v (t, x + 1, y - 2, z) - v (t, x + 1, y + 2, + z)) + 3.33333333333333e-2 * (-v (t, x + 1, y - 1, z) + v (t, + x + 1, y + 1, z))) * ti3 (x + 1, y, + z) + (-1.25e-2 * v (t, x, y, z) - 4.16666666666667e-2 * v (t, + x + 1, y, z) + 7.5e-2 * v (t, x + 2, y, + z) - 2.5e-2 * v (t, x + 3, y, z) + 4.16666666666667e-3 * v (t, + x + 4, y, + z)) * ti1 (x + 1, + y, + z)) * ti0 (x + 1, y, + z) + + (4.16666666666667e-3 * (v (t, x + 1, y, z - 2) - v (t, x + 1, y, + z + 2)) + 3.33333333333333e-2 * (-v (t, x + 1, y, z - 1) + v (t, + x + 1, y, z + 1))) * ti2 (x + 1, y, + z)) * ti0 (x + 1, + y, + z) * ti1 (x + 1, y, z); // Next time-step values. u (t + 1, x, y, - z) EQUALS temp6 *(temp10 * u (t - 1, x, y, - z) + - 1.5695652173913 * (temp487 * epsilon (x, y, z) + - (2.08333333333333e-2 * - (temp56 + - temp68 * (ti0 (x, y, z) * - ti3 (x, y, - z) + ti2 (x, y, - z))) + - 1.25e-2 * (temp185 + temp284 + - temp308) - - 3.75e-2 * (temp204 + temp286 + - temp309) + - 6.25e-3 * (temp225 + temp288 + - temp310) + - 1.66666666666667e-2 * - (-temp154 + temp237 - temp275 + - temp289 - temp299 + temp92) + 2.08333333333333e-3 * (temp116 - temp133 - temp165 + temp252 - temp262 - temp283 + temp290 - temp298 - temp307)) * delta (x, y, z)) + 4.0 * m (x, y, z) * u (t, x, y, z)); - v (t + 1, x, y, - z) EQUALS temp6 *(temp10 * v (t - 1, x, y, - z) + 1.5695652173913 * temp487 * delta (x, - y, - z) - + 3.26992753623188e-2 * (temp56 + - temp68 * (ti0 (x, y, z) * - ti3 (x, y, - z) + ti2 (x, - y, - z))) - + 1.96195652173913e-2 * (temp185 + temp284 + - temp308) - - 5.88586956521739e-2 * (temp204 + temp286 + temp309) + - 9.80978260869565e-3 * (temp225 + temp288 + temp310) + - 2.61594202898551e-2 * (-temp154 + temp237 - temp275 + - temp289 - temp299 + temp92) + 3.26992753623188e-3 * (temp116 - temp133 - temp165 + temp252 - temp262 - temp283 + temp290 - temp298 - temp307) + 4.0 * m (x, y, z) * v (t, x, y, z)); + z) EQUALS temp6 *(temp10 * u (t - 1, x, y, + z) + + 1.5695652173913 * (temp487 * epsilon (x, y, z) + + (2.08333333333333e-2 * + (temp56 + + temp68 * (ti0 (x, y, z) * + ti3 (x, y, + z) + ti2 (x, y, + z))) + + 1.25e-2 * (temp185 + temp284 + + temp308) - + 3.75e-2 * (temp204 + temp286 + + temp309) + + 6.25e-3 * (temp225 + temp288 + + temp310) + + 1.66666666666667e-2 * + (-temp154 + temp237 - temp275 + + temp289 - temp299 + temp92) + 2.08333333333333e-3 * (temp116 - + temp133 - temp165 + temp252 - temp262 - temp283 + temp290 - + temp298 - temp307)) * delta (x, y, z)) + 4.0 * m (x, y, + z) * u (t, x, y, z)); + v (t + 1, x, y, z) EQUALS temp6 *(temp10 * v (t - 1, x, y, + z) + 1.5695652173913 * temp487 * delta (x, y, + z) + 3.26992753623188e-2 * (temp56 + temp68 * (ti0 (x, y, + z) * ti3 (x, y, z) + ti2 (x, y, + z))) + 1.96195652173913e-2 * (temp185 + temp284 + temp308) - + 5.88586956521739e-2 * (temp204 + temp286 + temp309) + + 9.80978260869565e-3 * (temp225 + temp288 + temp310) + + 2.61594202898551e-2 * (-temp154 + temp237 - temp275 + temp289 - + temp299 + temp92) + 3.26992753623188e-3 * (temp116 - temp133 - + temp165 + temp252 - temp262 - temp283 + temp290 - temp298 - + temp307) + 4.0 * m (x, y, z) * v (t, x, y, z)); } @@ -1978,7 +1939,9 @@ namespace break; default: { - yask_exception e ("Error: only radius values of 2 and 4 are currently supported for the TTI stencil example"); + yask_exception + e + ("Error: only radius values of 2 and 4 are currently supported for the TTI stencil example"); throw e; } } diff --git a/src/stencils/TestStencils.cpp b/src/stencils/TestStencils.cpp index 61fc9a90..b74e8d11 100644 --- a/src/stencils/TestStencils.cpp +++ b/src/stencils/TestStencils.cpp @@ -1,7 +1,7 @@ /***************************************************************************** YASK: Yet Another Stencil Kit -Copyright (c) 2014-2021, Intel Corporation +Copyright (c) 2014-2022, Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to @@ -24,6 +24,7 @@ IN THE SOFTWARE. *****************************************************************************/ // Tests for various YASK DSL features. +// Most tests use whole-number calculations for more robust error-checking. // YASK stencil solution(s) in this file will be integrated into the YASK compiler utility. #include "yask_compiler_api.hpp" @@ -47,28 +48,29 @@ namespace { yc_index_node_ptr z = new_domain_index("z"); // spatial dim. // Define some stencils in different dimensions. - // These will be asymmetrical if any of the '*_ext' params are not 0; + // The size is based on the 'radius' option. + // These will be asymmetrical if any of the '*_ext' params are not the same; // Define simple stencil from var 'V' at 't0' centered around 'x0'. // Extend given radius left and/or right w/'*_ext'. virtual yc_number_node_ptr def_1d(yc_var_proxy& V, const yc_number_node_ptr& t0, const yc_number_node_ptr& x0, int left_ext, int right_ext) { + auto r = get_radius(); yc_number_node_ptr v; - int n = 0; - for (int i = -get_radius() - left_ext; i <= get_radius() + right_ext; i++, n++) + for (int i = -r - left_ext; i <= r + right_ext; i++) v += V(t0, x0+i); - return v / n; + return v; } // Define simple stencil from scratch or read-only var 'V' centered // around 'x0'. Similar to 'def_1d()', but doesn't use step var. virtual yc_number_node_ptr def_no_t_1d(yc_var_proxy& V, const yc_number_node_ptr& x0, int left_ext, int right_ext) { + auto r = get_radius(); yc_number_node_ptr v; - int n = 0; - for (int i = -get_radius() - left_ext; i <= get_radius() + right_ext; i++, n++) + for (int i = -r - left_ext; i <= r + right_ext; i++) v += V(x0+i); - return v / n; + return v; } // Define simple stencil from var 'V' at 't0' centered around 'x0', 'y0'. @@ -79,14 +81,12 @@ namespace { int x_left_ext, int x_right_ext, const yc_number_node_ptr& y0, int y_left_ext, int y_right_ext) { + auto r = get_radius(); yc_number_node_ptr v; - int n = 0; - for (int i : { -get_radius() - x_left_ext, 0, get_radius() + x_right_ext }) - for (int j : { -get_radius() - y_left_ext, 0, get_radius() + y_right_ext }) { + for (int i : { -r - x_left_ext, 0, r + x_right_ext }) + for (int j : { -r - y_left_ext, 0, r + y_right_ext }) v += V(t0, x0+i, y0+j); - n++; - } - return v / n; + return v; } // Define simple stencil from scratch or read-only var 'V' at 't0' @@ -97,14 +97,12 @@ namespace { int x_left_ext, int x_right_ext, const yc_number_node_ptr& y0, int y_left_ext, int y_right_ext) { + auto r = get_radius(); yc_number_node_ptr v; - int n = 0; - for (int i : { -get_radius() - x_left_ext, 0, get_radius() + x_right_ext }) - for (int j : { -get_radius() - y_left_ext, 0, get_radius() + y_right_ext }) { + for (int i : { -r - x_left_ext, 0, r + x_right_ext }) + for (int j : { -r - y_left_ext, 0, r + y_right_ext }) v += V(x0+i, y0+j); - n++; - } - return v / n; + return v; } // Define simple stencil from var 'V' at 't0' centered around 'x0', 'y0', 'z0'. @@ -117,15 +115,13 @@ namespace { int y_left_ext, int y_right_ext, const yc_number_node_ptr& z0, int z_left_ext, int z_right_ext) { - yc_number_node_ptr v; - int n = 0; - for (int i : { -get_radius() - x_left_ext, 0, get_radius() + x_right_ext }) - for (int j : { -get_radius() - y_left_ext, 0, get_radius() + y_right_ext }) - for (int k : { -get_radius() - z_left_ext, 0, get_radius() + z_right_ext }) { + auto r = get_radius(); + yc_number_node_ptr v = V(t0, x0, y0, z0); + for (int i : { -r - x_left_ext, r + x_right_ext }) + for (int j : { -r - y_left_ext, r + y_right_ext }) + for (int k : { -r - z_left_ext, r + z_right_ext }) v += V(t0, x0+i, y0+j, z0+k); - n++; - } - return v / n; + return v; } // Define simple stencil from scratch or read-only var 'V' centered @@ -138,15 +134,13 @@ namespace { int y_left_ext, int y_right_ext, const yc_number_node_ptr& z0, int z_left_ext, int z_right_ext) { - yc_number_node_ptr v; - int n = 0; - for (int i : { -get_radius() - x_left_ext, 0, get_radius() + x_right_ext }) - for (int j : { -get_radius() - y_left_ext, 0, get_radius() + y_right_ext }) - for (int k : { -get_radius() - z_left_ext, 0, get_radius() + z_right_ext }) { + auto r = get_radius(); + yc_number_node_ptr v = V(x0, y0, z0); + for (int i : { -r - x_left_ext, r + x_right_ext }) + for (int j : { -r - y_left_ext, r + y_right_ext }) + for (int k : { -r - z_left_ext, r + z_right_ext }) v += V(x0+i, y0+j, z0+k); - n++; - } - return v / n; + return v; } // Define simple stencil from var 'V' at 't0' centered around 'w0', 'x0', 'y0', 'z0'. @@ -161,16 +155,14 @@ namespace { int y_left_ext, int y_right_ext, const yc_number_node_ptr& z0, int z_left_ext, int z_right_ext) { - yc_number_node_ptr v; - int n = 0; - for (int h : { -get_radius() - w_left_ext, 0, get_radius() + w_right_ext }) - for (int i : { -get_radius() - x_left_ext, 0, get_radius() + x_right_ext }) - for (int j : { -get_radius() - y_left_ext, 0, get_radius() + y_right_ext }) - for (int k : { -get_radius() - z_left_ext, 0, get_radius() + z_right_ext }) { + auto r = get_radius(); + yc_number_node_ptr v = V(t0, w0, x0, y0, z0); + for (int h : { -r - w_left_ext, r + w_right_ext }) + for (int i : { -r - x_left_ext, r + x_right_ext }) + for (int j : { -r - y_left_ext, r + y_right_ext }) + for (int k : { -r - z_left_ext, r + z_right_ext }) v += V(t0, w0+h, x0+i, y0+j, z0+k); - n++; - } - return v / n; + return v; } public: @@ -340,7 +332,10 @@ namespace { // Time-varying var. Intermix last domain dim with misc dims to make // sure compiler creates correct layout. - yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, a, y, b, c }); + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, a, y, b, c }); + + // Misc-only var. + yc_var_proxy B = yc_var_proxy("B", get_soln(), { c, b }); public: @@ -352,16 +347,17 @@ namespace { // Define the value at t+1 using asymmetric stencil // with various pos & neg indices in misc dims. - yc_number_node_ptr v = A(t, x, 0, y, -1, 2) + 1.0; - for (int r = 1; r <= get_radius(); r++) - v += A(t, x + r, 3, y, 0, 1); - for (int r = 1; r <= get_radius() + 1; r++) - v += A(t, x - r, 4, y, 2, 1); - for (int r = 1; r <= get_radius() + 2; r++) - v += A(t, x, -2, y + r, 2, 0); - for (int r = 1; r <= get_radius() + 3; r++) - v += A(t, x, 0, y - r, 0, -1); - A(t+1, x, 1, y, 2, 3) EQUALS v; + auto r = get_radius(); + yc_number_node_ptr v = A(t, x, 0, y, 1, 2) + 1.0; + for (int i = 1; i <= r; i++) + v += A(t, x + i, 3, y, 0, 3); + for (int i = 1; i <= r + 1; i++) + v += A(t, x - i, 4, y, 2, 2); + for (int i = 1; i <= r + 2; i++) + v += A(t, x, -2, y + i, 2, 2); + for (int i = 1; i <= r + 3; i++) + v += A(t, x, 0, y - i, 0, 3); + A(t+1, x, 1, y, 2, 3) EQUALS v + B(-2, 3) - B(4, -2); } }; @@ -370,14 +366,86 @@ namespace { // '-stencil' commmand-line option or the 'stencil=' build option. static TestMisc2dStencil TestMisc2dStencil_instance; - - // A "stream-like" stencil that just reads and writes + // "Stream-like" stencils that just read and write // with no spatial offsets. // The radius controls how many reads are done in the time domain. // Running with radius=2 should give performance comparable to // (but not identical to) the stream 'triad' benchmark. + class StreamStencil1 : public yc_solution_with_radius_base { + + protected: + + // Indices & dimensions. + yc_index_node_ptr t = new_step_index("t"); // step in time dim. + yc_index_node_ptr x = new_domain_index("x"); // spatial dim. + + // Vars. + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x }); // time-varying 3D var. + + public: + + StreamStencil1(int radius=2) : + yc_solution_with_radius_base("test_stream_1d", radius) { } + virtual ~StreamStencil1() { } + + // Define equation to read 'get_radius()' values and write one. + virtual void define() { + + yc_number_node_ptr v; + + // Add 'get_radius()' values from past time-steps. + for (int r = 0; r < get_radius(); r++) + v += A(t-r, x); + + // define the value at t+1 to be equivalent to v + 1. + A(t+1, x) EQUALS v + 1; + } + }; + + // Create an object of type 'StreamStencil1', + // making it available in the YASK compiler utility via the + // '-stencil' commmand-line option or the 'stencil=' build option. + static StreamStencil1 StreamStencil1_instance; + + class StreamStencil2 : public yc_solution_with_radius_base { - class StreamStencil : public yc_solution_with_radius_base { + protected: + + // Indices & dimensions. + yc_index_node_ptr t = new_step_index("t"); // step in time dim. + yc_index_node_ptr x = new_domain_index("x"); // spatial dim. + yc_index_node_ptr y = new_domain_index("y"); // spatial dim. + + // Vars. + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, y }); // time-varying 3D var. + + public: + + StreamStencil2(int radius=2) : + yc_solution_with_radius_base("test_stream_2d", radius) { } + virtual ~StreamStencil2() { } + + // Define equation to read 'get_radius()' values and write one. + virtual void define() { + + yc_number_node_ptr v; + + // Add 'get_radius()' values from past time-steps. + for (int r = 0; r < get_radius(); r++) + v += A(t-r, x, y); + + // define the value at t+1 to be equivalent to v + 1. + A(t+1, x, y) EQUALS v + 1; + } + }; + + // Create an object of type 'StreamStencil2', + // making it available in the YASK compiler utility via the + // '-stencil' commmand-line option or the 'stencil=' build option. + static StreamStencil2 StreamStencil2_instance; + + + class StreamStencil3 : public yc_solution_with_radius_base { protected: @@ -392,9 +460,9 @@ namespace { public: - StreamStencil(int radius=2) : + StreamStencil3(int radius=2) : yc_solution_with_radius_base("test_stream_3d", radius) { } - virtual ~StreamStencil() { } + virtual ~StreamStencil3() { } // Define equation to read 'get_radius()' values and write one. virtual void define() { @@ -410,14 +478,13 @@ namespace { } }; - // Create an object of type 'StreamStencil', + // Create an object of type 'StreamStencil3', // making it available in the YASK compiler utility via the // '-stencil' commmand-line option or the 'stencil=' build option. - static StreamStencil StreamStencil_instance; + static StreamStencil3 StreamStencil3_instance; // Reverse-time stencil. // In this test, A(t-1) depends on A(t). - class TestReverseStencil : public TestBase { protected: @@ -444,15 +511,16 @@ namespace { static TestReverseStencil TestReverseStencil_instance; // Test dependent equations. - // These will create 2 stages that will be applied in sequence + // These will create >= 2 stages that will be applied in sequence // for each time-step. class TestDepStencil1 : public TestBase { protected: // Vars. - yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x }); // time-varying var. - yc_var_proxy B = yc_var_proxy("B", get_soln(), { t, x }); // time-varying var. + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x }); + yc_var_proxy B = yc_var_proxy("B", get_soln(), { t, x }); + yc_var_proxy C = yc_var_proxy("C", get_soln(), { t, x }); public: @@ -462,11 +530,12 @@ namespace { // Define equation to apply to all points in 'A' and 'B' vars. virtual void define() { - // Define A(t+1) from A(t) & stencil at B(t). - A(t+1, x) EQUALS A(t, x) - def_1d(B, t, x, 0, 1); + // Define A(t+1) and B(t+1). + A(t+1, x) EQUALS -2 * A(t, x); + B(t+1, x) EQUALS def_1d(B, t, x, 0, 1); - // Define B(t+1) from B(t) & stencil at A(t+1). - B(t+1, x) EQUALS B(t, x) - def_1d(A, t+1, x, 3, 2); + // 'C(t+1)' depends on 'A(t+1)', creating a 2nd stage. + C(t+1, x) EQUALS def_1d(A, t+1, x, 1, 0) + C(t, x+1); } }; @@ -482,6 +551,7 @@ namespace { // Vars. yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, y }); // time-varying var. yc_var_proxy B = yc_var_proxy("B", get_soln(), { t, x, y }); // time-varying var. + yc_var_proxy C = yc_var_proxy("C", get_soln(), { t, x, y }); // time-varying var. public: @@ -491,11 +561,14 @@ namespace { // Define equation to apply to all points in 'A' and 'B' vars. virtual void define() { - // Define A(t+1) from A(t) & stencil at B(t). + // Define A(t+1) from A(t) & B(t). A(t+1, x, y) EQUALS A(t, x, y) - def_2d(B, t, x, 0, 1, y, 2, 1); - // Define B(t+1) from B(t) & stencil at A(t+1). + // Define B(t+1) from B(t) & A(t+1), creating a 2nd stage. B(t+1, x, y) EQUALS B(t, x, y) - def_2d(A, t+1, x, 3, 2, y, 0, 1); + + // Define C(t+1) from B(t+1), creating a 3rd stage. + C(t+1, x, y) EQUALS B(t+1, x-1, y+2); } }; @@ -555,7 +628,7 @@ namespace { // Define equation to apply to all points in 'A' var. virtual void define() { - // Define values in scratch var 'B'. + // Define values in scratch var 'B' based on 'A'. B(x) EQUALS def_1d(A, t, x, 1, 0); // Set 'A' from scratch var values. @@ -568,6 +641,43 @@ namespace { // '-stencil' commmand-line option or the 'stencil=' build option. static TestScratchStencil1 TestScratchStencil1_instance; + class TestScratchStencil2 : public TestBase { + + protected: + + // Vars. + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, y }); // time-varying var. + + // Temporary storage. + yc_var_proxy t1 = yc_var_proxy("t1", get_soln(), { x, y }, true); + yc_var_proxy t2 = yc_var_proxy("t2", get_soln(), { x, y }, true); + + public: + + TestScratchStencil2(int radius=2) : + TestBase("test_scratch_2d", radius) { } + + // Define equation to apply to all points in 'A' var. + virtual void define() { + + // Set scratch var. + t1(x, y) EQUALS def_2d(A, t, x, 0, 1, y, 2, 1); + + // Set one scratch var from other scratch var. + t2(x, y) EQUALS t1(x, y+1); + + // Update A from scratch vars. + A(t+1, x, y) EQUALS A(t, x, y) + + def_no_t_2d(t1, x, 2, 0, y, 1, 0) + + def_no_t_2d(t2, x, 1, 0, y, 0, 1); + } + }; + + // Create an object of type 'TestScratchStencil2', + // making it available in the YASK compiler utility via the + // '-stencil' commmand-line option or the 'stencil=' build option. + static TestScratchStencil2 TestScratchStencil2_instance; + class TestScratchStencil3 : public TestBase { protected: @@ -723,10 +833,10 @@ namespace { // Define equation to apply to all points in 'A' var. virtual void define() { - // Time condition. + // Step condition based on step value. auto tc0 = (t % 2 == 0); - // Var condition. + // Step condition based on misc-var contents. auto vc0 = (B(0) > B(1)); // Set A w/different stencils depending on the conditions. It is @@ -787,7 +897,9 @@ namespace { // '-stencil' commmand-line option or the 'stencil=' build option. static TestScratchBoundaryStencil1 TestScratchBoundaryStencil1_instance; - // A stencil that uses svml math functions. + // A stencil that uses math functions. + // This stencil is an exception to the integer-result calculations + // used in most test stencils. class TestFuncStencil1 : public TestBase { protected: @@ -803,8 +915,12 @@ namespace { TestBase("test_func_1d", radius) { } virtual void define() { + + // Define 'A(t+1)' and 'B(t+1)' based on values at 't'. A(t+1, x) EQUALS cos(A(t, x)) - 2 * sin(A(t, x)); B(t+1, x) EQUALS pow(def_1d(B, t, x, 0, 1), 1.0/2.5); + + // 'C(t+1)' depends on 'A(t+1)', creating a 2nd stage. C(t+1, x) EQUALS atan(def_1d(A, t+1, x, 1, 0) + cbrt(C(t, x+1))); } }; @@ -814,44 +930,44 @@ namespace { // '-stencil' commmand-line option or the 'stencil=' build option. static TestFuncStencil1 TestFuncStencil1_instance; - // A stencil that has vars but no stencil equation. - class TestEmptyStencil2 : public TestBase { + // A stencil that no vars and no stencil equation. + // Kernel must be built with domain_dims and step_dim options. + class TestEmptyStencil0: public TestBase { protected: - // Vars. - yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, y }); // time-varying var. - public: - TestEmptyStencil2(int radius=1) : - TestBase("test_empty_2d", radius) { } + TestEmptyStencil0(int radius=1) : + TestBase("test_empty", radius) { } virtual void define() { } }; - // Create an object of type 'TestEmptyStencil2', + // Create an object of type 'TestEmptyStencil0', // making it available in the YASK compiler utility via the // '-stencil' commmand-line option or the 'stencil=' build option. - static TestEmptyStencil2 TestEmptyStencil2_instance; + static TestEmptyStencil0 TestEmptyStencil0_instance; - // A stencil that no vars and no stencil equation. - // Kernel must be built with domain_dims and step_dim options. - class TestEmptyStencil0: public TestBase { + // A stencil that has vars but no stencil equation. + class TestEmptyStencil2 : public TestBase { protected: + // Vars. + yc_var_proxy A = yc_var_proxy("A", get_soln(), { t, x, y }); // time-varying var. + public: - TestEmptyStencil0(int radius=1) : - TestBase("test_empty", radius) { } + TestEmptyStencil2(int radius=1) : + TestBase("test_empty_2d", radius) { } virtual void define() { } }; - // Create an object of type 'TestEmptyStencil0', + // Create an object of type 'TestEmptyStencil2', // making it available in the YASK compiler utility via the // '-stencil' commmand-line option or the 'stencil=' build option. - static TestEmptyStencil0 TestEmptyStencil0_instance; + static TestEmptyStencil2 TestEmptyStencil2_instance; } // namespace. diff --git a/utils/bin/analyze_trace.pl b/utils/bin/analyze_trace.pl index 483712bc..300a2be6 100755 --- a/utils/bin/analyze_trace.pl +++ b/utils/bin/analyze_trace.pl @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to diff --git a/utils/bin/convert_v2_stencil.pl b/utils/bin/convert_v2_stencil.pl index 37f5b31c..bb60832d 100755 --- a/utils/bin/convert_v2_stencil.pl +++ b/utils/bin/convert_v2_stencil.pl @@ -3,7 +3,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to diff --git a/utils/bin/gen_layouts.pl b/utils/bin/gen_layouts.pl index 44d76df4..72af87ec 100755 --- a/utils/bin/gen_layouts.pl +++ b/utils/bin/gen_layouts.pl @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -44,7 +44,8 @@ sub usage { use lib dirname($0)."/lib"; use lib dirname($0)."/../lib"; -print "// Automatically generated; do not edit.\n\n" if $opt ne '-p'; +print "// Automatically generated; do not edit.\n". + "#pragma once\n" if $opt ne '-p'; # permute items in a list. # args: block of code to run on each permutation and list to permute. @@ -172,17 +173,17 @@ END public: Layout_$name() { } Layout_$name(const Indices& sizes) : ${basename}(sizes) { } - inline int get_num_sizes() const { + static constexpr int get_num_sizes() { return $n; } // Return 1-D offset from $n-D 'j' indices. - inline idx_t layout(const Indices& j) const { + ALWAYS_INLINE idx_t layout(const Indices& j) const { return $layout; } // Return $n index(indices) based on 1-D 'ai' input. - inline Indices unlayout(idx_t ai) const { + ALWAYS_INLINE Indices unlayout(idx_t ai) const { Indices j(_sizes); $unlayout; return j; diff --git a/utils/bin/gen_loops.pl b/utils/bin/gen_loops.pl index 8370750d..c75747dc 100755 --- a/utils/bin/gen_loops.pl +++ b/utils/bin/gen_loops.pl @@ -3,7 +3,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -24,7 +24,7 @@ ## IN THE SOFTWARE. ############################################################################## -# Purpose: Create loop code. +# Purpose: Create area-scanning code. use strict; use File::Basename; @@ -39,20 +39,34 @@ $| = 1; # autoflush. +########## # Globals. my %OPT; # cmd-line options. -my @dims; # indices of dimensions. -my $inputVar; # input var. +my %macros; # macros from file. +my $inputVar = "LOOP_INDICES"; # input var macro. +my $outputVar = "BODY_INDICES"; # output var. +my $loopPart = "USE_LOOP_PART_"; # macro to enable specified loop part. +my $macroPrefix = ""; # prefix for macros. +my $varPrefix = ""; # prefix for vars. +my $doAlign = 1; # generate alignment code. +my @fixed_exprs = ("begin", "end", "stride", "tile_size"); +my @align_exprs = ("align", "align_ofs"); +my @var_exprs = ("start", "stop", "index"); +my $indent = dirname($0)."/yask_indent.sh"; # loop-feature bit fields. my $bSerp = 0x1; # serpentine path my $bSquare = 0x2; # square_wave path -my $bGroup = 0x4; # group path -my $bSimd = 0x8; # simd prefix +my $bTile = 0x4; # tile path +my $bOmpPar = 0x8; # OpenMP parallel +my $bManual = 0x10; # use manual scheduling +my $bNested = 0x20; # use "normal" nested loops +my $bSimd = 0x40; # OpenMP SIMD ########## -# Function to make names of variables based on dimension string(s). +# Various functions to create variable references. +# Create indices from args. # 'idx()' => "". # 'idx(3)' => "[3]". # 'idx(3,5)' => "[3][5]". @@ -63,86 +77,98 @@ sub idx { # Accessors for input struct. # Examples if $inputVar == "block_idxs": # inVar() => "block_idxs". -# inVar("foo") => "block_idxs.foo". -# inVar("foo", 5) => "block_idxs.foo[5]". +# inVar("foo", 5) => "FOO(5)" (using macro). sub inVar { my $vname = shift; - my $part = (defined $vname) ? ".$vname" : ""; - return "$inputVar$part".idx(@_); + if (defined $vname) { + die unless scalar(@_) == 1; + my $em = $macroPrefix.(uc $vname); + return "$em(@_)"; + } + return "$macroPrefix$inputVar"; } -# Accessors for local struct. -# locVar("foo", 5) => "local_indices.foo[5]". -sub locVar { +# Accessors for output struct. +# Examples if $outputVar == "local_indices": +# outVar() => "local_indices". +# outVar("foo", 5) => "local_indices.foo[5]". +sub outVar { my $vname = shift; - my $part = (defined $vname) ? ".$vname" : ""; - return "local_indices$part".idx(@_); + if (defined $vname) { + die unless scalar(@_) == 1; + return "$macroPrefix$outputVar.$vname".idx(@_); + } + return "$macroPrefix$outputVar"; +} + +# Make a local var. +sub locVar { + return $varPrefix . join('_', @_); } -# Access values in input struct. +# Names for vars used in the generated code. +# Arg(s) are loop dim(s). sub beginVar { - return inVar("begin", @_); + return locVar("begin", @_); } sub endVar { - return inVar("end", @_); + return locVar("end", @_); } sub strideVar { - return inVar("stride", @_); + return locVar("stride", @_); } sub alignVar { - return inVar("align", @_); + return locVar("align", @_); } sub alignOfsVar { - return inVar("align_ofs", @_); + return locVar("align_ofs", @_); } -sub groupSizeVar { - return inVar("group_size", @_); +sub tileSizeVar { + return locVar("tile_size", @_); } - -# These are generated scalars. sub adjAlignVar { - return join('_', 'adj_align', @_); + return locVar('adj_align', @_); } sub alignBeginVar { - return join('_', 'aligned_begin', @_); + return locVar('aligned_begin', @_); } sub numItersVar { - return join('_', 'num_iters', @_); + return locVar('num_iters', @_); } -sub numGroupsVar { - return join('_', 'num_full_groups', @_); +sub numTilesVar { + return locVar('num_full_tiles', @_); } -sub numFullGroupItersVar { - return join('_', 'num_iters_in_full_group', @_); +sub numFullTileItersVar { + return locVar('num_iters_in_full_tile', @_); } -sub numGroupSetItersVar { - return scalar @_ ? join('_', 'num_iters_in_group_set', @_) : - 'num_iters_in_full_group'; +sub numTileSetItersVar { + return scalar @_ ? locVar('num_iters_in_tile_set', @_) : + locVar('num_iters_in_full_tile'); } sub indexVar { - return join('_', 'index', @_); + return locVar('index', @_); } -sub groupIndexVar { - return join('_', 'index_of_group', @_); +sub tileIndexVar { + return locVar('index_of_tile', @_); } -sub groupSetOffsetVar { - return scalar @_ ? join('_', 'index_offset_within_group_set', @_) : - 'index_offset_within_this_group'; +sub tileSetOffsetVar { + return scalar @_ ? locVar('index_offset_within_tile_set', @_) : + locVar('index_offset_within_this_tile'); } -sub groupOffsetVar { - return join('_', 'index_offset_within_this_group', @_); +sub tileOffsetVar { + return locVar('index_offset_within_this_tile', @_); } -sub numLocalGroupItersVar { - return join('_', 'num_iters_in_group', @_); +sub numLocalTileItersVar { + return locVar('num_iters_in_tile', @_); } sub loopIndexVar { - return join('_', 'loop_index', @_); + return locVar('loop_index', @_); } sub startVar { - return join('_', 'start', @_); + return locVar('start', @_); } sub stopVar { - return join('_', 'stop', @_); + return locVar('stop', @_); } # return string of all non-empty args separated by commas. @@ -159,21 +185,87 @@ sub dimStr { return $s; } -# make args for a call. -sub makeArgs { - my @loopDims = @_; +# Conditionally define a macro. +sub macroDef($$$) { + my $mname = shift; + my $margs = shift; + my $mdef = shift; + + $mname = uc $mname; + $margs = (defined $margs) ? "($margs)" : ""; + return + "#ifndef ${macroPrefix}$mname", + "#define ${macroPrefix}$mname$margs $mdef", + "#endif"; +} +sub macroUndef($) { + my $mname = shift; + + $mname = uc $mname; + return + "#ifdef ${macroPrefix}$mname", + "#undef ${macroPrefix}$mname", + "#endif"; +} + +# copy vars from the input. +sub getInVars { + my $tiledDims = shift; # ref to hash. + my @ldims = @_; + my $itype = indexType(); my @stmts; - map { + for my $dim (@ldims) { + + # Vars for input values. + my $bvar = beginVar($dim); + my $evar = endVar($dim); + my $svar = strideVar($dim); + my $tsvar = tileSizeVar($dim); + my $avar = alignVar($dim); + my $aovar = alignOfsVar($dim); push @stmts, - " ".locVar("start", $_)." = ".startVar($_).";", - " ".locVar("stop", $_)." = ".stopVar($_).";", - " ".locVar("index", $_)." = ".indexVar($_).";", - " ".locVar("num_indices", $_)." = ".numItersVar($_).";"; - } @loopDims; - push @stmts, - " ".locVar("linear_indices")." = ".numItersVar(@loopDims).";", - " ".locVar("linear_index")." = ".loopIndexVar(@loopDims).";"; + "// Create input vars for dim $dim.", + "const $itype $bvar = ".inVar("begin", $dim).";", + "const $itype $evar = ".inVar("end", $dim).";", + "const $itype $svar = ".inVar("stride", $dim).";"; + push @stmts, + "const $itype $avar = ".inVar("align", $dim).";", + "const $itype $aovar = ".inVar("align_ofs", $dim).";" + if $doAlign; + push @stmts, + "const $itype $tsvar = ".inVar("tile_size", $dim).";" + if defined $$tiledDims{$dim}; + } + return @stmts; +} + +# make macros for the body. +sub makeOutMacros { + my @ldims = @_; + + my @stmts; + for my $expr (@var_exprs) { + my $base = "BODY_".uc($expr); + push @stmts, macroDef($base, "dim_num", "YCAT(".locVar($expr)."_, dim_num)"); + } + + return @stmts; +} + +# set var for the body. +sub setOutVars { + my @ldims = @_; + + my $itype = indexType(); + my @stmts; + + for my $expr (@var_exprs) { + for my $dim (@ldims) { + my $macro = "${macroPrefix}BODY_".uc($expr)."($dim)"; + push @stmts, outVar($expr, $dim)." = $macro;"; + } + } return @stmts; } @@ -181,21 +273,73 @@ sub makeArgs { # Loop-constructing functions. # return type of var needed for loop index. -# args: dimension(s) -- currently ignored. -sub indexType { +sub indexType() { return 'idx_t'; } -# Create and init vars *before* beginning of simple or collapsed loop. -sub addIndexVars1($$$) { +# Adjust features. +# Returns new set of features. +sub adjFeatures($$$) { + my $loopDims = shift; # ref to list of dimensions in loop. + my $features = shift; # feature bits for path types. + my $loopStack = shift; # whole stack at this point, including enclosing dims. + + my $ndims = scalar @$loopDims; + my $outerDim = $loopDims->[0]; # outer dim of these loops. + my $innerDim = $loopDims->[$#$loopDims]; # inner dim of these loops. + + # find enclosing dim outside of these loops if avail. + my $encDim; + map { $encDim = $loopStack->[$_] + if $loopStack->[$_ + 1] eq $outerDim; } 0..($#$loopStack-1); + + if (($features & $bManual) && !($features & $bOmpPar)) { + warn "notice: manual ignored for non-OpenMP parallel loop.\n"; + $features &= ~$bManual; # clear bits. + } + if ($ndims < 2 && ($features & ($bSquare | $bNested))) { + warn "notice: square-wave and nested ignored for loop with only $ndims dim.\n"; + $features &= ~($bSquare | $bNested); # clear bits. + } + if ($ndims < 2 && !defined $encDim && ($features & $bSerp)) { + warn "notice: serpentine ignored for outer loop.\n"; + $features &= ~$bSerp; # clear bit. + } + + if ($features & $bTile) { + + if ($ndims < 2) { + warn "notice: tiling ignored for loop with only $ndims dim.\n"; + $features &= ~$bTile; # clear bit. + } + die "error: serpentine not compatible with tiling.\n" + if $features & $bSerp; + die "error: square-wave not compatible with tiling.\n" + if $features & $bSquare; + } + if ($features & ($bManual | $bNested)) { + die "error: serpentine not compatible with manual or nested.\n" + if $features & $bSerp; + die "error: square-wave not compatible with manual or nested.\n" + if $features & $bSquare; + die "error: tiling not compatible with manual or nested.\n" + if $features & $bTile; + } + return $features; +} + +# Create and init vars *before* beginning of loop(s) in given dim(s). +# These compute loop-invariant values like number of iterations. +sub addIndexVars1($$$$) { my $code = shift; # ref to list of code lines. - my $loopDims = shift; # ref to list of dimensions. + my $loopDims = shift; # ref to list of dimensions in this loop. my $features = shift; # bits for path types. + my $loopStack = shift; # whole stack at this point, including enclosing dims. push @$code, - " // ** Begin scan over ".dimStr(@$loopDims).". **"; + "// ** Begin scan over ".dimStr(@$loopDims).". **"; - my $itype = indexType(@$loopDims); + my $itype = indexType(); for my $pass (0..1) { for my $i (0..$#$loopDims) { @@ -204,17 +348,21 @@ ($$$) # Pass 0: iterations. if ($pass == 0) { + + # Vars from the struct. my $bvar = beginVar($dim); my $evar = endVar($dim); my $svar = strideVar($dim); my $avar = alignVar($dim); my $aovar = alignOfsVar($dim); + my $tsvar = tileSizeVar($dim); + + # New vars. my $aavar = adjAlignVar($dim); my $abvar = alignBeginVar($dim); my $nvar = numItersVar($dim); - my $ntvar = numGroupsVar($dim); - my $tsvar = groupSizeVar($dim); - my $ntivar = numFullGroupItersVar($dim); + my $ntvar = numTilesVar($dim); + my $ntivar = numFullTileItersVar($dim); # Example alignment: # bvar = 20. @@ -225,30 +373,37 @@ ($$$) # aavar = min(4, 8) = 4. # abvar = round_down_flr(20 - 15, 4) + 15 = 4 + 15 = 19. - push @$code, - " // Alignment must be less than or equal to stride size.", - " const $itype $aavar = std::min($avar, $svar);", - " // Aligned beginning point such that ($bvar - $svar) < $abvar <= $bvar.", - " const $itype $abvar = yask::round_down_flr($bvar - $aovar, $aavar) + $aovar;", - " // Number of iterations to get from $abvar to (but not including) $evar, striding by $svar.". - " This value is rounded up because the last iteration may cover fewer than $svar strides.", - " const $itype $nvar = yask::ceil_idiv_flr($evar - $abvar, $svar);"; - - # For grouped loops. - if ($features & $bGroup) { - - # loop iterations within one group. + if ($doAlign) { + push @$code, + "// Alignment must be less than or equal to stride size.", + "const $itype $aavar = std::min($avar, $svar);", + "// Aligned beginning point such that ($bvar - $svar) < $abvar <= $bvar.", + "const $itype $abvar = yask::round_down_flr($bvar - $aovar, $aavar) + $aovar;", + "// Number of iterations to get from $abvar to (but not including) $evar, striding by $svar.". + "This value is rounded up because the last iteration may cover fewer than $svar strides.", + "const $itype $nvar = yask::ceil_idiv_flr($evar - $abvar, $svar);"; + } else { + push @$code, + "// Number of iterations to get from $bvar to (but not including) $evar, striding by $svar.". + "This value is rounded up because the last iteration may cover fewer than $svar strides.", + "const $itype $nvar = yask::ceil_idiv_flr($evar - $bvar, $svar);"; + } + + # For tiled loops. + if ($features & $bTile) { + + # loop iterations within one tile. push @$code, - " // Number of iterations in one full group in dimension $dim.". - " This value is rounded up, effectively increasing the group size if needed". - " to a multiple of $svar.". - " A group is considered 'full' if it has the max number of iterations.", - " const $itype $ntivar = std::min(yask::ceil_idiv_flr($tsvar, $svar), $nvar);"; + "// Number of iterations in one full tile in dimension $dim.". + "This value is rounded up, effectively increasing the tile size if needed". + "to a multiple of $svar.". + "A tile is considered 'full' if it has the max number of iterations.", + "const $itype $ntivar = std::min(yask::ceil_idiv_flr($tsvar, $svar), $nvar);"; - # number of full groups. + # number of full tiles. push @$code, - " // Number of full groups in dimension $dim.", - " const $itype $ntvar = $ntivar ? $nvar / $ntivar : 0;"; + "// Number of full tiles in dimension $dim.", + "const $itype $ntvar = $ntivar ? $nvar / $ntivar : 0;"; } } @@ -261,8 +416,8 @@ ($$$) my $snvar = numItersVar(@subDims); my $snval = join(' * ', map { numItersVar($_) } @subDims); push @$code, - " // Number of iterations in $loopStr", - " const $itype $snvar = $snval;"; + "// Number of iterations in $loopStr", + "const $itype $snvar = $snval;"; } } } @@ -273,35 +428,30 @@ ($$$$) my $code = shift; # ref to list of code lines. my $loopDims = shift; # ref to list of dimensions in loop. my $features = shift; # bits for path types. - my $loopStack = shift; # whole stack, including enclosing dims. + my $loopStack = shift; # whole stack at this point, including enclosing dims. - my $itype = indexType(@$loopDims); - my $civar = loopIndexVar(@$loopDims); # collapsed index var; everything based on this. + my $itype = indexType(); + my $civar = loopIndexVar(@$loopDims); # multi-dim index var; everything based on this. + my $ndims = scalar @$loopDims; my $outerDim = $loopDims->[0]; # outer dim of these loops. my $innerDim = $loopDims->[$#$loopDims]; # inner dim of these loops. - # Grouping. - if ($features & $bGroup) { - - die "error: serpentine not compatible with grouping.\n" - if $features & $bSerp; - die "error: square-wave not compatible with grouping.\n" - if $features & $bSquare; - - my $ndims = scalar @$loopDims; + # Tiling. + if ($features & $bTile) { # declare local size vars. - push @$code, " // Working vars for iterations in groups.". - " These are initialized to full-group counts and then". - " reduced if we are in a partial group."; + push @$code, + "// Working vars for iterations in tiles.". + "These are initialized to full-tile counts and then". + "reduced if/when in a partial tile."; for my $i (0 .. $ndims-1) { my $dim = $loopDims->[$i]; - my $ltvar = numLocalGroupItersVar($dim); - my $ltval = numFullGroupItersVar($dim); - push @$code, " $itype $ltvar = $ltval;"; + my $ltvar = numLocalTileItersVar($dim); + my $ltval = numFullTileItersVar($dim); + push @$code, "$itype $ltvar = $ltval;"; } - # calculate group indices and sizes and 1D offsets within groups. + # calculate tile indices and sizes and 1D offsets within tiles. my $prevOvar = $civar; # previous offset. for my $i (0 .. $ndims-1) { @@ -318,84 +468,84 @@ ($$$$) my @inDims = @$loopDims[$i + 1 .. $ndims - 1]; my $inStr = dimStr(@inDims); - # Size of group set. - my $tgvar = numGroupSetItersVar(@inDims); + # Size of tile set. + my $tgvar = numTileSetItersVar(@inDims); my $tgval = join(' * ', - (map { numLocalGroupItersVar($_) } @dims), + (map { numLocalTileItersVar($_) } @dims), (map { numItersVar($_) } @inDims)); my $tgStr = @inDims ? - "the set of groups across $inStr" : "this group"; + "the set of tiles across $inStr" : "this tile"; push @$code, - " // Number of iterations in $tgStr.", - " $itype $tgvar = $tgval;"; + "// Number of iterations in $tgStr.", + "$itype $tgvar = $tgval;"; - # Index of this group in this dim. - my $tivar = groupIndexVar($dim); + # Index of this tile in this dim. + my $tivar = tileIndexVar($dim); my $tival = "$tgvar ? $prevOvar / $tgvar : 0"; push @$code, - " // Index of this group in dimension $dim.", - " $itype $tivar = $tival;"; + "// Index of this tile in dimension $dim.", + "$itype $tivar = $tival;"; - # 1D offset within group set. - my $ovar = groupSetOffsetVar(@inDims); + # 1D offset within tile set. + my $ovar = tileSetOffsetVar(@inDims); my $oval = "$prevOvar % $tgvar"; push @$code, - " // Linear offset within $tgStr.", - " $itype $ovar = $oval;"; + "// Linear offset within $tgStr.", + "$itype $ovar = $oval;"; - # Size of this group in this dim. - my $ltvar = numLocalGroupItersVar($dim); + # Size of this tile in this dim. + my $ltvar = numLocalTileItersVar($dim); my $ltval = numItersVar($dim). - " - (".numGroupsVar($dim)." * ".numFullGroupItersVar($dim).")"; + " - (".numTilesVar($dim)." * ".numFullTileItersVar($dim).")"; push @$code, - " // Adjust number of iterations in this group in dimension $dim.", - " if ($tivar >= ".numGroupsVar($dim).")". + "// Adjust number of iterations in this tile in dimension $dim.", + "if ($tivar >= ".numTilesVar($dim).")". " $ltvar = $ltval;"; # for next dim. $prevOvar = $ovar; } - # Calculate nD indices within group and overall. - # TODO: allow different paths *within* group. + # Calculate nD indices within tile and overall. + # TODO: allow different paths *within* tile. for my $i (0 .. $ndims-1) { my $dim = $loopDims->[$i]; - my $tivar = groupIndexVar($dim); - my $ovar = groupSetOffsetVar(); # last one calculated above. + my $tivar = tileIndexVar($dim); + my $ovar = tileSetOffsetVar(); # last one calculated above. # dims after (inside of) $i (empty for inner dim) my @inDims = @$loopDims[$i + 1 .. $ndims - 1]; - # Determine offset within this group. - my $dovar = groupOffsetVar($dim); + # Determine offset within this tile. + my $dovar = tileOffsetVar($dim); my $doval = $ovar; # divisor of index is product of sizes of remaining nested dimensions. if (@inDims) { - my $subVal = join(' * ', map { numLocalGroupItersVar($_) } @inDims); + my $subVal = join(' * ', map { numLocalTileItersVar($_) } @inDims); $doval .= " / ($subVal)"; } # mod by size of this dimension (not needed for outer-most dim). if ($i > 0) { - $doval = "($doval) % ".numLocalGroupItersVar($dim); + $doval = "($doval) % ".numLocalTileItersVar($dim); } # output offset in this dim. push @$code, - " // Offset within this group in dimension $dim.", - " $itype $dovar = $doval;"; + "// Offset within this tile in dimension $dim.", + "$itype $dovar = $doval;"; # final index in this dim. my $divar = indexVar($dim); - my $dival = numFullGroupItersVar($dim)." * $tivar + $dovar"; + my $dival = numFullTileItersVar($dim)." * $tivar + $dovar"; push @$code, - " // Zero-based, unit-stride index for ".dimStr($dim).".", - " $itype $divar = $dival;"; + "// Zero-based, unit-stride index for ".dimStr($dim).".", + "$itype $divar = $dival;"; } } - # No grouping. + # No tiling. else { # find enclosing dim outside of these loops if avail. @@ -439,27 +589,27 @@ ($$$$) # output $divar. push @$code, - " // Zero-based, unit-stride index for ".dimStr($dim).".", - " idx_t $divar = $dival;"; + "// Zero-based, unit-stride index for ".dimStr($dim).".", + "$itype $divar = $dival;"; # apply square-wave to inner 2 dimensions if requested. - my $isInnerSquare = @$loopDims >=2 && $isInner && ($features & $bSquare); + my $isInnerSquare = $ndims >=2 && $isInner && ($features & $bSquare); if ($isInnerSquare) { my $divar2 = "index_x2"; - my $avar = "lsb"; + my $bvar = "lsb"; push @$code, - " // Modify $prevDivar and $divar for 'square_wave' path.", - " if (($innerNvar > 1) && ($prevDivar/2 < $prevNvar/2)) {", + "// Modify $prevDivar and $divar for 'square_wave' path.", + "if (($innerNvar > 1) && ($prevDivar/2 < $prevNvar/2)) {", " // Compute extended index over 2 iterations of $prevDivar.", - " idx_t $divar2 = $divar + ($nvar * ($prevDivar & 1));", + " $itype $divar2 = $divar + ($nvar * ($prevDivar & 1));", " // Select $divar from 0,0,1,1,2,2,... sequence", " $divar = $divar2 / 2;", " // Select $prevDivar adjustment value from 0,1,1,0,0,1,1, ... sequence.", - " idx_t $avar = ($divar2 & 0x1) ^ (($divar2 & 0x2) >> 1);", + " $itype $bvar = ($divar2 & 0x1) ^ (($divar2 & 0x2) >> 1);", " // Adjust $prevDivar +/-1 by replacing bit 0.", - " $prevDivar = ($prevDivar & (idx_t)-2) | $avar;", - " } // square-wave."; + " $prevDivar = ($prevDivar & $itype(-2)) | $bvar;", + "} // square-wave."; } # reverse order of every-other traversal if requested. @@ -467,12 +617,12 @@ ($$$$) if (($features & $bSerp) && defined $prevDivar) { if ($isInnerSquare) { push @$code, - " // Reverse direction of $divar after every-other iteration of $prevDivar for 'square_wave serpentine' path.", - " if (($prevDivar & 2) == 2) $divar = $nvar - $divar - 1;"; + "// Reverse direction of $divar after every-other iteration of $prevDivar for 'square_wave serpentine' path.", + "if (($prevDivar & 2) == 2) $divar = $nvar - $divar - 1;"; } else { push @$code, - " // Reverse direction of $divar after every iteration of $prevDivar for 'serpentine' path.", - " if (($prevDivar & 1) == 1) $divar = $nvar - $divar - 1;"; + "// Reverse direction of $divar after every iteration of $prevDivar for 'serpentine' path.", + "if (($prevDivar & 1) == 1) $divar = $nvar - $divar - 1;"; } } @@ -481,6 +631,16 @@ ($$$$) $prevNvar = $nvar; } } +} + +# Add start/stop variables *inside* the loop. +sub addIndexVars3($$$$) { + my $code = shift; # ref to list of code lines. + my $loopDims = shift; # ref to list of dimensions in loop. + my $features = shift; # bits for path types. + my $loopStack = shift; # whole stack at this point, including enclosing dims. + + my $itype = indexType(); # start and stop vars based on individual begin, end, stride, and index vars. for my $dim (@$loopDims) { @@ -491,38 +651,114 @@ ($$$$) my $abvar = alignBeginVar($dim); my $evar = endVar($dim); my $svar = strideVar($dim); - push @$code, - " // This value of $divar covers ".dimStr($dim)." from $stvar to (but not including) $spvar.", - " idx_t $stvar = std::max($abvar + ($divar * $svar), $bvar);", - " idx_t $spvar = std::min($abvar + (($divar+1) * $svar), $evar);"; + if ($doAlign) { + push @$code, + "// This value of $divar covers ".dimStr($dim)." from $stvar to (but not including) $spvar.", + "$itype $stvar = std::max($abvar + ($divar * $svar), $bvar);", + "$itype $spvar = std::min($abvar + (($divar+1) * $svar), $evar);"; + } else { + push @$code, + "// This value of $divar covers ".dimStr($dim)." from $stvar to (but not including) $spvar.", + "$itype $stvar = $bvar + ($divar * $svar);", + "$itype $spvar = std::min($bvar + (($divar+1) * $svar), $evar);"; + } } } -# start simple or collapsed loop body. +# Start of loop(s) over given dim(s). +# Every loop starts with 2 "{"'s to make ending easy. +# TODO: keep track of number of "{"'s used. sub beginLoop($$$$$$$) { - my $code = shift; # ref to list of code lines. - my $loopDims = shift; # ref to list of dimensions. - my $prefix = shift; # ref to list of prefix code. May be undef. - my $beginVal = shift; # beginning of loop. - my $endVal = shift; # end of loop (undef to use default). + my $code = shift; # ref to list of code lines to be added to. + my $loopDims = shift; # ref to list of dimensions for this loop. my $features = shift; # bits for path types. - my $loopStack = shift; # whole stack, including enclosing dims. + my $loopStack = shift; # whole stack of dims so far, including enclosing dims. + my $prefix = shift; # ref to list of prefix code. May be undef. + my $beginVal = shift; # beginning of loop (undef for default). + my $endVal = shift; # end of loop (undef for default). + $beginVal = 0 if !defined $beginVal; $endVal = numItersVar(@$loopDims) if !defined $endVal; - my $itype = indexType(@$loopDims); + $features = adjFeatures($loopDims, $features, $loopStack); + my $itype = indexType(); my $ivar = loopIndexVar(@$loopDims); - push @$code, @$prefix if defined $prefix; - push @$code, " for ($itype $ivar = $beginVal; $ivar < $endVal; $ivar++) {"; + my $ndims = scalar @$loopDims; - # add inner index vars. - addIndexVars2($code, $loopDims, $features, $loopStack); + # Add pre-loop index vars. + addIndexVars1($code, $loopDims, $features, $loopStack); + + # Start "normal" nested loops. + if ($features & $bNested) { + push @$code, @$prefix if defined $prefix; + for my $i (0 .. $ndims-1) { + my $dim = $loopDims->[$i]; + my $nvar = numItersVar($dim); + my $divar = indexVar($dim); + push @$code, + "for ($itype $divar = 0; $divar < $nvar; $divar++)"; + } + push @$code, "{ {"; + } + + # Start a parallel region if using manual distribution. + elsif ($features & $bManual) { + push @$code, + "// Start parallel section.", @$prefix + if defined $prefix; + push @$code, + "{", + "// Number of threads in this parallel section.", + "$itype nthreads = ${macroPrefix}OMP_NUM_THREADS;", + "// Unique 0-based thread index in this parallel section.", + "$itype thread_num = ${macroPrefix}OMP_THREAD_NUM;", + "host_assert(thread_num < nthreads);", + "// Begin and end indices for this thread.", + "$itype thread_begin = $beginVal + ". + "yask::div_equally_cumu_size_n($endVal - $beginVal, nthreads, thread_num - 1);", + "$itype thread_end = $beginVal + ". + "yask::div_equally_cumu_size_n($endVal - $beginVal, nthreads, thread_num);", + "// Starting index.", + "$itype $ivar = thread_begin;"; + + # Add initial-loop index vars for this thread. + addIndexVars2($code, $loopDims, $features, $loopStack); + + # Add sequential loops for this thread thread. + # TODO: use one loop and increment-and-wrap-around code. + push @$code, + "\n // Loop through the ranges of $ndims dim(s) in this thread."; + for my $i (0 .. $ndims-1) { + my $dim = $loopDims->[$i]; + my $nvar = numItersVar($dim); + my $divar = indexVar($dim); + push @$code, + "for (; $divar < $nvar && $ivar < thread_end; $divar++, ". + ($i < $ndims-1 ? indexVar($loopDims->[$i+1])."=0" : "$ivar++"). + ")"; + } + push @$code, " {"; + } + + # Start manually-flattened loop. + else { + push @$code, @$prefix if defined $prefix; + push @$code, + "for ($itype $ivar = $beginVal; $ivar < $endVal; $ivar++) { {"; + + # Add inner-loop index vars for this iteration. + addIndexVars2($code, $loopDims, $features, $loopStack); + } + + # Add start/stop vars for this iteration. + addIndexVars3($code, $loopDims, $features, $loopStack); } -# end simple or collapsed loop body. +# End loops. +# Every loop ends with 2 "{"'s. sub endLoop($) { my $code = shift; # ref to list of code lines. - push @$code, " }"; + push @$code, "} }"; } ########## @@ -531,8 +767,9 @@ ($) # Split a string into tokens, ignoring whitespace. sub tokenize($) { my $str = shift; - my @toks; + # Find tokens. + my @toks; while (length($str)) { # default is 1 char. @@ -552,7 +789,7 @@ ($) my $tok = substr($str, 0, $len, ''); # keep unless WS. - push @toks, $tok unless $tok =~ /^\s$/; + push @toks, $tok unless $tok =~ /^\s+$/; } return @toks; } @@ -606,9 +843,8 @@ ($$) my $toks = shift; # ref to token array. my $ti = shift; # ref to token index (starting at paren). - my $N = scalar(@dims); while (1) { - my $tok = checkToken($toks->[$$ti++], '\w+|N[-+]|\,|\.+|\)', 1); + my $tok = checkToken($toks->[$$ti++], '\w+|\,|\.+|\)', 1); # comma (ignore). if ($tok eq ',') { @@ -621,18 +857,6 @@ ($$) # actual token. else { - - # Handle, e.g., 'N+1', 'N-2'. - if ($tok eq 'N') { - my $oper = checkToken($toks->[$$ti++], '[-+]', 1); - my $tok2 = checkToken($toks->[$$ti++], '\d+', 1); - if ($oper eq '+') { - $tok = $N + $tok2; - } else { - $tok = $N - $tok2; - } - } - return $tok; } } @@ -654,22 +878,35 @@ ($$) } # Handle '..'. - elsif ($arg =~ /^\.+$/) { + elsif ($arg =~ /^\.\.+$/) { die "Error: missing token before '$arg'.\n" if !defined $prevArg; die "Error: non-numerical token before '$arg'.\n" - if $prevArg !~ /^\d+$/; + if $prevArg !~ /^[-]?\d+$/; + pop @args; my $arg2 = getNextArg($toks, $ti); die "Error: missing token after '$arg'.\n" if !defined $arg2; die "Error: non-numerical token after '$arg'.\n" - if $arg2 !~ /^\d+$/; - for my $i ($prevArg+1 .. $arg2) { - push @args, $i; + if $arg2 !~ /^[-]?\d+$/; + if ($prevArg == $arg2) { + push @args, $arg2; + } + elsif ($prevArg < $arg2) { + for my $i ($prevArg .. $arg2) { + push @args, $i; + } + } + else { + # Something like 2..1, so return empty list. + # TODO: add an operator to allow reverse ordering. } } + # Should be a number. else { + die "Error: non-numerical token '$arg'.\n" + if $arg !~ /^[-]?\d+$/; push @args, $arg; $prevArg = $arg; } @@ -677,64 +914,74 @@ ($$) return @args; } +# Generate a pragma w/given text. +sub pragma($) { + my $codeString = shift; + return (length($codeString) > 0) ? + "_Pragma(\"$codeString\")" : ""; +} + # Process the loop-code string. # This is where most of the work is done. sub processCode($) { my $codeString = shift; - my @toks = tokenize($codeString); - ##print join "\n", @toks; + # vars across loops. + my $partNum = 0; # macro-part counter. + my %dims; # all dims seen. + my %tiledDims; # dims needing tiles. + my @code; # code to output. - # vars to track loops. + # vars to track one loop. # set at beginning of loop() statements. my @loopStack; # current nesting of dimensions. my @loopCounts; # number of dimensions in each loop. my @loopDims; # dimension(s) of current loop. - my $curInnerDim; # iteration dimension of inner loop (undef if not in inner loop). # modifiers before loop() statements. - my @loopPrefix; # string(s) to put before loop body. my $features = 0; # bits for loop features. - # lists of code parts to be output. - # set at call() statements. - my @callStmts; # calculation statements. - - # Lines of code to output. - my @code; + # Subst macros. + while (my ($key, $value) = each (%macros)) { + $codeString =~ s/\b$key\b/$value/g; + } - # Front matter. - push @code, - "#ifndef OMP_PRAGMA", - "#define OMP_PRAGMA _Pragma(\"$OPT{ompConstruct}\")", - "#endif", - "// 'ScanIndices $inputVar' must be set before the following code.", - "{"; - - # loop thru all the tokens ni the input. + # loop thru all the tokens in the input. + my @toks = tokenize($codeString); for (my $ti = 0; $ti <= $#toks; ) { my $tok = checkToken($toks[$ti++], '.*', 1); + # generate simd in next loop. + if (lc $tok eq 'simd') { + + $features |= $bSimd; + print "info: generating SIMD in following loop.\n"; + } + # use OpenMP on next loop. - if (lc $tok eq 'omp') { + elsif (lc $tok eq 'omp') { - push @loopPrefix, - " // Distribute iterations among OpenMP threads.", - " OMP_PRAGMA"; - print "info: using OpenMP on following loop.\n"; + $features |= $bOmpPar; + print "info: using OpenMP on following loop(s).\n"; } - # generate simd in next loop. - elsif (lc $tok eq 'simd') { + # generate manual-scheduling optimizations in next loop. + elsif (lc $tok eq 'manual') { - push @loopPrefix, '_Pragma("simd")'; - $features |= $bSimd; - print "info: generating SIMD in following loop.\n"; + $features |= $bManual; + print "info: using manual-scheduling optimizations.\n"; } - # use grouped path in next loop if possible. - elsif (lc $tok eq 'grouped') { - $features |= $bGroup; + # generate nested loops. + elsif (lc $tok eq 'nested') { + + $features |= $bNested; + print "info: using traditional nested loops.\n"; + } + + # use tiled path in next loop if possible. + elsif (lc $tok eq 'tiled') { + $features |= $bTile; } # use serpentine path in next loop if possible. @@ -747,141 +994,128 @@ ($) $features |= $bSquare; } - # beginning of a loop. - # also eats the args in parens and the following '{'. + # Beginning of a loop. + # Also eats the args in parens and the following '{'. elsif (lc $tok eq 'loop') { - # get loop dimension(s). + # Get loop dimension(s). checkToken($toks[$ti++], '\(', 1); - @loopDims = getArgs(\@toks, \$ti); - die "error: no args for '$tok'.\n" if @loopDims == 0; + @loopDims = getArgs(\@toks, \$ti); # might be empty. checkToken($toks[$ti++], '\{', 1); # eat the '{'. + my $ndims = scalar(@loopDims); # num dims in this loop. - push @loopStack, @loopDims; # all dims including outer loops. - push @loopCounts, scalar(@loopDims); # number of dims in this loop. - - # check for existence of all vars. + # Check index consistency. for my $ld (@loopDims) { - die "Error: loop variable '$ld' not in ".dimStr(@dims).".\n" - if !grep($_ eq $ld, @dims); - } - - # set inner dim if applicable. - undef $curInnerDim; - if (isInInner(\@toks, \$ti)) { - $curInnerDim = $loopDims[$#loopDims]; - } - - # print more info. - print "info: generating scan over ".dimStr(@loopDims)."...\n"; - - # add initial code for index vars, but don't start loop body yet. - addIndexVars1(\@code, \@loopDims, $features); - - # if *not* the inner loop, start the loop body. - # if it is the inner loop, we might need more than one loop body, so - # it will not be generated until the '}' is seen. - if (!defined $curInnerDim) { - beginLoop(\@code, \@loopDims, \@loopPrefix, 0, undef, $features, \@loopStack); - - # clear data for this loop. - undef @loopDims; - undef @loopPrefix; - $features = 0; + $dims{$ld} = 1; + $tiledDims{$ld} = 1 if ($features & $bTile); + for my $ls (@loopStack) { + die "Error: loop variable '$ld' already used.\n" + if $ld == $ls; + } } - } - # Function(s) to call. - # Set @*Stmts* vars. - elsif (lc $tok eq 'call') { - - die "error: '$tok' attempted outside of inner loop.\n" - if !defined $curInnerDim; - - # Process funcs (args to call). - checkToken($toks[$ti++], '\(', 1); - my $ncall = 0; - while (1) { - my $arg = getNextArg(\@toks, \$ti); - last if !defined($arg); - $ncall++; - - # standard args to functions. - my $callArgs = $OPT{comArgs}; - - # get optional args from input. - if (checkToken($toks[$ti], '\(', 0)) { - $ti++; - my @oargs = getArgs(\@toks, \$ti); - $callArgs = joinArgs($callArgs, @oargs) if (@oargs); + push @loopStack, @loopDims; # all dims so far. + push @loopCounts, $ndims; # number of dims in each loop. + my @loopPrefix; # string(s) to put before loop body. + + # In inner loop? + my $is_inner = $ndims && isInInner(\@toks, \$ti); + push @loopPrefix, "${macroPrefix}INNER_LOOP_PREFIX" if $is_inner; + + # Add OMP pragma(s). + my $is_omp_nested = 0; + if ($features & $bOmpPar) { + if (($features & $bNested) && $ndims) { + push @code, + macroDef('OMP_NESTED_PRAGMA', undef, pragma("$OPT{omp} collapse($ndims)")); + push @loopPrefix, "${macroPrefix}OMP_NESTED_PRAGMA"; + $is_omp_nested = 1; + } else { + push @loopPrefix, "${macroPrefix}OMP_PRAGMA"; } + } + if ($features & $bSimd) { + push @loopPrefix, '${macroPrefix}OMP_SIMD'; + } - # Code for calls. - # e.g., prefix_fn(...); - push @callStmts, makeArgs(@loopStack) - if $ncall == 1; - push @callStmts, - " $OPT{callPrefix}$arg(". - joinArgs($callArgs, locVar()). ");"; - - } # args - } # call - - # End of loop. - # This is where most of @code is created for inner loops. - elsif ($tok eq '}') { - die "error: attempt to end loop w/o beginning\n" if !@loopStack; + # Start the loop unless there are no indices. + if ($ndims) { + print "info: generating scan over ".dimStr(@loopDims)."...\n"; + + # Start the loop. + beginLoop(\@code, \@loopDims, $features, \@loopStack, \@loopPrefix, undef, undef); + + # Inner-loop-specific code. + if ($is_inner) { + + # Start-stop indices for body. + push @code, + "// Indices for loop body.", + makeOutMacros(@loopStack), + "#ifdef ".outVar(), + "#ifndef ".inVar(), + "#error Cannot create ".outVar()." without ".inVar(), + "#endif"; + if ($features & $bOmpPar) { + + # Make a new var so it will become OMP private. + push @code, + "ScanIndices ".outVar()."(false);", + outVar()." = ".inVar().";", + setOutVars(@loopStack); + } else { + + # Just a reference if no OMP. + push @code, + "ScanIndices& ".outVar()." = ".inVar().";", + setOutVars(@loopStack); + } + push @code, "#endif // ".outVar(); + } - # not inner loop? - # just need to end it. - if (!defined $curInnerDim) { + } else { - endLoop(\@code); + # Dummy loop needed when there are no indices in the loop. + # Needed to get nesting and other assumptions right. + print "info: generating dummy loop for empty $tok args...\n"; + my $ivar = "dummy" . scalar(@loopCounts); + push @code, + "// Dummy loop.", + @loopPrefix, + "for (int $ivar = 0; $ivar < 1; $ivar++) { {"; } - # inner loop. - # for each part of loop, need to - # - start it, - # - add to @code, - # - end it. - else { - my $beginVal = 0; - my $endVal = numItersVar(@loopDims); - my $comment = " // Inner loop."; - - # beginning of loop. - push @code, $comment; - push @code, $OPT{innerMod}; - beginLoop(\@code, \@loopDims, \@loopPrefix, - $beginVal, $endVal, $features, \@loopStack); - - # Indices to pass to call. - push @code, - " // Local copy of indices for function calls.", - " ScanIndices ".locVar()."($inputVar);"; - - # loop body. - push @code, @callStmts; - - # end of loop. - endLoop(\@code); - - # clear code buffers. - undef @callStmts; + # Remove temp pragma. + push @code, macroUndef('OMP_NESTED_PRAGMA') if $is_omp_nested; + + # Macro break for inserting code: end one and start next one. + push @code, + macroUndef($loopPart.$partNum), + "#endif // Part $partNum.\n"; + $partNum++; + push @code, + "// Enable part $loopPart by defining the following macro.", + "#ifdef ${macroPrefix}$loopPart$partNum\n"; + + # clear data for this loop so we'll be ready for a nested loop. + undef @loopDims; + undef @loopPrefix; + $features = 0; + } - # clear other data for this loop. - undef $curInnerDim; - undef @loopDims; - undef @loopPrefix; - $features = 0; - } # inner loop. + # End of loop. + elsif ($tok eq '}') { + die "error: attempt to end loop w/o beginning\n" if !@loopCounts; # pop stacks. - my $ndims = pop @loopCounts; + my $ndims = pop @loopCounts; # How many indices in this loop? for my $i (1..$ndims) { my $sdim = pop @loopStack; - #push @code, " // End of $sdim loop."; + push @code, "// End of scan over dim $sdim."; } + + # Emit code. + endLoop(\@code); } # end of a loop. # separator (ignore). @@ -893,57 +1127,79 @@ ($) } else { - die "error: unrecognized token '$tok'\n"; + die "error: unrecognized or unexpected token '$tok'\n"; } } # token-handling loop. die "error: ".(scalar @loopStack)." loop(s) not closed.\n" if @loopStack; + # Sorted list of dims scanned. + my @dims = sort { $a <=> $b } keys %dims; + + # Front matter. + my @fcode; + push @fcode, + "// Enable part 0 by defining the following macro.", + "#ifdef ${macroPrefix}${loopPart}0\n", + "// These macros must be re-defined for each generated loop-nest.", + macroDef('SIMD_PRAGMA', undef, pragma($OPT{simd})), + macroDef('INNER_LOOP_PREFIX', undef, pragma($OPT{inner})), + macroDef('OMP_PRAGMA', undef, pragma($OPT{omp})), + macroDef('OMP_NUM_THREADS', undef, 'omp_get_num_threads()'), + macroDef('OMP_THREAD_NUM', undef, 'omp_get_thread_num()'), + "// Define ".inVar()." to initialize loop from a ScanIndices struct with that name.", + "// Any element of the struct may be overridden by defining the corresponding macro.", + "#ifdef ".inVar(); + for my $expr (@fixed_exprs) { + push @fcode, macroDef($expr, "dim_num", inVar().'.'.$expr.'[dim_num]'); + } + push @fcode, + "#endif //".inVar(), + "{", + getInVars(\%tiledDims, @dims); + unshift @code, @fcode; + # Back matter. push @code, "}", - "#undef OMP_PRAGMA", - "// End of generated code."; - - # indent program avail? - my $indent = 'indent'; - if (!defined which($indent)) { - $indent = 'gindent'; - if (!defined which($indent)) { - print "note: cannot find [g]indent utility--output will be unformatted.\n"; - undef $indent; - } + macroUndef($inputVar), + macroUndef($outputVar), + macroUndef('OMP_PRAGMA'), + macroUndef('OMP_NUM_THREADS'), + macroUndef('OMP_THREAD_NUM'), + macroUndef('SIMD_PRAGMA'), + macroUndef('INNER_LOOP_PREFIX'); + for my $expr (@fixed_exprs, @var_exprs) { + push @code, + macroUndef($expr), + macroUndef("BODY_$expr"); } - + push @code, + macroUndef($loopPart.$partNum), + "#endif"; + # open output stream. - my $cmd = defined $indent ? "$indent -fca -o $OPT{output} -" : - "cat > $OPT{output}"; - open OUT, "| $cmd" or die "error: cannot run '$cmd'.\n"; + open OUT, "> $OPT{output}" or die; # header. - print OUT "/*\n", - " * ".scalar(@dims)."-D var-scanning code.\n", + print OUT + "/*\n", + " * Var-scanning code.\n", " * Generated automatically from the following pseudo-code:\n", " *\n", - " * N = ",$#dims,";\n"; - - # format input to show in the header. - my $cmd2 = "echo '$codeString'"; - $cmd2 .= " | $indent -" if (defined $indent); - open IN, "$cmd2 |" or die "error: cannot run '$cmd2'.\n"; - while () { - print OUT " * $_"; - } - close IN; - print OUT " *\n */"; + " * $codeString\n", + " *\n */"; # print out code. for my $line (@code) { print OUT "\n" if $line =~ m=^\s*//=; # blank line before comment. - print OUT " $line\n"; + print OUT " $line\n"; # add space at beginning of every line. } + print OUT "// End of generated code.\n"; close OUT; + system("$indent $OPT{output}") if -x $indent; + print "info: output in '$OPT{output}'.\n"; } @@ -952,13 +1208,13 @@ () my(@KNOBS) = ( # knob, description, optional default - [ "ndims=i", "Value of N.", 1], - [ "inVar=s", "Name of input index vars.", 'scanVars'], - [ "comArgs=s", "Common arguments to all calls.", ''], - [ "callPrefix=s", "Common prefix for function call(s).", ''], - [ "ompConstruct=s", "Pragma to use before 'omp' loop(s).", "omp parallel for"], - [ "innerMod=s", "Code to insert before inner loops.", ''], - [ "output=s", "Name of output file.", 'loops.h'], + [ "prefix=s", "Common prefix of generated macros and vars", ''], + [ "inner=s", "Set default INNER_LOOP_PREFIX macro used before inner loop(s)", ''], + [ "omp=s", "Set default OMP_PRAGMA macro used before 'omp' loop(s)", "omp parallel for"], + [ "simd=s", "Set default SIMD_PRAGMA macro used before 'simd' loop(s)", "omp simd"], + [ "align!", "Generate peel code for alignment", 1], + [ "macro_file=s", "Name of input file containing '#define' macros that can be used in ", ''], + [ "output=s", "Name of output file", 'loops.h'], ); my($command_line) = process_command_line(\%OPT, \@KNOBS); print "$command_line\n" if $OPT{verbose}; @@ -968,49 +1224,83 @@ () print "Outputs C++ code to scan N-D vars.\n", "Usage: $script [options] \n", "The contains optionally-nested scans across the given\n", - " indices between 0 and N-1 indicated by 'loop()'\n", - "Indices may be specified as a comma-separated list or range,\n", - " using the variable 'N' as needed.\n", - "Inner loops should contain call statements that generate calls to calculation functions.\n", - "A loop statement with more than one argument will generate a single collapsed loop.\n", + " indices indicated by 'loop()'\n", + "Indices may be specified as a comma-separated list or range.\n", + "The generated code will contain a macro-guarded part before the first loop body\n", + " and a part after each loop body.\n", "Optional loop modifiers:\n", - " omp: generate an OpenMP for loop (distribute work across SW threads).*\n", - " grouped: generate grouped scan within a collapsed loop.\n", - " serpentine: generate reverse scan when enclosing loop dimension is odd.*\n", - " square_wave: generate 2D square-wave scan for two innermost dimensions of a collapsed loop.*\n", + " simd: add SIMD_PRAMA before loop (distribute work across SIMD HW).\n", + " omp: add OMP_PRAGMA before loop (distribute work across SW threads).\n", + " nested: use traditional nested loops instead of generating one flattened loop;\n", + " automatically adds 'collapse' clause to OpenMP loops.\n", + " manual: create optimized index calculation for manually-scheduled OpenMP loops;\n", + " must set OMP_PRAGMA to something like 'parallel', not 'parallel for'.\n", + " tiled: generate tiled scan within a >1D loop.\n", + " serpentine: generate reverse scan when enclosing loop index is odd.*\n", + " square_wave: generate 2D square-wave scan for two innermost dims of >1D loop.*\n", " * Do not use these modifiers for YASK rank or block loops because they must\n", " execute with strictly-increasing indices when using temporal tiling.\n", - "A 'ScanIndices' var must be defined in C++ code prior to including the generated code.\n", + " Also, do not combile these modifiers with 'tiled' or 'manual'.\n", + "A 'ScanIndices' type must be defined in C++ code prior to including the generated code.\n", " This struct contains the following 'Indices' elements:\n", " 'begin': [in] first index to scan in each dim.\n", " 'end': [in] value beyond last index to scan in each dim.\n", " 'stride': [in] distance between each scan point in each dim.\n", " 'align': [in] alignment of strides after first one.\n", " 'align_ofs': [in] value to subtract from 'start' before applying alignment.\n", - " 'group_size': [in] min size of each group of points visisted first in a multi-dim loop.\n", - " 'start': [out] set to first scan point in called function(s) in inner loop(s).\n", - " 'stop': [out] set to one past last scan point in called function(s) in inner loop(s).\n", + " 'tile_size': [in] size of each tile in each tiled dim (ignored if not tiled).\n", + " 'start': [out] set to first scan point in body of inner loop(s).\n", + " 'stop': [out] set to one past last scan point in body of inner loop(s).\n", " 'index': [out] set to zero on first iteration of loop; increments each iteration.\n", - " Each called function has a 'ScanIndices' variable as a parameter.\n", - " Values in the 'in' arrays in all dimensions are copied from the input.\n", - " Values in the 'out' arrays in any dimension not scanned are copied from the input.\n", " Each array should be the length specified by the largest index used (typically same as -ndims).\n", - " The 'ScanIndices' input var is named with the -inVar option.\n", + " The 'align' and 'align_ofs' elements are ignored if '-no-align' is used.\n", + "A 'ScanIndices' input var must be defined in C++ code prior to including the generated code.\n", + " The 'in' indices control the range of the generaged loop(s).\n", + " The 'ScanIndices' input var is named with the LOOP_INDICES macro.\n", + "Loop indices will be available in the body of the loop in a new 'ScanIndices' var.\n", + " Values in the 'out' indices are set to indicate the range to be covered by the loop body.\n", + " Any values in the struct not explicity set are copied from the input.\n", + " The 'ScanIndices' output var is named with the BODY_INDICES macro.\n", "Options:\n"; print_options_help(\@KNOBS); print "Examples:\n", - " $script -ndims 2 'loop(0,1) { call(f); }'\n", - " $script -ndims 3 'omp loop(0,1) { loop(2) { call(f); } }'\n", - " $script -ndims 3 'omp loop(0) { loop(1,2) { call(f); } }'\n", - " $script -ndims 3 'grouped omp loop(0..N-1) { call(f); }'\n", - " $script -ndims 3 'omp loop(0) { square_wave loop(1..N-1) { call(f); } }'\n", - " $script -ndims 4 'omp loop(0..N+1) { loop(N+2,N-1) { call(f); } }'\n"; + " $script 'loop(0,1) { }'\n", + " $script 'omp loop(0,1) { }'\n", + " $script 'omp nested loop(0,1) { }'\n", + " $script 'omp serpentine loop(0,1) { }'\n", + " $script 'omp tiled loop(0,1,2) { }'\n", + " $script 'omp loop(0,1) { loop(2) { } }'\n", + " $script 'omp loop(0) { loop(1,2) { } }'\n", + " $script 'omp loop(0) { square_wave loop(1..2) { } }'\n", + " $script 'omp loop(0,1,2) { loop(3) { } }'\n"; exit 1; } - @dims = 0 .. ($OPT{ndims} - 1); - print "info: generating scanning code for ".scalar(@dims)."-D vars...\n"; - $inputVar = $OPT{inVar}; + $macroPrefix = uc $OPT{prefix}; + $varPrefix = lc $OPT{prefix}; + $doAlign = $OPT{align}; + push @fixed_exprs, @align_exprs if $doAlign; + + # Read macros. + if ($OPT{macro_file}) { + open MF, "< $OPT{macro_file}" or die "cannot open '$OPT{macro_file}'\n"; + while () { + chomp; + s/\s+$//; + + # Macro with name and value. + if (/^\s*#define\s+(\w+)\s+(.*)/) { + $macros{$1} = $2; + } + + # Macro with name only. + elsif (/^\s*#define\s+(\w+)/) { + $macros{$1} = ''; + } + } + close MF; + print "".(scalar keys %macros)." macro(s) read from '$OPT{macro_file}'\n"; + } my $codeString = join(' ', @ARGV); # just concat all non-options params together. processCode($codeString); diff --git a/utils/bin/view_asm.pl b/utils/bin/view_asm.pl index 21e9607a..40c0c90b 100755 --- a/utils/bin/view_asm.pl +++ b/utils/bin/view_asm.pl @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -46,7 +46,7 @@ sub usage { "options:\n". " -p print instrs in addition to stats\n". " -l view only inner loops\n". - " -s view funcs/loops only with SIMD code\n". + " -s view only funcs/loops using SIMD regs\n". " -f= view only in matching function\n". " -t= view only if containing matching text\n"; } @@ -105,12 +105,13 @@ () (!$targetFn || $curFn =~ /$targetFn/) && (!$targetText || grep(/$targetText/, @lines))) { print "\n"; + print "Function '$curFn'\n" if defined $curFn; if ($loopsOnly) { print "non-" if !scalar %rstats; - print "SIMD loop:\n"; + print "SIMD inner loop:\n"; } - print "Function '$curFn'\n" if defined $curFn; print @lines if $printAsm; + print "\n".($loopsOnly ? "Inner loop" : "Function")." summary:\n"; print "$ninstrs total instrs\n"; print "Instr counts per instr type (FLOP count is a subtotal):\n"; for my $key (sort keys %istats) { @@ -165,9 +166,11 @@ () chomp; # file name, e.g., - # .file 40 "src/stencil_block_loops.hpp" + # .file 4 "myfile.cpp" + # .file 13 "foo/bar" "src/stencil_block_loops.hpp" if (/^\s*\.file\s+(\d+)\s+"(.*)"/) { my ($fi, $fn) = ($1, $2); + $fn =~ s=" "=/=g; $files{$fi} = basename($fn); my $dir = dirname($fn); $dirs{$fi} = dirname($fn); @@ -177,15 +180,17 @@ () } # location, e.g., - # .loc 40 23 prologue_end is_stmt 1 - elsif (/^\s*\.loc\s+(\d+)\s+(.*)/) { - my ($fi, $info) = ($1, $2); + # .loc 40 23 19 ... + elsif (/^\s*\.loc\s+(\d+)\s+(\d+)?\s+(\d+)?\s+(.*)/) { + my ($fi, $line, $col, $info) = ($1, $2, $3, $4); if (exists $files{$fi}) { $srcFile = $files{$fi}; - $locInfo = "$srcFile:$info"; + $locInfo = "$srcFile"; + $locInfo .= ":$line" if defined $line && $line > 0; + $locInfo .= ":$col" if defined $col && $col > 0; my $srcDir = $dirs{$fi}; if ($srcDir && exists($dirIndices{$srcDir})) { - $locInfo = "/$locInfo"; + $locInfo = "# /$locInfo"; } } else { $srcFile = ""; @@ -194,13 +199,14 @@ () } # begin function. - elsif (/^\#\s+[-]+\s+Begin\s+(.+)/) { - $curFn = $1; + elsif (/#\s+\-+\s+Begin( function)?\s+(.+)/) { + $curFn = $2; + #print ">> function $2\n"; clearStats(); } # end function. - elsif (/^\#\s+[-]+\s+End /) { + elsif (/(^\#\s+\-+\s+End)|(\-\- End function)/) { printLines() if $pass && !$loopsOnly; clearStats(); } @@ -218,7 +224,7 @@ () # label, e.g., #..B1.39: # Preds ..B1.54 ..B1.38 - elsif (/^\s*(\S+):/) { + elsif (/^\s*([\w.]+):/) { my $lab = $1; $labels{$lab} = $asmLine; @@ -228,13 +234,15 @@ () # clear previous loop data. clearStats(); } - push @lines, "$_\n" if $lab =~ /\.\.B/; + push @lines, "$_\n" unless $lab =~ /tmp/; } - # line of code, e.g., + # line of asm code, e.g., # kmovw %r10d, %k7 #137.17 - elsif (/^\s+(\w+)\s+(.*)\#(.*)/) { - my ($instr, $args, $comment) = ($1, $2, $3); + elsif (/^\s+(\w+)(\s+(.*))?(\#(.*))?/) { + my ($instr, $args, $comment) = ($1, $3, $5); + $args = "" if !defined $args; + $comment = "" if !defined $comment; $asmLine++; push @lines, "$_\t$locInfo\n"; $ninstrs++; @@ -266,7 +274,7 @@ () # arg stats. (dest is last arg.) my $type = ($args =~ /[xyz]mm/) ? $& : 'non-SIMD'; - #$type .= ' "spill"' if $comment =~ /spill/; + #$type .= ' "spill"' if $comment =~ /spill/i; if ($args =~ /\(.*r[bs]p.*\).*,/) { $astats{"$type stack load"}++; } elsif ($args =~ /,.*\(.*r[bs]p.*\)/) { diff --git a/utils/bin/yask_indent.sh b/utils/bin/yask_indent.sh new file mode 100755 index 00000000..4580d914 --- /dev/null +++ b/utils/bin/yask_indent.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +############################################################################## +## YASK: Yet Another Stencil Kit +## Copyright (c) 2014-2022, Intel Corporation +## +## Permission is hereby granted, free of charge, to any person obtaining a copy +## of this software and associated documentation files (the "Software"), to +## deal in the Software without restriction, including without limitation the +## rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +## sell copies of the Software, and to permit persons to whom the Software is +## furnished to do so, subject to the following conditions: +## +## * The above copyright notice and this permission notice shall be included in +## all copies or substantial portions of the Software. +## +## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +## IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +## FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +## AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +## LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +## FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +## IN THE SOFTWARE. +############################################################################## + +# Purpose: indent C/C++ source code. + +args="-fca -nlp" +if command -v gindent >/dev/null; then + set -x; gindent $args $@ +elif command -v indent >/dev/null; then + set -x; indent $args $@ +else + echo "Note: generated file not indented because neither 'gindent' nor 'indent' was found." +fi + diff --git a/utils/bin/yask_log_to_csv.pl b/utils/bin/yask_log_to_csv.pl index ce164ac3..a3bfd052 100755 --- a/utils/bin/yask_log_to_csv.pl +++ b/utils/bin/yask_log_to_csv.pl @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to diff --git a/utils/bin/yask_tuner.pl b/utils/bin/yask_tuner.pl index 799dcf59..c01008ba 100755 --- a/utils/bin/yask_tuner.pl +++ b/utils/bin/yask_tuner.pl @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -51,7 +51,6 @@ my $sweep = 0; # if true, sweep instead of search. my $testing = 0; # if true, don't run real trials. my $checking = 0; # if true, don't run at all. -my $mic; # set to 0, 1, etc. for KNC mic. my $host; # set to run on a different host. my $sde = 0; # run under sde (for testing). my $sim = 0; # run under any simulator/emulator. @@ -91,9 +90,8 @@ sub usage { " -debugCheck Print detailed results of sanity-checks.\n". " -killCmd= Command to kill runaway jobs (default is '$killCmd').\n". "\ntarget options:\n". - " -arch= Specify target architecture: knc, knl, hsw, ... (default is '$arch').\n". + " -arch= Specify target architecture: avx512, avx3, ... (default is '$arch').\n". " -host= Run binary on host using ssh.\n". - " -mic= Set hostname to current hostname appended with -mic; sets arch to 'knc'.\n". " -sde Run binary on SDE (for testing only).\n". " -makePrefix= Prefix make command with .*\n". " -makeArgs= Pass additional to make command.*\n". @@ -110,13 +108,13 @@ sub usage { " -sweep Use exhausitive search instead of GA.\n". " -= Force to fixed value .\n". " Run with -check for list of genes and default ranges.\n". - " Setting local-domain size (l) also sets upper block and region sizes.\n". + " Setting local-domain size (l) also sets upper block and mega-block sizes.\n". " Leave off 'x', 'y', 'z' suffix to set these 3 vars to same val.\n". " Examples: '-l=512' Set local-domain size to 512^3.\n". " '-bx=64' Set block size to 64 in 'x' dim.\n". " '-ep=0' Disable extra padding.\n". " '-c=1' Allow only one vector in a cluster.\n". - " '-r=0' Allow only one OpenMP region (region size=0 => local-domain size).\n". + " '-Mb=0' Allow only one Mega-block (Mb=0 => Mb=local-domain size).\n". " -=- Restrict between and , inclusive.\n". " Example: '-bx=8-128'.\n". " See the notes above on specification.\n". @@ -136,7 +134,7 @@ sub usage { "* indicates options that are invalid if -noBuild is used.\n". "\n". "examples:\n". - " $0 -stencil=iso3dfd -l=768 -r=0 -noPrefetch\n". + " $0 -stencil=iso3dfd -l=768 -Mb=0 -noPrefetch\n". " $0 -stencil=awp -lx=512 -ly=512 -lz=256 -b=4-512:4\n". " $0 -stencil=3axis -mem=8-10 -noBuild\n"; @@ -150,9 +148,9 @@ sub usage { my %geneRanges; my $autoKey = 'def_'; # prefix for default setting. -# control groups. +# control tiles. # TODO: make an option. -my $showGroups = 0; +my $showTiles = 0; # autoflush. $| = 1; @@ -201,11 +199,6 @@ sub usage { elsif ($opt =~ '^-?ranks=(\d+)$') { $nranks = $rhs; } - elsif ($opt =~ '^-?mic=(\d+)$') { - $mic = $rhs; - $arch = 'knc'; - $host = hostname()."-mic$mic"; - } elsif ($opt =~ '^-?arch=(\S+)$') { $arch = $rhs; } @@ -272,8 +265,8 @@ sub usage { # special case for local-domain size: also set default for other max sizes. if ($key =~ /^l[xyz]?$/ && $max > 0) { - my @szs = qw(r b mb sb); - push @szs, qw(bg mbg sbg) if $showGroups; + my @szs = qw(Mb b mb nb); + push @szs, qw(b_tile mb_tile nb_tile) if $showTiles; for my $i (@szs) { my $key2 = $key; $key2 =~ s/^l/$i/; @@ -299,9 +292,6 @@ sub usage { # radius. $radius = 8 if !defined $radius; -# disable folding for DP MIC (no valignq). -$folding = 0 if (defined $mic && $dp); - # dir name. my $searchTypeStr = $sweep ? 'sweep' : 'tuner'; my $hostStr = defined $host ? $host : hostname(); @@ -328,11 +318,11 @@ sub usage { unless $checking; # things to get from the run. -if ($showGroups) { +if ($showTiles) { push @YaskUtils::log_keys, - 'block-group-size', - 'mini-block-group-size'; - 'sub-block-group-size'; + 'block-tile-size', + 'micro-block-tile-size'; + 'nano-block-tile-size'; } # how many individuals to create randomly and then keep at any given time. @@ -358,21 +348,21 @@ sub usage { my $minPoints; my $maxPoints; my $minClustersInBlock = 10; -my $minBlocksInRegion = 10; +my $minBlocksInMegaBlock = 10; # Threads. -my $minThreadDivisor = 1; -my $maxThreadDivisor = 4; -my $minBlockThreads = 1; -my $maxBlockThreads = 32; # TODO: set to number of CPUs. +my $minOuterThreads = 8; +my $maxOuterThreads = 64; +my $minInnerThreads = 1; +my $maxInnerThreads = 4; # List of possible loop orders. my @loopOrders = ('123', '132', '213', '231', '312', '321'); # Possible space-filling curve modifiers. -my @pathNamesIncreasing = - ('', 'grouped'); +my @pathNamesIncreasing = (''); +push @pathNamesIncreasing, 'tiled' if $showTiles; my @pathNames = (@pathNamesIncreasing, 'serpentine', 'square_wave serpentine', 'square_wave'); @@ -400,15 +390,11 @@ sub usage { if $folding; } -# OMP. -my @schedules = - ( 'static,1', 'dynamic,1', 'guided,1' ); - # Data structure to describe each gene in the genome. # 2-D array. Each outer array element contains the following elements: # 0. min allowed value; '0' is a special case handled by YASK. # 1. max allowed value. -# 2. step size between values (usually 1). +# 2. step size between values. # 3. name. my @rangesAll = ( @@ -417,27 +403,27 @@ sub usage { [ $minDim, $maxDim, 16, 'ly' ], [ $minDim, $maxDim, 16, 'lz' ], - # region size. - [ 1, $maxTimeBlock, 1, 'rt' ], - [ 0, $maxDim, 1, 'rx' ], - [ 0, $maxDim, 1, 'ry' ], - [ 0, $maxDim, 1, 'rz' ], + # mega-block size. + [ 1, $maxTimeBlock, 1, 'Mbt' ], + [ 0, $maxDim, 4, 'Mbx' ], + [ 0, $maxDim, 4, 'Mby' ], + [ 0, $maxDim, 4, 'Mbz' ], # block size. [ 1, $maxTimeBlock, 1, 'bt' ], - [ 0, $maxDim, 1, 'bx' ], - [ 0, $maxDim, 1, 'by' ], - [ 0, $maxDim, 1, 'bz' ], + [ 0, $maxDim, 4, 'bx' ], + [ 0, $maxDim, 4, 'by' ], + [ 0, $maxDim, 4, 'bz' ], - # mini-block size. - [ 0, $maxDim, 1, 'mbx' ], - [ 0, $maxDim, 1, 'mby' ], - [ 0, $maxDim, 1, 'mbz' ], + # micro-block size. + [ 0, $maxDim, 4, 'mbx' ], + [ 0, $maxDim, 4, 'mby' ], + [ 0, $maxDim, 4, 'mbz' ], - # sub-block size. - [ 0, $maxDim, 1, 'sbx' ], - [ 0, $maxDim, 1, 'sby' ], - [ 0, $maxDim, 1, 'sbz' ], + # nano-block size. + [ 0, $maxDim, 4, 'nbx' ], + [ 0, $maxDim, 4, 'nby' ], + [ 0, $maxDim, 4, 'nbz' ], # extra padding. [ 0, $maxPad, 1, 'epx' ], @@ -445,23 +431,28 @@ sub usage { [ 0, $maxPad, 1, 'epz' ], # threads. - [ $minThreadDivisor, $maxThreadDivisor, 1, 'thread_divisor' ], - [ $minBlockThreads, $maxBlockThreads, 1, 'block_threads' ], - [ 0, 1, 1, 'bind_block_threads' ], + [ $minOuterThreads, $maxOuterThreads, 1, 'outer_threads' ], + [ $minInnerThreads, $maxInnerThreads, 1, 'inner_threads' ], + [ 0, 1, 1, 'bind_inner_threads' ], ); -if ($showGroups) { +if ($showTiles) { push @rangesAll, ( - # block-group size. - [ 0, $maxDim, 1, 'bgx' ], - [ 0, $maxDim, 1, 'bgy' ], - [ 0, $maxDim, 1, 'bgz' ], + # block-tile size. + [ 0, $maxDim, 4, 'b_tilex' ], + [ 0, $maxDim, 4, 'b_tiley' ], + [ 0, $maxDim, 4, 'b_tilez' ], - # sub-block-group size. - [ 0, $maxDim, 1, 'sbgx' ], - [ 0, $maxDim, 1, 'sbgy' ], - [ 0, $maxDim, 1, 'sbgz' ], + # micro-block-tile size. + [ 0, $maxDim, 4, 'mb_tilex' ], + [ 0, $maxDim, 4, 'mb_tiley' ], + [ 0, $maxDim, 4, 'mb_tilez' ], + + # nano-block-tile size. + [ 0, $maxDim, 4, 'sb_tilex' ], + [ 0, $maxDim, 4, 'sb_tiley' ], + [ 0, $maxDim, 4, 'sb_tilez' ], ); } @@ -473,14 +464,14 @@ sub usage { # Loops, from the list above. # Each loop consists of index order and path mods. # Block and rank paths require increasing indices. - [ 0, $#loopOrders, 1, 'subBlockOrder' ], - [ 0, $#pathNames, 1, 'subBlockPath' ], - [ 0, $#loopOrders, 1, 'miniBlockOrder' ], - [ 0, $#pathNames, 1, 'miniBlockPath' ], + [ 0, $#loopOrders, 1, 'nanoBlockOrder' ], + [ 0, $#pathNames, 1, 'nanoBlockPath' ], + [ 0, $#loopOrders, 1, 'microBlockOrder' ], + [ 0, $#pathNames, 1, 'microBlockPath' ], [ 0, $#loopOrders, 1, 'blockOrder' ], [ 0, $#pathNamesIncreasing, 1, 'blockPath' ], - [ 0, $#loopOrders, 1, 'regionOrder' ], - [ 0, $#pathNames, 1, 'regionPath' ], + [ 0, $#loopOrders, 1, 'MegaBlockOrder' ], + [ 0, $#pathNames, 1, 'MegaBlockPath' ], [ 0, $#loopOrders, 1, 'rankOrder' ], [ 0, $#pathNamesIncreasing, 1, 'rankPath' ], @@ -497,10 +488,6 @@ sub usage { [ -$maxPfd_l1, $maxPfd_l1, 1, 'pfd_l1' ], [ -$maxPfd_l2, $maxPfd_l2, 1, 'pfd_l2' ], - # other build options. - [ 0, $#schedules, 1, 'ompRegionSchedule' ], # OMP schedule for region loop. - [ 0, $#schedules, 1, 'ompBlockSchedule' ], # OMP schedule for mini-block loop. - ); } @@ -678,7 +665,7 @@ ($$$) my @vals; for my $d (@dirs) { - if ($key =~ /bg$/ && !$showGroups) { + if ($key =~ /bg$/ && !$showTiles) { push @vals, 1; } else { push @vals, readHash($hash, "$key$d", $isBuildVar); @@ -765,12 +752,8 @@ ($) my $exePrefix = $timeCmd; $exePrefix .= " sde -$arch --" if $sde; - my $runCmd = "bin/yask.sh -log $outDir/yask.$stencil.$arch.run".sprintf("%06d",$run).".log"; - if (defined $mic) { - $runCmd .= " -mic $mic"; - } else { - $runCmd .= " -host $host" if defined $host; - } + my $runCmd = "bin/yask.sh -log_dir $outDir/logs -log yask.$stencil.$arch.run".sprintf("%06d",$run).".log"; + $runCmd .= " -host $host" if defined $host; $runCmd .= " -exe_prefix '$exePrefix' -stencil $tag -arch $arch -no-pre_auto_tune -no-print_suffixes"; $runCmd .= " -ranks $nranks" if defined $nranks; return $runCmd; @@ -1037,19 +1020,17 @@ ($$$$) # vars to create. my $order = join(',', @dims); # e.g., '2, 1'. my $outerMods = ''; - my $innerMods = ''; # path gene? my $pathKey = $tunerPrefix."Path"; if (exists $h->{$pathKey}) { my $path = readHash($h, $pathKey, 1); - my $pathStr = @pathNames[$path]; # e.g., 'grouped'. + my $pathStr = @pathNames[$path]; # e.g., 'tiled'. $outerMods = $pathStr; } my $loopVars = " ".$makePrefix."_LOOP_ORDER='$order'"; - $loopVars .= " ".$makePrefix."_LOOP_OUTER_MODS='$outerMods'"; - $loopVars .= " ".$makePrefix."_LOOP_INNER_MODS='$innerMods'"; + $loopVars .= " ".$makePrefix."_LOOP_MODS='$outerMods'"; return $loopVars; } @@ -1143,24 +1124,26 @@ sub fitness { # get individual vars from hash or fixed values. my $h = makeHash($values); my @ds = readHashes($h, 'l', 0); - my $rt = readHash($h, 'rt', 1); - my @rs = readHashes($h, 'r', 0); + my $Mbt = readHash($h, 'Mbt', 1); + my @Mbs = readHashes($h, 'Mb', 0); my $bt = readHash($h, 'bt', 1); my @bs = readHashes($h, 'b', 0); my @mbs = readHashes($h, 'mb', 0); - my @sbs = readHashes($h, 'sb', 0); - my @bgs = readHashes($h, 'bg', 0); - my @sbgs = readHashes($h, 'sbg', 0); + my @nbs = readHashes($h, 'nb', 0); + my (@b_tiles, @mb_tiles, @nb_tiles); + if ($showTiles) { + @b_tiles = readHashes($h, 'b_tile', 0); + @mb_tiles = readHashes($h, 'mb_tile', 0); + @nb_tiles = readHashes($h, 'nb_tile', 0); + } my @cvs = readHashes($h, 'c', 1); # in vectors, not in points! my @ps = readHashes($h, 'ep', 0); my $fold = readHash($h, 'fold', 1); - my $thread_divisor = readHash($h, 'thread_divisor', 0); - my $block_threads = readHash($h, 'block_threads', 0); - my $bind_block_threads = readHash($h, 'bind_block_threads', 0); + my $outer_threads = readHash($h, 'outer_threads', 0); + my $inner_threads = readHash($h, 'inner_threads', 0); + my $bind_inner_threads = readHash($h, 'bind_inner_threads', 0); my $pfd_l1 = readHash($h, 'pfd_l1', 1); my $pfd_l2 = readHash($h, 'pfd_l2', 1); - my $ompRegionSchedule = readHash($h, 'ompRegionSchedule', 1); - my $ompBlockSchedule = readHash($h, 'ompBlockSchedule', 1); # fold numbers. my $foldNums = $folds[$fold]; @@ -1178,39 +1161,46 @@ sub fitness { my @cs = map { $fs[$_] * $cvs[$_] } 0..$#dirs; # adjust inner sizes to fit in their enclosing sizes. - adjSizes(\@rs, \@ds); # region <= domain. - adjSizes(\@bs, \@rs); # block <= region. - adjSizes(\@mbs, \@bs); # mini-block <= block. - adjSizes(\@sbs, \@mbs); # sub-block <= mini-block. - adjSizes(\@bgs, \@rs); # block-group <= region. - adjSizes(\@sbgs, \@mbs); # sub-block-group <= mini-block. + adjSizes(\@Mbs, \@ds); # Mega-block <= domain. + adjSizes(\@bs, \@Mbs); # block <= Mega-block. + adjSizes(\@mbs, \@bs); # micro-block <= block. + adjSizes(\@nbs, \@mbs); # nano-block <= micro-block. + if ($showTiles) { + adjSizes(\@b_tiles, \@bs); # block-tile <= block. + adjSizes(\@mb_tiles, \@mbs); # micro-block-tile <= micro-block. + adjSizes(\@nb_tiles, \@nbs); # nano-block-tile <= nano-block. + } # 3d sizes in points. my $dPts = mult(@ds); - my $rPts = mult(@rs); + my $MbPts = mult(@Mbs); my $bPts = mult(@bs); my $mbPts = mult(@mbs); - my $sbPts = mult(@sbs); + my $nbPts = mult(@nbs); my $cPts = mult(@cs); my $fPts = mult(@fs); - my $bgPts = mult(@bgs); - my $sbgPts = mult(@sbgs); + my ($b_tilePts, $mb_tilePts, $nb_tilePts); + if ($showTiles) { + $b_tilePts = mult(@b_tiles); + $mb_tilePts = mult(@mb_tiles); + $nb_tilePts = mult(@nb_tiles); + } # Clusters per block. my @bcs = map { ceil($bs[$_] / $cs[$_]) } 0..$#dirs; - my $bCls = mult(@bcs); + my $bcs = mult(@bcs); - # Mini-blocks per block. + # Micro-blocks per block. my @bmbs = map { ceil($bs[$_] / $mbs[$_]) } 0..$#dirs; - my $bMbs = mult(@bmbs); + my $bmbs = mult(@bmbs); - # Blocks per region. - my @rbs = map { ceil($rs[$_] / $bs[$_]) } 0..$#dirs; - my $rBlks = mult(@rbs); + # Blocks per Mega-block. + my @Mbbs = map { ceil($Mbs[$_] / $bs[$_]) } 0..$#dirs; + my $Mbbs = mult(@Mbbs); - # Regions per rank. - my @drs = map { ceil($ds[$_] / $rs[$_]) } 0..$#dirs; - my $dRegs = mult(@drs); + # Mega-blocks per rank. + my @dMbs = map { ceil($ds[$_] / $Mbs[$_]) } 0..$#dirs; + my $dMbs = mult(@dMbs); # mem usage estimate. my $overallSize = calcSize(\@ds, \@ps, \@cs); @@ -1218,19 +1208,20 @@ sub fitness { if ($debugCheck) { print "Sizes:\n"; print " local-domain size = $dPts\n"; - print " region size = $rPts\n"; + print " Mega-block size = $MbPts\n"; print " block size = $bPts\n"; - print " sub-block size = $sbPts\n"; + print " nano-block size = $nbPts\n"; print " cluster size = $cPts\n"; print " fold size = $fPts\n"; - print " regions per local-domain = $dRegs\n"; - print " blocks per region = $rBlks\n"; - print " clusters per block = $bCls\n"; - print " mini-blocks per block = $bMbs\n"; + print " Mega-blocks per local-domain = $dMbs\n"; + print " blocks per Mega-block = $Mbbs\n"; + print " clusters per block = $bcs\n"; + print " micro-blocks per block = $bmbs\n"; print " mem estimate = ".($overallSize/$YaskUtils::oneGi)." GB\n"; - if ($showGroups) { - print " block-group size = $bgPts\n"; - print " sub-block-group size = $sbgPts\n"; + if ($showTiles) { + print " block-tile size = $b_tilePts\n"; + print " micro-block-tile size = $mb_tilePts\n"; + print " nano-block-tile size = $nb_tilePts\n"; } } @@ -1257,16 +1248,16 @@ sub fitness { } # Each block should do minimal work. - if ($bCls < $minClustersInBlock) { - print " $bCls clusters per block < $minClustersInBlock\n" if $debugCheck; + if ($bcs < $minClustersInBlock) { + print " $bcs clusters per block < $minClustersInBlock\n" if $debugCheck; $checkStats{'block size too small'}++; $ok = 0; } # Should be min number of blocks. - if ($rBlks < $minBlocksInRegion) { - print " $rBlks blocks per region < $minBlocksInRegion\n" if $debugCheck; - $checkStats{'too few blocks per region'}++; + if ($Mbbs < $minBlocksInMegaBlock) { + print " $Mbbs blocks per Mega-block < $minBlocksInMegaBlock\n" if $debugCheck; + $checkStats{'too few blocks per Mega-block'}++; $ok = 0; } @@ -1276,28 +1267,25 @@ sub fitness { $checkStats{'ok'} += $ok; addStat($ok, 'mem estimate', $overallSize); addStat($ok, 'local-domain size', $dPts); - addStat($ok, 'region size', $rPts); + addStat($ok, 'Mega-block size', $MbPts); addStat($ok, 'block size', $bPts); - addStat($ok, 'mini-block size', $mbPts); - addStat($ok, 'sub-block size', $sbPts); + addStat($ok, 'micro-block size', $mbPts); + addStat($ok, 'nano-block size', $nbPts); addStat($ok, 'cluster size', $cPts); - addStat($ok, 'regions per local-domain', $dRegs); - addStat($ok, 'blocks per region', $rBlks); - addStat($ok, 'clusters per block', $bCls); - addStat($ok, 'mini-blocks per block', $bMbs); + addStat($ok, 'Mega-blocks per local-domain', $dMbs); + addStat($ok, 'blocks per Mega-block', $Mbbs); + addStat($ok, 'clusters per block', $bcs); + addStat($ok, 'micro-blocks per block', $bmbs); addStat($ok, 'vectors per cluster', $cvs); - if ($showGroups) { - addStat($ok, 'block-group size', $bgPts); - addStat($ok, 'sub-block-group size', $sbgPts); + if ($showTiles) { + addStat($ok, 'block-tile size', $b_tilePts); + addStat($ok, 'micro-block-tile size', $mb_tilePts); + addStat($ok, 'nano-block-tile size', $nb_tilePts); } # exit here if just checking. return $ok if $justChecking; - # OMP settings. - my $regionScheduleStr = $schedules[$ompRegionSchedule]; - my $blockScheduleStr = $schedules[$ompBlockSchedule]; - # compile-time settings. my $macros = ''; # string of macros. my $mvars = ''; # other make vars. @@ -1318,13 +1306,10 @@ sub fitness { # gen-loops vars. $mvars .= makeLoopVars($h, 'RANK', 'rank', 3); - $mvars .= makeLoopVars($h, 'REGION', 'region', 3); + $mvars .= makeLoopVars($h, 'MEGA_BLOCK', 'MegaBlock', 3); $mvars .= makeLoopVars($h, 'BLOCK', 'block', 3); - $mvars .= makeLoopVars($h, 'MINI_BLOCK', 'miniBlock', 3); - $mvars .= makeLoopVars($h, 'SUB_BLOCK', 'subBlock', 2); - - # other vars. - $mvars .= " omp_region_schedule=$regionScheduleStr omp_block_schedule=$blockScheduleStr"; + $mvars .= makeLoopVars($h, 'MICRO_BLOCK', 'microBlock', 3); + $mvars .= makeLoopVars($h, 'NANO_BLOCK', 'nanoBlock', 3); # how to make. my ( $makeCmd, $tag ) = getMakeCmd($macros, $mvars); @@ -1332,20 +1317,20 @@ sub fitness { # how to run. my $runCmd = getRunCmd($tag); # shell command plus any initial args. my $args = ""; # exe args. - $args .= " -thread_divisor $thread_divisor"; - $args .= " -block_threads $block_threads"; - $args .= ($bind_block_threads ? " " : " -no"). "-bind_block_threads"; + $args .= " -outer_threads $outer_threads -inner_threads $inner_threads -max_threads ".($outer_threads * $inner_threads); + $args .= ($bind_inner_threads ? " " : " -no"). "-bind_inner_threads"; # sizes. $args .= " -lx $ds[0] -ly $ds[1] -lz $ds[2]"; - $args .= " -rt $rt -rx $rs[0] -ry $rs[1] -rz $rs[2]"; + $args .= " -Mbt $Mbt -Mbx $Mbs[0] -Mby $Mbs[1] -Mbz $Mbs[2]"; $args .= " -bt $bt -bx $bs[0] -by $bs[1] -bz $bs[2]"; $args .= " -mbx $mbs[0] -mby $mbs[1] -mbz $mbs[2]"; - $args .= " -sbx $sbs[0] -sby $sbs[1] -sbz $sbs[2]"; + $args .= " -nbx $nbs[0] -nby $nbs[1] -nbz $nbs[2]"; $args .= " -epx $ps[0] -epy $ps[1] -epz $ps[2]"; - if ($showGroups) { - $args .= " -bgx $bgs[0] -bgy $bgs[1] -bgz $bgs[2]"; - $args .= " -sbgx $sbgs[0] -sbgy $sbgs[1] -sbgz $sbgs[2]"; + if ($showTiles) { + $args .= " -b_tilex $b_tiles[0] -b_tiley $b_tiles[1] -b_tilez $b_tiles[2]"; + $args .= " -mb_tilex $mb_tiles[0] -mb_tiley $mb_tiles[1] -mb_tilez $mb_tiles[2]"; + $args .= " -nb_tilex $nb_tiles[0] -nb_tiley $nb_tiles[1] -nb_tilez $nb_tiles[2]"; } # num of secs and trials. @@ -1371,13 +1356,13 @@ sub fitness { # Kill time is some overhead plus allocated time multipled by temporal blocking and num runs. my $maxOHead = 60 * 10; - my $killTime = $maxOHead + ($shortTime * max($rt, $bt) * $shortTrials); + my $killTime = $maxOHead + ($shortTime * max($Mbt, $bt) * $shortTrials); # Inject kill command into '-exe_prefix' part of run commands. $shortRunCmd =~ s/$timeCmd/$timeCmd $killCmd $killTime/; # Repeat for long run. - $killTime = $maxOHead + ($longTime * max($rt, $bt) * $longTrials); + $killTime = $maxOHead + ($longTime * max($Mbt, $bt) * $longTrials); $longRunCmd =~ s/$timeCmd/$timeCmd $killCmd $killTime/; } } diff --git a/utils/bin/yask_tuner_summary.csh b/utils/bin/yask_tuner_summary.csh index 1a2fa86c..e1775e1f 100755 --- a/utils/bin/yask_tuner_summary.csh +++ b/utils/bin/yask_tuner_summary.csh @@ -2,7 +2,7 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to diff --git a/utils/lib/CmdLine.pm b/utils/lib/CmdLine.pm index 78430241..2aee5946 100644 --- a/utils/lib/CmdLine.pm +++ b/utils/lib/CmdLine.pm @@ -1,6 +1,6 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to diff --git a/utils/lib/YaskUtils.pm b/utils/lib/YaskUtils.pm index 4ffd28e2..ab7341c5 100644 --- a/utils/lib/YaskUtils.pm +++ b/utils/lib/YaskUtils.pm @@ -1,6 +1,6 @@ ############################################################################## ## YASK: Yet Another Stencil Kit -## Copyright (c) 2014-2021, Intel Corporation +## Copyright (c) 2014-2022, Intel Corporation ## ## Permission is hereby granted, free of charge, to any person obtaining a copy ## of this software and associated documentation files (the "Software"), to @@ -28,15 +28,7 @@ package YaskUtils; use strict; use FileHandle; use Carp; - -# Special keys. -my $linux_key = "Linux kernel"; -my $nodes_key = "MPI node(s)"; -our @special_log_keys = - ( - $linux_key, - $nodes_key, - ); +use Scalar::Util qw(looks_like_number); # Values to get from log file. # First one should be overall "fitness". @@ -51,45 +43,112 @@ our @log_keys = 'mid throughput (est-FLOPS)', 'mid elapsed time (sec)', 'mid num-steps-done', + 'best throughput (num-points/sec)', 'best throughput (num-reads/sec)', 'best throughput (num-writes/sec)', 'best throughput (est-FLOPS)', 'best elapsed time (sec)', 'best num-steps-done', - 'yask version', - 'target', + + 'num-trials', + 'min-throughput (num-points/sec)', + 'max-throughput (num-points/sec)', + 'ave-throughput (num-points/sec)', + 'std-dev-throughput (num-points/sec)', + 'stencil name', 'stencil description', 'element size', - 'invocation', + 'script invocation', 'binary invocation', + 'yask version', + 'target', + 'num MPI ranks', - 'num ranks', 'num OpenMP threads', # also matches 'Num OpenMP threads used'. - 'num threads per region', - 'num threads per block', + 'num outer threads', + 'num inner threads', + 'device thread limit', + + 'domain size in this rank', + 'total allocation in this rank', + 'overall problem size', 'total overall allocation', + 'inner-layout dim', + 'inner-loop dim', + + 'num mega-blocks per local-domain per step', + 'num blocks per mega-block per step', + 'num micro-blocks per block per step', + 'num nano-blocks per micro-block per step', + 'num pico-blocks per nano-block per step', + + 'L1 prefetch distance', + 'L2 prefetch distance', + 'num temporal block steps', + 'num wave front steps', + 'extra padding', + 'min padding', + + # values from compiler report + 'YASK compiler invocation', + 'YC_STENCIL', + 'YC_TARGET', + 'YK_CXXVER', + 'YK_CXXCMD', + 'YK_CXXOPT', + 'YK_CXXFLAGS', + 'YK_STENCIL', + 'YK_ARCH', + 'YK_TAG', + 'YK_EXEC', + ); + +# Keys set with custom code. +my $linux_key = "Linux kernel"; +my $hostname_key = "hostname"; +my $nodes_key = "MPI node(s)"; +my $auto_tuner_key = "Auto-tuner used"; +my $val_key = "validation results"; +my $yask_key = "YASK vars"; +our @special_log_keys = + ( + $hostname_key, + $linux_key, + $nodes_key, + $auto_tuner_key, + $val_key, + $yask_key, + ); + +# Sizes. +our @size_log_keys = + ( 'global-domain size', 'local-domain size', - 'region size', + 'mega-block size', 'block size', - 'mini-block size', - 'sub-block size', + 'micro-block size', + 'nano-block size', + 'pico-block size', 'cluster size', 'vector size', - 'num regions per local-domain per step', - 'num blocks per region per step', - 'num mini-blocks per block per step', - 'num sub-blocks per mini-block per step', - 'extra padding', - 'minimum padding', - 'L1 prefetch distance', - 'L2 prefetch distance', - 'num temporal block steps', - 'num wave front steps', + ); +if (0) { + push @size_log_keys, + ( + 'local-domain tile size', + 'mega-block tile size', + 'block tile size', + 'micro-block tile size', + 'nano-block tile size', + ); +} - # other values from log file. +# System settings. +our @sys_log_keys = + ( 'model name', 'CPU(s)', 'core(s) per socket', @@ -100,8 +159,7 @@ our @log_keys = 'ShMem', ); -our @all_log_keys = ( @log_keys, @special_log_keys ); - +our @all_log_keys = ( @log_keys, @size_log_keys, @sys_log_keys, @special_log_keys ); our $oneKi = 1024; our $oneMi = $oneKi * $oneKi; @@ -123,7 +181,7 @@ our $onef = 1e-15; # Return a number from a number w/suffix. # Examples: -# - removeSuf("2.34K") => 2300. +# - removeSuf("2.34K") => 2340. # - removeSuf("2KiB") => 2048. # - removeSuf("foo") => "foo". sub removeSuf($) { @@ -185,9 +243,9 @@ sub getResultsFromLine($$) { # pre-process keys one time. if (scalar keys %proc_keys == 0) { undef %proc_keys; - for my $m (@log_keys) { + for my $k (@log_keys, @size_log_keys, @sys_log_keys) { - my $pm = lc $m; + my $pm = lc $k; $pm =~ s/^\s+//; $pm =~ s/\s+$//; @@ -196,29 +254,49 @@ sub getResultsFromLine($$) { # short key. my $sk = substr $pm,0,$klen; - + # escape regex chars. $pm =~ s/\(/\\(/g; $pm =~ s/\)/\\)/g; - $proc_keys{$sk}{$pm} = $m; + $proc_keys{$sk}{$pm} = $k; } } # Substitutions to handle old formats. + $line =~ s/^Invocation/Script invocation/g if $line !~ /yask_compiler/; $line =~ s/overall.problem/global-domain/g; $line =~ s/rank.domain/local-domain/g; $line =~ s/grid/var/g; $line =~ s/Grid/Var/g; $line =~ s/target.ISA/target/g; + $line =~ s/mini([_-])bl/micro${1}bl/g; + $line =~ s/sub([_-])bl/nano${1}bl/g; + $line =~ s/region/mega-block/g; + $line =~ s/minimum-padding/min-padding/g; + $line =~ s/Num threads per region/num outer threads/g; + $line =~ s/Num threads per block/num inner threads/g; # special cases for manual parsing... - # TODO: catch output of auto-tuner and update relevant results. + # Validation. + if ($line =~ /did not pass internal validation test/i) { + $results->{$val_key} = 'failed'; + } + elsif ($line =~ /passed internal validation test/i) { + $results->{$val_key} = 'passed'; + } + elsif ($line =~ /Results NOT VERIFIED/i) { + $results->{$val_key} = 'not verified'; + } + # Output of 'uname -a' - if ($line =~ /^\s*Linux\s/) { + elsif ($line =~ /^\s*Linux\s/) { my @w = split ' ', $line; - $results->{$linux_key} = $w[2]; # 'Linux' hostname kernel ... + + # 'Linux' hostname kernel ... + $results->{$hostname_key} = $w[1]; + $results->{$linux_key} = $w[2]; } # MPI node names. @@ -229,19 +307,25 @@ sub getResultsFromLine($$) { $results->{$nodes_key} .= $nname; } + # Vars containing "YASK". + elsif ($line =~ /^env:\s+(\w*YASK\w*=.*)/) { + $results->{$yask_key} .= '; ' if + exists $results->{$yask_key}; + $results->{$yask_key} .= $1; + } + # If auto-tuner is run globally, capture updated values. # Invalidate settings overridden by auto-tuner on multiple stages. - elsif ($line =~ /^auto-tuner(.).*size:/) { + elsif ($line =~ /^\s*auto-tuner(.).*size:/) { my $c = $1; + $results->{$auto_tuner_key} = 'TRUE'; - # If colon found above, tuner is global. + # If colon found immediately after "auto-tuner", tuner is global. my $onep = ($c eq ':'); - for my $k ('block size', - 'mini-block size', - 'sub-block size',) { + for my $k (@size_log_keys) { $line =~ s/-size/ size/; - if ($line =~ / (best-)?$k:\s*(t=.*)/i) { + if ($line =~ / (best-)?$k:\s*(.*)/i) { my $val = $onep ? $2 : 'auto-tuned'; $results->{$k} = $val; } @@ -250,23 +334,27 @@ sub getResultsFromLine($$) { # look for matches to all other keys. else { - my ($key, $val) = split /:/,$line,2; + my ($key, $val) = split /[=:]/,$line,2; if (defined $val) { $key = lc $key; $key =~ s/^\s+//; + $key =~ s/\s+$//; $key =~ s/[- ]+/-/g; # relax hyphen and space match. + $val =~ s/^\s+//; + $val =~ s/\s+$//; - # short key. + # short key for quick match. my $sk = substr $key,0,$klen; - # match to short key? + # return if no match to short key. return if !exists $proc_keys{$sk}; - + # look for exact key. for my $m (keys %{$proc_keys{$sk}}) { # match? - # only check that beginning of key matches. + # Only compares found key to beginning of target, + # so beginning of target must be unique. if ($key =~ /^$m/) { $val =~ s/^\s+//; $val =~ s/\s+$//; @@ -275,6 +363,19 @@ sub getResultsFromLine($$) { # Save value w/converted suffix. my $k = $proc_keys{$sk}{$m}; $results->{$k} = $val; + + # More special processing to get env-vars set via script. + # TODO: remove overridden vars. + if ($k eq 'script invocation') { + for my $w (split /\s+/, $val) { + if ($w =~ /(\w*YASK\w*=.*)/) { + $results->{$yask_key} .= '; ' if + exists $results->{$yask_key}; + $results->{$yask_key} .= $1; + } + } + } + last; } } @@ -311,20 +412,28 @@ sub printCsvHeader($) { # Does NOT print newline. sub printCsvValues($$) { my $results = shift; # ref to hash. - my $fh = shift; + my $fh = shift; # file handle. my @cols; for my $m (@all_log_keys) { my $r = $results->{$m}; - $r = '' if !defined $r; - $r = '"'.$r.'"' # add quotes if not a number. - if $r !~ /^[0-9.e+-]+$/ || $r =~ /[.].*[.]/; + + $r = '' if + !defined $r; + + # special-case fix for bogus Excel cell reference. + $r =~ s/-/ -/ if + $r =~ /^"?-[a-zA-Z]/; + + # add quotes if not a number, etc. + $r = '"'.$r.'"' if + !looks_like_number($r) && + $r !~ /^"/ && + $r ne 'TRUE' && $r ne 'FALSE'; push @cols, $r; } print $fh join(',', @cols); } -# return with a 1 so require() will not fail... -# +# return with a 1 so require() will not fail. 1; -