diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs index de1c611b7b..e03b779f14 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceBuiltn.cs @@ -55,14 +55,14 @@ public static string TopoAwareRuntimeDef(CpuTargetOptions options, ulong dataAli public static string CMakeDef(string name) { - var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "cpu_runtime.cmake")); + var cmakePath = CMakePath(Path.Combine(Path.GetDirectoryName(typeof(CSourceBuiltn).Assembly.Location)!, "Runtime", "cmake", "ntt_module.cmake")); var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/CMakeLists.txt.cshtml", new { CMakePath = cmakePath }).Result; return content; } public static string MakeMain(TIR.PrimFunction primFunction, ulong dataAlign, ulong dataUsage, ulong rdataPoolSize, IEnumerable rdataBuffers, CpuTargetOptions options) { - var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result; + var content = RazorTemplateEngine.RenderAsync("~/CodeGen/CPU/Templates/thread_main.cpp.cshtml", new KernelMainModel(primFunction, rdataBuffers.ToArray(), options, dataAlign, dataUsage, rdataPoolSize)).Result; return content; } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs index 3ef3373d57..0f2fd57cae 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/CSourceCompiler.cs @@ -169,9 +169,7 @@ private void ArchSpecific() private string ArgumentsSpecific(string sourcePath, string outPath) { var archConfig = RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? - "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DBUILD_SHARED=ON" : - RuntimeInformation.IsOSPlatform(OSPlatform.Linux) ? "-DBUILD_SHARED=ON" : - string.Empty; + "-DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl" : string.Empty; #if DEBUG var config = "Debug"; diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs b/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs index c5b9437a8a..1c24067338 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/LinkableModule.cs @@ -56,7 +56,7 @@ public ILinkedModule Link(ILinkContext linkContext) Directory.CreateDirectory(dumpPath); } - using (var fs = File.Open(Path.Join(dumpPath, "main.cpp"), FileMode.Create)) + using (var fs = File.Open(Path.Join(dumpPath, "thread_main.cpp"), FileMode.Create)) { using (var writer = new StreamWriter(fs)) { @@ -112,7 +112,7 @@ public ILinkedModule Link(ILinkContext linkContext) private string CompileCSource(string sourcePath) { var compiler = new CSourceCompiler(); - var binDir = Path.Join(sourcePath, "build", "nncase_cpu_module"); + var binDir = Path.Join(sourcePath, "build", "nncase_ntt_module"); return compiler.Compile(sourcePath, binDir); } } diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml index e6c38fe1a9..df93b610c6 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/CMakeLists.txt.cshtml @@ -12,41 +12,4 @@ endif() include(@Html.Raw(Model.CMakePath)) -if(NOT APPLE AND BUILD_SHARED) - add_library(nncase_cpu_module SHARED main.cpp) - set_target_properties(nncase_cpu_module PROPERTIES PREFIX "" SUFFIX "") - set_target_properties(nncase_cpu_runtime PROPERTIES POSITION_INDEPENDENT_CODE ON) -else() - add_executable(nncase_cpu_module main.cpp) -endif() -target_compile_features(nncase_cpu_module PUBLIC cxx_std_20) -target_link_libraries(nncase_cpu_module PRIVATE nncase_cpu_runtime) -target_compile_definitions(nncase_cpu_module PUBLIC -DNNCASE_CPU_MODULE=1) - -if (MSVC) - set_target_properties(nncase_cpu_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE) - target_link_options(nncase_cpu_module PRIVATE /NODEFAULTLIB) - target_link_libraries(nncase_cpu_module PRIVATE libvcruntime msvcrt ucrt "libcpmt$<$:d>") - set_property(TARGET nncase_cpu_module PROPERTY - MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") -else() - if (APPLE) - if(BUILD_STANDALONE) - target_link_options(nncase_cpu_module PRIVATE -ld_classic -lc) - else() - target_link_options(nncase_cpu_module PRIVATE -static) - target_link_options(nncase_cpu_module PRIVATE -e _module_entry -bundle -ld_classic -lc) - target_compile_options(nncase_cpu_module PRIVATE "$<$:-O1>") - endif(BUILD_STANDALONE) - else() - if (BUILD_SHARED) - target_link_options(nncase_cpu_module PRIVATE -e module_entry) - else() - if(NOT BUILD_STANDALONE) - target_link_options(nncase_cpu_module PRIVATE -static) - target_link_options(nncase_cpu_module PRIVATE -e module_entry -nostdlib) - endif(NOT BUILD_STANDALONE) - endif() - target_link_libraries(nncase_cpu_module PRIVATE gcc) - endif() -endif() +target_sources(nncase_ntt_module PRIVATE thread_main.cpp) diff --git a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml similarity index 92% rename from modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml rename to modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml index 30293289a9..ee6a8f5b7e 100644 --- a/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/main.cpp.cshtml +++ b/modules/Nncase.Modules.CPU/CodeGen/CPU/Templates/thread_main.cpp.cshtml @@ -14,7 +14,7 @@ } #include "kernel.h" -alignas(@(Model.Alignment)) static thread_local uint8_t local_data[@(Model.DataSize)]; +//alignas(@(Model.Alignment)) static thread_local uint8_t local_data[@(Model.DataSize)]; extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata) { /* prepare inputs */ @@ -41,7 +41,9 @@ extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata) { throw new NotSupportedException($"not support multi form topology!"); } + auto local_data = (uint8_t *)nncase::ntt::runtime::thread_alloc(@Model.DataSize, @Model.Alignment); @(Model.PrimFunction.Name)(@(string.Join(", ", names)), local_data); + nncase::ntt::runtime::thread_free(local_data); } #ifdef NNCASE_STANDALONE diff --git a/ntt/cmake/cpu_runtime.cmake b/ntt/cmake/cpu_runtime.cmake deleted file mode 100644 index 73364b04f7..0000000000 --- a/ntt/cmake/cpu_runtime.cmake +++ /dev/null @@ -1,12 +0,0 @@ -cmake_minimum_required(VERSION 3.15) - -include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake) - -add_library(nncase_cpu_runtime OBJECT ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp) -target_compile_features(nncase_cpu_runtime PUBLIC cxx_std_20) -target_include_directories(nncase_cpu_runtime PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include) - -if (MSVC) - set_property(TARGET nncase_cpu_runtime PROPERTY - MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") -endif() diff --git a/ntt/cmake/ntt_module.cmake b/ntt/cmake/ntt_module.cmake new file mode 100644 index 0000000000..43e6fdf1fc --- /dev/null +++ b/ntt/cmake/ntt_module.cmake @@ -0,0 +1,33 @@ +cmake_minimum_required(VERSION 3.15) + +include(${CMAKE_CURRENT_LIST_DIR}/compile_flags.cmake) + +if (BUILD_STANDALONE) + add_executable(nncase_ntt_module ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp) +else() + add_library(nncase_ntt_module SHARED ${CMAKE_CURRENT_LIST_DIR}/../src/dummy.cpp) +endif() + +target_compile_features(nncase_ntt_module PUBLIC cxx_std_20) +target_include_directories(nncase_ntt_module PUBLIC ${CMAKE_CURRENT_LIST_DIR}/../include) +set_target_properties(nncase_ntt_module PROPERTIES PREFIX "" SUFFIX "") +set_target_properties(nncase_ntt_module PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_CPU_MODULE=1) + +target_sources(nncase_ntt_module PRIVATE ${CMAKE_CURRENT_LIST_DIR}/../src/cpu_runtime.cpp) + +if (BUILD_STANDALONE) + target_compile_definitions(nncase_ntt_module PUBLIC -DNNCASE_STANDALONE=1) +endif() + +if (MSVC) + set_property(TARGET nncase_ntt_module PROPERTY + MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>") + set_target_properties(nncase_ntt_module PROPERTIES LINK_FLAGS /SUBSYSTEM:CONSOLE) + target_link_options(nncase_ntt_module PRIVATE /NODEFAULTLIB) + target_link_libraries(nncase_ntt_module PRIVATE libvcruntime msvcrt ucrt "libcpmt$<$:d>") +elseif(APPLE) + target_link_options(nncase_ntt_module PRIVATE -ld_classic -lc) +else() + target_link_options(nncase_ntt_module PRIVATE -static) +endif() diff --git a/ntt/include/nncase/ntt/runtime.h b/ntt/include/nncase/ntt/runtime.h index ea63253e08..878a37234d 100644 --- a/ntt/include/nncase/ntt/runtime.h +++ b/ntt/include/nncase/ntt/runtime.h @@ -23,16 +23,8 @@ #endif namespace nncase::ntt::runtime { -enum class module_main_reason { - block_main, - thread_main, -}; -} +void *thread_alloc(size_t bytes, size_t alignment); +void thread_free(void *ptr); +} // namespace nncase::ntt::runtime -extern "C" { -extern void thread_main(std::byte *const *inouts, const std::byte *rdata); - -extern NTT_RUNTIME_API void -module_entry(nncase::ntt::runtime::module_main_reason reason, void *params); -using module_entry_t = decltype(module_entry) *; -} +extern "C" void thread_main(std::byte *const *inouts, const std::byte *rdata); diff --git a/ntt/include/nncase/ntt/runtime/cpu_runtime.h b/ntt/include/nncase/ntt/runtime/cpu_runtime.h index 7e32d06ca7..bc4abbeef0 100644 --- a/ntt/include/nncase/ntt/runtime/cpu_runtime.h +++ b/ntt/include/nncase/ntt/runtime/cpu_runtime.h @@ -17,78 +17,61 @@ #include "../runtime.h" #include -extern "C" { -struct nncase_runtime_cpu_mt_t { - float (*acosf)(float v); - float (*acoshf)(float v); - float (*asinf)(float v); - float (*asinhf)(float v); - float (*copysignf)(float mag, float sgn); - float (*cosf)(float v); - float (*coshf)(float v); - float (*erff)(float v); - float (*expf)(float v); - float (*fmodf)(float x, float y); - float (*logf)(float v); - float (*nearbyintf)(float v); - float (*powf)(float x, float y); - float (*roundf)(float v); - float (*sinf)(float v); - float (*sinhf)(float v); - float (*sqrtf)(float v); - float (*tanhf)(float v); - - uint8_t *(*sram_address)(int bid, int tid); - - void (*failfast)(const char *format, va_list args); - -#ifndef WIN32 - void *(*memcpy)(void *dst, const void *src, size_t len); - void *(*memmove)(void *dst, const void *src, size_t len); - void *(*memset)(void *b, int c, size_t len); +#ifdef __APPLE__ +#include #endif -}; -struct nncase_runtime_cpu_block_params_t { - const nncase_runtime_cpu_mt_t *cpu_mt; +namespace nncase::ntt::runtime { +struct cpu_block_entry_params_t { size_t tdim; size_t bdim; size_t cdim; + size_t bid; + size_t cid; + size_t cpu_id_offset; + std::byte *const *inouts; + const std::byte *rdata; +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key; +#endif }; -struct nncase_runtime_cpu_thread_params_t { +struct cpu_thread_context_t { size_t tid; size_t bid; size_t cid; - std::byte *const *inouts; - const std::byte *rdata; + + static cpu_thread_context_t ¤t() noexcept; }; -} -namespace nncase::ntt::runtime { -extern const nncase_runtime_cpu_mt_t *cpu_mt; extern size_t tdim; extern size_t bdim; extern size_t cdim; - -extern thread_local size_t tid; -extern thread_local size_t bid; -extern thread_local size_t cid; } // namespace nncase::ntt::runtime namespace nncase::ntt { template <> struct program_id_getter<0> { - static size_t id() noexcept { return runtime::tid; } + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().tid; + } static size_t dim() noexcept { return runtime::tdim; } }; template <> struct program_id_getter<1> { - static size_t id() noexcept { return runtime::bid; } + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().bid; + } static size_t dim() noexcept { return runtime::bdim; } }; template <> struct program_id_getter<2> { - static size_t id() noexcept { return runtime::cid; } + static size_t id() noexcept { + return runtime::cpu_thread_context_t::current().cid; + } static size_t dim() noexcept { return runtime::cdim; } }; } // namespace nncase::ntt + +extern "C" void +block_entry(const nncase::ntt::runtime::cpu_block_entry_params_t ¶ms); +using block_entry_t = decltype(block_entry) *; diff --git a/ntt/src/cpu_runtime.cpp b/ntt/src/cpu_runtime.cpp index 7568111f78..96c0f58759 100644 --- a/ntt/src/cpu_runtime.cpp +++ b/ntt/src/cpu_runtime.cpp @@ -12,119 +12,116 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include -#include #include +#include #include #include +#include + +#ifdef WIN32 +#include +#elif defined(__APPLE__) +#include +#include +#else +#include +#endif namespace nncase::ntt::runtime { -const nncase_runtime_cpu_mt_t *cpu_mt; size_t tdim; size_t bdim; size_t cdim; -thread_local size_t tid; -thread_local size_t bid; -thread_local size_t cid; -} // namespace nncase::ntt::runtime - -using namespace nncase::ntt::runtime; - -extern "C" { -#ifndef NNCASE_STANDALONE -// compiler support -#if defined(_MSC_VER) -#pragma function(acosf) -#pragma function(asinf) -#pragma function(cosf) -#pragma function(coshf) -#pragma function(erff) -#pragma function(expf) -#pragma function(fmodf) -#pragma function(logf) -#pragma function(powf) -#pragma function(roundf) -#pragma function(sinf) -#pragma function(sinhf) -#pragma function(sqrtf) -#pragma function(tanhf) +#ifdef __APPLE__ +pthread_key_t cpu_thread_context_key; +#else +thread_local cpu_thread_context_t cpu_thread_context; #endif -float acosf(float v) { return cpu_mt->acosf(v); } -float acoshf(float v) { return cpu_mt->acoshf(v); } -float asinf(float v) { return cpu_mt->asinf(v); } -float asinhf(float v) { return cpu_mt->asinhf(v); } -float copysignf(float mag, float sgn) { return cpu_mt->copysignf(mag, sgn); } -float cosf(float v) { return cpu_mt->cosf(v); } -float coshf(float v) { return cpu_mt->coshf(v); } -float erff(float v) { return cpu_mt->erff(v); } -float expf(float v) { return cpu_mt->expf(v); } -float fmodf(float x, float y) { return cpu_mt->fmodf(x, y); } -float logf(float v) { return cpu_mt->logf(v); } -float nearbyintf(float v) { return cpu_mt->nearbyintf(v); } -float powf(float x, float y) { return cpu_mt->powf(x, y); } -float roundf(float v) { return cpu_mt->roundf(v); } -float sinf(float v) { return cpu_mt->sinf(v); } -float sinhf(float v) { return cpu_mt->sinhf(v); } -float sqrtf(float v) { return cpu_mt->sqrtf(v); } -float tanhf(float v) { return cpu_mt->tanhf(v); } - +void *thread_alloc(size_t bytes, size_t alignment) { #ifdef WIN32 -void _invalid_parameter(wchar_t const *const expression, - wchar_t const *const function_name, - wchar_t const *const file_name, - unsigned int const line_number, - uintptr_t const reserved) { - cpu_mt->failfast("invalid_parameter", (va_list)0); + return _aligned_malloc(bytes, alignment); +#else + size_t mask = alignment - 1; + size_t aligned_bytes = bytes + (-bytes & mask); + auto ptr = aligned_alloc(alignment, aligned_bytes); + if (!ptr) { + std::terminate(); + } + return ptr; +#endif } -int _CrtDbgReport(int reportType, const char *filename, int linenumber, - const char *moduleName, const char *format, ...) { - va_list args; - va_start(args, format); - cpu_mt->failfast(format, args); - va_end(args); - return 0; -} +void thread_free(void *ptr) { +#ifdef WIN32 + _aligned_free(ptr); #else -void *memcpy(void *dst, const void *src, size_t len) { - return cpu_mt->memcpy(dst, src, len); + free(ptr); +#endif } +} // namespace nncase::ntt::runtime + +using namespace nncase::ntt::runtime; -void *memmove(void *dst, const void *src, size_t len) { - return cpu_mt->memmove(dst, src, len); +cpu_thread_context_t &cpu_thread_context_t::current() noexcept { +#ifndef __APPLE__ + return cpu_thread_context; +#else + return *reinterpret_cast( + pthread_getspecific(cpu_thread_context_key)); +#endif } -void *memset(void *b, int c, size_t len) { return cpu_mt->memset(b, c, len); } +extern "C" void block_entry(const cpu_block_entry_params_t ¶ms) { + tdim = params.tdim; + bdim = params.bdim; + cdim = params.cdim; + +#ifdef __APPLE__ + cpu_thread_context_key = params.cpu_thread_context_key; #endif + + std::vector threads; + for (size_t tid = 0; tid < tdim; tid++) { + threads.emplace_back([tid, params] { +#ifdef __APPLE__ + pthread_setspecific(cpu_thread_context_key, + new cpu_thread_context_t +#else + cpu_thread_context_t::current() = +#endif + { + .tid = tid, + .bid = params.bid, + .cid = params.cid, + } +#ifdef __APPLE__ + ); +#else + ; #endif -void module_entry(nncase::ntt::runtime::module_main_reason reason, - void *params) { - switch (reason) { - case nncase::ntt::runtime::module_main_reason::block_main: { - auto block_params = - reinterpret_cast(params); - cpu_mt = block_params->cpu_mt; - tdim = block_params->tdim; - bdim = block_params->bdim; - cdim = block_params->cdim; - break; - } - case nncase::ntt::runtime::module_main_reason::thread_main: { - auto thread_params = - reinterpret_cast(params); - tid = thread_params->tid; - bid = thread_params->bid; - cid = thread_params->cid; - thread_main(thread_params->inouts, thread_params->rdata); - break; - } - default: - break; + size_t cpu_id = params.cpu_id_offset + tid; +#if WIN32 + SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)1 << cpu_id); +#elif defined(__APPLE__) + thread_affinity_policy_data_t policy = {(int)cpu_id}; + thread_policy_set(pthread_mach_thread_np(pthread_self()), + THREAD_AFFINITY_POLICY, (thread_policy_t)&policy, + THREAD_AFFINITY_POLICY_COUNT); +#else + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu_id, &cpuset); + pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset); +#endif + cpu_thread_context_t::current().tid = tid; + thread_main(params.inouts, params.rdata); + }); } + + for (auto &t : threads) + t.join(); } -} // extern "C" diff --git a/ntt/src/dummy.cpp b/ntt/src/dummy.cpp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp b/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp index 93d7a345b2..42f8fcaa16 100644 --- a/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/elf/elf_loader.cpp @@ -90,7 +90,7 @@ void elf_loader::load(std::span elf) { throw std::runtime_error("dlopen error:" + std::string(dlerror())); } - entry_ = dlsym(handle_, "module_entry"); + entry_ = dlsym(handle_, "block_entry"); if (!entry_) { throw std::runtime_error("dlsym error:" + std::string(dlerror())); } diff --git a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp index 9cb6b47a8f..f7ae78fdb7 100644 --- a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.cpp @@ -14,6 +14,8 @@ */ #include "macho_loader.h" #include +#include +#include #include #include #include @@ -21,18 +23,31 @@ using namespace nncase::runtime; +#define THROW_SYS_IF_NOT(x) \ + if (!(x)) { \ + throw std::system_error(errno, std::system_category()); \ + } + macho_loader::~macho_loader() { +#if 0 if (!NSUnLinkModule(reinterpret_cast(mod_), NSUNLINKMODULE_OPTION_NONE)) { - // throw std::runtime_error("NSUnLinkModule failed"); + abort(); } if (!NSDestroyObjectFileImage(reinterpret_cast(ofi_))) { - // throw std::runtime_error("NSDestroyObjectFileImage failed"); + + abort(); } +#else + if (mod_) { + dlclose(mod_); + } +#endif } void macho_loader::load(std::span macho) { +#if 0 if (NSCreateObjectFileImageFromMemory( macho.data(), macho.size_bytes(), reinterpret_cast(&ofi_)) != @@ -51,8 +66,32 @@ void macho_loader::load(std::span macho) { if (sym_ == NULL) { throw std::runtime_error("NSLookupSymbolInModule failed"); } +#else + char temp_path[] = "/tmp/nncase.function.cpu.XXXXXX"; + { + auto func_file = mkstemp(temp_path); + THROW_SYS_IF_NOT(func_file != -1); + THROW_SYS_IF_NOT(write(func_file, (char *)macho.data(), macho.size()) != + -1); + THROW_SYS_IF_NOT(close(func_file) != -1); + } + + mod_ = dlopen(temp_path, RTLD_NOW); + if (!mod_) { + throw std::runtime_error("dlopen error:" + std::string(dlerror())); + } + + sym_ = dlsym(mod_, "block_entry"); + if (!sym_) { + throw std::runtime_error("dlsym error:" + std::string(dlerror())); + } +#endif } void *macho_loader::entry() const noexcept { +#if 0 return NSAddressOfSymbol(reinterpret_cast(sym_)); +#else + return sym_; +#endif } diff --git a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h index efba7af607..c91ac39fa7 100644 --- a/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h +++ b/src/Native/src/runtime/cpu/loaders/macho/macho_loader.h @@ -20,14 +20,23 @@ BEGIN_NS_NNCASE_RUNTIME class macho_loader { public: - macho_loader() noexcept : ofi_(nullptr), mod_(nullptr), sym_(nullptr) {} + macho_loader() noexcept + : +#if 0 + ofi_(nullptr), +#endif + mod_(nullptr), + sym_(nullptr) { + } ~macho_loader(); void load(std::span macho); void *entry() const noexcept; private: +#if 0 void *ofi_; +#endif void *mod_; void *sym_; }; diff --git a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp index fa6d7a8127..19120fb538 100644 --- a/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp +++ b/src/Native/src/runtime/cpu/loaders/pe/pe_loader.cpp @@ -73,7 +73,7 @@ void pe_loader::load(std::span pe) { image_ = LoadLibraryW(temp_filename); THROW_WIN32_IF_NOT(image_); - entry_ = (void *)GetProcAddress((HMODULE)image_, "module_entry"); + entry_ = (void *)GetProcAddress((HMODULE)image_, "block_entry"); THROW_WIN32_IF_NOT(entry_); #else auto dos_header = reinterpret_cast(pe.data()); diff --git a/src/Native/src/runtime/cpu/runtime_function.cpp b/src/Native/src/runtime/cpu/runtime_function.cpp index 5ca027ec75..55a88e2dd7 100644 --- a/src/Native/src/runtime/cpu/runtime_function.cpp +++ b/src/Native/src/runtime/cpu/runtime_function.cpp @@ -27,6 +27,7 @@ using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; +using namespace nncase::ntt::runtime; typedef struct { uint32_t tdim; @@ -36,7 +37,7 @@ typedef struct { } desc_header; cpu_runtime_function::cpu_runtime_function(runtime_module &rt_module) - : runtime_function(rt_module), module_entry_(nullptr), tdim_(0), bdim_(0) {} + : runtime_function(rt_module), block_entry_(nullptr), tdim_(0), bdim_(0) {} cpu_runtime_function::~cpu_runtime_function() {} @@ -57,7 +58,7 @@ result cpu_runtime_function::initialize_core( auto text = module().text().subspan(context.header().entrypoint, context.header().text_size); loader_.load(text); - module_entry_ = (module_entry_t)loader_.entry(); + block_entry_ = (block_entry_t)loader_.entry(); return ok(); } diff --git a/src/Native/src/runtime/cpu/runtime_function.h b/src/Native/src/runtime/cpu/runtime_function.h index 925c1455f3..51b3b1f165 100644 --- a/src/Native/src/runtime/cpu/runtime_function.h +++ b/src/Native/src/runtime/cpu/runtime_function.h @@ -54,7 +54,7 @@ class cpu_runtime_function final : public runtime_function { elf_loader loader_; #endif - module_entry_t module_entry_; + block_entry_t block_entry_; uint64_t tdim_; uint64_t bdim_; uint64_t cdim_; diff --git a/src/Native/src/runtime/cpu/runtime_function.run.cpp b/src/Native/src/runtime/cpu/runtime_function.run.cpp index f9fa2bc27f..072f11b2c6 100644 --- a/src/Native/src/runtime/cpu/runtime_function.run.cpp +++ b/src/Native/src/runtime/cpu/runtime_function.run.cpp @@ -21,114 +21,40 @@ #include #include #include - -#ifdef WIN32 -#include -#elif defined(__APPLE__) -#include -#include -#else -#include -#endif +#include using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; - -namespace { -#define SRAM_SIZE_PER_BLOCK (1024 * 1024 * 4) -#define SRAM_SIZE_PER_THREAD (SRAM_SIZE_PER_BLOCK) - -static uint8_t _sram[1][SRAM_SIZE_PER_BLOCK]; -static uint8_t *_block_sram_ptr[] = {_sram[0]}; -static uint8_t *sram_address(int bid, int tid) { - return _block_sram_ptr[bid] + (SRAM_SIZE_PER_BLOCK * tid); -} - -static void failfast(const char *foramt, va_list args) { - char buffer[1024]; - vsprintf(buffer, foramt, args); - throw std::runtime_error(buffer); -} - -nncase_runtime_cpu_mt_t nncase_cpu_mt_ = { - .acosf = acosf, - .acoshf = acoshf, - .asinf = asinf, - .asinhf = asinhf, - .copysignf = copysignf, - .cosf = cosf, - .coshf = coshf, - .erff = erff, - .expf = expf, - .fmodf = fmodf, - .logf = logf, - .nearbyintf = nearbyintf, - .powf = powf, - .roundf = roundf, - .sinf = sinf, - .sinhf = sinhf, - .sqrtf = sqrtf, - .tanhf = tanhf, - .sram_address = sram_address, - .failfast = failfast, - -#ifndef WIN32 - .memcpy = memcpy, - .memmove = memmove, - .memset = memset, -#endif -}; -} // namespace +using namespace nncase::ntt::runtime; result cpu_runtime_function::run(std::span params) noexcept { - std::vector threads; + std::vector blocks; for (size_t cid = 0; cid < cdim_; cid++) { for (size_t bid = 0; bid < bdim_; bid++) { - nncase_runtime_cpu_block_params_t block_params{ - .cpu_mt = &nncase_cpu_mt_, - .tdim = tdim_, - .bdim = bdim_, - .cdim = cdim_, - }; - module_entry_(ntt::runtime::module_main_reason::block_main, - &block_params); - for (size_t tid = 0; tid < tdim_; tid++) { - threads.emplace_back([this, cid, tid, bid, params] { - size_t cpu_id = (cid * bdim_ + bid) * tdim_ + tid; -#if WIN32 - SetThreadAffinityMask(GetCurrentThread(), - (DWORD_PTR)1 << cpu_id); -#elif defined(__APPLE__) - thread_affinity_policy_data_t policy = {(int)cpu_id}; - thread_policy_set(pthread_mach_thread_np(pthread_self()), - THREAD_AFFINITY_POLICY, - (thread_policy_t)&policy, - THREAD_AFFINITY_POLICY_COUNT); -#else - cpu_set_t cpuset; - CPU_ZERO(&cpuset); - CPU_SET(cpu_id, &cpuset); - pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), - &cpuset); + blocks.emplace_back([cid, bid, params, this] { + cpu_block_entry_params_t block_entry_params{ + .tdim = tdim_, + .bdim = bdim_, + .cdim = cdim_, + .bid = bid, + .cid = cid, + .cpu_id_offset = (cid * bdim_ + bid) * tdim_, + .inouts = params.data(), + .rdata = module().rdata().data(), +#ifdef __APPLE__ + .cpu_thread_context_key = module().cpu_thread_context_key(), #endif + }; - nncase_runtime_cpu_thread_params_t thread_params{ - .tid = tid, - .bid = bid, - .cid = cid, - .inouts = params.data(), - .rdata = module().rdata().data(), - }; - module_entry_(ntt::runtime::module_main_reason::thread_main, - &thread_params); - }); - } + block_entry_(block_entry_params); + }); } } - for (auto &t : threads) - t.join(); + for (auto &block : blocks) { + block.join(); + } return ok(); } diff --git a/src/Native/src/runtime/cpu/runtime_module.cpp b/src/Native/src/runtime/cpu/runtime_module.cpp index 8747b3ba79..9ac44f3271 100644 --- a/src/Native/src/runtime/cpu/runtime_module.cpp +++ b/src/Native/src/runtime/cpu/runtime_module.cpp @@ -14,6 +14,7 @@ */ #include "runtime_module.h" #include "runtime_function.h" +#include #include #include #include @@ -22,6 +23,20 @@ using namespace nncase; using namespace nncase::runtime; using namespace nncase::runtime::cpu; +using namespace nncase::ntt::runtime; + +cpu_runtime_module::cpu_runtime_module() noexcept { +#ifdef __APPLE__ + pthread_key_create(&cpu_thread_context_key_, + [](void *ptr) { delete (cpu_thread_context_t *)ptr; }); +#endif +} + +cpu_runtime_module::~cpu_runtime_module() { +#ifdef __APPLE__ + pthread_key_delete(cpu_thread_context_key_); +#endif +} result cpu_runtime_module::initialize_before_functions( runtime_module_init_context &context) noexcept { diff --git a/src/Native/src/runtime/cpu/runtime_module.h b/src/Native/src/runtime/cpu/runtime_module.h index af15d2fa8a..ce8c6f03db 100644 --- a/src/Native/src/runtime/cpu/runtime_module.h +++ b/src/Native/src/runtime/cpu/runtime_module.h @@ -17,15 +17,28 @@ #include #include +#ifdef __APPLE__ +#include +#endif + BEGIN_NS_NNCASE_RT_MODULE(cpu) class cpu_runtime_module : public runtime_module { public: + cpu_runtime_module() noexcept; + virtual ~cpu_runtime_module(); + kernels::kernel_context &kernel_context() noexcept; std::span text() const noexcept { return text_; } std::span rdata() const noexcept { return rdata_; } +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key() const noexcept { + return cpu_thread_context_key_; + } +#endif + protected: result initialize_before_functions( runtime_module_init_context &context) noexcept override; @@ -37,6 +50,10 @@ class cpu_runtime_module : public runtime_module { std::span rdata_; host_buffer_t text_storage_; host_buffer_t rdata_storage_; + +#ifdef __APPLE__ + pthread_key_t cpu_thread_context_key_ = {}; +#endif }; END_NS_NNCASE_RT_MODULE