diff --git a/.github/workflows/scripts/linux/build-dependencies-qt.sh b/.github/workflows/scripts/linux/build-dependencies-qt.sh index 9c1f55651913c..edad131e659a5 100755 --- a/.github/workflows/scripts/linux/build-dependencies-qt.sh +++ b/.github/workflows/scripts/linux/build-dependencies-qt.sh @@ -16,10 +16,10 @@ fi LIBBACKTRACE=ad106d5fdd5d960bd33fae1c48a351af567fd075 LIBJPEG=9f -LIBPNG=1.6.43 +LIBPNG=1.6.44 LIBWEBP=1.4.0 LZ4=b8fd2d15309dd4e605070bd4486e26b6ef814e29 -SDL=SDL2-2.30.8 +SDL=SDL2-2.30.9 QT=6.8.0 ZSTD=1.5.6 @@ -34,10 +34,10 @@ cd deps-build cat > SHASUMS < SHASUMS < SHASUMS <series == cpuinfo_arm_chipset_series_unisoc_t && chipset->model == 310) { cpuinfo_log_warning("VDOT instructions disabled: cause occasional SIGILL on Unisoc T310"); + } else if (chipset->series == cpuinfo_arm_chipset_series_unisoc_ums && chipset->model == 312) { + cpuinfo_log_warning("VDOT instructions disabled: cause occasional SIGILL on Unisoc UMS312"); } else { switch (midr & (CPUINFO_ARM_MIDR_IMPLEMENTER_MASK | CPUINFO_ARM_MIDR_PART_MASK)) { case UINT32_C(0x4100D0B0): /* Cortex-A76 */ diff --git a/3rdparty/cpuinfo/src/arm/linux/aarch64-isa.c b/3rdparty/cpuinfo/src/arm/linux/aarch64-isa.c index db5349ecf8f99..bc2186f61c05a 100644 --- a/3rdparty/cpuinfo/src/arm/linux/aarch64-isa.c +++ b/3rdparty/cpuinfo/src/arm/linux/aarch64-isa.c @@ -3,9 +3,11 @@ #include #include +#include + void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( uint32_t features, - uint32_t features2, + uint64_t features2, uint32_t midr, const struct cpuinfo_arm_chipset chipset[restrict static 1], struct cpuinfo_arm_isa isa[restrict static 1]) { @@ -142,6 +144,27 @@ void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SVE2) { isa->sve2 = true; } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME) { + isa->sme = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME2) { + isa->sme2 = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME2P1) { + isa->sme2p1 = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME_I16I32) { + isa->sme_i16i32 = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME_BI32I32) { + isa->sme_bi32i32 = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME_B16B16) { + isa->sme_b16b16 = true; + } + if (features2 & CPUINFO_ARM_LINUX_FEATURE2_SME_F16F16) { + isa->sme_f16f16 = true; + } // SVEBF16 is set iff SVE and BF16 are both supported, but the SVEBF16 // feature flag was added in Linux kernel before the BF16 feature flag, // so we check for either. @@ -151,4 +174,21 @@ void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( if (features & CPUINFO_ARM_LINUX_FEATURE_ASIMDFHM) { isa->fhm = true; } + +#ifndef PR_SVE_GET_VL +#define PR_SVE_GET_VL 51 +#endif + +#ifndef PR_SVE_VL_LEN_MASK +#define PR_SVE_VL_LEN_MASK 0xffff +#endif + + int ret = prctl(PR_SVE_GET_VL); + if (ret < 0) { + cpuinfo_log_warning("No SVE support on this machine"); + isa->svelen = 0; // Assume no SVE support if the call fails + } else { + // Mask out the SVE vector length bits + isa->svelen = ret & PR_SVE_VL_LEN_MASK; + } } diff --git a/3rdparty/cpuinfo/src/arm/linux/api.h b/3rdparty/cpuinfo/src/arm/linux/api.h index 365fea6c7aaa5..14fed7ceb562d 100644 --- a/3rdparty/cpuinfo/src/arm/linux/api.h +++ b/3rdparty/cpuinfo/src/arm/linux/api.h @@ -137,6 +137,13 @@ struct cpuinfo_arm_linux_proc_cpuinfo_cache { #define CPUINFO_ARM_LINUX_FEATURE2_DGH UINT32_C(0x00008000) #define CPUINFO_ARM_LINUX_FEATURE2_RNG UINT32_C(0x00010000) #define CPUINFO_ARM_LINUX_FEATURE2_BTI UINT32_C(0x00020000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME UINT32_C(0x00800000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME2 UINT64_C(0x0000002000000000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME2P1 UINT64_C(0x0000004000000000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME_I16I32 UINT64_C(0x0000008000000000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME_BI32I32 UINT64_C(0x0000010000000000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME_B16B16 UINT64_C(0x0000020000000000) +#define CPUINFO_ARM_LINUX_FEATURE2_SME_F16F16 UINT64_C(0x0000040000000000) #endif #define CPUINFO_ARM_LINUX_VALID_ARCHITECTURE UINT32_C(0x00010000) @@ -172,7 +179,7 @@ struct cpuinfo_arm_linux_processor { struct cpuinfo_arm_linux_proc_cpuinfo_cache proc_cpuinfo_cache; #endif uint32_t features; - uint32_t features2; + uint64_t features2; /** * Main ID Register value. */ @@ -295,14 +302,14 @@ CPUINFO_INTERNAL bool cpuinfo_arm_linux_parse_proc_cpuinfo( #if CPUINFO_ARCH_ARM CPUINFO_INTERNAL bool cpuinfo_arm_linux_hwcap_from_getauxval( uint32_t hwcap[restrict static 1], - uint32_t hwcap2[restrict static 1]); + uint64_t hwcap2[restrict static 1]); CPUINFO_INTERNAL bool cpuinfo_arm_linux_hwcap_from_procfs( uint32_t hwcap[restrict static 1], - uint32_t hwcap2[restrict static 1]); + uint64_t hwcap2[restrict static 1]); CPUINFO_INTERNAL void cpuinfo_arm_linux_decode_isa_from_proc_cpuinfo( uint32_t features, - uint32_t features2, + uint64_t features2, uint32_t midr, uint32_t architecture_version, uint32_t architecture_flags, @@ -311,11 +318,11 @@ CPUINFO_INTERNAL void cpuinfo_arm_linux_decode_isa_from_proc_cpuinfo( #elif CPUINFO_ARCH_ARM64 CPUINFO_INTERNAL void cpuinfo_arm_linux_hwcap_from_getauxval( uint32_t hwcap[restrict static 1], - uint32_t hwcap2[restrict static 1]); + uint64_t hwcap2[restrict static 1]); CPUINFO_INTERNAL void cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( uint32_t features, - uint32_t features2, + uint64_t features2, uint32_t midr, const struct cpuinfo_arm_chipset chipset[restrict static 1], struct cpuinfo_arm_isa isa[restrict static 1]); diff --git a/3rdparty/cpuinfo/src/arm/linux/chipset.c b/3rdparty/cpuinfo/src/arm/linux/chipset.c index 1f93351dddb0e..c4977c388880b 100644 --- a/3rdparty/cpuinfo/src/arm/linux/chipset.c +++ b/3rdparty/cpuinfo/src/arm/linux/chipset.c @@ -85,6 +85,7 @@ static enum cpuinfo_arm_chipset_vendor chipset_series_vendor[cpuinfo_arm_chipset [cpuinfo_arm_chipset_series_telechips_tcc] = cpuinfo_arm_chipset_vendor_telechips, [cpuinfo_arm_chipset_series_texas_instruments_omap] = cpuinfo_arm_chipset_vendor_texas_instruments, [cpuinfo_arm_chipset_series_unisoc_t] = cpuinfo_arm_chipset_vendor_unisoc, + [cpuinfo_arm_chipset_series_unisoc_ums] = cpuinfo_arm_chipset_vendor_unisoc, [cpuinfo_arm_chipset_series_wondermedia_wm] = cpuinfo_arm_chipset_vendor_wondermedia, }; @@ -959,6 +960,70 @@ static bool match_t(const char* start, const char* end, struct cpuinfo_arm_chips return true; } +/** + * Tries to match, case-sentitively, /Unisoc UMS\d{3,4}/ signature for Unisoc UMS + * chipset. If match successful, extracts model information into \p chipset + * argument. + * + * @param start - start of the platform identifier (/proc/cpuinfo Hardware + * string, ro.product.board, ro.board.platform, or ro.chipname) to match. + * @param end - end of the platform identifier (/proc/cpuinfo Hardware string, + * ro.product.board, ro.board.platform, or ro.chipname) to match. + * @param[out] chipset - location where chipset information will be stored upon + * a successful match. + * + * @returns true if signature matched, false otherwise. + */ +static bool match_ums(const char* start, const char* end, struct cpuinfo_arm_chipset chipset[restrict static 1]) { + /* Expect 13-14 symbols: "Unisoc UMS" (10 symbols) + 3-4-digit model number + */ + const size_t length = end - start; + switch (length) { + case 13: + case 14: + break; + default: + return false; + } + + /* Check that string starts with "Unisoc UMS". The first four characters + * are loaded as 32-bit little endian word */ + const uint32_t expected_unis = load_u32le(start); + if (expected_unis != UINT32_C(0x73696E55) /* "sinU" = reverse("Unis") */) { + return false; + } + + /* The next four characters are loaded as 32-bit little endian word */ + const uint32_t expected_oc_u = load_u32le(start + 4); + if (expected_oc_u != UINT32_C(0x5520636F) /* "U co" = reverse("oc U") */) { + return false; + } + + /* The next four characters are loaded as 16-bit little endian word */ + const uint16_t expected_ms = load_u16le(start + 8); + if (expected_ms != UINT16_C(0x534D) /* "SM" = reverse("MS") */) { + return false; + } + + /* Validate and parse 3-4 digit model number */ + uint32_t model = 0; + for (uint32_t i = 10; i < length; i++) { + const uint32_t digit = (uint32_t)(uint8_t)start[i] - '0'; + if (digit >= 10) { + /* Not really a digit */ + return false; + } + model = model * 10 + digit; + } + + *chipset = (struct cpuinfo_arm_chipset){ + .vendor = cpuinfo_arm_chipset_vendor_unisoc, + .series = cpuinfo_arm_chipset_series_unisoc_ums, + .model = model, + }; + return true; +} + /** * Tries to match /lc\d{4}[a-z]?$/ signature for Leadcore LC chipsets. * If match successful, extracts model information into \p chipset argument. @@ -2508,6 +2573,16 @@ struct cpuinfo_arm_chipset cpuinfo_arm_linux_decode_chipset_from_proc_cpuinfo_ha return chipset; } + /* Check Unisoc UMS signature */ + if (match_ums(hardware, hardware_end, &chipset)) { + cpuinfo_log_debug( + "matched Unisoc UMS signature in /proc/cpuinfo Hardware string \"%.*s\"", + (int)hardware_length, + hardware); + + return chipset; + } + #if CPUINFO_ARCH_ARM /* Check Marvell PXA signature */ if (match_pxa(hardware, hardware_end, &chipset)) { @@ -3726,6 +3801,7 @@ static const char* chipset_series_string[cpuinfo_arm_chipset_series_max] = { [cpuinfo_arm_chipset_series_telechips_tcc] = "TCC", [cpuinfo_arm_chipset_series_texas_instruments_omap] = "OMAP", [cpuinfo_arm_chipset_series_unisoc_t] = "T", + [cpuinfo_arm_chipset_series_unisoc_ums] = "UMS", [cpuinfo_arm_chipset_series_wondermedia_wm] = "WM", }; diff --git a/3rdparty/cpuinfo/src/arm/linux/hwcap.c b/3rdparty/cpuinfo/src/arm/linux/hwcap.c index e836548db190a..7f7b4dfddb46b 100644 --- a/3rdparty/cpuinfo/src/arm/linux/hwcap.c +++ b/3rdparty/cpuinfo/src/arm/linux/hwcap.c @@ -31,8 +31,8 @@ void cpuinfo_set_hwcap(uint32_t hwcap) { mock_hwcap = hwcap; } -static uint32_t mock_hwcap2 = 0; -void cpuinfo_set_hwcap2(uint32_t hwcap2) { +static uint64_t mock_hwcap2 = 0; +void cpuinfo_set_hwcap2(uint64_t hwcap2) { mock_hwcap2 = hwcap2; } #endif @@ -40,7 +40,7 @@ void cpuinfo_set_hwcap2(uint32_t hwcap2) { #if CPUINFO_ARCH_ARM typedef unsigned long (*getauxval_function_t)(unsigned long); -bool cpuinfo_arm_linux_hwcap_from_getauxval(uint32_t hwcap[restrict static 1], uint32_t hwcap2[restrict static 1]) { +bool cpuinfo_arm_linux_hwcap_from_getauxval(uint32_t hwcap[restrict static 1], uint64_t hwcap2[restrict static 1]) { #if CPUINFO_MOCK *hwcap = mock_hwcap; *hwcap2 = mock_hwcap2; @@ -83,13 +83,13 @@ bool cpuinfo_arm_linux_hwcap_from_getauxval(uint32_t hwcap[restrict static 1], u } #ifdef __ANDROID__ -bool cpuinfo_arm_linux_hwcap_from_procfs(uint32_t hwcap[restrict static 1], uint32_t hwcap2[restrict static 1]) { +bool cpuinfo_arm_linux_hwcap_from_procfs(uint32_t hwcap[restrict static 1], uint64_t hwcap2[restrict static 1]) { #if CPUINFO_MOCK *hwcap = mock_hwcap; *hwcap2 = mock_hwcap2; return true; #else - uint32_t hwcaps[2] = {0, 0}; + uint64_t hwcaps[2] = {0, 0}; bool result = false; int file = -1; @@ -113,7 +113,7 @@ bool cpuinfo_arm_linux_hwcap_from_procfs(uint32_t hwcap[restrict static 1], uint hwcaps[0] = (uint32_t)elf_auxv.a_un.a_val; break; case AT_HWCAP2: - hwcaps[1] = (uint32_t)elf_auxv.a_un.a_val; + hwcaps[1] = (uint64_t)elf_auxv.a_un.a_val; break; } } else { @@ -141,13 +141,13 @@ bool cpuinfo_arm_linux_hwcap_from_procfs(uint32_t hwcap[restrict static 1], uint } #endif /* __ANDROID__ */ #elif CPUINFO_ARCH_ARM64 -void cpuinfo_arm_linux_hwcap_from_getauxval(uint32_t hwcap[restrict static 1], uint32_t hwcap2[restrict static 1]) { +void cpuinfo_arm_linux_hwcap_from_getauxval(uint32_t hwcap[restrict static 1], uint64_t hwcap2[restrict static 1]) { #if CPUINFO_MOCK *hwcap = mock_hwcap; *hwcap2 = mock_hwcap2; #else *hwcap = (uint32_t)getauxval(AT_HWCAP); - *hwcap2 = (uint32_t)getauxval(AT_HWCAP2); + *hwcap2 = (uint64_t)getauxval(AT_HWCAP2); return; #endif } diff --git a/3rdparty/cpuinfo/src/arm/linux/init.c b/3rdparty/cpuinfo/src/arm/linux/init.c index 988f05aaa787d..1eab69d5fce92 100644 --- a/3rdparty/cpuinfo/src/arm/linux/init.c +++ b/3rdparty/cpuinfo/src/arm/linux/init.c @@ -247,7 +247,8 @@ void cpuinfo_arm_linux_init(void) { #endif #if CPUINFO_ARCH_ARM - uint32_t isa_features = 0, isa_features2 = 0; + uint32_t isa_features = 0; + uint64_t isa_features2 = 0; #ifdef __ANDROID__ /* * On Android before API 20, libc.so does not provide getauxval @@ -299,7 +300,8 @@ void cpuinfo_arm_linux_init(void) { &chipset, &cpuinfo_isa); #elif CPUINFO_ARCH_ARM64 - uint32_t isa_features = 0, isa_features2 = 0; + uint32_t isa_features = 0; + uint64_t isa_features2 = 0; /* getauxval is always available on ARM64 Android */ cpuinfo_arm_linux_hwcap_from_getauxval(&isa_features, &isa_features2); cpuinfo_arm64_linux_decode_isa_from_proc_cpuinfo( @@ -333,18 +335,52 @@ void cpuinfo_arm_linux_init(void) { } /* Propagate topology group IDs among siblings */ + bool detected_core_siblings_list_node = false; + bool detected_cluster_cpus_list_node = false; for (uint32_t i = 0; i < arm_linux_processors_count; i++) { if (!bitmask_all(arm_linux_processors[i].flags, CPUINFO_LINUX_FLAG_VALID)) { continue; } - if (arm_linux_processors[i].flags & CPUINFO_LINUX_FLAG_PACKAGE_ID) { + if (!bitmask_all(arm_linux_processors[i].flags, CPUINFO_LINUX_FLAG_PACKAGE_ID)) { + continue; + } + + /* Use the cluster_cpus_list topology node if available. If not + * found, cache the result to avoid repeatedly attempting to + * read the non-existent paths. + * */ + if (!detected_core_siblings_list_node && !detected_cluster_cpus_list_node) { + if (cpuinfo_linux_detect_cluster_cpus( + arm_linux_processors_count, + i, + (cpuinfo_siblings_callback)cluster_siblings_parser, + arm_linux_processors)) { + detected_cluster_cpus_list_node = true; + continue; + } else { + detected_core_siblings_list_node = true; + } + } + + /* The cached result above will guarantee only one of the blocks + * below will execute, with a bias towards cluster_cpus_list. + **/ + if (detected_core_siblings_list_node) { cpuinfo_linux_detect_core_siblings( arm_linux_processors_count, i, (cpuinfo_siblings_callback)cluster_siblings_parser, arm_linux_processors); } + + if (detected_cluster_cpus_list_node) { + cpuinfo_linux_detect_cluster_cpus( + arm_linux_processors_count, + i, + (cpuinfo_siblings_callback)cluster_siblings_parser, + arm_linux_processors); + } } /* Propagate all cluster IDs */ diff --git a/3rdparty/cpuinfo/src/arm/mach/init.c b/3rdparty/cpuinfo/src/arm/mach/init.c index 9d83c05a95742..3fb62414e0f4b 100644 --- a/3rdparty/cpuinfo/src/arm/mach/init.c +++ b/3rdparty/cpuinfo/src/arm/mach/init.c @@ -399,6 +399,16 @@ void cpuinfo_arm_mach_init(void) { cpuinfo_isa.i8mm = true; } + const uint32_t has_feat_sme = get_sys_info_by_name("hw.optional.arm.FEAT_SME"); + if (has_feat_sme != 0) { + cpuinfo_isa.sme = true; + } + + const uint32_t has_feat_sme2 = get_sys_info_by_name("hw.optional.arm.FEAT_SME2"); + if (has_feat_sme2 != 0) { + cpuinfo_isa.sme2 = true; + } + uint32_t num_clusters = 1; for (uint32_t i = 0; i < mach_topology.cores; i++) { cores[i] = (struct cpuinfo_core){ diff --git a/3rdparty/cpuinfo/src/freebsd/topology.c b/3rdparty/cpuinfo/src/freebsd/topology.c index da941e9cb243e..675a81f8bb467 100644 --- a/3rdparty/cpuinfo/src/freebsd/topology.c +++ b/3rdparty/cpuinfo/src/freebsd/topology.c @@ -24,8 +24,10 @@ static char* sysctl_str(const char* name) { size_t value_size = 0; if (sysctlbyname(name, NULL, &value_size, NULL, 0) != 0) { cpuinfo_log_error("sysctlbyname(\"%s\") failed: %s", name, strerror(errno)); + return NULL; } else if (value_size <= 0) { cpuinfo_log_error("sysctlbyname(\"%s\") returned invalid value size %zu", name, value_size); + return NULL; } value_size += 1; char* value = calloc(value_size, 1); @@ -52,29 +54,22 @@ struct cpuinfo_freebsd_topology cpuinfo_freebsd_detect_topology(void) { if (!topology_spec) { return topology; } - const char* group_tag = ""; - char* p = strstr(topology_spec, group_tag); - while (p) { - const char* cpu_tag = "cpu count=\""; - char* q = strstr(p, cpu_tag); - if (q) { - p = q + strlen(cpu_tag); - topology.packages += atoi(p); - } else { - break; - } - } - if (topology.packages == 0) { - const char* group_tag = "", " 0) { + break; + } } + if (topology.packages == 0) { - cpuinfo_log_error("failed to parse topology_spec:%s", topology_spec); + cpuinfo_log_error("failed to parse topology_spec: %s", topology_spec); free(topology_spec); goto fail; } @@ -84,6 +79,7 @@ struct cpuinfo_freebsd_topology cpuinfo_freebsd_detect_topology(void) { goto fail; } if (topology.cores < topology.packages) { + cpuinfo_log_error("invalid numbers of package and core: %d %d", topology.packages, topology.cores); goto fail; } topology.threads_per_core = sysctl_int("kern.smp.threads_per_core"); diff --git a/3rdparty/cpuinfo/src/x86/freebsd/init.c b/3rdparty/cpuinfo/src/x86/freebsd/init.c index c6c6d7533be9a..797fa24b95a55 100644 --- a/3rdparty/cpuinfo/src/x86/freebsd/init.c +++ b/3rdparty/cpuinfo/src/x86/freebsd/init.c @@ -135,6 +135,10 @@ void cpuinfo_x86_freebsd_init(void) { if (x86_processor.cache.l1i.size != 0 || x86_processor.cache.l1d.size != 0) { /* Assume that threads on the same core share L1 */ threads_per_l1 = freebsd_topology.threads / freebsd_topology.cores; + if (threads_per_l1 == 0) { + cpuinfo_log_error("failed to detect threads_per_l1"); + goto cleanup; + } cpuinfo_log_warning( "freebsd kernel did not report number of " "threads sharing L1 cache; assume %" PRIu32, @@ -154,6 +158,10 @@ void cpuinfo_x86_freebsd_init(void) { * the same package share L2 */ threads_per_l2 = freebsd_topology.threads / freebsd_topology.packages; } + if (threads_per_l2 == 0) { + cpuinfo_log_error("failed to detect threads_per_l1"); + goto cleanup; + } cpuinfo_log_warning( "freebsd kernel did not report number of " "threads sharing L2 cache; assume %" PRIu32, @@ -170,6 +178,10 @@ void cpuinfo_x86_freebsd_init(void) { * may be L4 cache as well) */ threads_per_l3 = freebsd_topology.threads / freebsd_topology.packages; + if (threads_per_l3 == 0) { + cpuinfo_log_error("failed to detect threads_per_l3"); + goto cleanup; + } cpuinfo_log_warning( "freebsd kernel did not report number of " "threads sharing L3 cache; assume %" PRIu32, @@ -187,6 +199,10 @@ void cpuinfo_x86_freebsd_init(void) { * shared L4 (like on IBM POWER8). */ threads_per_l4 = freebsd_topology.threads; + if (threads_per_l4 == 0) { + cpuinfo_log_error("failed to detect threads_per_l4"); + goto cleanup; + } cpuinfo_log_warning( "freebsd kernel did not report number of " "threads sharing L4 cache; assume %" PRIu32, @@ -203,7 +219,7 @@ void cpuinfo_x86_freebsd_init(void) { "%" PRIu32 " L1I caches", l1_count * sizeof(struct cpuinfo_cache), l1_count); - return; + goto cleanup; } for (uint32_t c = 0; c < l1_count; c++) { l1i[c] = (struct cpuinfo_cache){ @@ -230,7 +246,7 @@ void cpuinfo_x86_freebsd_init(void) { "%" PRIu32 " L1D caches", l1_count * sizeof(struct cpuinfo_cache), l1_count); - return; + goto cleanup; } for (uint32_t c = 0; c < l1_count; c++) { l1d[c] = (struct cpuinfo_cache){ @@ -257,7 +273,7 @@ void cpuinfo_x86_freebsd_init(void) { "%" PRIu32 " L2 caches", l2_count * sizeof(struct cpuinfo_cache), l2_count); - return; + goto cleanup; } for (uint32_t c = 0; c < l2_count; c++) { l2[c] = (struct cpuinfo_cache){ @@ -284,7 +300,7 @@ void cpuinfo_x86_freebsd_init(void) { "%" PRIu32 " L3 caches", l3_count * sizeof(struct cpuinfo_cache), l3_count); - return; + goto cleanup; } for (uint32_t c = 0; c < l3_count; c++) { l3[c] = (struct cpuinfo_cache){ @@ -311,7 +327,7 @@ void cpuinfo_x86_freebsd_init(void) { "%" PRIu32 " L4 caches", l4_count * sizeof(struct cpuinfo_cache), l4_count); - return; + goto cleanup; } for (uint32_t c = 0; c < l4_count; c++) { l4[c] = (struct cpuinfo_cache){ diff --git a/3rdparty/lzma/include/7zCrc.h b/3rdparty/lzma/include/7zCrc.h index 4afaeae4a8450..3e6d408b9287f 100644 --- a/3rdparty/lzma/include/7zCrc.h +++ b/3rdparty/lzma/include/7zCrc.h @@ -1,5 +1,5 @@ /* 7zCrc.h -- CRC32 calculation -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_7Z_CRC_H #define ZIP7_INC_7Z_CRC_H @@ -20,7 +20,8 @@ void Z7_FASTCALL CrcGenerateTable(void); UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size); UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size); -typedef UInt32 (Z7_FASTCALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); +typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size); +Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo); EXTERN_C_END diff --git a/3rdparty/lzma/include/7zTypes.h b/3rdparty/lzma/include/7zTypes.h index 1fcb2473e18b0..5b77420a3b2fb 100644 --- a/3rdparty/lzma/include/7zTypes.h +++ b/3rdparty/lzma/include/7zTypes.h @@ -1,5 +1,5 @@ /* 7zTypes.h -- Basic types -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-24 : Igor Pavlov : Public domain */ #ifndef ZIP7_7Z_TYPES_H #define ZIP7_7Z_TYPES_H @@ -530,20 +530,20 @@ struct ISzAlloc #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m) */ #if defined (__clang__) || defined(__GNUC__) -#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") -#define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL \ +#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \ _Pragma("GCC diagnostic pop") #else -#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL -#define Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL +#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL #endif #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \ - Z7_DIAGNOSCTIC_IGNORE_BEGIN_CAST_QUAL \ + Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \ type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \ - Z7_DIAGNOSCTIC_IGNORE_END_CAST_QUAL + Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \ Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p) diff --git a/3rdparty/lzma/include/7zVersion.h b/3rdparty/lzma/include/7zVersion.h index 7549239614e52..1ddef80b08d21 100644 --- a/3rdparty/lzma/include/7zVersion.h +++ b/3rdparty/lzma/include/7zVersion.h @@ -1,7 +1,7 @@ -#define MY_VER_MAJOR 23 -#define MY_VER_MINOR 01 +#define MY_VER_MAJOR 24 +#define MY_VER_MINOR 8 #define MY_VER_BUILD 0 -#define MY_VERSION_NUMBERS "23.01" +#define MY_VERSION_NUMBERS "24.08" #define MY_VERSION MY_VERSION_NUMBERS #ifdef MY_CPU_NAME @@ -10,12 +10,12 @@ #define MY_VERSION_CPU MY_VERSION #endif -#define MY_DATE "2023-06-20" +#define MY_DATE "2024-08-11" #undef MY_COPYRIGHT #undef MY_VERSION_COPYRIGHT_DATE #define MY_AUTHOR_NAME "Igor Pavlov" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" -#define MY_COPYRIGHT_CR "Copyright (c) 1999-2023 Igor Pavlov" +#define MY_COPYRIGHT_CR "Copyright (c) 1999-2024 Igor Pavlov" #ifdef USE_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR diff --git a/3rdparty/lzma/include/Alloc.h b/3rdparty/lzma/include/Alloc.h index fac5b62fb9f5a..01bf6b7dd6b1c 100644 --- a/3rdparty/lzma/include/Alloc.h +++ b/3rdparty/lzma/include/Alloc.h @@ -1,5 +1,5 @@ /* Alloc.h -- Memory allocation functions -2023-03-04 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_ALLOC_H #define ZIP7_INC_ALLOC_H @@ -22,6 +22,9 @@ void *MyAlloc(size_t size); void MyFree(void *address); void *MyRealloc(void *address, size_t size); +void *z7_AlignedAlloc(size_t size); +void z7_AlignedFree(void *p); + #ifdef _WIN32 #ifdef Z7_LARGE_PAGES @@ -33,12 +36,14 @@ void MidFree(void *address); void *BigAlloc(size_t size); void BigFree(void *address); +/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */ + #else -#define MidAlloc(size) MyAlloc(size) -#define MidFree(address) MyFree(address) -#define BigAlloc(size) MyAlloc(size) -#define BigFree(address) MyFree(address) +#define MidAlloc(size) z7_AlignedAlloc(size) +#define MidFree(address) z7_AlignedFree(address) +#define BigAlloc(size) z7_AlignedAlloc(size) +#define BigFree(address) z7_AlignedFree(address) #endif diff --git a/3rdparty/lzma/include/Bra.h b/3rdparty/lzma/include/Bra.h index a4ee568e215a0..b47112cedc3db 100644 --- a/3rdparty/lzma/include/Bra.h +++ b/3rdparty/lzma/include/Bra.h @@ -1,5 +1,5 @@ /* Bra.h -- Branch converters for executables -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-20 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_BRA_H #define ZIP7_INC_BRA_H @@ -8,8 +8,12 @@ EXTERN_C_BEGIN -#define Z7_BRANCH_CONV_DEC(name) z7_BranchConv_ ## name ## _Dec -#define Z7_BRANCH_CONV_ENC(name) z7_BranchConv_ ## name ## _Enc +/* #define PPC BAD_PPC_11 // for debug */ + +#define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec +#define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc +#define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name) +#define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name) #define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec #define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc @@ -20,19 +24,20 @@ typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv)); typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt)); #define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0 -Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_DEC(X86)); -Z7_BRANCH_CONV_ST_DECL(Z7_BRANCH_CONV_ST_ENC(X86)); +Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86)); +Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86)); #define Z7_BRANCH_FUNCS_DECL(name) \ -Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_DEC(name)); \ -Z7_BRANCH_CONV_DECL(Z7_BRANCH_CONV_ENC(name)); +Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \ +Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name)); -Z7_BRANCH_FUNCS_DECL(ARM64) -Z7_BRANCH_FUNCS_DECL(ARM) -Z7_BRANCH_FUNCS_DECL(ARMT) -Z7_BRANCH_FUNCS_DECL(PPC) -Z7_BRANCH_FUNCS_DECL(SPARC) -Z7_BRANCH_FUNCS_DECL(IA64) +Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64) +Z7_BRANCH_FUNCS_DECL (BranchConv_ARM) +Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT) +Z7_BRANCH_FUNCS_DECL (BranchConv_PPC) +Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC) +Z7_BRANCH_FUNCS_DECL (BranchConv_IA64) +Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV) /* These functions convert data that contain CPU instructions. @@ -49,14 +54,14 @@ and one for decoding (_Enc/_Dec postfixes in function name). In params: data : data buffer size : size of data - pc : current virtual Program Counter (Instruction Pinter) value + pc : current virtual Program Counter (Instruction Pointer) value In/Out param: state : pointer to state variable (for X86 converter only) Return: The pointer to position in (data) buffer after last byte that was processed. If the caller calls converter again, it must call it starting with that position. - But the caller is allowed to move data in buffer. so pointer to + But the caller is allowed to move data in buffer. So pointer to current processed position also will be changed for next call. Also the caller must increase internal (pc) value for next call. @@ -65,6 +70,7 @@ Each converter has some characteristics: Endian, Alignment, LookAhead. X86 little 1 4 ARMT little 2 2 + RISCV little 2 6 ARM little 4 0 ARM64 little 4 0 PPC big 4 0 diff --git a/3rdparty/lzma/include/Compiler.h b/3rdparty/lzma/include/Compiler.h index 185a52deb5049..2a9c2b7a0896c 100644 --- a/3rdparty/lzma/include/Compiler.h +++ b/3rdparty/lzma/include/Compiler.h @@ -1,5 +1,5 @@ /* Compiler.h : Compiler specific defines and pragmas -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_COMPILER_H #define ZIP7_INC_COMPILER_H @@ -25,11 +25,79 @@ #define Z7_MINGW #endif +#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__)) +#define Z7_MCST_LCC +#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__) +#endif + +/* +#if defined(__AVX2__) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \ + || defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \ + || defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400) + #define Z7_COMPILER_AVX2_SUPPORTED + #endif +#endif +*/ + // #pragma GCC diagnostic ignored "-Wunknown-pragmas" #ifdef __clang__ // padding size of '' with 4 bytes to alignment boundary #pragma GCC diagnostic ignored "-Wpadded" + +#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \ + && defined(__FreeBSD__) +// freebsd: +#pragma GCC diagnostic ignored "-Wexcess-padding" +#endif + +#if __clang_major__ >= 16 +#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage" +#endif + +#if __clang_major__ == 13 +#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) +// cheri +#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast" +#endif +#endif + +#if __clang_major__ == 13 + // for + #pragma GCC diagnostic ignored "-Wreserved-identifier" +#endif + +#endif // __clang__ + +#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16 +// #pragma GCC diagnostic ignored "-Wcast-function-type-strict" +#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \ + _Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"") +#else +#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION +#endif + +typedef void (*Z7_void_Function)(void); +#if defined(__clang__) || defined(__GNUC__) +#define Z7_CAST_FUNC_C (Z7_void_Function) +#elif defined(_MSC_VER) && _MSC_VER > 1920 +#define Z7_CAST_FUNC_C (void *) +// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' +#else +#define Z7_CAST_FUNC_C +#endif +/* +#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) + // #pragma GCC diagnostic ignored "-Wcast-function-type" +#endif +*/ +#ifdef __GNUC__ +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000) +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif #endif @@ -101,7 +169,8 @@ _Pragma("clang loop unroll(disable)") \ _Pragma("clang loop vectorize(disable)") #define Z7_ATTRIB_NO_VECTORIZE -#elif defined(__GNUC__) && (__GNUC__ >= 5) +#elif defined(__GNUC__) && (__GNUC__ >= 5) \ + && (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610)) #define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) // __attribute__((optimize("no-unroll-loops"))); #define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE @@ -142,15 +211,23 @@ #endif -#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 36000)) -#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ +#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600)) + +#if (Z7_CLANG_VERSION < 130000) +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"") +#else +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"") -#define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ +#endif + +#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \ _Pragma("GCC diagnostic pop") #else -#define Z7_DIAGNOSCTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER -#define Z7_DIAGNOSCTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER #endif #define UNUSED_VAR(x) (void)x; diff --git a/3rdparty/lzma/include/CpuArch.h b/3rdparty/lzma/include/CpuArch.h index 8e5d8a543f878..683cfaa86279d 100644 --- a/3rdparty/lzma/include/CpuArch.h +++ b/3rdparty/lzma/include/CpuArch.h @@ -1,5 +1,5 @@ /* CpuArch.h -- CPU specific code -2023-04-02 : Igor Pavlov : Public domain */ +2024-06-17 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_CPU_ARCH_H #define ZIP7_INC_CPU_ARCH_H @@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) */ +#if !defined(_M_ARM64EC) #if defined(_M_X64) \ || defined(_M_AMD64) \ || defined(__x86_64__) \ @@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #endif #define MY_CPU_64BIT #endif +#endif #if defined(_M_IX86) \ @@ -47,17 +49,26 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #if defined(_M_ARM64) \ + || defined(_M_ARM64EC) \ || defined(__AARCH64EL__) \ || defined(__AARCH64EB__) \ || defined(__aarch64__) #define MY_CPU_ARM64 - #ifdef __ILP32__ +#if defined(__ILP32__) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) #define MY_CPU_NAME "arm64-32" #define MY_CPU_SIZEOF_POINTER 4 - #else +#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) + #define MY_CPU_NAME "arm64-128" + #define MY_CPU_SIZEOF_POINTER 16 +#else +#if defined(_M_ARM64EC) + #define MY_CPU_NAME "arm64ec" +#else #define MY_CPU_NAME "arm64" +#endif #define MY_CPU_SIZEOF_POINTER 8 - #endif +#endif #define MY_CPU_64BIT #endif @@ -133,8 +144,36 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #endif +#if defined(__sparc__) \ + || defined(__sparc) + #define MY_CPU_SPARC + #if defined(__LP64__) \ + || defined(_LP64) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) + #define MY_CPU_NAME "sparcv9" + #define MY_CPU_SIZEOF_POINTER 8 + #define MY_CPU_64BIT + #elif defined(__sparc_v9__) \ + || defined(__sparcv9) + #define MY_CPU_64BIT + #if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) + #define MY_CPU_NAME "sparcv9-32" + #else + #define MY_CPU_NAME "sparcv9m" + #endif + #elif defined(__sparc_v8__) \ + || defined(__sparcv8) + #define MY_CPU_NAME "sparcv8" + #define MY_CPU_SIZEOF_POINTER 4 + #else + #define MY_CPU_NAME "sparc" + #endif +#endif + + #if defined(__riscv) \ || defined(__riscv__) + #define MY_CPU_RISCV #if __riscv_xlen == 32 #define MY_CPU_NAME "riscv32" #elif __riscv_xlen == 64 @@ -145,6 +184,39 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #endif +#if defined(__loongarch__) + #define MY_CPU_LOONGARCH + #if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64) + #define MY_CPU_64BIT + #endif + #if defined(__loongarch64) + #define MY_CPU_NAME "loongarch64" + #define MY_CPU_LOONGARCH64 + #else + #define MY_CPU_NAME "loongarch" + #endif +#endif + + +// #undef MY_CPU_NAME +// #undef MY_CPU_SIZEOF_POINTER +// #define __e2k__ +// #define __SIZEOF_POINTER__ 4 +#if defined(__e2k__) + #define MY_CPU_E2K + #if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4) + #define MY_CPU_NAME "e2k-32" + #define MY_CPU_SIZEOF_POINTER 4 + #else + #define MY_CPU_NAME "e2k" + #if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8) + #define MY_CPU_SIZEOF_POINTER 8 + #endif + #endif + #define MY_CPU_64BIT +#endif + + #if defined(MY_CPU_X86) || defined(MY_CPU_AMD64) #define MY_CPU_X86_OR_AMD64 #endif @@ -175,6 +247,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. || defined(MY_CPU_ARM_LE) \ || defined(MY_CPU_ARM64_LE) \ || defined(MY_CPU_IA64_LE) \ + || defined(_LITTLE_ENDIAN) \ || defined(__LITTLE_ENDIAN__) \ || defined(__ARMEL__) \ || defined(__THUMBEL__) \ @@ -251,6 +324,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #ifndef MY_CPU_NAME + // #define MY_CPU_IS_UNKNOWN #ifdef MY_CPU_LE #define MY_CPU_NAME "LE" #elif defined(MY_CPU_BE) @@ -295,9 +369,19 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #define Z7_BSWAP64(v) _byteswap_uint64(v) #define Z7_CPU_FAST_BSWAP_SUPPORTED -#elif (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ - || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) - +/* GCC can generate slow code that calls function for __builtin_bswap32() for: + - GCC for RISCV, if Zbb/XTHeadBb extension is not used. + - GCC for SPARC. + The code from CLANG for SPARC also is not fastest. + So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases. +*/ +#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb) || defined(__riscv_xtheadbb)) \ + && !defined(MY_CPU_SPARC) \ + && ( \ + (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \ + || (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \ + ) + #define Z7_BSWAP16(v) __builtin_bswap16(v) #define Z7_BSWAP32(v) __builtin_bswap32(v) #define Z7_BSWAP64(v) __builtin_bswap64(v) @@ -329,13 +413,48 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #ifdef MY_CPU_LE #if defined(MY_CPU_X86_OR_AMD64) \ - || defined(MY_CPU_ARM64) + || defined(MY_CPU_ARM64) \ + || defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \ + || defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6) #define MY_CPU_LE_UNALIGN #define MY_CPU_LE_UNALIGN_64 #elif defined(__ARM_FEATURE_UNALIGNED) - /* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. - So we can't use unaligned 64-bit operations. */ - #define MY_CPU_LE_UNALIGN +/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions. + Description of problems: +problem-1 : 32-bit ARM architecture: + multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM) + require 32-bit (WORD) alignment (by 32-bit ARM architecture). + So there is "Alignment fault exception", if data is not aligned for 32-bit. + +problem-2 : 32-bit kernels and arm64 kernels: + 32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception". + So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux. + + But some arm64 kernels do not handle these faults in 32-bit programs. + So we have unhandled exception for such instructions. + Probably some new arm64 kernels have fixed it, and unaligned + paired-access instructions work in new kernels? + +problem-3 : compiler for 32-bit arm: + Compilers use LDRD/STRD/LDM/STM for UInt64 accesses + and for another cases where two 32-bit accesses are fused + to one multi-access instruction. + So UInt64 variables must be aligned for 32-bit, and each + 32-bit access must be aligned for 32-bit, if we want to + avoid "Alignment fault" exception (handled or unhandled). + +problem-4 : performace: + Even if unaligned access is handled by kernel, it will be slow. + So if we allow unaligned access, we can get fast unaligned + single-access, and slow unaligned paired-access. + + We don't allow unaligned access on 32-bit arm, because compiler + genarates paired-access instructions that require 32-bit alignment, + and some arm64 kernels have no handler for these instructions. + Also unaligned paired-access instructions will be slow, if kernel handles them. +*/ + // it must be disabled: + // #define MY_CPU_LE_UNALIGN #endif #endif @@ -439,11 +558,13 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #if defined(MY_CPU_BE) +#define GetBe64a(p) (*(const UInt64 *)(const void *)(p)) #define GetBe32a(p) (*(const UInt32 *)(const void *)(p)) #define GetBe16a(p) (*(const UInt16 *)(const void *)(p)) #define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); } #define SetBe16a(p, v) { *(UInt16 *)(void *)(p) = (v); } +#define GetUi64a(p) GetUi64(p) #define GetUi32a(p) GetUi32(p) #define GetUi16a(p) GetUi16(p) #define SetUi32a(p, v) SetUi32(p, v) @@ -451,11 +572,13 @@ MY_CPU_64BIT means that processor can work with 64-bit registers. #elif defined(MY_CPU_LE) +#define GetUi64a(p) (*(const UInt64 *)(const void *)(p)) #define GetUi32a(p) (*(const UInt32 *)(const void *)(p)) #define GetUi16a(p) (*(const UInt16 *)(const void *)(p)) #define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); } #define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); } +#define GetBe64a(p) GetBe64(p) #define GetBe32a(p) GetBe32(p) #define GetBe16a(p) GetBe16(p) #define SetBe32a(p, v) SetBe32(p, v) @@ -486,6 +609,7 @@ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void); BoolInt CPU_IsSupported_AES(void); BoolInt CPU_IsSupported_AVX(void); BoolInt CPU_IsSupported_AVX2(void); +BoolInt CPU_IsSupported_AVX512F_AVX512VL(void); BoolInt CPU_IsSupported_VAES_AVX2(void); BoolInt CPU_IsSupported_CMOV(void); BoolInt CPU_IsSupported_SSE(void); diff --git a/3rdparty/lzma/include/LzFind.h b/3rdparty/lzma/include/LzFind.h index a3f72c9870018..67e8a6e0286ee 100644 --- a/3rdparty/lzma/include/LzFind.h +++ b/3rdparty/lzma/include/LzFind.h @@ -1,5 +1,5 @@ /* LzFind.h -- Match finder for LZ algorithms -2023-03-04 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_LZ_FIND_H #define ZIP7_INC_LZ_FIND_H @@ -144,7 +144,8 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable); void MatchFinder_Init_LowHash(CMatchFinder *p); void MatchFinder_Init_HighHash(CMatchFinder *p); void MatchFinder_Init_4(CMatchFinder *p); -void MatchFinder_Init(CMatchFinder *p); +// void MatchFinder_Init(CMatchFinder *p); +void MatchFinder_Init(void *p); UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); diff --git a/3rdparty/lzma/include/LzFindMt.h b/3rdparty/lzma/include/LzFindMt.h index db5923ea05b73..fcb479da9e555 100644 --- a/3rdparty/lzma/include/LzFindMt.h +++ b/3rdparty/lzma/include/LzFindMt.h @@ -1,5 +1,5 @@ /* LzFindMt.h -- multithreaded Match finder for LZ algorithms -2023-03-05 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_LZ_FIND_MT_H #define ZIP7_INC_LZ_FIND_MT_H @@ -31,7 +31,10 @@ typedef struct // UInt32 numBlocks_Sent; } CMtSync; -typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances); + +struct CMatchFinderMt_; + +typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances); /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ #define kMtCacheLineDummy 128 @@ -39,7 +42,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); -typedef struct +typedef struct CMatchFinderMt_ { /* LZ */ const Byte *pointerToCurPos; diff --git a/3rdparty/lzma/include/Precomp.h b/3rdparty/lzma/include/Precomp.h index 69afb2ffd100a..7747fdd74c64b 100644 --- a/3rdparty/lzma/include/Precomp.h +++ b/3rdparty/lzma/include/Precomp.h @@ -1,10 +1,127 @@ -/* Precomp.h -- StdAfx -2023-04-02 : Igor Pavlov : Public domain */ +/* Precomp.h -- precompilation file +2024-01-25 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_PRECOMP_H #define ZIP7_INC_PRECOMP_H +/* + this file must be included before another *.h files and before . + this file is included from the following files: + C\*.c + C\Util\*\Precomp.h <- C\Util\*\*.c + CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp + + this file can set the following macros: + Z7_LARGE_PAGES 1 + Z7_LONG_PATH 1 + Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip + _WIN32_WINNT 0x0500 (or higher) + WINVER _WIN32_WINNT + UNICODE 1 + _UNICODE 1 +*/ + #include "Compiler.h" -/* #include "7zTypes.h" */ + +#ifdef _MSC_VER +// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty +#if _MSC_VER >= 1912 +// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception. +#endif +#endif + +/* +// for debug: +#define UNICODE 1 +#define _UNICODE 1 +#define _WIN32_WINNT 0x0500 // win2000 +#ifndef WINVER + #define WINVER _WIN32_WINNT +#endif +*/ + +#ifdef _WIN32 +/* + this "Precomp.h" file must be included before , + if we want to define _WIN32_WINNT before . +*/ + +#ifndef Z7_LARGE_PAGES +#ifndef Z7_NO_LARGE_PAGES +#define Z7_LARGE_PAGES 1 +#endif +#endif + +#ifndef Z7_LONG_PATH +#ifndef Z7_NO_LONG_PATH +#define Z7_LONG_PATH 1 +#endif +#endif + +#ifndef Z7_DEVICE_FILE +#ifndef Z7_NO_DEVICE_FILE +// #define Z7_DEVICE_FILE 1 +#endif +#endif + +// we don't change macros if included after +#ifndef _WINDOWS_ + +#ifndef Z7_WIN32_WINNT_MIN + #if defined(_M_ARM64) || defined(__aarch64__) + // #define Z7_WIN32_WINNT_MIN 0x0a00 // win10 + #define Z7_WIN32_WINNT_MIN 0x0600 // vista + #elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT) + // #define Z7_WIN32_WINNT_MIN 0x0602 // win8 + #define Z7_WIN32_WINNT_MIN 0x0600 // vista + #elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64) + #define Z7_WIN32_WINNT_MIN 0x0503 // win2003 + // #elif defined(_M_IX86) || defined(__i386__) + // #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 + #else // x86 and another(old) systems + #define Z7_WIN32_WINNT_MIN 0x0500 // win2000 + // #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug + #endif +#endif // Z7_WIN32_WINNT_MIN + + +#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT +#ifdef _WIN32_WINNT + // #error Stop_Compiling_Bad_WIN32_WINNT +#else + #ifndef Z7_NO_DEFINE_WIN32_WINNT +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define _WIN32_WINNT Z7_WIN32_WINNT_MIN +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER + #endif +#endif // _WIN32_WINNT + +#ifndef WINVER + #define WINVER _WIN32_WINNT +#endif +#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT + + +#ifndef _MBCS +#ifndef Z7_NO_UNICODE +// UNICODE and _UNICODE are used by and by 7-zip code. + +#ifndef UNICODE +#define UNICODE 1 +#endif + +#ifndef _UNICODE +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define _UNICODE 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // Z7_NO_UNICODE +#endif // _MBCS +#endif // _WINDOWS_ + +// #include "7zWindows.h" + +#endif // _WIN32 #endif diff --git a/3rdparty/lzma/include/Threads.h b/3rdparty/lzma/include/Threads.h index 4028464a337c1..c1484a277378b 100644 --- a/3rdparty/lzma/include/Threads.h +++ b/3rdparty/lzma/include/Threads.h @@ -1,5 +1,5 @@ /* Threads.h -- multithreading library -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-28 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_THREADS_H #define ZIP7_INC_THREADS_H @@ -9,12 +9,21 @@ #else +#include "Compiler.h" + +// #define Z7_AFFINITY_DISABLE #if defined(__linux__) #if !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) #ifndef Z7_AFFINITY_DISABLE #define Z7_AFFINITY_SUPPORTED // #pragma message(" ==== Z7_AFFINITY_SUPPORTED") -// #define _GNU_SOURCE +#if !defined(_GNU_SOURCE) +// #pragma message(" ==== _GNU_SOURCE set") +// we need _GNU_SOURCE for cpu_set_t, if we compile for MUSL +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#define _GNU_SOURCE +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif #endif #endif #endif @@ -173,7 +182,7 @@ WRes CriticalSection_Init(CCriticalSection *p); #else // _WIN32 -typedef struct _CEvent +typedef struct { int _created; int _manual_reset; @@ -199,7 +208,7 @@ WRes Event_Wait(CEvent *p); WRes Event_Close(CEvent *p); -typedef struct _CSemaphore +typedef struct { int _created; UInt32 _count; @@ -219,7 +228,7 @@ WRes Semaphore_Wait(CSemaphore *p); WRes Semaphore_Close(CSemaphore *p); -typedef struct _CCriticalSection +typedef struct { pthread_mutex_t _mutex; } CCriticalSection; @@ -230,6 +239,7 @@ void CriticalSection_Enter(CCriticalSection *cs); void CriticalSection_Leave(CCriticalSection *cs); LONG InterlockedIncrement(LONG volatile *addend); +LONG InterlockedDecrement(LONG volatile *addend); #endif // _WIN32 diff --git a/3rdparty/lzma/include/Xz.h b/3rdparty/lzma/include/Xz.h index d5001f6cace8c..42bc685341a51 100644 --- a/3rdparty/lzma/include/Xz.h +++ b/3rdparty/lzma/include/Xz.h @@ -1,5 +1,5 @@ /* Xz.h - Xz interface -2023-04-13 : Igor Pavlov : Public domain */ +2024-01-26 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_XZ_H #define ZIP7_INC_XZ_H @@ -18,6 +18,7 @@ EXTERN_C_BEGIN #define XZ_ID_ARMT 8 #define XZ_ID_SPARC 9 #define XZ_ID_ARM64 0xa +#define XZ_ID_RISCV 0xb #define XZ_ID_LZMA2 0x21 unsigned Xz_ReadVarInt(const Byte *p, size_t maxSize, UInt64 *value); @@ -233,13 +234,13 @@ typedef enum typedef struct { EXzState state; - UInt32 pos; + unsigned pos; unsigned alignPos; unsigned indexPreSize; CXzStreamFlags streamFlags; - UInt32 blockHeaderSize; + unsigned blockHeaderSize; UInt64 packSize; UInt64 unpackSize; diff --git a/3rdparty/lzma/include/XzCrc64.h b/3rdparty/lzma/include/XzCrc64.h index ca46869a644a8..04f8153df4ac3 100644 --- a/3rdparty/lzma/include/XzCrc64.h +++ b/3rdparty/lzma/include/XzCrc64.h @@ -1,5 +1,5 @@ /* XzCrc64.h -- CRC64 calculation -2023-04-02 : Igor Pavlov : Public domain */ +2023-12-08 : Igor Pavlov : Public domain */ #ifndef ZIP7_INC_XZ_CRC64_H #define ZIP7_INC_XZ_CRC64_H @@ -10,16 +10,16 @@ EXTERN_C_BEGIN -extern UInt64 g_Crc64Table[]; +// extern UInt64 g_Crc64Table[]; void Z7_FASTCALL Crc64GenerateTable(void); #define CRC64_INIT_VAL UINT64_CONST(0xFFFFFFFFFFFFFFFF) #define CRC64_GET_DIGEST(crc) ((crc) ^ CRC64_INIT_VAL) -#define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +// #define CRC64_UPDATE_BYTE(crc, b) (g_Crc64Table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) UInt64 Z7_FASTCALL Crc64Update(UInt64 crc, const void *data, size_t size); -UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); +// UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size); EXTERN_C_END diff --git a/3rdparty/lzma/src/7zArcIn.c b/3rdparty/lzma/src/7zArcIn.c index 43fa7c21454aa..23f294992235e 100644 --- a/3rdparty/lzma/src/7zArcIn.c +++ b/3rdparty/lzma/src/7zArcIn.c @@ -1,5 +1,5 @@ /* 7zArcIn.c -- 7z Input functions -2023-05-11 : Igor Pavlov : Public domain */ +2023-09-07 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -301,7 +301,7 @@ static SRes RememberBitVector(CSzData *sd, UInt32 numItems, const Byte **v) static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) { - Byte b = 0; + unsigned b = 0; unsigned m = 0; UInt32 sum = 0; for (; numItems != 0; numItems--) @@ -312,7 +312,7 @@ static UInt32 CountDefinedBits(const Byte *bits, UInt32 numItems) m = 8; } m--; - sum += ((b >> m) & 1); + sum += (UInt32)((b >> m) & 1); } return sum; } diff --git a/3rdparty/lzma/src/7zCrc.c b/3rdparty/lzma/src/7zCrc.c index ccfd01f9e5932..6e2db9eab192b 100644 --- a/3rdparty/lzma/src/7zCrc.c +++ b/3rdparty/lzma/src/7zCrc.c @@ -1,93 +1,96 @@ /* 7zCrc.c -- CRC32 calculation and init -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "7zCrc.h" #include "CpuArch.h" -#define kCrcPoly 0xEDB88320 +// for debug: +// #define __ARM_FEATURE_CRC32 1 -#ifdef MY_CPU_LE - #define CRC_NUM_TABLES 8 -#else - #define CRC_NUM_TABLES 9 +#ifdef __ARM_FEATURE_CRC32 +// #pragma message("__ARM_FEATURE_CRC32") +#define Z7_CRC_HW_FORCE +#endif - UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); - UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table); +// #define Z7_CRC_DEBUG_BE +#ifdef Z7_CRC_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE #endif -#ifndef MY_CPU_BE - UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); - UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); +#ifdef Z7_CRC_HW_FORCE + #define Z7_CRC_NUM_TABLES_USE 1 +#else +#ifdef Z7_CRC_NUM_TABLES + #define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES +#else + #define Z7_CRC_NUM_TABLES_USE 12 +#endif #endif -/* -extern -CRC_FUNC g_CrcUpdateT4; -CRC_FUNC g_CrcUpdateT4; -*/ -extern -CRC_FUNC g_CrcUpdateT8; -CRC_FUNC g_CrcUpdateT8; -extern -CRC_FUNC g_CrcUpdateT0_32; -CRC_FUNC g_CrcUpdateT0_32; -extern -CRC_FUNC g_CrcUpdateT0_64; -CRC_FUNC g_CrcUpdateT0_64; -extern -CRC_FUNC g_CrcUpdate; -CRC_FUNC g_CrcUpdate; - -UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; - -UInt32 Z7_FASTCALL CrcUpdate(UInt32 v, const void *data, size_t size) -{ - return g_CrcUpdate(v, data, size, g_CrcTable); -} +#if Z7_CRC_NUM_TABLES_USE < 1 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif -UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) -{ - return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL; -} +#if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1) + #define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE +#else + #define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1) +#endif -#if CRC_NUM_TABLES < 4 \ - || (CRC_NUM_TABLES == 4 && defined(MY_CPU_BE)) \ +#ifndef Z7_CRC_HW_FORCE + +#if Z7_CRC_NUM_TABLES_USE == 1 \ || (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) -#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table); -UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table) +#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1 +static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size) { + const UInt32 *table = g_CrcTable; const Byte *p = (const Byte *)data; - const Byte *pEnd = p + size; - for (; p != pEnd; p++) + const Byte *lim = p + size; + for (; p != lim; p++) v = CRC_UPDATE_BYTE_2(v, *p); return v; } #endif + +#if Z7_CRC_NUM_TABLES_USE != 1 +#ifndef MY_CPU_BE + #define FUNC_NAME_LE_2(s) CrcUpdateT ## s + #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) + #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE) + UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table); +#endif +#ifndef MY_CPU_LE + #define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s + #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) + #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE) + UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table); +#endif +#endif + +#endif // Z7_CRC_HW_FORCE + /* ---------- hardware CRC ---------- */ #ifdef MY_CPU_LE #if defined(MY_CPU_ARM_OR_ARM64) - // #pragma message("ARM*") - #if defined(_MSC_VER) && !defined(__clang__) - #if defined(MY_CPU_ARM64) - #if (_MSC_VER >= 1910) - #ifndef __clang__ - #define USE_ARM64_CRC - #include - #endif - #endif - #endif - #elif (defined(__clang__) && (__clang_major__ >= 3)) \ - || (defined(__GNUC__) && (__GNUC__ > 4)) + #if (defined(__clang__) && (__clang_major__ >= 3)) \ + || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \ + || defined(__GNUC__) && (__GNUC__ >= 8) #if !defined(__ARM_FEATURE_CRC32) +// #pragma message("!defined(__ARM_FEATURE_CRC32)") +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER #define __ARM_FEATURE_CRC32 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRC32_WAS_SET #if defined(__clang__) #if defined(MY_CPU_ARM64) #define ATTRIB_CRC __attribute__((__target__("crc"))) @@ -96,100 +99,120 @@ UInt32 Z7_FASTCALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UI #endif #else #if defined(MY_CPU_ARM64) +#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000) #define ATTRIB_CRC __attribute__((__target__("+crc"))) +#endif #else +#if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8) +#if defined(__ARM_FP) && __GNUC__ >= 8 +// for -mfloat-abi=hard: similar to + #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd"))) +#else #define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) +#endif +#endif #endif #endif #endif #if defined(__ARM_FEATURE_CRC32) - #define USE_ARM64_CRC + // #pragma message("") +/* +arm_acle.h (GGC): + before Nov 17, 2017: +#ifdef __ARM_FEATURE_CRC32 + + Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked" +#if __ARM_ARCH >= 8 +#pragma GCC target ("arch=armv8-a+crc") + + Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1: +#ifdef __ARM_FEATURE_CRC32 +#ifdef __ARM_FP +#pragma GCC target ("arch=armv8-a+crc+simd") +#else +#pragma GCC target ("arch=armv8-a+crc") +#endif +*/ +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 +#if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") +#undef __ARM_ARCH +#define __ARM_ARCH 8 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif + #define Z7_CRC_HW_USE #include #endif + #elif defined(_MSC_VER) + #if defined(MY_CPU_ARM64) + #if (_MSC_VER >= 1910) + #ifdef __clang__ + // #define Z7_CRC_HW_USE + // #include + #else + #define Z7_CRC_HW_USE + #include + #endif + #endif + #endif #endif -#else - -// no hardware CRC - -// #define USE_CRC_EMU - -#ifdef USE_CRC_EMU - -#pragma message("ARM64 CRC emulation") - -Z7_FORCE_INLINE -UInt32 __crc32b(UInt32 v, UInt32 data) -{ - const UInt32 *table = g_CrcTable; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); - return v; -} +#else // non-ARM* -Z7_FORCE_INLINE -UInt32 __crc32w(UInt32 v, UInt32 data) -{ - const UInt32 *table = g_CrcTable; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - return v; -} +// #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code +#ifdef Z7_CRC_HW_USE +#include "7zCrcEmu.h" +#endif -Z7_FORCE_INLINE -UInt32 __crc32d(UInt32 v, UInt64 data) -{ - const UInt32 *table = g_CrcTable; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8; - return v; -} +#endif // non-ARM* -#endif // USE_CRC_EMU -#endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE) +#if defined(Z7_CRC_HW_USE) +// #pragma message("USE ARM HW CRC") -#if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) +#ifdef MY_CPU_64BIT + #define CRC_HW_WORD_TYPE UInt64 + #define CRC_HW_WORD_FUNC __crc32d +#else + #define CRC_HW_WORD_TYPE UInt32 + #define CRC_HW_WORD_FUNC __crc32w +#endif -#define T0_32_UNROLL_BYTES (4 * 4) -#define T0_64_UNROLL_BYTES (4 * 8) +#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4) -#ifndef ATTRIB_CRC -#define ATTRIB_CRC +#ifdef ATTRIB_CRC + ATTRIB_CRC #endif -// #pragma message("USE ARM HW CRC") - -ATTRIB_CRC -UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); -ATTRIB_CRC -UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) +Z7_NO_INLINE +#ifdef Z7_CRC_HW_FORCE + UInt32 Z7_FASTCALL CrcUpdate +#else + static UInt32 Z7_FASTCALL CrcUpdate_HW +#endif + (UInt32 v, const void *data, size_t size) { const Byte *p = (const Byte *)data; - UNUSED_VAR(table); - - for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--) + for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--) v = __crc32b(v, *p++); - - if (size >= T0_32_UNROLL_BYTES) + if (size >= CRC_HW_UNROLL_BYTES) { const Byte *lim = p + size; - size &= (T0_32_UNROLL_BYTES - 1); + size &= CRC_HW_UNROLL_BYTES - 1; lim -= size; do { - v = __crc32w(v, *(const UInt32 *)(const void *)(p)); - v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; - v = __crc32w(v, *(const UInt32 *)(const void *)(p)); - v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); + p += 2 * sizeof(CRC_HW_WORD_TYPE); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p)); + v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE))); + p += 2 * sizeof(CRC_HW_WORD_TYPE); } while (p != lim); } @@ -200,46 +223,86 @@ UInt32 Z7_FASTCALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const return v; } -ATTRIB_CRC -UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); -ATTRIB_CRC -UInt32 Z7_FASTCALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) +#ifdef Z7_ARM_FEATURE_CRC32_WAS_SET +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#undef __ARM_FEATURE_CRC32 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#undef Z7_ARM_FEATURE_CRC32_WAS_SET +#endif + +#endif // defined(Z7_CRC_HW_USE) +#endif // MY_CPU_LE + + + +#ifndef Z7_CRC_HW_FORCE + +#if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) +/* +typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC) + (UInt32 v, const void *data, size_t size, const UInt32 *table); +Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate; +*/ +static unsigned g_Crc_Algo; +#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) +static unsigned g_Crc_Be; +#endif +#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) + + + +Z7_NO_INLINE +#ifdef Z7_CRC_HW_USE + static UInt32 Z7_FASTCALL CrcUpdate_Base +#else + UInt32 Z7_FASTCALL CrcUpdate +#endif + (UInt32 crc, const void *data, size_t size) { - const Byte *p = (const Byte *)data; - UNUSED_VAR(table); +#if Z7_CRC_NUM_TABLES_USE == 1 + return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); +#else // Z7_CRC_NUM_TABLES_USE != 1 +#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME + if (g_Crc_Algo == 1) + return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size); +#endif - for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--) - v = __crc32b(v, *p++); +#ifdef MY_CPU_LE + return FUNC_NAME_LE(crc, data, size, g_CrcTable); +#elif defined(MY_CPU_BE) + return FUNC_NAME_BE(crc, data, size, g_CrcTable); +#else + if (g_Crc_Be) + return FUNC_NAME_BE(crc, data, size, g_CrcTable); + else + return FUNC_NAME_LE(crc, data, size, g_CrcTable); +#endif +#endif // Z7_CRC_NUM_TABLES_USE != 1 +} - if (size >= T0_64_UNROLL_BYTES) - { - const Byte *lim = p + size; - size &= (T0_64_UNROLL_BYTES - 1); - lim -= size; - do - { - v = __crc32d(v, *(const UInt64 *)(const void *)(p)); - v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; - v = __crc32d(v, *(const UInt64 *)(const void *)(p)); - v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8; - } - while (p != lim); - } - - for (; size != 0; size--) - v = __crc32b(v, *p++); - return v; +#ifdef Z7_CRC_HW_USE +Z7_NO_INLINE +UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size) +{ + if (g_Crc_Algo == 0) + return CrcUpdate_HW(crc, data, size); + return CrcUpdate_Base(crc, data, size); } +#endif -#undef T0_32_UNROLL_BYTES -#undef T0_64_UNROLL_BYTES +#endif // !defined(Z7_CRC_HW_FORCE) -#endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU) -#endif // MY_CPU_LE + +UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size) +{ + return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL; +} +MY_ALIGN(64) +UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL]; void Z7_FASTCALL CrcGenerateTable(void) @@ -247,94 +310,111 @@ void Z7_FASTCALL CrcGenerateTable(void) UInt32 i; for (i = 0; i < 256; i++) { +#if defined(Z7_CRC_HW_FORCE) + g_CrcTable[i] = __crc32b(i, 0); +#else + #define kCrcPoly 0xEDB88320 UInt32 r = i; unsigned j; for (j = 0; j < 8; j++) r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); g_CrcTable[i] = r; +#endif } - for (i = 256; i < 256 * CRC_NUM_TABLES; i++) + for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++) { const UInt32 r = g_CrcTable[(size_t)i - 256]; g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); } - #if CRC_NUM_TABLES < 4 - g_CrcUpdate = CrcUpdateT1; - #elif defined(MY_CPU_LE) - // g_CrcUpdateT4 = CrcUpdateT4; - #if CRC_NUM_TABLES < 8 - g_CrcUpdate = CrcUpdateT4; - #else // CRC_NUM_TABLES >= 8 - g_CrcUpdateT8 = CrcUpdateT8; - /* - #ifdef MY_CPU_X86_OR_AMD64 - if (!CPU_Is_InOrder()) - #endif - */ - g_CrcUpdate = CrcUpdateT8; - #endif - #else +#if !defined(Z7_CRC_HW_FORCE) && \ + (defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE)) + +#if Z7_CRC_NUM_TABLES_USE <= 1 + g_Crc_Algo = 1; +#else // Z7_CRC_NUM_TABLES_USE <= 1 + +#if defined(MY_CPU_LE) + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; +#else // !defined(MY_CPU_LE) { - #ifndef MY_CPU_BE +#ifndef MY_CPU_BE UInt32 k = 0x01020304; const Byte *p = (const Byte *)&k; if (p[0] == 4 && p[1] == 3) - { - #if CRC_NUM_TABLES < 8 - // g_CrcUpdateT4 = CrcUpdateT4; - g_CrcUpdate = CrcUpdateT4; - #else // CRC_NUM_TABLES >= 8 - g_CrcUpdateT8 = CrcUpdateT8; - g_CrcUpdate = CrcUpdateT8; - #endif - } + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; else if (p[0] != 1 || p[1] != 2) - g_CrcUpdate = CrcUpdateT1; + g_Crc_Algo = 1; else - #endif // MY_CPU_BE +#endif // MY_CPU_BE { - for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) + for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--) { const UInt32 x = g_CrcTable[(size_t)i - 256]; g_CrcTable[i] = Z7_BSWAP32(x); } - #if CRC_NUM_TABLES <= 4 - g_CrcUpdate = CrcUpdateT1; - #elif CRC_NUM_TABLES <= 8 - // g_CrcUpdateT4 = CrcUpdateT1_BeT4; - g_CrcUpdate = CrcUpdateT1_BeT4; - #else // CRC_NUM_TABLES > 8 - g_CrcUpdateT8 = CrcUpdateT1_BeT8; - g_CrcUpdate = CrcUpdateT1_BeT8; - #endif +#if defined(Z7_CRC_UPDATE_T1_FUNC_NAME) + g_Crc_Algo = Z7_CRC_NUM_TABLES_USE; +#endif +#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE)) + g_Crc_Be = 1; +#endif } } - #endif // CRC_NUM_TABLES < 4 +#endif // !defined(MY_CPU_LE) - #ifdef MY_CPU_LE - #ifdef USE_ARM64_CRC - if (CPU_IsSupported_CRC32()) - { - g_CrcUpdateT0_32 = CrcUpdateT0_32; - g_CrcUpdateT0_64 = CrcUpdateT0_64; - g_CrcUpdate = - #if defined(MY_CPU_ARM) - CrcUpdateT0_32; - #else - CrcUpdateT0_64; - #endif - } - #endif - - #ifdef USE_CRC_EMU - g_CrcUpdateT0_32 = CrcUpdateT0_32; - g_CrcUpdateT0_64 = CrcUpdateT0_64; - g_CrcUpdate = CrcUpdateT0_64; - #endif +#ifdef MY_CPU_LE +#ifdef Z7_CRC_HW_USE + if (CPU_IsSupported_CRC32()) + g_Crc_Algo = 0; +#endif // Z7_CRC_HW_USE +#endif // MY_CPU_LE + +#endif // Z7_CRC_NUM_TABLES_USE <= 1 +#endif // g_Crc_Algo was declared +} + +Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo) +{ + if (algo == 0) + return &CrcUpdate; + +#if defined(Z7_CRC_HW_USE) + if (algo == sizeof(CRC_HW_WORD_TYPE) * 8) + { +#ifdef Z7_CRC_HW_FORCE + return &CrcUpdate; +#else + if (g_Crc_Algo == 0) + return &CrcUpdate_HW; +#endif + } +#endif + +#ifndef Z7_CRC_HW_FORCE + if (algo == Z7_CRC_NUM_TABLES_USE) + return + #ifdef Z7_CRC_HW_USE + &CrcUpdate_Base; + #else + &CrcUpdate; #endif +#endif + + return NULL; } #undef kCrcPoly -#undef CRC64_NUM_TABLES +#undef Z7_CRC_NUM_TABLES_USE +#undef Z7_CRC_NUM_TABLES_TOTAL #undef CRC_UPDATE_BYTE_2 +#undef FUNC_NAME_LE_2 +#undef FUNC_NAME_LE_1 +#undef FUNC_NAME_LE +#undef FUNC_NAME_BE_2 +#undef FUNC_NAME_BE_1 +#undef FUNC_NAME_BE + +#undef CRC_HW_UNROLL_BYTES +#undef CRC_HW_WORD_FUNC +#undef CRC_HW_WORD_TYPE diff --git a/3rdparty/lzma/src/7zCrcOpt.c b/3rdparty/lzma/src/7zCrcOpt.c index 9c649290df720..9408017ed4fc1 100644 --- a/3rdparty/lzma/src/7zCrcOpt.c +++ b/3rdparty/lzma/src/7zCrcOpt.c @@ -1,117 +1,199 @@ -/* 7zCrcOpt.c -- CRC32 calculation -2023-04-02 : Igor Pavlov : Public domain */ +/* 7zCrcOpt.c -- CRC32 calculation (optimized functions) +2023-12-07 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" +#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1 + +// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC_DEBUG_BE +#ifdef Z7_CRC_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c +#ifdef Z7_CRC_NUM_TABLES +#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES +#else +#define Z7_CRC_NUM_TABLES_USE 12 +#endif + +#if Z7_CRC_NUM_TABLES_USE % 4 || \ + Z7_CRC_NUM_TABLES_USE < 4 * 1 || \ + Z7_CRC_NUM_TABLES_USE > 4 * 6 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif + + #ifndef MY_CPU_BE -#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) -UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); -UInt32 Z7_FASTCALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) -{ - const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) - v = CRC_UPDATE_BYTE_2(v, *p); - for (; size >= 4; size -= 4, p += 4) - { - v ^= *(const UInt32 *)(const void *)p; - v = - (table + 0x300)[((v ) & 0xFF)] - ^ (table + 0x200)[((v >> 8) & 0xFF)] - ^ (table + 0x100)[((v >> 16) & 0xFF)] - ^ (table + 0x000)[((v >> 24))]; - } - for (; size > 0; size--, p++) - v = CRC_UPDATE_BYTE_2(v, *p); - return v; -} +#define Q(n, d) \ + ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) + +#define R(a) *((const UInt32 *)(const void *)p + (a)) + +#define CRC_FUNC_PRE_LE2(step) \ +UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) -UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); -UInt32 Z7_FASTCALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) +#define CRC_FUNC_PRE_LE(step) \ + CRC_FUNC_PRE_LE2(step); \ + CRC_FUNC_PRE_LE2(step) + +CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) + const Byte *lim; + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC_UPDATE_BYTE_2(v, *p); - for (; size >= 8; size -= 8, p += 8) + lim = p + size; + if (size >= Z7_CRC_NUM_TABLES_USE) { - UInt32 d; - v ^= *(const UInt32 *)(const void *)p; - v = - (table + 0x700)[((v ) & 0xFF)] - ^ (table + 0x600)[((v >> 8) & 0xFF)] - ^ (table + 0x500)[((v >> 16) & 0xFF)] - ^ (table + 0x400)[((v >> 24))]; - d = *((const UInt32 *)(const void *)p + 1); - v ^= - (table + 0x300)[((d ) & 0xFF)] - ^ (table + 0x200)[((d >> 8) & 0xFF)] - ^ (table + 0x100)[((d >> 16) & 0xFF)] - ^ (table + 0x000)[((d >> 24))]; + lim -= Z7_CRC_NUM_TABLES_USE; + do + { + v ^= R(0); + { +#if Z7_CRC_NUM_TABLES_USE == 1 * 4 + v = Q(0, v); +#else +#define U2(r, op) \ + { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } + UInt32 d, x; + U2(1, =) +#if Z7_CRC_NUM_TABLES_USE >= 3 * 4 +#define U(r) U2(r, ^=) + U(2) +#if Z7_CRC_NUM_TABLES_USE >= 4 * 4 + U(3) +#if Z7_CRC_NUM_TABLES_USE >= 5 * 4 + U(4) +#if Z7_CRC_NUM_TABLES_USE >= 6 * 4 + U(5) +#if Z7_CRC_NUM_TABLES_USE >= 7 * 4 +#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif +#endif +#endif +#endif +#endif +#undef U +#undef U2 + v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); +#endif + } + p += Z7_CRC_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC_UPDATE_BYTE_2(v, *p); return v; } +#undef CRC_UPDATE_BYTE_2 +#undef R +#undef Q +#undef CRC_FUNC_PRE_LE +#undef CRC_FUNC_PRE_LE2 + #endif + + #ifndef MY_CPU_LE -#define CRC_UINT32_SWAP(v) Z7_BSWAP32(v) +#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8)) -#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) +#define Q(n, d) \ + ( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) -UInt32 Z7_FASTCALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) -{ - const Byte *p = (const Byte *)data; - table += 0x100; - v = CRC_UINT32_SWAP(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) - v = CRC_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 4; size -= 4, p += 4) - { - v ^= *(const UInt32 *)(const void *)p; - v = - (table + 0x000)[((v ) & 0xFF)] - ^ (table + 0x100)[((v >> 8) & 0xFF)] - ^ (table + 0x200)[((v >> 16) & 0xFF)] - ^ (table + 0x300)[((v >> 24))]; - } - for (; size > 0; size--, p++) - v = CRC_UPDATE_BYTE_2_BE(v, *p); - return CRC_UINT32_SWAP(v); -} +#ifdef Z7_CRC_DEBUG_BE + #define R(a) GetBe32a((const UInt32 *)(const void *)p + (a)) +#else + #define R(a) *((const UInt32 *)(const void *)p + (a)) +#endif + + +#define CRC_FUNC_PRE_BE2(step) \ +UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table) -UInt32 Z7_FASTCALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) +#define CRC_FUNC_PRE_BE(step) \ + CRC_FUNC_PRE_BE2(step); \ + CRC_FUNC_PRE_BE2(step) + +CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; + const Byte *lim; table += 0x100; - v = CRC_UINT32_SWAP(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) + v = Z7_BSWAP32(v); + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 8; size -= 8, p += 8) + lim = p + size; + if (size >= Z7_CRC_NUM_TABLES_USE) { - UInt32 d; - v ^= *(const UInt32 *)(const void *)p; - v = - (table + 0x400)[((v ) & 0xFF)] - ^ (table + 0x500)[((v >> 8) & 0xFF)] - ^ (table + 0x600)[((v >> 16) & 0xFF)] - ^ (table + 0x700)[((v >> 24))]; - d = *((const UInt32 *)(const void *)p + 1); - v ^= - (table + 0x000)[((d ) & 0xFF)] - ^ (table + 0x100)[((d >> 8) & 0xFF)] - ^ (table + 0x200)[((d >> 16) & 0xFF)] - ^ (table + 0x300)[((d >> 24))]; + lim -= Z7_CRC_NUM_TABLES_USE; + do + { + v ^= R(0); + { +#if Z7_CRC_NUM_TABLES_USE == 1 * 4 + v = Q(0, v); +#else +#define U2(r, op) \ + { d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); } + UInt32 d, x; + U2(1, =) +#if Z7_CRC_NUM_TABLES_USE >= 3 * 4 +#define U(r) U2(r, ^=) + U(2) +#if Z7_CRC_NUM_TABLES_USE >= 4 * 4 + U(3) +#if Z7_CRC_NUM_TABLES_USE >= 5 * 4 + U(4) +#if Z7_CRC_NUM_TABLES_USE >= 6 * 4 + U(5) +#if Z7_CRC_NUM_TABLES_USE >= 7 * 4 +#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES +#endif +#endif +#endif +#endif +#endif +#undef U +#undef U2 + v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v); +#endif + } + p += Z7_CRC_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC_UPDATE_BYTE_2_BE(v, *p); - return CRC_UINT32_SWAP(v); + return Z7_BSWAP32(v); } +#undef CRC_UPDATE_BYTE_2_BE +#undef R +#undef Q +#undef CRC_FUNC_PRE_BE +#undef CRC_FUNC_PRE_BE2 + +#endif +#undef Z7_CRC_NUM_TABLES_USE #endif diff --git a/3rdparty/lzma/src/7zDec.c b/3rdparty/lzma/src/7zDec.c index 96c60359a4779..c9b4064e3ffb4 100644 --- a/3rdparty/lzma/src/7zDec.c +++ b/3rdparty/lzma/src/7zDec.c @@ -1,5 +1,5 @@ /* 7zDec.c -- Decoding from 7z folder -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -51,6 +51,7 @@ #ifndef Z7_NO_METHODS_FILTERS #define k_Delta 3 +#define k_RISCV 0xb #define k_BCJ 0x3030103 #define k_PPC 0x3030205 #define k_IA64 0x3030401 @@ -362,6 +363,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f) case k_IA64: case k_SPARC: case k_ARM: + case k_RISCV: #endif #ifdef Z7_USE_FILTER_ARM64 case k_ARM64: @@ -535,10 +537,10 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, } } } - #if defined(Z7_USE_BRANCH_FILTER) +#if defined(Z7_USE_BRANCH_FILTER) else if (ci == 1) { - #if !defined(Z7_NO_METHODS_FILTERS) +#if !defined(Z7_NO_METHODS_FILTERS) if (coder->MethodID == k_Delta) { if (coder->PropsSize != 1) @@ -550,22 +552,43 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, } continue; } - #endif +#endif - #ifdef Z7_USE_FILTER_ARM64 +#ifdef Z7_USE_FILTER_ARM64 if (coder->MethodID == k_ARM64) { UInt32 pc = 0; if (coder->PropsSize == 4) + { pc = GetUi32(propsData + coder->PropsOffset); + if (pc & 3) + return SZ_ERROR_UNSUPPORTED; + } else if (coder->PropsSize != 0) return SZ_ERROR_UNSUPPORTED; z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc); continue; } - #endif - - #if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) + if (coder->MethodID == k_RISCV) + { + UInt32 pc = 0; + if (coder->PropsSize == 4) + { + pc = GetUi32(propsData + coder->PropsOffset); + if (pc & 1) + return SZ_ERROR_UNSUPPORTED; + } + else if (coder->PropsSize != 0) + return SZ_ERROR_UNSUPPORTED; + z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc); + continue; + } +#endif + +#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT) { if (coder->PropsSize != 0) return SZ_ERROR_UNSUPPORTED; @@ -579,7 +602,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0 break; } - CASE_BRA_CONV(PPC) + case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0; + // CASE_BRA_CONV(PPC) CASE_BRA_CONV(IA64) CASE_BRA_CONV(SPARC) CASE_BRA_CONV(ARM) @@ -592,9 +616,9 @@ static SRes SzFolder_Decode2(const CSzFolder *folder, } continue; } - #endif +#endif } // (c == 1) - #endif +#endif // Z7_USE_BRANCH_FILTER else return SZ_ERROR_UNSUPPORTED; } diff --git a/3rdparty/lzma/src/Aes.c b/3rdparty/lzma/src/Aes.c index bcaafab1188f6..abc5d24b48500 100644 --- a/3rdparty/lzma/src/Aes.c +++ b/3rdparty/lzma/src/Aes.c @@ -1,5 +1,5 @@ /* Aes.c -- AES encryption / decryption -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -13,7 +13,9 @@ AES_CODE_FUNC g_AesCtr_Code; UInt32 g_Aes_SupportedFunctions_Flags; #endif +MY_ALIGN(64) static UInt32 T[256 * 4]; +MY_ALIGN(64) static const Byte Sbox[256] = { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, @@ -33,7 +35,9 @@ static const Byte Sbox[256] = { 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; +MY_ALIGN(64) static UInt32 D[256 * 4]; +MY_ALIGN(64) static Byte InvS[256]; #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) @@ -54,24 +58,54 @@ static Byte InvS[256]; // #define Z7_SHOW_AES_STATUS #ifdef MY_CPU_X86_OR_AMD64 - #define USE_HW_AES -#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) - #if defined(__clang__) - #if (__clang_major__ >= 8) // fix that check - #define USE_HW_AES - #endif - #elif defined(__GNUC__) - #if (__GNUC__ >= 6) // fix that check + + #if defined(__INTEL_COMPILER) + #if (__INTEL_COMPILER >= 1110) #define USE_HW_AES + #if (__INTEL_COMPILER >= 1900) + #define USE_HW_VAES + #endif #endif + #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) + #define USE_HW_AES + #if defined(__clang__) && (__clang_major__ >= 8) \ + || defined(__GNUC__) && (__GNUC__ >= 8) + #define USE_HW_VAES + #endif #elif defined(_MSC_VER) - #if _MSC_VER >= 1910 + #define USE_HW_AES + #define USE_HW_VAES + #endif + +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_AES) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_AES + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) #define USE_HW_AES #endif + #endif + #endif #endif #endif #ifdef USE_HW_AES +// #pragma message("=== Aes.c USE_HW_AES === ") #ifdef Z7_SHOW_AES_STATUS #include #define PRF(x) x @@ -136,6 +170,7 @@ void AesGenTables(void) #endif #ifdef MY_CPU_X86_OR_AMD64 + #ifdef USE_HW_VAES if (CPU_IsSupported_VAES_AVX2()) { PRF(printf("\n===vaes avx2\n")); @@ -146,6 +181,7 @@ void AesGenTables(void) #endif } #endif + #endif } #endif diff --git a/3rdparty/lzma/src/AesOpt.c b/3rdparty/lzma/src/AesOpt.c index e48a6d641fbd2..58769ea05912f 100644 --- a/3rdparty/lzma/src/AesOpt.c +++ b/3rdparty/lzma/src/AesOpt.c @@ -1,5 +1,5 @@ /* AesOpt.c -- AES optimized code for x86 AES hardware instructions -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -15,8 +15,8 @@ #define USE_INTEL_VAES #endif #endif - #elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \ - || defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4) + #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400) #define USE_INTEL_AES #if !defined(__AES__) #define ATTRIB_AES __attribute__((__target__("aes"))) @@ -35,27 +35,37 @@ #define USE_INTEL_VAES #endif #endif + #ifndef USE_INTEL_AES + #define Z7_USE_AES_HW_STUB + #endif + #ifndef USE_INTEL_VAES + #define Z7_USE_VAES_HW_STUB + #endif #endif -#ifndef ATTRIB_AES - #define ATTRIB_AES -#endif -#ifndef ATTRIB_VAES - #define ATTRIB_VAES -#endif + #ifndef USE_INTEL_AES + // #define Z7_USE_AES_HW_STUB // for debug + #endif + #ifndef USE_INTEL_VAES + // #define Z7_USE_VAES_HW_STUB // for debug + #endif #ifdef USE_INTEL_AES #include -#ifndef USE_INTEL_VAES +#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) #define AES_TYPE_keys UInt32 #define AES_TYPE_data Byte // #define AES_TYPE_keys __m128i // #define AES_TYPE_data __m128i #endif +#ifndef ATTRIB_AES + #define ATTRIB_AES +#endif + #define AES_FUNC_START(name) \ void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks) // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks) @@ -69,8 +79,6 @@ AES_FUNC_START (name) #define MM_OP_m(op, src) MM_OP(op, m, src) #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src) -#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) - AES_FUNC_START2 (AesCbc_Encode_HW) { @@ -139,11 +147,6 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]) #endif -#define AVX_DECLARE_VAR(reg, ii) __m256i reg; -#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; -#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; -#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) - #define MM_OP_key(op, reg) MM_OP(op, reg, key); #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg) @@ -152,27 +155,13 @@ AES_FUNC_START2 (AesCbc_Encode_HW) #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg) #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg) - -#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) -#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) -#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) -#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) -#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) - #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr; #define CTR_END( reg, ii) MM_XOR (data[ii], reg) -#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); -#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) - #define WOP_KEY(op, n) { \ const __m128i key = w[n]; \ WOP(op); } -#define AVX_WOP_KEY(op, n) { \ - const __m256i key = w[n]; \ - WOP(op); } - #define WIDE_LOOP_START \ dataEnd = data + numBlocks; \ @@ -190,6 +179,40 @@ AES_FUNC_START2 (AesCbc_Encode_HW) for (; data < dataEnd; data++) + +#ifdef USE_INTEL_VAES + +#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src) +#define AVX_DECLARE_VAR(reg, ii) __m256i reg; +#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii]; +#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg; +/* +AVX_XOR_data_M1() needs unaligned memory load +if (we don't use _mm256_loadu_si256() here) +{ + Most compilers with enabled optimizations generate fused AVX (LOAD + OP) + instruction that can load unaligned data. + But GCC and CLANG without -O2 or -O1 optimizations can generate separated + LOAD-ALIGNED (vmovdqa) instruction that will fail on execution. +} +Note: some compilers generate more instructions, if we use _mm256_loadu_si256() here. +v23.02: we use _mm256_loadu_si256() here, because we need compatibility with any compiler. +*/ +#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256(&(((const __m256i *)(const void *)(data - 1))[ii]))) +// for debug only: the following code will fail on execution, if compiled by some compilers: +// #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii])) + +#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg) +#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg) +#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg) +#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg) +#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg) +#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key); +#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg) +#define AVX_WOP_KEY(op, n) { \ + const __m256i key = w[n]; \ + WOP(op); } + #define NUM_AES_KEYS_MAX 15 #define WIDE_LOOP_START_AVX(OP) \ @@ -214,6 +237,9 @@ AES_FUNC_START2 (AesCbc_Encode_HW) /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified, MSVC still can insert vzeroupper instruction. */ +#endif + + AES_FUNC_START2 (AesCbc_Decode_HW) { @@ -380,6 +406,9 @@ required that must be included before . #endif #endif // __clang__ && _MSC_VER +#ifndef ATTRIB_VAES + #define ATTRIB_VAES +#endif #define VAES_FUNC_START2(name) \ AES_FUNC_START (name); \ @@ -519,10 +548,18 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) /* no USE_INTEL_AES */ +#if defined(Z7_USE_AES_HW_STUB) +// We can compile this file with another C compiler, +// or we can compile asm version. +// So we can generate real code instead of this stub function. +// #if defined(_MSC_VER) #pragma message("AES HW_SW stub was used") +// #endif +#if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB) #define AES_TYPE_keys UInt32 #define AES_TYPE_data Byte +#endif #define AES_FUNC_START(name) \ void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \ @@ -535,13 +572,16 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256) AES_COMPAT_STUB (AesCbc_Encode) AES_COMPAT_STUB (AesCbc_Decode) AES_COMPAT_STUB (AesCtr_Code) +#endif // Z7_USE_AES_HW_STUB #endif // USE_INTEL_AES #ifndef USE_INTEL_VAES - +#if defined(Z7_USE_VAES_HW_STUB) +// #if defined(_MSC_VER) #pragma message("VAES HW_SW stub was used") +// #endif #define VAES_COMPAT_STUB(name) \ void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \ @@ -550,36 +590,59 @@ AES_COMPAT_STUB (AesCtr_Code) VAES_COMPAT_STUB (AesCbc_Decode_HW) VAES_COMPAT_STUB (AesCtr_Code_HW) - +#endif #endif // ! USE_INTEL_VAES + + #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) - #if defined(__clang__) - #if (__clang_major__ >= 8) // fix that check + #if defined(__ARM_FEATURE_AES) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_AES + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) #define USE_HW_AES #endif - #elif defined(__GNUC__) - #if (__GNUC__ >= 6) // fix that check - #define USE_HW_AES #endif - #elif defined(_MSC_VER) - #if _MSC_VER >= 1910 - #define USE_HW_AES #endif #endif #ifdef USE_HW_AES // #pragma message("=== AES HW === ") +// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES #if defined(__clang__) || defined(__GNUC__) +#if !defined(__ARM_FEATURE_AES) && \ + !defined(__ARM_FEATURE_CRYPTO) #ifdef MY_CPU_ARM64 - #define ATTRIB_AES __attribute__((__target__("+crypto,aes"))) +#if defined(__clang__) + #define ATTRIB_AES __attribute__((__target__("crypto"))) +#else + #define ATTRIB_AES __attribute__((__target__("+crypto"))) +#endif #else +#if defined(__clang__) + #define ATTRIB_AES __attribute__((__target__("armv8-a,aes"))) +#else #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) +#endif #endif +#endif #else // _MSC_VER // for arm32 @@ -590,12 +653,60 @@ VAES_COMPAT_STUB (AesCtr_Code_HW) #define ATTRIB_AES #endif -#if defined(_MSC_VER) && !defined(__clang__) && defined(MY_CPU_ARM64) +#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else +/* + clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese + clang + 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO) + 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) + 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) + 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES) + 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 +*/ +#if defined(__clang__) && __clang_major__ < 16 +#if !defined(__ARM_FEATURE_AES) && \ + !defined(__ARM_FEATURE_CRYPTO) +// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 +// #if defined(__clang__) && __clang_major__ < 13 + #define __ARM_FEATURE_CRYPTO 1 +// #else + #define __ARM_FEATURE_AES 1 +// #endif + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif // clang + +#if defined(__clang__) + +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") + #undef __ARM_ARCH + #define __ARM_ARCH 8 + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // clang + #include + +#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ + defined(__ARM_FEATURE_CRYPTO) && \ + defined(__ARM_FEATURE_AES) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #undef __ARM_FEATURE_CRYPTO + #undef __ARM_FEATURE_AES + #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") #endif +#endif // Z7_MSC_VER_ORIGINAL + typedef uint8x16_t v128; #define AES_FUNC_START(name) \ @@ -620,7 +731,7 @@ AES_FUNC_START (name) AES_FUNC_START2 (AesCbc_Encode_HW) { - v128 *p = (v128*)(void*)ivAes; + v128 * const p = (v128*)(void*)ivAes; v128 *data = (v128*)(void*)data8; v128 m = *p; const v128 k0 = p[2]; @@ -639,7 +750,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) const v128 k_z0 = w[2]; for (; numBlocks != 0; numBlocks--, data++) { - MM_XOR_m (*data); + MM_XOR_m (*data) AES_E_MC_m (k0) AES_E_MC_m (k1) AES_E_MC_m (k2) @@ -660,7 +771,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW) } } AES_E_m (k_z1) - MM_XOR_m (k_z0); + MM_XOR_m (k_z0) *data = m; } *p = m; @@ -745,7 +856,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW) while (w != p); WOP_KEY (AES_D, 1) WOP_KEY (AES_XOR, 0) - MM_XOR (m0, iv); + MM_XOR (m0, iv) WOP_M1 (XOR_data_M1) iv = data[NUM_WAYS - 1]; WOP (STORE_data) @@ -759,14 +870,14 @@ AES_FUNC_START2 (AesCbc_Decode_HW) AES_D_IMC_m (w[2]) do { - AES_D_IMC_m (w[1]); - AES_D_IMC_m (w[0]); + AES_D_IMC_m (w[1]) + AES_D_IMC_m (w[0]) w -= 2; } while (w != p); - AES_D_m (w[1]); - MM_XOR_m (w[0]); - MM_XOR_m (iv); + AES_D_m (w[1]) + MM_XOR_m (w[0]) + MM_XOR_m (iv) iv = *data; *data = m; } @@ -783,6 +894,12 @@ AES_FUNC_START2 (AesCtr_Code_HW) const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2; const v128 *dataEnd; uint64x2_t one = vdupq_n_u64(0); + +// the bug in clang: +// __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2); +#if defined(__clang__) && (__clang_major__ <= 9) +#pragma GCC diagnostic ignored "-Wvector-conversion" +#endif one = vsetq_lane_u64(1, one, 0); p += 2; @@ -809,11 +926,11 @@ AES_FUNC_START2 (AesCtr_Code_HW) { const v128 *w = p; v128 m; - CTR_START (m, 0); + CTR_START (m, 0) do { - AES_E_MC_m (w[0]); - AES_E_MC_m (w[1]); + AES_E_MC_m (w[0]) + AES_E_MC_m (w[1]) w += 2; } while (w != wEnd); diff --git a/3rdparty/lzma/src/Alloc.c b/3rdparty/lzma/src/Alloc.c index d841bf20a3519..63e1a121e7dd2 100644 --- a/3rdparty/lzma/src/Alloc.c +++ b/3rdparty/lzma/src/Alloc.c @@ -1,5 +1,5 @@ /* Alloc.c -- Memory allocation functions -2023-04-02 : Igor Pavlov : Public domain */ +2024-02-18 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -10,19 +10,18 @@ #include "Alloc.h" -#ifdef _WIN32 -#ifdef Z7_LARGE_PAGES -#if defined(__clang__) || defined(__GNUC__) -typedef void (*Z7_voidFunction)(void); -#define MY_CAST_FUNC (Z7_voidFunction) -#elif defined(_MSC_VER) && _MSC_VER > 1920 -#define MY_CAST_FUNC (void *) -// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' -#else -#define MY_CAST_FUNC +#if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \ + (!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64) + #define Z7_USE_DYN_GetLargePageMinimum +#endif + +// for debug: +#if 0 +#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16) +// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ") +#define Z7_ALLOC_NO_OFFSET_ALLOCATOR +#endif #endif -#endif // Z7_LARGE_PAGES -#endif // _WIN32 // #define SZ_ALLOC_DEBUG /* #define SZ_ALLOC_DEBUG */ @@ -146,7 +145,9 @@ static void PrintAddr(void *p) #define PRINT_FREE(name, cnt, ptr) #define Print(s) #define PrintLn() +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR #define PrintHex(v, align) +#endif #define PrintAddr(p) #endif @@ -246,9 +247,9 @@ void MidFree(void *address) #ifdef Z7_LARGE_PAGES #ifdef MEM_LARGE_PAGES - #define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES + #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES #else - #define MY__MEM_LARGE_PAGES 0x20000000 + #define MY_MEM_LARGE_PAGES 0x20000000 #endif extern @@ -258,19 +259,23 @@ typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID); void SetLargePageSize(void) { - #ifdef Z7_LARGE_PAGES SIZE_T size; +#ifdef Z7_USE_DYN_GetLargePageMinimum +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION + const Func_GetLargePageMinimum fn = - (Func_GetLargePageMinimum) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), + (Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "GetLargePageMinimum"); if (!fn) return; size = fn(); +#else + size = GetLargePageMinimum(); +#endif if (size == 0 || (size & (size - 1)) != 0) return; g_LargePageSize = size; - #endif } #endif // Z7_LARGE_PAGES @@ -292,7 +297,7 @@ void *BigAlloc(size_t size) size2 = (size + ps) & ~ps; if (size2 >= size) { - void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); + void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE); if (p) { PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p) @@ -328,20 +333,7 @@ const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; #endif -/* - uintptr_t : C99 (optional) - : unsupported in VS6 -*/ - -#ifdef _WIN32 - typedef UINT_PTR UIntPtr; -#else - /* - typedef uintptr_t UIntPtr; - */ - typedef ptrdiff_t UIntPtr; -#endif - +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR #define ADJUST_ALLOC_SIZE 0 /* @@ -352,14 +344,36 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; MyAlloc() can return address that is NOT multiple of sizeof(void *). */ - /* -#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) + uintptr_t : C99 (optional) + : unsupported in VS6 */ -#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) +typedef + #ifdef _WIN32 + UINT_PTR + #elif 1 + uintptr_t + #else + ptrdiff_t + #endif + MY_uintptr_t; + +#if 0 \ + || (defined(__CHERI__) \ + || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8)) +// for 128-bit pointers (cheri): +#define MY_ALIGN_PTR_DOWN(p, align) \ + ((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1)))) +#else +#define MY_ALIGN_PTR_DOWN(p, align) \ + ((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1)))) +#endif +#endif -#if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) +#if !defined(_WIN32) \ + && (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \ + || defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)) #define USE_posix_memalign #endif @@ -399,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size) #define ALLOC_ALIGN_SIZE ((size_t)1 << 7) -static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) +void *z7_AlignedAlloc(size_t size) { - #ifndef USE_posix_memalign +#ifndef USE_posix_memalign void *p; void *pAligned; size_t newSize; - UNUSED_VAR(pp) /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned block to prevent cache line sharing with another allocated blocks */ @@ -431,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) return pAligned; - #else +#else void *p; - UNUSED_VAR(pp) if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) return NULL; @@ -443,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) return p; - #endif +#endif +} + + +void z7_AlignedFree(void *address) +{ +#ifndef USE_posix_memalign + if (address) + MyFree(((void **)address)[-1]); +#else + free(address); +#endif +} + + +static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) +{ + UNUSED_VAR(pp) + return z7_AlignedAlloc(size); } static void SzAlignedFree(ISzAllocPtr pp, void *address) { UNUSED_VAR(pp) - #ifndef USE_posix_memalign +#ifndef USE_posix_memalign if (address) MyFree(((void **)address)[-1]); - #else +#else free(address); - #endif +#endif } @@ -463,16 +493,44 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree }; -#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) - /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ -#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] -/* -#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] -*/ +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR +#if 1 + #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *)) + #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] +#else + // we can use this simplified code, + // if (CAlignOffsetAlloc::offset == (k * sizeof(void *)) + #define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1]) +#endif +#endif + + +#if 0 +#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR +#include +static void PrintPtr(const char *s, const void *p) +{ + const Byte *p2 = (const Byte *)&p; + unsigned i; + printf("%s %p ", s, p); + for (i = sizeof(p); i != 0;) + { + i--; + printf("%02x", p2[i]); + } + printf("\n"); +} +#endif +#endif + static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) { +#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) + UNUSED_VAR(pp) + return z7_AlignedAlloc(size); +#else const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); void *adr; void *pAligned; @@ -501,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; +#if 0 + printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size); + PrintPtr("base", adr); + PrintPtr("alig", pAligned); +#endif + PrintLn(); Print("- Aligned: "); Print(" size="); PrintHex(size, 8); @@ -512,11 +576,16 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) REAL_BLOCK_PTR_VAR(pAligned) = adr; return pAligned; +#endif } static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) { +#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) + UNUSED_VAR(pp) + z7_AlignedFree(address); +#else if (address) { const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt); @@ -525,6 +594,7 @@ static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) PrintLn(); ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); } +#endif } diff --git a/3rdparty/lzma/src/Bra.c b/3rdparty/lzma/src/Bra.c index 22e0e478ecae3..e61edf8f12e1a 100644 --- a/3rdparty/lzma/src/Bra.c +++ b/3rdparty/lzma/src/Bra.c @@ -1,11 +1,11 @@ /* Bra.c -- Branch converters for RISC code -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-20 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "Bra.h" -#include "CpuArch.h" #include "RotateDefs.h" +#include "CpuArch.h" #if defined(MY_CPU_SIZEOF_POINTER) \ && ( MY_CPU_SIZEOF_POINTER == 4 \ @@ -26,7 +26,7 @@ #define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c; // #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c; -#define Z7_BRANCH_CONV(name) z7_BranchConv_ ## name +#define Z7_BRANCH_CONV(name) z7_ ## name #define Z7_BRANCH_FUNC_MAIN(name) \ static \ @@ -42,11 +42,11 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ #ifdef Z7_EXTRACT_ONLY #define Z7_BRANCH_FUNCS_IMP(name) \ - Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) #else #define Z7_BRANCH_FUNCS_IMP(name) \ - Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) \ - Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC, 1) + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \ + Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1) #endif #if defined(__clang__) @@ -72,7 +72,7 @@ Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \ #endif -Z7_BRANCH_FUNC_MAIN(ARM64) +Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64) { // Byte *p = data; const Byte *lim; @@ -121,10 +121,10 @@ Z7_BRANCH_FUNC_MAIN(ARM64) } } } -Z7_BRANCH_FUNCS_IMP(ARM64) +Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64) -Z7_BRANCH_FUNC_MAIN(ARM) +Z7_BRANCH_FUNC_MAIN(BranchConv_ARM) { // Byte *p = data; const Byte *lim; @@ -152,10 +152,10 @@ Z7_BRANCH_FUNC_MAIN(ARM) } } } -Z7_BRANCH_FUNCS_IMP(ARM) +Z7_BRANCH_FUNCS_IMP(BranchConv_ARM) -Z7_BRANCH_FUNC_MAIN(PPC) +Z7_BRANCH_FUNC_MAIN(BranchConv_PPC) { // Byte *p = data; const Byte *lim; @@ -192,14 +192,14 @@ Z7_BRANCH_FUNC_MAIN(PPC) } } } -Z7_BRANCH_FUNCS_IMP(PPC) +Z7_BRANCH_FUNCS_IMP(BranchConv_PPC) #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED #define BR_SPARC_USE_ROTATE #endif -Z7_BRANCH_FUNC_MAIN(SPARC) +Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC) { // Byte *p = data; const Byte *lim; @@ -254,10 +254,10 @@ Z7_BRANCH_FUNC_MAIN(SPARC) } } } -Z7_BRANCH_FUNCS_IMP(SPARC) +Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC) -Z7_BRANCH_FUNC_MAIN(ARMT) +Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT) { // Byte *p = data; Byte *lim; @@ -335,12 +335,12 @@ Z7_BRANCH_FUNC_MAIN(ARMT) // return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2)); // return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2)); } -Z7_BRANCH_FUNCS_IMP(ARMT) +Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT) // #define BR_IA64_NO_INLINE -Z7_BRANCH_FUNC_MAIN(IA64) +Z7_BRANCH_FUNC_MAIN(BranchConv_IA64) { // Byte *p = data; const Byte *lim; @@ -417,4 +417,293 @@ Z7_BRANCH_FUNC_MAIN(IA64) } } } -Z7_BRANCH_FUNCS_IMP(IA64) +Z7_BRANCH_FUNCS_IMP(BranchConv_IA64) + + +#define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET; +#define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET; + +#if 1 && defined(MY_CPU_LE_UNALIGN) + #define RISCV_USE_UNALIGNED_LOAD +#endif + +#ifdef RISCV_USE_UNALIGNED_LOAD + #define RISCV_GET_UI32(p) GetUi32(p) + #define RISCV_SET_UI32(p, v) { SetUi32(p, v) } +#else + #define RISCV_GET_UI32(p) \ + ((UInt32)GetUi16a(p) + \ + ((UInt32)GetUi16a((p) + 2) << 16)) + #define RISCV_SET_UI32(p, v) { \ + SetUi16a(p, (UInt16)(v)) \ + SetUi16a((p) + 2, (UInt16)(v >> 16)) } +#endif + +#if 1 && defined(MY_CPU_LE) + #define RISCV_USE_16BIT_LOAD +#endif + +#ifdef RISCV_USE_16BIT_LOAD + #define RISCV_LOAD_VAL(p) GetUi16a(p) +#else + #define RISCV_LOAD_VAL(p) (*(p)) +#endif + +#define RISCV_INSTR_SIZE 2 +#define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE) +#define RISCV_STEP_2 4 +#define RISCV_REG_VAL (2 << 7) +#define RISCV_CMD_VAL 3 +#if 1 + // for code size optimization: + #define RISCV_DELTA_7F 0x7f +#else + #define RISCV_DELTA_7F 0 +#endif + +#define RISCV_CHECK_1(v, b) \ + (((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0) + +#if 1 + #define RISCV_CHECK_2(v, r) \ + ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \ + << 18) \ + < ((r) & 0x1d)) +#else + // this branch gives larger code, because + // compilers generate larger code for big constants. + #define RISCV_CHECK_2(v, r) \ + ((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ + & ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \ + < ((r) & 0x1d)) +#endif + + +#define RISCV_SCAN_LOOP \ + Byte *lim; \ + size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \ + if (size <= 6) return p; \ + size -= 6; \ + lim = p + size; \ + BR_PC_INIT \ + for (;;) \ + { \ + UInt32 a, v; \ + /* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \ + for (;;) \ + { \ + if Z7_UNLIKELY(p >= lim) { return p; } \ + a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \ + if ((a & 0x77) == 0) break; \ + a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \ + p += RISCV_INSTR_SIZE * 2; \ + if ((a & 0x77) == 0) \ + { \ + p -= RISCV_INSTR_SIZE; \ + if Z7_UNLIKELY(p >= lim) { return p; } \ + break; \ + } \ + } +// (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL +// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL +// (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC +// (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC + +Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc) +{ + RISCV_SCAN_LOOP + v = a; + a = RISCV_GET_UI32(p); +#ifndef RISCV_USE_16BIT_LOAD + v += (UInt32)p[1] << 8; +#endif + + if ((v & 8) == 0) // JAL + { + if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80) + { + p += RISCV_INSTR_SIZE; + continue; + } + { + v = ((a & 1u << 31) >> 11) + | ((a & 0x3ff << 21) >> 20) + | ((a & 1 << 20) >> 9) + | (a & 0xff << 12); + BR_CONVERT_VAL_ENC(v) + // ((v & 1) == 0) + // v: bits [1 : 20] contain offset bits +#if 0 && defined(RISCV_USE_UNALIGNED_LOAD) + a &= 0xfff; + a |= ((UInt32)(v << 23)) + | ((UInt32)(v << 7) & ((UInt32)0xff << 16)) + | ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8)); + RISCV_SET_UI32(p, a) +#else // aligned +#if 0 + SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff))) +#else + p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf)); +#endif + +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + v <<= 15; + v = Z7_BSWAP32(v); + SetUi16a(p + 2, (UInt16)v) +#else + p[2] = (Byte)(v >> 9); + p[3] = (Byte)(v >> 1); +#endif +#endif // aligned + } + p += 4; + continue; + } // JAL + + { + // AUIPC + if (v & 0xe80) // (not x0) and (not x2) + { + const UInt32 b = RISCV_GET_UI32(p + 4); + if (RISCV_CHECK_1(v, b)) + { + { + const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL); + RISCV_SET_UI32(p, temp) + } + a &= 0xfffff000; + { +#if 1 + const int t = -1 >> 1; + if (t != -1) + a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation + else +#endif + a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension). + } + BR_CONVERT_VAL_ENC(a) +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + a = Z7_BSWAP32(a); + RISCV_SET_UI32(p + 4, a) +#else + SetBe32(p + 4, a) +#endif + p += 8; + } + else + p += RISCV_STEP_1; + } + else + { + UInt32 r = a >> 27; + if (RISCV_CHECK_2(v, r)) + { + v = RISCV_GET_UI32(p + 4); + r = (r << 7) + 0x17 + (v & 0xfffff000); + a = (a >> 12) | (v << 20); + RISCV_SET_UI32(p, r) + RISCV_SET_UI32(p + 4, a) + p += 8; + } + else + p += RISCV_STEP_2; + } + } + } // for +} + + +Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc) +{ + RISCV_SCAN_LOOP +#ifdef RISCV_USE_16BIT_LOAD + if ((a & 8) == 0) + { +#else + v = a; + a += (UInt32)p[1] << 8; + if ((v & 8) == 0) + { +#endif + // JAL + a -= 0x100 - RISCV_DELTA_7F; + if (a & 0xd80) + { + p += RISCV_INSTR_SIZE; + continue; + } + { + const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff; +#if 0 // unaligned + a = GetUi32(p); + v = (UInt32)(a >> 23) & ((UInt32)0xff << 1) + | (UInt32)(a >> 7) & ((UInt32)0xff << 9) +#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + v = GetUi16a(p + 2); + v = Z7_BSWAP32(v) >> 15 +#else + v = (UInt32)p[3] << 1 + | (UInt32)p[2] << 9 +#endif + | (UInt32)((a & 0xf000) << 5); + BR_CONVERT_VAL_DEC(v) + a = a_old + | (v << 11 & 1u << 31) + | (v << 20 & 0x3ff << 21) + | (v << 9 & 1 << 20) + | (v & 0xff << 12); + RISCV_SET_UI32(p, a) + } + p += 4; + continue; + } // JAL + + { + // AUIPC + v = a; +#if 1 && defined(RISCV_USE_UNALIGNED_LOAD) + a = GetUi32(p); +#else + a |= (UInt32)GetUi16a(p + 2) << 16; +#endif + if ((v & 0xe80) == 0) // x0/x2 + { + const UInt32 r = a >> 27; + if (RISCV_CHECK_2(v, r)) + { + UInt32 b; +#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE) + b = RISCV_GET_UI32(p + 4); + b = Z7_BSWAP32(b); +#else + b = GetBe32(p + 4); +#endif + v = a >> 12; + BR_CONVERT_VAL_DEC(b) + a = (r << 7) + 0x17; + a += (b + 0x800) & 0xfffff000; + v |= b << 20; + RISCV_SET_UI32(p, a) + RISCV_SET_UI32(p + 4, v) + p += 8; + } + else + p += RISCV_STEP_2; + } + else + { + const UInt32 b = RISCV_GET_UI32(p + 4); + if (!RISCV_CHECK_1(v, b)) + p += RISCV_STEP_1; + else + { + v = (a & 0xfffff000) | (b >> 20); + a = (b << 12) | (0x17 + RISCV_REG_VAL); + RISCV_SET_UI32(p, a) + RISCV_SET_UI32(p + 4, v) + p += 8; + } + } + } + } // for +} diff --git a/3rdparty/lzma/src/CpuArch.c b/3rdparty/lzma/src/CpuArch.c index 33f8a3ab4c1e9..e792f39deb4dc 100644 --- a/3rdparty/lzma/src/CpuArch.c +++ b/3rdparty/lzma/src/CpuArch.c @@ -1,5 +1,5 @@ /* CpuArch.c -- CPU specific code -2023-05-18 : Igor Pavlov : Public domain */ +2024-07-04 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -226,7 +226,7 @@ void __declspec(naked) Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) DON'T remove Z7_NO_INLINE and Z7_FASTCALL for MY_cpuidex_HACK(): !!! */ static -Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(UInt32 subFunction, UInt32 func, int *CPUInfo) +Z7_NO_INLINE void Z7_FASTCALL MY_cpuidex_HACK(Int32 subFunction, Int32 func, Int32 *CPUInfo) { UNUSED_VAR(subFunction) __cpuid(CPUInfo, func); @@ -242,13 +242,13 @@ Z7_NO_INLINE #endif void Z7_FASTCALL z7_x86_cpuid(UInt32 p[4], UInt32 func) { - MY_cpuidex((int *)p, (int)func, 0); + MY_cpuidex((Int32 *)p, (Int32)func, 0); } Z7_NO_INLINE UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void) { - int a[4]; + Int32 a[4]; MY_cpuidex(a, 0, 0); return a[0]; } @@ -384,7 +384,7 @@ BoolInt CPU_IsSupported_CMOV(void) UInt32 a[4]; if (!x86cpuid_Func_1(&a[0])) return 0; - return (a[3] >> 15) & 1; + return (BoolInt)(a[3] >> 15) & 1; } BoolInt CPU_IsSupported_SSE(void) @@ -393,7 +393,7 @@ BoolInt CPU_IsSupported_SSE(void) CHECK_SYS_SSE_SUPPORT if (!x86cpuid_Func_1(&a[0])) return 0; - return (a[3] >> 25) & 1; + return (BoolInt)(a[3] >> 25) & 1; } BoolInt CPU_IsSupported_SSE2(void) @@ -402,7 +402,7 @@ BoolInt CPU_IsSupported_SSE2(void) CHECK_SYS_SSE_SUPPORT if (!x86cpuid_Func_1(&a[0])) return 0; - return (a[3] >> 26) & 1; + return (BoolInt)(a[3] >> 26) & 1; } #endif @@ -419,17 +419,17 @@ static UInt32 x86cpuid_Func_1_ECX(void) BoolInt CPU_IsSupported_AES(void) { - return (x86cpuid_Func_1_ECX() >> 25) & 1; + return (BoolInt)(x86cpuid_Func_1_ECX() >> 25) & 1; } BoolInt CPU_IsSupported_SSSE3(void) { - return (x86cpuid_Func_1_ECX() >> 9) & 1; + return (BoolInt)(x86cpuid_Func_1_ECX() >> 9) & 1; } BoolInt CPU_IsSupported_SSE41(void) { - return (x86cpuid_Func_1_ECX() >> 19) & 1; + return (BoolInt)(x86cpuid_Func_1_ECX() >> 19) & 1; } BoolInt CPU_IsSupported_SHA(void) @@ -441,7 +441,7 @@ BoolInt CPU_IsSupported_SHA(void) { UInt32 d[4]; z7_x86_cpuid(d, 7); - return (d[1] >> 29) & 1; + return (BoolInt)(d[1] >> 29) & 1; } } @@ -638,10 +638,10 @@ BoolInt CPU_IsSupported_AVX(void) { const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); - // printf("\n=== XGetBV=%d\n", bm); + // printf("\n=== XGetBV=0x%x\n", bm); return 1 - & (bm >> 1) // SSE state is supported (set by OS) for storing/restoring - & (bm >> 2); // AVX state is supported (set by OS) for storing/restoring + & (BoolInt)(bm >> 1) // SSE state is supported (set by OS) for storing/restoring + & (BoolInt)(bm >> 2); // AVX state is supported (set by OS) for storing/restoring } // since Win7SP1: we can use GetEnabledXStateFeatures(); } @@ -658,10 +658,39 @@ BoolInt CPU_IsSupported_AVX2(void) z7_x86_cpuid(d, 7); // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); return 1 - & (d[1] >> 5); // avx2 + & (BoolInt)(d[1] >> 5); // avx2 } } +#if 0 +BoolInt CPU_IsSupported_AVX512F_AVX512VL(void) +{ + if (!CPU_IsSupported_AVX()) + return False; + if (z7_x86_cpuid_GetMaxFunc() < 7) + return False; + { + UInt32 d[4]; + BoolInt v; + z7_x86_cpuid(d, 7); + // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); + v = 1 + & (BoolInt)(d[1] >> 16) // avx512f + & (BoolInt)(d[1] >> 31); // avx512vl + if (!v) + return False; + } + { + const UInt32 bm = (UInt32)x86_xgetbv_0(MY_XCR_XFEATURE_ENABLED_MASK); + // printf("\n=== XGetBV=0x%x\n", bm); + return 1 + & (BoolInt)(bm >> 5) // OPMASK + & (BoolInt)(bm >> 6) // ZMM upper 256-bit + & (BoolInt)(bm >> 7); // ZMM16 ... ZMM31 + } +} +#endif + BoolInt CPU_IsSupported_VAES_AVX2(void) { if (!CPU_IsSupported_AVX()) @@ -673,9 +702,9 @@ BoolInt CPU_IsSupported_VAES_AVX2(void) z7_x86_cpuid(d, 7); // printf("\ncpuid(7): ebx=%8x ecx=%8x\n", d[1], d[2]); return 1 - & (d[1] >> 5) // avx2 + & (BoolInt)(d[1] >> 5) // avx2 // & (d[1] >> 31) // avx512vl - & (d[2] >> 9); // vaes // VEX-256/EVEX + & (BoolInt)(d[2] >> 9); // vaes // VEX-256/EVEX } } @@ -688,7 +717,7 @@ BoolInt CPU_IsSupported_PageGB(void) if (d[0] < 0x80000001) return False; z7_x86_cpuid(d, 0x80000001); - return (d[3] >> 26) & 1; + return (BoolInt)(d[3] >> 26) & 1; } } @@ -760,33 +789,70 @@ BoolInt CPU_IsSupported_AES (void) { return APPLE_CRYPTO_SUPPORT_VAL; } #else // __APPLE__ -#include +#if defined(__GLIBC__) && (__GLIBC__ * 100 + __GLIBC_MINOR__ >= 216) + #define Z7_GETAUXV_AVAILABLE +#else +// #pragma message("=== is not NEW GLIBC === ") + #if defined __has_include + #if __has_include () +// #pragma message("=== sys/auxv.h is avail=== ") + #define Z7_GETAUXV_AVAILABLE + #endif + #endif +#endif +#ifdef Z7_GETAUXV_AVAILABLE +// #pragma message("=== Z7_GETAUXV_AVAILABLE === ") +#include #define USE_HWCAP +#endif #ifdef USE_HWCAP +#if defined(__FreeBSD__) +static unsigned long MY_getauxval(int aux) +{ + unsigned long val; + if (elf_aux_info(aux, &val, sizeof(val))) + return 0; + return val; +} +#else +#define MY_getauxval getauxval + #if defined __has_include + #if __has_include () #include + #endif + #endif +#endif #define MY_HWCAP_CHECK_FUNC_2(name1, name2) \ - BoolInt CPU_IsSupported_ ## name1() { return (getauxval(AT_HWCAP) & (HWCAP_ ## name2)) ? 1 : 0; } + BoolInt CPU_IsSupported_ ## name1(void) { return (MY_getauxval(AT_HWCAP) & (HWCAP_ ## name2)); } #ifdef MY_CPU_ARM64 #define MY_HWCAP_CHECK_FUNC(name) \ MY_HWCAP_CHECK_FUNC_2(name, name) +#if 1 || defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else MY_HWCAP_CHECK_FUNC_2(NEON, ASIMD) +#endif // MY_HWCAP_CHECK_FUNC (ASIMD) #elif defined(MY_CPU_ARM) #define MY_HWCAP_CHECK_FUNC(name) \ - BoolInt CPU_IsSupported_ ## name() { return (getauxval(AT_HWCAP2) & (HWCAP2_ ## name)) ? 1 : 0; } + BoolInt CPU_IsSupported_ ## name(void) { return (MY_getauxval(AT_HWCAP2) & (HWCAP2_ ## name)); } MY_HWCAP_CHECK_FUNC_2(NEON, NEON) #endif #else // USE_HWCAP #define MY_HWCAP_CHECK_FUNC(name) \ - BoolInt CPU_IsSupported_ ## name() { return 0; } + BoolInt CPU_IsSupported_ ## name(void) { return 0; } +#if defined(__ARM_NEON) + BoolInt CPU_IsSupported_NEON(void) { return True; } +#else MY_HWCAP_CHECK_FUNC(NEON) +#endif #endif // USE_HWCAP diff --git a/3rdparty/lzma/src/DllSecur.c b/3rdparty/lzma/src/DllSecur.c index 02a0f977eaddf..bbbfc0a7638a2 100644 --- a/3rdparty/lzma/src/DllSecur.c +++ b/3rdparty/lzma/src/DllSecur.c @@ -1,5 +1,5 @@ /* DllSecur.c -- DLL loading security -2023-04-02 : Igor Pavlov : Public domain */ +2023-12-03 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -11,19 +11,7 @@ #ifndef UNDER_CE -#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__) - // #pragma GCC diagnostic ignored "-Wcast-function-type" -#endif - -#if defined(__clang__) || defined(__GNUC__) -typedef void (*Z7_voidFunction)(void); -#define MY_CAST_FUNC (Z7_voidFunction) -#elif defined(_MSC_VER) && _MSC_VER > 1920 -#define MY_CAST_FUNC (void *) -// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()' -#else -#define MY_CAST_FUNC -#endif +Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); @@ -61,7 +49,7 @@ static const char * const g_Dlls = if ((UInt16)GetVersion() != 6) { \ const \ Func_SetDefaultDllDirectories setDllDirs = \ - (Func_SetDefaultDllDirectories) MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ + (Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \ "SetDefaultDllDirectories"); \ if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; } diff --git a/3rdparty/lzma/src/LzFind.c b/3rdparty/lzma/src/LzFind.c index 799d6227ba830..1ce404648e2f1 100644 --- a/3rdparty/lzma/src/LzFind.c +++ b/3rdparty/lzma/src/LzFind.c @@ -1,5 +1,5 @@ /* LzFind.c -- Match finder for LZ algorithms -2023-03-14 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -108,9 +108,15 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all return (p->bufBase != NULL); } -static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } +static const Byte *MatchFinder_GetPointerToCurrentPos(void *p) +{ + return ((CMatchFinder *)p)->buffer; +} -static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); } +static UInt32 MatchFinder_GetNumAvailableBytes(void *p) +{ + return GET_AVAIL_BYTES((CMatchFinder *)p); +} Z7_NO_INLINE @@ -571,8 +577,9 @@ void MatchFinder_Init_4(CMatchFinder *p) #define CYC_TO_POS_OFFSET 0 // #define CYC_TO_POS_OFFSET 1 // for debug -void MatchFinder_Init(CMatchFinder *p) +void MatchFinder_Init(void *_p) { + CMatchFinder *p = (CMatchFinder *)_p; MatchFinder_Init_HighHash(p); MatchFinder_Init_LowHash(p); MatchFinder_Init_4(p); @@ -607,16 +614,16 @@ void MatchFinder_Init(CMatchFinder *p) #endif #endif -// #elif defined(MY_CPU_ARM_OR_ARM64) -#elif defined(MY_CPU_ARM64) +#elif defined(MY_CPU_ARM64) \ + /* || (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) */ - #if defined(__clang__) && (__clang_major__ >= 8) \ - || defined(__GNUC__) && (__GNUC__ >= 8) + #if defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) #define USE_LZFIND_SATUR_SUB_128 #ifdef MY_CPU_ARM64 // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__(""))) #else - // #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) + #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=neon"))) #endif #elif defined(_MSC_VER) @@ -625,7 +632,7 @@ void MatchFinder_Init(CMatchFinder *p) #endif #endif - #if defined(_MSC_VER) && !defined(__clang__) && defined(MY_CPU_ARM64) + #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else #include @@ -1082,9 +1089,11 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const #define MOVE_POS \ - ++p->cyclicBufferPos; \ + p->cyclicBufferPos++; \ p->buffer++; \ - { const UInt32 pos1 = p->pos + 1; p->pos = pos1; if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } + { const UInt32 pos1 = p->pos + 1; \ + p->pos = pos1; \ + if (pos1 == p->posLimit) MatchFinder_CheckLimits(p); } #define MOVE_POS_RET MOVE_POS return distances; @@ -1103,20 +1112,26 @@ static void MatchFinder_MovePos(CMatchFinder *p) } #define GET_MATCHES_HEADER2(minLen, ret_op) \ - unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \ - lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ + UInt32 hv; const Byte *cur; UInt32 curMatch; \ + UInt32 lenLimit = p->lenLimit; \ + if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; } \ cur = p->buffer; #define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return distances) -#define SKIP_HEADER(minLen) do { GET_MATCHES_HEADER2(minLen, continue) +#define SKIP_HEADER(minLen) \ + do { GET_MATCHES_HEADER2(minLen, continue) -#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue +#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, \ + p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue -#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num); +#define SKIP_FOOTER \ + SkipMatchesSpec(MF_PARAMS(p)); \ + MOVE_POS \ + } while (--num); #define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \ - distances = func(MF_PARAMS(p), \ - distances, (UInt32)_maxLen_); MOVE_POS_RET + distances = func(MF_PARAMS(p), distances, (UInt32)_maxLen_); \ + MOVE_POS_RET #define GET_MATCHES_FOOTER_BT(_maxLen_) \ GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1) @@ -1133,8 +1148,9 @@ static void MatchFinder_MovePos(CMatchFinder *p) for (; c != lim; c++) if (*(c + diff) != *c) break; \ maxLen = (unsigned)(c - cur); } -static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt2_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; GET_MATCHES_HEADER(2) HASH2_CALC curMatch = p->hash[hv]; @@ -1158,8 +1174,9 @@ UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) mmm = pos; -static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt3_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; UInt32 mmm; UInt32 h2, d2, pos; unsigned maxLen; @@ -1199,8 +1216,9 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt4_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; UInt32 mmm; UInt32 h2, h3, d2, d3, pos; unsigned maxLen; @@ -1267,10 +1285,12 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Bt5_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; UInt32 mmm; - UInt32 h2, h3, d2, d3, maxLen, pos; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -1339,8 +1359,9 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32* Hc4_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; UInt32 mmm; UInt32 h2, h3, d2, d3, pos; unsigned maxLen; @@ -1407,10 +1428,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) +static UInt32 * Hc5_MatchFinder_GetMatches(void *_p, UInt32 *distances) { + CMatchFinder *p = (CMatchFinder *)_p; UInt32 mmm; - UInt32 h2, h3, d2, d3, maxLen, pos; + UInt32 h2, h3, d2, d3, pos; + unsigned maxLen; UInt32 *hash; GET_MATCHES_HEADER(5) @@ -1466,7 +1489,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) if (*(cur - d2 + 3) != cur[3]) break; UPDATE_maxLen - distances[-2] = maxLen; + distances[-2] = (UInt32)maxLen; if (maxLen == lenLimit) { p->son[p->cyclicBufferPos] = curMatch; @@ -1489,8 +1512,9 @@ UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances) } -static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt2_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; SKIP_HEADER(2) { HASH2_CALC @@ -1511,8 +1535,9 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) SKIP_FOOTER } -static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt3_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; SKIP_HEADER(3) { UInt32 h2; @@ -1526,8 +1551,9 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num) SKIP_FOOTER } -static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt4_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; SKIP_HEADER(4) { UInt32 h2, h3; @@ -1542,8 +1568,9 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) SKIP_FOOTER } -static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Bt5_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; SKIP_HEADER(5) { UInt32 h2, h3; @@ -1589,8 +1616,9 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) }} while(num); \ -static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Hc4_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; HC_SKIP_HEADER(4) UInt32 h2, h3; @@ -1604,8 +1632,9 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num) } -static void Hc5_MatchFinder_Skip(CMatchFinder *p, UInt32 num) +static void Hc5_MatchFinder_Skip(void *_p, UInt32 num) { + CMatchFinder *p = (CMatchFinder *)_p; HC_SKIP_HEADER(5) UInt32 h2, h3; @@ -1634,41 +1663,41 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num) void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable) { - vTable->Init = (Mf_Init_Func)MatchFinder_Init; - vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; - vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; + vTable->Init = MatchFinder_Init; + vTable->GetNumAvailableBytes = MatchFinder_GetNumAvailableBytes; + vTable->GetPointerToCurrentPos = MatchFinder_GetPointerToCurrentPos; if (!p->btMode) { if (p->numHashBytes <= 4) { - vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; + vTable->GetMatches = Hc4_MatchFinder_GetMatches; + vTable->Skip = Hc4_MatchFinder_Skip; } else { - vTable->GetMatches = (Mf_GetMatches_Func)Hc5_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Hc5_MatchFinder_Skip; + vTable->GetMatches = Hc5_MatchFinder_GetMatches; + vTable->Skip = Hc5_MatchFinder_Skip; } } else if (p->numHashBytes == 2) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; + vTable->GetMatches = Bt2_MatchFinder_GetMatches; + vTable->Skip = Bt2_MatchFinder_Skip; } else if (p->numHashBytes == 3) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; + vTable->GetMatches = Bt3_MatchFinder_GetMatches; + vTable->Skip = Bt3_MatchFinder_Skip; } else if (p->numHashBytes == 4) { - vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; + vTable->GetMatches = Bt4_MatchFinder_GetMatches; + vTable->Skip = Bt4_MatchFinder_Skip; } else { - vTable->GetMatches = (Mf_GetMatches_Func)Bt5_MatchFinder_GetMatches; - vTable->Skip = (Mf_Skip_Func)Bt5_MatchFinder_Skip; + vTable->GetMatches = Bt5_MatchFinder_GetMatches; + vTable->Skip = Bt5_MatchFinder_Skip; } } diff --git a/3rdparty/lzma/src/LzFindMt.c b/3rdparty/lzma/src/LzFindMt.c index 5253e6ebb35d5..ac9d59d0fd42b 100644 --- a/3rdparty/lzma/src/LzFindMt.c +++ b/3rdparty/lzma/src/LzFindMt.c @@ -1,5 +1,5 @@ /* LzFindMt.c -- multithreaded Match finder for LZ algorithms -2023-04-02 : Igor Pavlov : Public domain */ +2024-01-22 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -94,7 +94,7 @@ static void MtSync_Construct(CMtSync *p) } -#define DEBUG_BUFFER_LOCK // define it to debug lock state +// #define DEBUG_BUFFER_LOCK // define it to debug lock state #ifdef DEBUG_BUFFER_LOCK #include @@ -877,8 +877,9 @@ SRes MatchFinderMt_InitMt(CMatchFinderMt *p) } -static void MatchFinderMt_Init(CMatchFinderMt *p) +static void MatchFinderMt_Init(void *_p) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; CMatchFinder *mf = MF(p); p->btBufPos = @@ -981,8 +982,9 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) -static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) +static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; return p->pointerToCurPos; } @@ -990,8 +992,9 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); -static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) +static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; if (p->btBufPos != p->btBufPosLimit) return p->btNumAvailBytes; return MatchFinderMt_GetNextBlock_Bt(p); @@ -1243,8 +1246,9 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) } -static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) +static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; const UInt32 *bt = p->btBufPos; const UInt32 len = *bt++; const UInt32 *btLim = bt + len; @@ -1267,8 +1271,9 @@ static UInt32 * MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) -static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) +static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; const UInt32 *bt = p->btBufPos; UInt32 len = *bt++; const UInt32 avail = p->btNumAvailBytes - 1; @@ -1315,14 +1320,16 @@ static UInt32 * MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); -static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt0_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER2_MT { p->btNumAvailBytes--; SKIP_FOOTER_MT } -static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt2_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER_MT(2) UInt32 h2; MT_HASH2_CALC @@ -1330,8 +1337,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) SKIP_FOOTER_MT } -static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) +static void MatchFinderMt3_Skip(void *_p, UInt32 num) { + CMatchFinderMt *p = (CMatchFinderMt *)_p; SKIP_HEADER_MT(3) UInt32 h2, h3; MT_HASH3_CALC @@ -1361,39 +1369,39 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num) void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) { - vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; - vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; - vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; - vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; + vTable->Init = MatchFinderMt_Init; + vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes; + vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos; + vTable->GetMatches = MatchFinderMt_GetMatches; switch (MF(p)->numHashBytes) { case 2: p->GetHeadsFunc = GetHeads2; - p->MixMatchesFunc = (Mf_Mix_Matches)NULL; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; - vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; + p->MixMatchesFunc = NULL; + vTable->Skip = MatchFinderMt0_Skip; + vTable->GetMatches = MatchFinderMt2_GetMatches; break; case 3: p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; + p->MixMatchesFunc = MixMatches2; + vTable->Skip = MatchFinderMt2_Skip; break; case 4: p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; // it's fast inline version of GetMatches() - // vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; + // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; - vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; + p->MixMatchesFunc = MixMatches3; + vTable->Skip = MatchFinderMt3_Skip; break; default: p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; - p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; + p->MixMatchesFunc = MixMatches4; vTable->Skip = - (Mf_Skip_Func)MatchFinderMt3_Skip; - // (Mf_Skip_Func)MatchFinderMt4_Skip; + MatchFinderMt3_Skip; + // MatchFinderMt4_Skip; break; } } diff --git a/3rdparty/lzma/src/Lzma2Dec.c b/3rdparty/lzma/src/Lzma2Dec.c index 388cbc71732c9..8bf54e499edf7 100644 --- a/3rdparty/lzma/src/Lzma2Dec.c +++ b/3rdparty/lzma/src/Lzma2Dec.c @@ -1,5 +1,5 @@ /* Lzma2Dec.c -- LZMA2 Decoder -2023-03-03 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ /* #define SHOW_DEBUG_INFO */ @@ -157,8 +157,10 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b) p->decoder.prop.lp = (Byte)lp; return LZMA2_STATE_DATA; } + + default: + return LZMA2_STATE_ERROR; } - return LZMA2_STATE_ERROR; } static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) diff --git a/3rdparty/lzma/src/LzmaEnc.c b/3rdparty/lzma/src/LzmaEnc.c index 6d13cac8ce2c5..37b2787db6871 100644 --- a/3rdparty/lzma/src/LzmaEnc.c +++ b/3rdparty/lzma/src/LzmaEnc.c @@ -1,5 +1,5 @@ /* LzmaEnc.c -- LZMA Encoder -2023-04-13: Igor Pavlov : Public domain */ +2024-01-24: Igor Pavlov : Public domain */ #include "Precomp.h" @@ -195,11 +195,11 @@ unsigned GetPosSlot1(UInt32 pos); unsigned GetPosSlot1(UInt32 pos) { unsigned res; - BSR2_RET(pos, res); + BSR2_RET(pos, res) return res; } -#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } -#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } +#define GetPosSlot2(pos, res) { BSR2_RET(pos, res) } +#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res) } #else // ! LZMA_LOG_BSR @@ -512,7 +512,7 @@ struct CLzmaEnc COPY_ARR(d, s, posEncoders) \ (d)->lenProbs = (s)->lenProbs; \ (d)->repLenProbs = (s)->repLenProbs; \ - memcpy((d)->litProbs, (s)->litProbs, ((UInt32)0x300 << (p)->lclp) * sizeof(CLzmaProb)); + memcpy((d)->litProbs, (s)->litProbs, ((size_t)0x300 * sizeof(CLzmaProb)) << (p)->lclp); void LzmaEnc_SaveState(CLzmaEncHandle p) { @@ -1040,14 +1040,14 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( UInt32 price = b; do { - unsigned bit = sym & 1; + const unsigned bit = sym & 1; sym >>= 1; price += GET_PRICEa(probs[sym], bit); } while (sym >= 2); { - unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; + const unsigned prob = probs[(size_t)i + (1 << (kLenNumHighBits - 1))]; prices[(size_t)i * 2 ] = price + GET_PRICEa_0(prob); prices[(size_t)i * 2 + 1] = price + GET_PRICEa_1(prob); } @@ -1056,7 +1056,7 @@ Z7_NO_INLINE static void Z7_FASTCALL LenPriceEnc_UpdateTables( { unsigned posState; - size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); + const size_t num = (p->tableSize - kLenNumLowSymbols * 2) * sizeof(p->prices[0][0]); for (posState = 1; posState < numPosStates; posState++) memcpy(p->prices[posState] + kLenNumLowSymbols * 2, p->prices[0] + kLenNumLowSymbols * 2, num); } @@ -2696,12 +2696,12 @@ static SRes LzmaEnc_Alloc(CLzmaEnc *p, UInt32 keepWindowSize, ISzAllocPtr alloc, #endif { - unsigned lclp = p->lc + p->lp; + const unsigned lclp = p->lc + p->lp; if (!p->litProbs || !p->saveState.litProbs || p->lclp != lclp) { LzmaEnc_FreeLits(p, alloc); - p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); - p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((UInt32)0x300 << lclp) * sizeof(CLzmaProb)); + p->litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); + p->saveState.litProbs = (CLzmaProb *)ISzAlloc_Alloc(alloc, ((size_t)0x300 * sizeof(CLzmaProb)) << lclp); if (!p->litProbs || !p->saveState.litProbs) { LzmaEnc_FreeLits(p, alloc); @@ -2802,8 +2802,8 @@ static void LzmaEnc_Init(CLzmaEnc *p) } { - UInt32 num = (UInt32)0x300 << (p->lp + p->lc); - UInt32 k; + const size_t num = (size_t)0x300 << (p->lp + p->lc); + size_t k; CLzmaProb *probs = p->litProbs; for (k = 0; k < num; k++) probs[k] = kProbInitValue; diff --git a/3rdparty/lzma/src/MtCoder.c b/3rdparty/lzma/src/MtCoder.c index 6f58abb2a3905..03959b6cad682 100644 --- a/3rdparty/lzma/src/MtCoder.c +++ b/3rdparty/lzma/src/MtCoder.c @@ -1,5 +1,5 @@ /* MtCoder.c -- Multi-thread Coder -2023-04-13 : Igor Pavlov : Public domain */ +2023-09-07 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -430,7 +430,7 @@ SRes MtCoder_Code(CMtCoder *p) SRes res = SZ_OK; if (numThreads > MTCODER_THREADS_MAX) - numThreads = MTCODER_THREADS_MAX; + numThreads = MTCODER_THREADS_MAX; numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads); if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; @@ -438,7 +438,7 @@ SRes MtCoder_Code(CMtCoder *p) if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; if (numBlocksMax > MTCODER_BLOCKS_MAX) - numBlocksMax = MTCODER_BLOCKS_MAX; + numBlocksMax = MTCODER_BLOCKS_MAX; if (p->blockSize != p->allocatedBufsSize) { @@ -469,7 +469,7 @@ SRes MtCoder_Code(CMtCoder *p) { RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent)) - RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)) + RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax)) } for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++) diff --git a/3rdparty/lzma/src/MtDec.c b/3rdparty/lzma/src/MtDec.c index 782069926d5aa..96274b6f3aec2 100644 --- a/3rdparty/lzma/src/MtDec.c +++ b/3rdparty/lzma/src/MtDec.c @@ -1,5 +1,5 @@ /* MtDec.c -- Multi-thread Decoder -2023-04-02 : Igor Pavlov : Public domain */ +2024-02-20 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -809,6 +809,16 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t) #endif +typedef + #ifdef _WIN32 + UINT_PTR + #elif 1 + uintptr_t + #else + ptrdiff_t + #endif + MY_uintptr_t; + static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) { WRes res; @@ -821,7 +831,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) res = MtDec_ThreadFunc2(t); p = t->mtDec; if (res == 0) - return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; + return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes; { // it's unexpected situation for some threading function error if (p->exitThreadWRes == 0) @@ -832,7 +842,7 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp) Event_Set(&p->threads[0].canWrite); MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); } - return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; + return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res; } static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp) @@ -1072,7 +1082,7 @@ SRes MtDec_Code(CMtDec *p) if (wres == 0) { wres = Event_Set(&nextThread->canWrite); if (wres == 0) { wres = Event_Set(&nextThread->canRead); if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread); - wres = (WRes)(UINT_PTR)res; + wres = (WRes)(MY_uintptr_t)res; if (wres != 0) { p->needContinue = False; diff --git a/3rdparty/lzma/src/Ppmd7.c b/3rdparty/lzma/src/Ppmd7.c index 6e1307e2d391c..efcc5d86e9586 100644 --- a/3rdparty/lzma/src/Ppmd7.c +++ b/3rdparty/lzma/src/Ppmd7.c @@ -1,5 +1,5 @@ /* Ppmd7.c -- PPMdH codec -2023-04-02 : Igor Pavlov : Public domain +2023-09-07 : Igor Pavlov : Public domain This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ #include "Precomp.h" @@ -302,8 +302,17 @@ static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx) #define MEM_12_CPY(dest, src, num) \ - { UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ - do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } + { UInt32 *d = (UInt32 *)(dest); \ + const UInt32 *z = (const UInt32 *)(src); \ + unsigned n = (num); \ + do { \ + d[0] = z[0]; \ + d[1] = z[1]; \ + d[2] = z[2]; \ + z += 3; \ + d += 3; \ + } while (--n); \ + } /* @@ -711,8 +720,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p) if ((ns1 & 1) == 0) { /* Expand for one UNIT */ - unsigned oldNU = ns1 >> 1; - unsigned i = U2I(oldNU); + const unsigned oldNU = ns1 >> 1; + const unsigned i = U2I(oldNU); if (i != U2I((size_t)oldNU + 1)) { void *ptr = Ppmd7_AllocUnits(p, i + 1); @@ -731,7 +740,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) sum = c->Union2.SummFreq; /* max increase of Escape_Freq is 3 here. total increase of Union2.SummFreq for all symbols is less than 256 here */ - sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); + sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1))); /* original PPMdH uses 16-bit variable for (sum) here. But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ // sum = (UInt16)sum; @@ -761,7 +770,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p) // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context s->Freq = (Byte)freq; // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here - sum = freq + p->InitEsc + (ns > 3); + sum = (UInt32)(freq + p->InitEsc + (ns > 3)); } } @@ -933,10 +942,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq) p->HiBitsFlag; { // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ - unsigned summ = (UInt16)see->Summ; // & 0xFFFF - unsigned r = (summ >> see->Shift); + const unsigned summ = (UInt16)see->Summ; // & 0xFFFF + const unsigned r = (summ >> see->Shift); see->Summ = (UInt16)(summ - r); - *escFreq = r + (r == 0); + *escFreq = (UInt32)(r + (r == 0)); } } else @@ -981,9 +990,9 @@ void Ppmd7_Update1_0(CPpmd7 *p) CPpmd_State *s = p->FoundState; CPpmd7_Context *mc = p->MinContext; unsigned freq = s->Freq; - unsigned summFreq = mc->Union2.SummFreq; + const unsigned summFreq = mc->Union2.SummFreq; p->PrevSuccess = (2 * freq > summFreq); - p->RunLength += (int)p->PrevSuccess; + p->RunLength += (Int32)p->PrevSuccess; mc->Union2.SummFreq = (UInt16)(summFreq + 4); freq += 4; s->Freq = (Byte)freq; diff --git a/3rdparty/lzma/src/Ppmd7Dec.c b/3rdparty/lzma/src/Ppmd7Dec.c index 832382828d125..081ab89557a8d 100644 --- a/3rdparty/lzma/src/Ppmd7Dec.c +++ b/3rdparty/lzma/src/Ppmd7Dec.c @@ -1,5 +1,5 @@ /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder -2023-04-02 : Igor Pavlov : Public domain +2023-09-07 : Igor Pavlov : Public domain This code is based on: PPMd var.H (2001): Dmitry Shkarin : Public domain */ @@ -58,7 +58,7 @@ static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) void Ppmd7_UpdateModel(CPpmd7 *p); -#define MASK(sym) ((unsigned char *)charMask)[sym] +#define MASK(sym) ((Byte *)charMask)[sym] // Z7_FORCE_INLINE // static int Ppmd7z_DecodeSymbol(CPpmd7 *p) @@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) MASK(s->Symbol) = 0; do { - unsigned sym0 = s2[0].Symbol; - unsigned sym1 = s2[1].Symbol; + const unsigned sym0 = s2[0].Symbol; + const unsigned sym1 = s2[1].Symbol; s2 += 2; MASK(sym0) = 0; MASK(sym1) = 0; @@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) unsigned num2 = num / 2; num &= 1; - hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); + hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num); s += num; p->MinContext = mc; do { - unsigned sym0 = s[0].Symbol; - unsigned sym1 = s[1].Symbol; + const unsigned sym0 = s[0].Symbol; + const unsigned sym1 = s[1].Symbol; s += 2; - hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); - hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); + hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0))); + hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1))); } while (--num2); } @@ -238,13 +238,13 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p) s = Ppmd7_GetStats(p, p->MinContext); hiCnt = count; - // count -= s->Freq & (unsigned)(MASK(s->Symbol)); + // count -= s->Freq & (UInt32)(MASK(s->Symbol)); // if ((Int32)count >= 0) { for (;;) { - count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; - // count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; + // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; } } s--; diff --git a/3rdparty/lzma/src/Ppmd7Enc.c b/3rdparty/lzma/src/Ppmd7Enc.c index 41106bab459e2..49cbbe649969b 100644 --- a/3rdparty/lzma/src/Ppmd7Enc.c +++ b/3rdparty/lzma/src/Ppmd7Enc.c @@ -1,5 +1,5 @@ /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder -2023-04-02 : Igor Pavlov : Public domain +2023-09-07 : Igor Pavlov : Public domain This code is based on: PPMd var.H (2001): Dmitry Shkarin : Public domain */ @@ -82,7 +82,7 @@ void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) void Ppmd7_UpdateModel(CPpmd7 *p); -#define MASK(sym) ((unsigned char *)charMask)[sym] +#define MASK(sym) ((Byte *)charMask)[sym] Z7_FORCE_INLINE static @@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) MASK(s->Symbol) = 0; do { - unsigned sym0 = s2[0].Symbol; - unsigned sym1 = s2[1].Symbol; + const unsigned sym0 = s2[0].Symbol; + const unsigned sym1 = s2[1].Symbol; s2 += 2; MASK(sym0) = 0; MASK(sym1) = 0; @@ -265,16 +265,15 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) if (num2 != 0) { s += i; - for (;;) + do { - unsigned sym0 = s[0].Symbol; - unsigned sym1 = s[1].Symbol; + const unsigned sym0 = s[0].Symbol; + const unsigned sym1 = s[1].Symbol; s += 2; sum += (s[-2].Freq & (unsigned)(MASK(sym0))); sum += (s[-1].Freq & (unsigned)(MASK(sym1))); - if (--num2 == 0) - break; } + while (--num2); } diff --git a/3rdparty/lzma/src/Sha256.c b/3rdparty/lzma/src/Sha256.c index 018cf6f4b4b7f..14d3be9c6a0f0 100644 --- a/3rdparty/lzma/src/Sha256.c +++ b/3rdparty/lzma/src/Sha256.c @@ -1,5 +1,5 @@ /* Sha256.c -- SHA-256 Hash -2023-04-02 : Igor Pavlov : Public domain +2024-03-01 : Igor Pavlov : Public domain This code is based on public domain code from Wei Dai's Crypto++ library. */ #include "Precomp.h" @@ -15,35 +15,35 @@ This code is based on public domain code from Wei Dai's Crypto++ library. */ #endif #ifdef MY_CPU_X86_OR_AMD64 - #ifdef _MSC_VER - #if _MSC_VER >= 1200 + #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \ + || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \ + || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \ + || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \ + || defined(_MSC_VER) && (_MSC_VER >= 1200) #define Z7_COMPILER_SHA256_SUPPORTED - #endif - #elif defined(__clang__) - #if (__clang_major__ >= 8) // fix that check - #define Z7_COMPILER_SHA256_SUPPORTED - #endif - #elif defined(__GNUC__) - #if (__GNUC__ >= 8) // fix that check - #define Z7_COMPILER_SHA256_SUPPORTED - #endif - #elif defined(__INTEL_COMPILER) - #if (__INTEL_COMPILER >= 1800) // fix that check - #define Z7_COMPILER_SHA256_SUPPORTED - #endif #endif -#elif defined(MY_CPU_ARM_OR_ARM64) - #ifdef _MSC_VER - #if _MSC_VER >= 1910 +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_SHA2) \ + || defined(__ARM_FEATURE_CRYPTO) + #define Z7_COMPILER_SHA256_SUPPORTED + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) #define Z7_COMPILER_SHA256_SUPPORTED #endif - #elif defined(__clang__) - #if (__clang_major__ >= 8) // fix that check - #define Z7_COMPILER_SHA256_SUPPORTED #endif - #elif defined(__GNUC__) - #if (__GNUC__ >= 6) // fix that check - #define Z7_COMPILER_SHA256_SUPPORTED #endif #endif #endif @@ -224,8 +224,6 @@ void Sha256_Init(CSha256 *p) #endif -void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); - // static extern MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64]; diff --git a/3rdparty/lzma/src/Sha256Opt.c b/3rdparty/lzma/src/Sha256Opt.c index afb351d2786c9..eb3816664674a 100644 --- a/3rdparty/lzma/src/Sha256Opt.c +++ b/3rdparty/lzma/src/Sha256Opt.c @@ -1,5 +1,5 @@ /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "Compiler.h" @@ -11,6 +11,8 @@ #endif #endif +// #define Z7_USE_HW_SHA_STUB // for debug + #ifdef MY_CPU_X86_OR_AMD64 #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check #define USE_HW_SHA @@ -32,9 +34,14 @@ #endif #if (_MSC_VER >= USE_VER_MIN) #define USE_HW_SHA + #else + #define Z7_USE_HW_SHA_STUB #endif #endif // #endif // MY_CPU_X86_OR_AMD64 +#ifndef USE_HW_SHA + // #define Z7_USE_HW_SHA_STUB // for debug +#endif #ifdef USE_HW_SHA @@ -202,19 +209,28 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #endif // USE_HW_SHA -#elif defined(MY_CPU_ARM_OR_ARM64) - - #if defined(__clang__) - #if (__clang_major__ >= 8) // fix that check +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + + #if defined(__ARM_FEATURE_SHA2) \ + || defined(__ARM_FEATURE_CRYPTO) + #define USE_HW_SHA + #else + #if defined(MY_CPU_ARM64) \ + || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \ + || defined(Z7_MSC_VER_ORIGINAL) + #if defined(__ARM_FP) && \ + ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \ + || defined(__GNUC__) && (__GNUC__ >= 6) \ + ) \ + || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) && \ + (Z7_CLANG_VERSION < 170000 || \ + Z7_CLANG_VERSION > 170001) #define USE_HW_SHA #endif - #elif defined(__GNUC__) - #if (__GNUC__ >= 6) // fix that check - #define USE_HW_SHA #endif - #elif defined(_MSC_VER) - #if _MSC_VER >= 1910 - #define USE_HW_SHA #endif #endif @@ -222,24 +238,88 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ // #pragma message("=== Sha256 HW === ") + #if defined(__clang__) || defined(__GNUC__) +#if !defined(__ARM_FEATURE_SHA2) && \ + !defined(__ARM_FEATURE_CRYPTO) #ifdef MY_CPU_ARM64 - #define ATTRIB_SHA __attribute__((__target__("+crypto,sha2"))) +#if defined(__clang__) + #define ATTRIB_SHA __attribute__((__target__("crypto"))) +#else + #define ATTRIB_SHA __attribute__((__target__("+crypto"))) +#endif #else +#if defined(__clang__) && (__clang_major__ >= 1) + #define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2"))) +#else #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) +#endif #endif +#endif #else // _MSC_VER // for arm32 #define _ARM_USE_NEW_NEON_INTRINSICS #endif -#if defined(_MSC_VER) && !defined(__clang__) && defined(MY_CPU_ARM64) + + + + +#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else + + + + + + + + + +#if defined(__clang__) && __clang_major__ < 16 +#if !defined(__ARM_FEATURE_SHA2) && \ + !defined(__ARM_FEATURE_CRYPTO) +// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ") + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1 +// #if defined(__clang__) && __clang_major__ < 13 + #define __ARM_FEATURE_CRYPTO 1 +// #else + #define __ARM_FEATURE_SHA2 1 +// #endif + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif // clang + +#if defined(__clang__) + +#if defined(__ARM_ARCH) && __ARM_ARCH < 8 + Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #pragma message("#define __ARM_ARCH 8") + #undef __ARM_ARCH + #define __ARM_ARCH 8 + Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif + +#endif // clang + #include + +#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \ + defined(__ARM_FEATURE_CRYPTO) && \ + defined(__ARM_FEATURE_SHA2) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER + #undef __ARM_FEATURE_CRYPTO + #undef __ARM_FEATURE_SHA2 + #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ") #endif +#endif // Z7_MSC_VER_ORIGINAL + typedef uint32x4_t v128; // typedef __n128 v128; // MSVC @@ -316,10 +396,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ LOAD_SHUFFLE (m2, 2) LOAD_SHUFFLE (m3, 3) - R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); - R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); - R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); - R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); + R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ) + R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ) + R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ) state0 = vaddq_u32(state0, state0_save); state1 = vaddq_u32(state1, state1_save); @@ -337,16 +417,17 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #endif // MY_CPU_ARM_OR_ARM64 -#ifndef USE_HW_SHA - +#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB) // #error Stop_Compiling_UNSUPPORTED_SHA // #include - +// We can compile this file with another C compiler, +// or we can compile asm version. +// So we can generate real code instead of this stub function. // #include "Sha256.h" -void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); - +// #if defined(_MSC_VER) #pragma message("Sha256 HW-SW stub was used") - +// #endif +void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) { @@ -359,7 +440,6 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ return; */ } - #endif @@ -384,3 +464,4 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_ #undef USE_HW_SHA #undef ATTRIB_SHA #undef USE_VER_MIN +#undef Z7_USE_HW_SHA_STUB diff --git a/3rdparty/lzma/src/SwapBytes.c b/3rdparty/lzma/src/SwapBytes.c index 7901bbaa87e4a..9290592b7da7b 100644 --- a/3rdparty/lzma/src/SwapBytes.c +++ b/3rdparty/lzma/src/SwapBytes.c @@ -1,5 +1,5 @@ /* SwapBytes.c -- Byte Swap conversion filter -2023-04-07 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -305,11 +305,12 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) msvc 19.30+ (VS2022): replaces _mm256_set_m128i(m,m) to vbroadcastf128(m) as we want */ // _mm256_broadcastsi128_si256(*mask128_ptr); - /* +#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000) #define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1) - MY_mm256_set_m128i - */ - _mm256_set_m128i( +#else + #define MY_mm256_set_m128i _mm256_set_m128i +#endif + MY_mm256_set_m128i( *(const __m128i *)mask128_ptr, *(const __m128i *)mask128_ptr); #endif @@ -330,32 +331,59 @@ ShufBytes_256(void *items8, const void *lim8, const void *mask128_ptr) // compile message "NEON intrinsics not available with the soft-float ABI" -#elif defined(MY_CPU_ARM_OR_ARM64) || \ - (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) -// #elif defined(MY_CPU_ARM64) +#elif defined(MY_CPU_ARM_OR_ARM64) \ + && defined(MY_CPU_LE) \ + && !defined(Z7_DISABLE_ARM_NEON) #if defined(__clang__) && (__clang_major__ >= 8) \ - || defined(__GNUC__) && (__GNUC__ >= 8) - #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 7)) \ + || defined(__GNUC__) && (__GNUC__ >= 6) + #if defined(__ARM_FP) + #if (defined(__ARM_ARCH) && (__ARM_ARCH >= 4)) \ || defined(MY_CPU_ARM64) + #if defined(MY_CPU_ARM64) \ + || !defined(Z7_CLANG_VERSION) \ + || defined(__ARM_NEON) #define USE_SWAP_128 - #endif #ifdef MY_CPU_ARM64 // #define SWAP_ATTRIB_NEON __attribute__((__target__(""))) #else - // #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) - #endif +#if defined(Z7_CLANG_VERSION) + // #define SWAP_ATTRIB_NEON __attribute__((__target__("neon"))) +#else + // #pragma message("SWAP_ATTRIB_NEON __attribute__((__target__(fpu=neon))") + #define SWAP_ATTRIB_NEON __attribute__((__target__("fpu=neon"))) +#endif + #endif // MY_CPU_ARM64 + #endif // __ARM_NEON + #endif // __ARM_ARCH + #endif // __ARM_FP + #elif defined(_MSC_VER) #if (_MSC_VER >= 1910) #define USE_SWAP_128 #endif #endif - #if defined(_MSC_VER) && defined(MY_CPU_ARM64) + #ifdef USE_SWAP_128 + #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64) #include #else + +/* +#if !defined(__ARM_NEON) +#if defined(Z7_GCC_VERSION) && (__GNUC__ < 5) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 90201) \ + || defined(Z7_GCC_VERSION) && (__GNUC__ == 5) && (Z7_GCC_VERSION < 100100) +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +#pragma message("#define __ARM_NEON 1") +// #define __ARM_NEON 1 +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER +#endif +#endif +*/ #include #endif + #endif #ifndef USE_SWAP_128 #define FORCE_SWAP_MODE @@ -464,6 +492,13 @@ Z7_ATTRIB_NO_VECTOR \ void Z7_FASTCALL +#if defined(MY_CPU_ARM_OR_ARM64) +#if defined(__clang__) +#pragma GCC diagnostic ignored "-Wlanguage-extension-token" +#endif +#endif + + #ifdef MY_CPU_64BIT #if defined(MY_CPU_ARM64) \ diff --git a/3rdparty/lzma/src/Threads.c b/3rdparty/lzma/src/Threads.c index cf52bd303075f..464efeca496ab 100644 --- a/3rdparty/lzma/src/Threads.c +++ b/3rdparty/lzma/src/Threads.c @@ -1,5 +1,5 @@ /* Threads.c -- multithreading library -2023-03-04 : Igor Pavlov : Public domain */ +2024-03-28 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -195,20 +195,19 @@ WRes CriticalSection_Init(CCriticalSection *p) // ---------- POSIX ---------- -#ifndef __APPLE__ +#if defined(__linux__) && !defined(__APPLE__) && !defined(_AIX) && !defined(__ANDROID__) #ifndef Z7_AFFINITY_DISABLE // _GNU_SOURCE can be required for pthread_setaffinity_np() / CPU_ZERO / CPU_SET // clang < 3.6 : unknown warning group '-Wreserved-id-macro' // clang 3.6 - 12.01 : gives warning "macro name is a reserved identifier" // clang >= 13 : do not give warning #if !defined(_GNU_SOURCE) - #if defined(__clang__) && (__clang_major__ >= 4) && (__clang_major__ <= 12) - #pragma GCC diagnostic ignored "-Wreserved-id-macro" - #endif -#define _GNU_SOURCE +Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER +// #define _GNU_SOURCE +Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER #endif // !defined(_GNU_SOURCE) #endif // Z7_AFFINITY_DISABLE -#endif // __APPLE__ +#endif // __linux__ #include "Threads.h" @@ -244,8 +243,9 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, { if (cpuSet) { - #ifdef Z7_AFFINITY_SUPPORTED - + // pthread_attr_setaffinity_np() is not supported for MUSL compile. + // so we check for __GLIBC__ here +#if defined(Z7_AFFINITY_SUPPORTED) && defined( __GLIBC__) /* printf("\n affinity :"); unsigned i; @@ -267,7 +267,7 @@ WRes Thread_Create_With_CpuSet(CThread *p, THREAD_FUNC_TYPE func, LPVOID param, // ret2 = pthread_attr_setaffinity_np(&attr, sizeof(*cpuSet), cpuSet); // if (ret2) ret = ret2; - #endif +#endif } ret = pthread_create(&p->_tid, &attr, func, param); @@ -369,13 +369,20 @@ WRes AutoResetEvent_CreateNotSignaled(CAutoResetEvent *p) { return AutoResetEvent_Create(p, 0); } +#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) +// freebsd: +#pragma GCC diagnostic ignored "-Wthread-safety-analysis" +#endif + WRes Event_Set(CEvent *p) { RINOK(pthread_mutex_lock(&p->_mutex)) p->_state = True; - int res1 = pthread_cond_broadcast(&p->_cond); - int res2 = pthread_mutex_unlock(&p->_mutex); - return (res2 ? res2 : res1); + { + const int res1 = pthread_cond_broadcast(&p->_cond); + const int res2 = pthread_mutex_unlock(&p->_mutex); + return (res2 ? res2 : res1); + } } WRes Event_Reset(CEvent *p) @@ -408,8 +415,8 @@ WRes Event_Close(CEvent *p) return 0; p->_created = 0; { - int res1 = pthread_mutex_destroy(&p->_mutex); - int res2 = pthread_cond_destroy(&p->_cond); + const int res1 = pthread_mutex_destroy(&p->_mutex); + const int res2 = pthread_cond_destroy(&p->_cond); return (res1 ? res1 : res2); } } @@ -487,8 +494,8 @@ WRes Semaphore_Close(CSemaphore *p) return 0; p->_created = 0; { - int res1 = pthread_mutex_destroy(&p->_mutex); - int res2 = pthread_cond_destroy(&p->_cond); + const int res1 = pthread_mutex_destroy(&p->_mutex); + const int res2 = pthread_cond_destroy(&p->_cond); return (res1 ? res1 : res2); } } @@ -549,6 +556,18 @@ LONG InterlockedIncrement(LONG volatile *addend) #endif } +LONG InterlockedDecrement(LONG volatile *addend) +{ + // Print("InterlockedDecrement") + #ifdef USE_HACK_UNSAFE_ATOMIC + LONG val = *addend - 1; + *addend = val; + return val; + #else + return __sync_sub_and_fetch(addend, 1); + #endif +} + #endif // _WIN32 WRes AutoResetEvent_OptCreate_And_Reset(CAutoResetEvent *p) diff --git a/3rdparty/lzma/src/Xz.c b/3rdparty/lzma/src/Xz.c index 4ad071060a7fd..d07550d097c92 100644 --- a/3rdparty/lzma/src/Xz.c +++ b/3rdparty/lzma/src/Xz.c @@ -1,5 +1,5 @@ /* Xz.c - Xz -2023-04-02 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -52,6 +52,7 @@ void XzCheck_Init(CXzCheck *p, unsigned mode) case XZ_CHECK_CRC32: p->crc = CRC_INIT_VAL; break; case XZ_CHECK_CRC64: p->crc64 = CRC64_INIT_VAL; break; case XZ_CHECK_SHA256: Sha256_Init(&p->sha); break; + default: break; } } @@ -62,6 +63,7 @@ void XzCheck_Update(CXzCheck *p, const void *data, size_t size) case XZ_CHECK_CRC32: p->crc = CrcUpdate(p->crc, data, size); break; case XZ_CHECK_CRC64: p->crc64 = Crc64Update(p->crc64, data, size); break; case XZ_CHECK_SHA256: Sha256_Update(&p->sha, (const Byte *)data, size); break; + default: break; } } diff --git a/3rdparty/lzma/src/XzCrc64.c b/3rdparty/lzma/src/XzCrc64.c index c2fad6cdaaa99..94fc1afb37242 100644 --- a/3rdparty/lzma/src/XzCrc64.c +++ b/3rdparty/lzma/src/XzCrc64.c @@ -1,5 +1,5 @@ /* XzCrc64.c -- CRC64 calculation -2023-04-02 : Igor Pavlov : Public domain */ +2023-12-08 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -8,36 +8,76 @@ #define kCrc64Poly UINT64_CONST(0xC96C5795D7870F42) -#ifdef MY_CPU_LE - #define CRC64_NUM_TABLES 4 +// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC64_DEBUG_BE +#ifdef Z7_CRC64_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +#ifdef Z7_CRC64_NUM_TABLES + #define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES #else - #define CRC64_NUM_TABLES 5 + #define Z7_CRC64_NUM_TABLES_USE 12 +#endif - UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); +#if Z7_CRC64_NUM_TABLES_USE < 1 + #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES #endif + +#if Z7_CRC64_NUM_TABLES_USE != 1 + #ifndef MY_CPU_BE - UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); + #define FUNC_NAME_LE_2(s) XzCrc64UpdateT ## s + #define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s) + #define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC64_NUM_TABLES_USE) + UInt64 Z7_FASTCALL FUNC_NAME_LE (UInt64 v, const void *data, size_t size, const UInt64 *table); +#endif +#ifndef MY_CPU_LE + #define FUNC_NAME_BE_2(s) XzCrc64UpdateBeT ## s + #define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s) + #define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC64_NUM_TABLES_USE) + UInt64 Z7_FASTCALL FUNC_NAME_BE (UInt64 v, const void *data, size_t size, const UInt64 *table); #endif -typedef UInt64 (Z7_FASTCALL *CRC64_FUNC)(UInt64 v, const void *data, size_t size, const UInt64 *table); +#if defined(MY_CPU_LE) + #define FUNC_REF FUNC_NAME_LE +#elif defined(MY_CPU_BE) + #define FUNC_REF FUNC_NAME_BE +#else + #define FUNC_REF g_Crc64Update + static UInt64 (Z7_FASTCALL *FUNC_REF)(UInt64 v, const void *data, size_t size, const UInt64 *table); +#endif + +#endif + + +MY_ALIGN(64) +static UInt64 g_Crc64Table[256 * Z7_CRC64_NUM_TABLES_USE]; -static CRC64_FUNC g_Crc64Update; -UInt64 g_Crc64Table[256 * CRC64_NUM_TABLES]; UInt64 Z7_FASTCALL Crc64Update(UInt64 v, const void *data, size_t size) { - return g_Crc64Update(v, data, size, g_Crc64Table); +#if Z7_CRC64_NUM_TABLES_USE == 1 + #define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) + const UInt64 *table = g_Crc64Table; + const Byte *p = (const Byte *)data; + const Byte *lim = p + size; + for (; p != lim; p++) + v = CRC64_UPDATE_BYTE_2(v, *p); + return v; + #undef CRC64_UPDATE_BYTE_2 +#else + return FUNC_REF (v, data, size, g_Crc64Table); +#endif } -UInt64 Z7_FASTCALL Crc64Calc(const void *data, size_t size) -{ - return g_Crc64Update(CRC64_INIT_VAL, data, size, g_Crc64Table) ^ CRC64_INIT_VAL; -} +Z7_NO_INLINE void Z7_FASTCALL Crc64GenerateTable(void) { - UInt32 i; + unsigned i; for (i = 0; i < 256; i++) { UInt64 r = i; @@ -46,35 +86,55 @@ void Z7_FASTCALL Crc64GenerateTable(void) r = (r >> 1) ^ (kCrc64Poly & ((UInt64)0 - (r & 1))); g_Crc64Table[i] = r; } - for (i = 256; i < 256 * CRC64_NUM_TABLES; i++) + +#if Z7_CRC64_NUM_TABLES_USE != 1 +#if 1 || 1 && defined(MY_CPU_X86) // low register count + for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i++) { - const UInt64 r = g_Crc64Table[(size_t)i - 256]; - g_Crc64Table[i] = g_Crc64Table[r & 0xFF] ^ (r >> 8); + const UInt64 r0 = g_Crc64Table[(size_t)i]; + g_Crc64Table[(size_t)i + 256] = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); } - - #ifdef MY_CPU_LE - - g_Crc64Update = XzCrc64UpdateT4; +#else + for (i = 0; i < 256 * (Z7_CRC64_NUM_TABLES_USE - 1); i += 2) + { + UInt64 r0 = g_Crc64Table[(size_t)(i) ]; + UInt64 r1 = g_Crc64Table[(size_t)(i) + 1]; + r0 = g_Crc64Table[(Byte)r0] ^ (r0 >> 8); + r1 = g_Crc64Table[(Byte)r1] ^ (r1 >> 8); + g_Crc64Table[(size_t)i + 256 ] = r0; + g_Crc64Table[(size_t)i + 256 + 1] = r1; + } +#endif - #else +#ifndef MY_CPU_LE { - #ifndef MY_CPU_BE +#ifndef MY_CPU_BE UInt32 k = 1; if (*(const Byte *)&k == 1) - g_Crc64Update = XzCrc64UpdateT4; + FUNC_REF = FUNC_NAME_LE; else - #endif +#endif { - for (i = 256 * CRC64_NUM_TABLES - 1; i >= 256; i--) +#ifndef MY_CPU_BE + FUNC_REF = FUNC_NAME_BE; +#endif + for (i = 0; i < 256 * Z7_CRC64_NUM_TABLES_USE; i++) { - const UInt64 x = g_Crc64Table[(size_t)i - 256]; + const UInt64 x = g_Crc64Table[i]; g_Crc64Table[i] = Z7_BSWAP64(x); } - g_Crc64Update = XzCrc64UpdateT1_BeT4; } } - #endif +#endif // ndef MY_CPU_LE +#endif // Z7_CRC64_NUM_TABLES_USE != 1 } #undef kCrc64Poly -#undef CRC64_NUM_TABLES +#undef Z7_CRC64_NUM_TABLES_USE +#undef FUNC_REF +#undef FUNC_NAME_LE_2 +#undef FUNC_NAME_LE_1 +#undef FUNC_NAME_LE +#undef FUNC_NAME_BE_2 +#undef FUNC_NAME_BE_1 +#undef FUNC_NAME_BE diff --git a/3rdparty/lzma/src/XzCrc64Opt.c b/3rdparty/lzma/src/XzCrc64Opt.c index d03374c006332..0c1fc2ffecb89 100644 --- a/3rdparty/lzma/src/XzCrc64Opt.c +++ b/3rdparty/lzma/src/XzCrc64Opt.c @@ -1,61 +1,261 @@ -/* XzCrc64Opt.c -- CRC64 calculation -2023-04-02 : Igor Pavlov : Public domain */ +/* XzCrc64Opt.c -- CRC64 calculation (optimized functions) +2023-12-08 : Igor Pavlov : Public domain */ #include "Precomp.h" #include "CpuArch.h" +#if !defined(Z7_CRC64_NUM_TABLES) || Z7_CRC64_NUM_TABLES > 1 + +// for debug only : define Z7_CRC64_DEBUG_BE to test big-endian code in little-endian cpu +// #define Z7_CRC64_DEBUG_BE +#ifdef Z7_CRC64_DEBUG_BE +#undef MY_CPU_LE +#define MY_CPU_BE +#endif + +#if defined(MY_CPU_64BIT) +#define Z7_CRC64_USE_64BIT +#endif + +// the value Z7_CRC64_NUM_TABLES_USE must be defined to same value as in XzCrc64.c +#ifdef Z7_CRC64_NUM_TABLES +#define Z7_CRC64_NUM_TABLES_USE Z7_CRC64_NUM_TABLES +#else +#define Z7_CRC64_NUM_TABLES_USE 12 +#endif + +#if Z7_CRC64_NUM_TABLES_USE % 4 || \ + Z7_CRC64_NUM_TABLES_USE < 4 || \ + Z7_CRC64_NUM_TABLES_USE > 4 * 4 + #error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + + #ifndef MY_CPU_BE -#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) +#define CRC64_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) + +#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) -UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table); -UInt64 Z7_FASTCALL XzCrc64UpdateT4(UInt64 v, const void *data, size_t size, const UInt64 *table) +#define Q64LE(n, d) \ + ( (table + ((n) * 8 + 7) * 0x100)[((d) ) & 0xFF] \ + ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 3 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 4 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 5 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 6 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 0) * 0x100)[((d) >> 7 * 8)] ) + +#define R64(a) *((const UInt64 *)(const void *)p + (a)) + +#else + +#define Q32LE(n, d) \ + ( (table + ((n) * 4 + 3) * 0x100)[((d) ) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] ) + +#define R32(a) *((const UInt32 *)(const void *)p + (a)) + +#endif + + +#define CRC64_FUNC_PRE_LE2(step) \ +UInt64 Z7_FASTCALL XzCrc64UpdateT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) + +#define CRC64_FUNC_PRE_LE(step) \ + CRC64_FUNC_PRE_LE2(step); \ + CRC64_FUNC_PRE_LE2(step) + +CRC64_FUNC_PRE_LE(Z7_CRC64_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) + const Byte *lim; + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC64_UPDATE_BYTE_2(v, *p); - for (; size >= 4; size -= 4, p += 4) + lim = p + size; + if (size >= Z7_CRC64_NUM_TABLES_USE) { - const UInt32 d = (UInt32)v ^ *(const UInt32 *)(const void *)p; - v = (v >> 32) - ^ (table + 0x300)[((d ) & 0xFF)] - ^ (table + 0x200)[((d >> 8) & 0xFF)] - ^ (table + 0x100)[((d >> 16) & 0xFF)] - ^ (table + 0x000)[((d >> 24))]; + lim -= Z7_CRC64_NUM_TABLES_USE; + do + { +#if Z7_CRC64_NUM_TABLES_USE == 4 + const UInt32 d = (UInt32)v ^ R32(0); + v = (v >> 32) ^ Q32LE(0, d); +#elif Z7_CRC64_NUM_TABLES_USE == 8 +#ifdef Z7_CRC64_USE_64BIT + v ^= R64(0); + v = Q64LE(0, v); +#else + UInt32 v0, v1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + v = Q32LE(1, v0) ^ Q32LE(0, v1); +#endif +#elif Z7_CRC64_NUM_TABLES_USE == 12 + UInt32 w; + UInt32 v0, v1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + w = R32(2); + v = Q32LE(0, w); + v ^= Q32LE(2, v0) ^ Q32LE(1, v1); +#elif Z7_CRC64_NUM_TABLES_USE == 16 +#ifdef Z7_CRC64_USE_64BIT + UInt64 w; + UInt64 x; + w = R64(1); x = Q64LE(0, w); + v ^= R64(0); v = x ^ Q64LE(1, v); +#else + UInt32 v0, v1; + UInt32 r0, r1; + v0 = (UInt32)v ^ R32(0); + v1 = (UInt32)(v >> 32) ^ R32(1); + r0 = R32(2); + r1 = R32(3); + v = Q32LE(1, r0) ^ Q32LE(0, r1); + v ^= Q32LE(3, v0) ^ Q32LE(2, v1); +#endif +#else +#error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + p += Z7_CRC64_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC64_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC64_UPDATE_BYTE_2(v, *p); return v; } +#undef CRC64_UPDATE_BYTE_2 +#undef R32 +#undef R64 +#undef Q32LE +#undef Q64LE +#undef CRC64_FUNC_PRE_LE +#undef CRC64_FUNC_PRE_LE2 + #endif + + #ifndef MY_CPU_LE -#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[(Byte)((crc) >> 56) ^ (b)] ^ ((crc) << 8)) +#define CRC64_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 56) ^ (b)] ^ ((crc) << 8)) + +#if defined(Z7_CRC64_USE_64BIT) && (Z7_CRC64_NUM_TABLES_USE % 8 == 0) + +#define Q64BE(n, d) \ + ( (table + ((n) * 8 + 0) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 8 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 3) * 0x100)[((d) >> 3 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 4) * 0x100)[((d) >> 4 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 5) * 0x100)[((d) >> 5 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 6) * 0x100)[((d) >> 6 * 8) & 0xFF] \ + ^ (table + ((n) * 8 + 7) * 0x100)[((d) >> 7 * 8)] ) + +#ifdef Z7_CRC64_DEBUG_BE + #define R64BE(a) GetBe64a((const UInt64 *)(const void *)p + (a)) +#else + #define R64BE(a) *((const UInt64 *)(const void *)p + (a)) +#endif + +#else + +#define Q32BE(n, d) \ + ( (table + ((n) * 4 + 0) * 0x100)[(Byte)(d)] \ + ^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \ + ^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] ) -UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table); -UInt64 Z7_FASTCALL XzCrc64UpdateT1_BeT4(UInt64 v, const void *data, size_t size, const UInt64 *table) +#ifdef Z7_CRC64_DEBUG_BE + #define R32BE(a) GetBe32a((const UInt32 *)(const void *)p + (a)) +#else + #define R32BE(a) *((const UInt32 *)(const void *)p + (a)) +#endif + +#endif + +#define CRC64_FUNC_PRE_BE2(step) \ +UInt64 Z7_FASTCALL XzCrc64UpdateBeT ## step (UInt64 v, const void *data, size_t size, const UInt64 *table) + +#define CRC64_FUNC_PRE_BE(step) \ + CRC64_FUNC_PRE_BE2(step); \ + CRC64_FUNC_PRE_BE2(step) + +CRC64_FUNC_PRE_BE(Z7_CRC64_NUM_TABLES_USE) { const Byte *p = (const Byte *)data; - table += 0x100; + const Byte *lim; v = Z7_BSWAP64(v); - for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) + for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC64_NUM_TABLES_USE & 4))) != 0; size--, p++) v = CRC64_UPDATE_BYTE_2_BE(v, *p); - for (; size >= 4; size -= 4, p += 4) + lim = p + size; + if (size >= Z7_CRC64_NUM_TABLES_USE) { - const UInt32 d = (UInt32)(v >> 32) ^ *(const UInt32 *)(const void *)p; - v = (v << 32) - ^ (table + 0x000)[((d ) & 0xFF)] - ^ (table + 0x100)[((d >> 8) & 0xFF)] - ^ (table + 0x200)[((d >> 16) & 0xFF)] - ^ (table + 0x300)[((d >> 24))]; + lim -= Z7_CRC64_NUM_TABLES_USE; + do + { +#if Z7_CRC64_NUM_TABLES_USE == 4 + const UInt32 d = (UInt32)(v >> 32) ^ R32BE(0); + v = (v << 32) ^ Q32BE(0, d); +#elif Z7_CRC64_NUM_TABLES_USE == 12 + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + const UInt32 w = R32BE(2); + v = Q32BE(0, w); + v ^= Q32BE(2, d1) ^ Q32BE(1, d0); + +#elif Z7_CRC64_NUM_TABLES_USE == 8 + #ifdef Z7_CRC64_USE_64BIT + v ^= R64BE(0); + v = Q64BE(0, v); + #else + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + v = Q32BE(1, d1) ^ Q32BE(0, d0); + #endif +#elif Z7_CRC64_NUM_TABLES_USE == 16 + #ifdef Z7_CRC64_USE_64BIT + const UInt64 w = R64BE(1); + v ^= R64BE(0); + v = Q64BE(0, w) ^ Q64BE(1, v); + #else + const UInt32 d1 = (UInt32)(v >> 32) ^ R32BE(0); + const UInt32 d0 = (UInt32)(v ) ^ R32BE(1); + const UInt32 w1 = R32BE(2); + const UInt32 w0 = R32BE(3); + v = Q32BE(1, w1) ^ Q32BE(0, w0); + v ^= Q32BE(3, d1) ^ Q32BE(2, d0); + #endif +#elif +#error Stop_Compiling_Bad_CRC64_NUM_TABLES +#endif + p += Z7_CRC64_NUM_TABLES_USE; + } + while (p <= lim); + lim += Z7_CRC64_NUM_TABLES_USE; } - for (; size > 0; size--, p++) + for (; p < lim; p++) v = CRC64_UPDATE_BYTE_2_BE(v, *p); return Z7_BSWAP64(v); } +#undef CRC64_UPDATE_BYTE_2_BE +#undef R32BE +#undef R64BE +#undef Q32BE +#undef Q64BE +#undef CRC64_FUNC_PRE_BE +#undef CRC64_FUNC_PRE_BE2 + +#endif +#undef Z7_CRC64_NUM_TABLES_USE #endif diff --git a/3rdparty/lzma/src/XzDec.c b/3rdparty/lzma/src/XzDec.c index a5f703966d60f..3d1c98e63194f 100644 --- a/3rdparty/lzma/src/XzDec.c +++ b/3rdparty/lzma/src/XzDec.c @@ -1,5 +1,5 @@ /* XzDec.c -- Xz Decode -2023-04-13 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -105,30 +105,32 @@ static SRes XzBcFilterState_SetProps(void *pp, const Byte *props, size_t propSiz { if (propSize != 1) return SZ_ERROR_UNSUPPORTED; - p->delta = (unsigned)props[0] + 1; + p->delta = (UInt32)props[0] + 1; } else { if (propSize == 4) { - UInt32 v = GetUi32(props); + const UInt32 v = GetUi32(props); switch (p->methodId) { case XZ_ID_PPC: case XZ_ID_ARM: case XZ_ID_SPARC: case XZ_ID_ARM64: - if ((v & 3) != 0) + if (v & 3) return SZ_ERROR_UNSUPPORTED; break; case XZ_ID_ARMT: - if ((v & 1) != 0) + case XZ_ID_RISCV: + if (v & 1) return SZ_ERROR_UNSUPPORTED; break; case XZ_ID_IA64: - if ((v & 0xF) != 0) + if (v & 0xf) return SZ_ERROR_UNSUPPORTED; break; + default: break; } p->ip = v; } @@ -151,12 +153,13 @@ static void XzBcFilterState_Init(void *pp) static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Dec[] = { - Z7_BRANCH_CONV_DEC(PPC), - Z7_BRANCH_CONV_DEC(IA64), - Z7_BRANCH_CONV_DEC(ARM), - Z7_BRANCH_CONV_DEC(ARMT), - Z7_BRANCH_CONV_DEC(SPARC), - Z7_BRANCH_CONV_DEC(ARM64) + Z7_BRANCH_CONV_DEC_2 (BranchConv_PPC), + Z7_BRANCH_CONV_DEC_2 (BranchConv_IA64), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARMT), + Z7_BRANCH_CONV_DEC_2 (BranchConv_SPARC), + Z7_BRANCH_CONV_DEC_2 (BranchConv_ARM64), + Z7_BRANCH_CONV_DEC_2 (BranchConv_RISCV) }; static SizeT XzBcFilterStateBase_Filter_Dec(CXzBcFilterStateBase *p, Byte *data, SizeT size) @@ -262,7 +265,7 @@ static SRes XzBcFilterState_Code2(void *pp, #define XZ_IS_SUPPORTED_FILTER_ID(id) \ - ((id) >= XZ_ID_Delta && (id) <= XZ_ID_ARM64) + ((id) >= XZ_ID_Delta && (id) <= XZ_ID_RISCV) SRes Xz_StateCoder_Bc_SetFromMethod_Func(IStateCoder *p, UInt64 id, Xz_Func_BcFilterStateBase_Filter func, ISzAllocPtr alloc) @@ -541,13 +544,12 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met { IStateCoder *sc = &p->coders[coderIndex]; p->ids[coderIndex] = methodId; - switch (methodId) - { - case XZ_ID_LZMA2: return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); - #ifdef USE_SUBBLOCK - case XZ_ID_Subblock: return SbState_SetFromMethod(sc, p->alloc); - #endif - } + if (methodId == XZ_ID_LZMA2) + return Lzma2State_SetFromMethod(sc, outBuf, outBufSize, p->alloc); +#ifdef USE_SUBBLOCK + if (methodId == XZ_ID_Subblock) + return SbState_SetFromMethod(sc, p->alloc); +#endif if (coderIndex == 0) return SZ_ERROR_UNSUPPORTED; return Xz_StateCoder_Bc_SetFromMethod_Func(sc, methodId, @@ -558,10 +560,8 @@ static SRes MixCoder_SetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 met static SRes MixCoder_ResetFromMethod(CMixCoder *p, unsigned coderIndex, UInt64 methodId, Byte *outBuf, size_t outBufSize) { IStateCoder *sc = &p->coders[coderIndex]; - switch (methodId) - { - case XZ_ID_LZMA2: return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); - } + if (methodId == XZ_ID_LZMA2) + return Lzma2State_ResetOutBuf(sc, outBuf, outBufSize); return SZ_ERROR_UNSUPPORTED; } @@ -804,7 +804,7 @@ static BoolInt Xz_CheckFooter(CXzStreamFlags flags, UInt64 indexSize, const Byte } #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ - { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ + { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ if (s == 0) return SZ_ERROR_ARCHIVE; \ pos += s; } @@ -1034,7 +1034,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, SRes res; ECoderFinishMode finishMode2 = finishMode; - BoolInt srcFinished2 = srcFinished; + BoolInt srcFinished2 = (BoolInt)srcFinished; BoolInt destFinish = False; if (p->block.packSize != (UInt64)(Int64)-1) @@ -1127,7 +1127,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, return SZ_OK; } - switch (p->state) + switch ((int)p->state) { case XZ_STATE_STREAM_HEADER: { @@ -1172,15 +1172,15 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, p->state = XZ_STATE_STREAM_INDEX; break; } - p->blockHeaderSize = ((UInt32)p->buf[0] << 2) + 4; + p->blockHeaderSize = ((unsigned)p->buf[0] << 2) + 4; break; } if (p->pos != p->blockHeaderSize) { - UInt32 cur = p->blockHeaderSize - p->pos; + unsigned cur = p->blockHeaderSize - p->pos; if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1222,8 +1222,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } else { - UInt32 checkSize = XzFlags_GetCheckSize(p->streamFlags); - UInt32 cur = checkSize - p->pos; + const unsigned checkSize = XzFlags_GetCheckSize(p->streamFlags); + unsigned cur = checkSize - p->pos; if (cur != 0) { if (srcRem == 0) @@ -1232,7 +1232,7 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, return SZ_OK; } if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1321,9 +1321,9 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, case XZ_STATE_STREAM_FOOTER: { - UInt32 cur = XZ_STREAM_FOOTER_SIZE - p->pos; + unsigned cur = XZ_STREAM_FOOTER_SIZE - p->pos; if (cur > srcRem) - cur = (UInt32)srcRem; + cur = (unsigned)srcRem; memcpy(p->buf + p->pos, src, cur); p->pos += cur; (*srcLen) += cur; @@ -1358,6 +1358,8 @@ SRes XzUnpacker_Code(CXzUnpacker *p, Byte *dest, SizeT *destLen, } case XZ_STATE_BLOCK: break; /* to disable GCC warning */ + + default: return SZ_ERROR_FAIL; } } /* @@ -1773,10 +1775,10 @@ static void XzDecMt_Callback_Parse(void *obj, unsigned coderIndex, CMtDecCallbac } } { - UInt64 packSize = block->packSize; - UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); - UInt32 checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); - UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; + const UInt64 packSize = block->packSize; + const UInt64 packSizeAligned = packSize + ((0 - (unsigned)packSize) & 3); + const unsigned checkSize = XzFlags_GetCheckSize(coder->dec.streamFlags); + const UInt64 blockPackSum = coder->inPreSize + packSizeAligned + checkSize; // if (blockPackSum <= me->props.inBlockMax) // unpackBlockMaxSize { @@ -2381,7 +2383,7 @@ static SRes XzDecMt_Decode_ST(CXzDecMt *p if (tMode) { XzDecMt_FreeOutBufs(p); - tMode = MtDec_PrepareRead(&p->mtc); + tMode = (BoolInt)MtDec_PrepareRead(&p->mtc); } #endif @@ -2644,7 +2646,7 @@ SRes XzDecMt_Decode(CXzDecMtHandle p, p->outSize = *outDataSize; } - p->finishMode = finishMode; + p->finishMode = (BoolInt)finishMode; // p->outSize = 457; p->outSize_Defined = True; p->finishMode = False; // for test diff --git a/3rdparty/lzma/src/XzEnc.c b/3rdparty/lzma/src/XzEnc.c index 22408e26189a2..c1affadfa6f14 100644 --- a/3rdparty/lzma/src/XzEnc.c +++ b/3rdparty/lzma/src/XzEnc.c @@ -1,5 +1,5 @@ /* XzEnc.c -- Xz Encode -2023-04-13 : Igor Pavlov : Public domain */ +2024-03-01 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -29,8 +29,9 @@ #define XZ_GET_PAD_SIZE(dataSize) ((4 - ((unsigned)(dataSize) & 3)) & 3) -/* max pack size for LZMA2 block + check-64bytrs: */ -#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + 64) +#define XZ_CHECK_SIZE_MAX 64 +/* max pack size for LZMA2 block + pad4 + check_size: */ +#define XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize) ((unpackSize) + ((unpackSize) >> 10) + 16 + XZ_CHECK_SIZE_MAX) #define XZ_GET_ESTIMATED_BLOCK_TOTAL_PACK_SIZE(unpackSize) (XZ_BLOCK_HEADER_SIZE_MAX + XZ_GET_MAX_BLOCK_PACK_SIZE(unpackSize)) @@ -325,12 +326,13 @@ typedef struct static const z7_Func_BranchConv g_Funcs_BranchConv_RISC_Enc[] = { - Z7_BRANCH_CONV_ENC(PPC), - Z7_BRANCH_CONV_ENC(IA64), - Z7_BRANCH_CONV_ENC(ARM), - Z7_BRANCH_CONV_ENC(ARMT), - Z7_BRANCH_CONV_ENC(SPARC), - Z7_BRANCH_CONV_ENC(ARM64) + Z7_BRANCH_CONV_ENC_2 (BranchConv_PPC), + Z7_BRANCH_CONV_ENC_2 (BranchConv_IA64), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARMT), + Z7_BRANCH_CONV_ENC_2 (BranchConv_SPARC), + Z7_BRANCH_CONV_ENC_2 (BranchConv_ARM64), + Z7_BRANCH_CONV_ENC_2 (BranchConv_RISCV) }; static SizeT XzBcFilterStateBase_Filter_Enc(CXzBcFilterStateBase *p, Byte *data, SizeT size) @@ -888,9 +890,9 @@ static SRes Xz_CompressBlock( blockSizes->unpackSize = checkInStream.processed; } { - Byte buf[4 + 64]; - unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); - UInt64 packSize = seqSizeOutStream.processed; + Byte buf[4 + XZ_CHECK_SIZE_MAX]; + const unsigned padSize = XZ_GET_PAD_SIZE(seqSizeOutStream.processed); + const UInt64 packSize = seqSizeOutStream.processed; buf[0] = 0; buf[1] = 0; @@ -898,7 +900,8 @@ static SRes Xz_CompressBlock( buf[3] = 0; SeqCheckInStream_GetDigest(&checkInStream, buf + 4); - RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) + RINOK(WriteBytes(&seqSizeOutStream.vt, buf + (4 - padSize), + padSize + XzFlags_GetCheckSize((CXzStreamFlags)props->checkId))) blockSizes->totalSize = seqSizeOutStream.processed - padSize; @@ -1083,18 +1086,19 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf CXzEnc *me = (CXzEnc *)pp; SRes res; CMtProgressThunk progressThunk; - - Byte *dest = me->outBufs[outBufIndex]; - + Byte *dest; UNUSED_VAR(finished) - { CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; bInfo->totalSize = 0; bInfo->unpackSize = 0; bInfo->headerSize = 0; + // v23.02: we don't compress empty blocks + // also we must ignore that empty block in XzEnc_MtCallback_Write() + if (srcSize == 0) + return SZ_OK; } - + dest = me->outBufs[outBufIndex]; if (!dest) { dest = (Byte *)ISzAlloc_Alloc(me->alloc, me->outBufSize); @@ -1140,18 +1144,20 @@ static SRes XzEnc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBuf static SRes XzEnc_MtCallback_Write(void *pp, unsigned outBufIndex) { CXzEnc *me = (CXzEnc *)pp; - const CXzEncBlockInfo *bInfo = &me->EncBlocks[outBufIndex]; - const Byte *data = me->outBufs[outBufIndex]; - - RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) - + // v23.02: we don't write empty blocks + // note: if (bInfo->unpackSize == 0) then there is no compressed data of block + if (bInfo->unpackSize == 0) + return SZ_OK; { - UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); - RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) + const Byte *data = me->outBufs[outBufIndex]; + RINOK(WriteBytes(me->outStream, data, bInfo->headerSize)) + { + const UInt64 totalPackFull = bInfo->totalSize + XZ_GET_PAD_SIZE(bInfo->totalSize); + RINOK(WriteBytes(me->outStream, data + XZ_BLOCK_HEADER_SIZE_MAX, (size_t)totalPackFull - bInfo->headerSize)) + } + return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); } - - return XzEncIndex_AddIndexRecord(&me->xzIndex, bInfo->unpackSize, bInfo->totalSize, me->alloc); } #endif diff --git a/3rdparty/lzma/src/XzIn.c b/3rdparty/lzma/src/XzIn.c index d0fc763679ee5..b68af965c1724 100644 --- a/3rdparty/lzma/src/XzIn.c +++ b/3rdparty/lzma/src/XzIn.c @@ -1,5 +1,5 @@ /* XzIn.c - Xz input -2023-04-02 : Igor Pavlov : Public domain */ +2023-09-07 : Igor Pavlov : Public domain */ #include "Precomp.h" @@ -27,7 +27,7 @@ SRes Xz_ReadHeader(CXzStreamFlags *p, ISeqInStreamPtr inStream) } #define READ_VARINT_AND_CHECK(buf, pos, size, res) \ - { unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ + { const unsigned s = Xz_ReadVarInt(buf + pos, size - pos, res); \ if (s == 0) return SZ_ERROR_ARCHIVE; \ pos += s; } @@ -37,7 +37,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, unsigned headerSize; *headerSizeRes = 0; RINOK(SeqInStream_ReadByte(inStream, &header[0])) - headerSize = (unsigned)header[0]; + headerSize = header[0]; if (headerSize == 0) { *headerSizeRes = 1; @@ -47,7 +47,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, *isIndex = False; headerSize = (headerSize << 2) + 4; - *headerSizeRes = headerSize; + *headerSizeRes = (UInt32)headerSize; { size_t processedSize = headerSize - 1; RINOK(SeqInStream_ReadMax(inStream, header + 1, &processedSize)) @@ -58,7 +58,7 @@ SRes XzBlock_ReadHeader(CXzBlock *p, ISeqInStreamPtr inStream, BoolInt *isIndex, } #define ADD_SIZE_CHECK(size, val) \ - { UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } + { const UInt64 newSize = size + (val); if (newSize < size) return XZ_SIZE_OVERFLOW; size = newSize; } UInt64 Xz_GetUnpackSize(const CXzStream *p) { diff --git a/3rdparty/xbyak/xbyak/xbyak.h b/3rdparty/xbyak/xbyak/xbyak.h index 4a8effe62c4d7..28af005a3edf7 100644 --- a/3rdparty/xbyak/xbyak/xbyak.h +++ b/3rdparty/xbyak/xbyak/xbyak.h @@ -155,7 +155,7 @@ namespace Xbyak { enum { DEFAULT_MAX_CODE_SIZE = 4096, - VERSION = 0x7060 /* 0xABCD = A.BC(.D) */ + VERSION = 0x7210 /* 0xABCD = A.BC(.D) */ }; #ifndef MIE_INTEGER_TYPE_DEFINED @@ -232,6 +232,7 @@ enum { ERR_CANT_USE_REX2, ERR_INVALID_DFV, ERR_INVALID_REG_IDX, + ERR_BAD_ENCODING_MODE, ERR_INTERNAL // Put it at last. }; @@ -290,6 +291,7 @@ inline const char *ConvertErrorToString(int err) "can't use rex2", "invalid dfv", "invalid reg index", + "bad encoding mode", "internal error" }; assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl)); @@ -1673,7 +1675,9 @@ inline const uint8_t* Label::getAddress() const typedef enum { DefaultEncoding, VexEncoding, - EvexEncoding + EvexEncoding, + PreAVX10v2Encoding, + AVX10v2Encoding } PreferredEncoding; class CodeGenerator : public CodeArray { @@ -1730,10 +1734,10 @@ class CodeGenerator : public CodeArray { { return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM()); } - static inline bool isValidSSE(const Operand& op1) + static inline bool isValidSSE(const Operand& op) { // SSE instructions do not support XMM16 - XMM31 - return !(op1.isXMM() && op1.getIdx() >= 16); + return !(op.isXMM() && op.getIdx() >= 16); } static inline uint8_t rexRXB(int bit, int bit3, const Reg& r, const Reg& b, const Reg& x = Reg()) { @@ -1867,16 +1871,19 @@ class CodeGenerator : public CodeArray { } db(code); } - void verifySAE(const Reg& r, uint64_t type) const + // Allow YMM embedded rounding for AVX10.2 to minimize flag modifications + bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const { - if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_SAE_IS_INVALID) + if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true; + if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } - void verifyER(const Reg& r, uint64_t type) const + bool verifyER(const Reg& r, const Reg& b, uint64_t type) const { - if ((type & T_ER_R) && r.isREG(32|64)) return; - if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return; - XBYAK_THROW(ERR_ER_IS_INVALID) + if ((type & T_ER_R) && b.isREG(32|64)) return false; + if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true; + if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false; + XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false) } // (a, b, c) contains non zero two or three values then err int verifyDuplicate(int a, int b, int c, int err) @@ -1897,19 +1904,21 @@ class CodeGenerator : public CodeArray { bool R = reg.isExtIdx(); bool X3 = (x && x->isExtIdx()) || (base.isSIMD() && base.isExtIdx2()); - bool B4 = base.isREG() && base.isExtIdx2(); - bool X4 = x && (x->isREG() && x->isExtIdx2()); + uint8_t B4 = (base.isREG() && base.isExtIdx2()) ? 8 : 0; + uint8_t U = (x && (x->isREG() && x->isExtIdx2())) ? 0 : 4; bool B = base.isExtIdx(); bool Rp = reg.isExtIdx2(); int LL; int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET); int disp8N = 1; if (rounding) { + bool isUzero = false; if (rounding == EvexModifierRounding::T_SAE) { - verifySAE(base, type); LL = 0; + isUzero = verifySAE(reg, base, type); LL = 0; } else { - verifyER(base, type); LL = rounding - 1; + isUzero = verifyER(reg, base, type); LL = rounding - 1; } + if (isUzero) U = 0; // avx10.2 Evex.U b = true; } else { if (v) VL = (std::max)(VL, v->getBit()); @@ -1935,8 +1944,8 @@ class CodeGenerator : public CodeArray { if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET); if (aaa == 0) z = 0; // clear T_z if mask is not set db(0x62); - db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | (B4 ? 8 : 0) | mmm); - db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | (X4 ? 0 : 4) | (pp & 3)); + db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | B4 | mmm); + db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | U | (pp & 3)); db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (V4 ? 0 : 8) | (aaa & 7)); db(code); return disp8N; @@ -2163,7 +2172,7 @@ class CodeGenerator : public CodeArray { } } } - void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&), int imm8 = NONE) + void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&) = 0, int imm8 = NONE) { if (isValid && !isValid(r, op)) XBYAK_THROW(ERR_BAD_COMBINATION) if (!isValidSSE(r) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED) @@ -2554,6 +2563,18 @@ class CodeGenerator : public CodeArray { Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM; opVex(x.copyAndSetKind(kind), &xm0, op, type, code); } + // (x, x, x/m), (x, y, y/m), (y, z, z/m) + void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code) + { + int b1 = x1.getBit(); + int b2 = x2.getBit(); + int b3 = op.getBit(); + if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) { + opVex(x1, &x2, op, type, code); + return; + } + XBYAK_THROW(ERR_BAD_COMBINATION); + } const Xmm& cvtIdx0(const Operand& x) const { return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0; @@ -2644,21 +2665,24 @@ class CodeGenerator : public CodeArray { if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING) opVex(x, 0, addr, type, code); } - void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding) + void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0) { - opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code); + opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm); } - int orEvexIf(PreferredEncoding encoding) { - if (encoding == DefaultEncoding) { - encoding = defaultEncoding_; + PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const + { + if (enc == DefaultEncoding) { + enc = defaultEncoding_[sel]; } - if (encoding == EvexEncoding) { + if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding) #ifdef XBYAK_DISABLE_AVX512 - XBYAK_THROW(ERR_EVEX_IS_INVALID) + if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW(ERR_EVEX_IS_INVALID) #endif - return T_MUST_EVEX; - } - return 0; + return enc; + } + uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) { + enc = getEncoding(enc, sel); + return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex); } void opInOut(const Reg& a, const Reg& d, uint8_t code) { @@ -2770,6 +2794,31 @@ class CodeGenerator : public CodeArray { } opSSE(x, op, type1, code1, isXMM_XMMorMEM, imm); } + // AVX10 zero-extending for vmovd, vmovw + void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit) + { + const Operand *p1 = &op1; + const Operand *p2 = &op2; + bool rev = false; + if (p1->isMEM()) { + std::swap(p1, p2); + rev = true; + } + if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) + if (p1->isXMM()) { + std::swap(p1, p2); + rev = !rev; + } + enc = getEncoding(enc, 1); + int sel = -1; + if (p1->isXMM() || (p1->isMEM() && enc == AVX10v2Encoding)) { + sel = 2 + int(rev); + } else if (p1->isREG(bit) || p1->isMEM()) { + sel = int(rev); + } + if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION) + opAVX_X_X_XM(*static_cast(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]); + } public: unsigned int getVersion() const { return VERSION; } using CodeArray::db; @@ -2828,7 +2877,7 @@ class CodeGenerator : public CodeArray { #endif private: bool isDefaultJmpNEAR_; - PreferredEncoding defaultEncoding_; + PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw public: void L(const std::string& label) { labelMgr_.defineSlabel(label); } void L(Label& label) { labelMgr_.defineClabel(label); } @@ -2999,6 +3048,7 @@ class CodeGenerator : public CodeArray { rex(*p2, *p1); db(0x90 | (p2->getIdx() & 7)); return; } + if (p1->isREG() && p2->isREG()) std::swap(p1, p2); // adapt to NASM 2.16.03 behavior to pass tests opRO(static_cast(*p1), *p2, 0, 0x86 | (p1->isBit(8) ? 0 : 1), (p1->isREG() && (p1->getBit() == p2->getBit()))); } @@ -3113,8 +3163,9 @@ class CodeGenerator : public CodeArray { , es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs) #endif , isDefaultJmpNEAR_(false) - , defaultEncoding_(EvexEncoding) { + setDefaultEncoding(); + setDefaultEncodingAVX10(); labelMgr_.set(this); } void reset() @@ -3151,13 +3202,20 @@ class CodeGenerator : public CodeArray { #undef jnl #endif - // set default encoding to select Vex or Evex - void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; } - - void sha1msg12(const Xmm& x, const Operand& op) + // set default encoding of VNNI + // EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI + void setDefaultEncoding(PreferredEncoding enc = EvexEncoding) + { + if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[0] = enc; + } + // default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16 + void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding) { - opROO(Reg(), op, x, T_MUST_EVEX, 0xD9); + if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE) + defaultEncoding_[1] = enc; } + void bswap(const Reg32e& r) { int idx = r.getIdx(); @@ -3170,6 +3228,24 @@ class CodeGenerator : public CodeArray { } db(0xC8 + (idx & 7)); } + void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512 + T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32); + } + void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding) + { + const uint64_t typeTbl[] = { + T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16 + T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2 + }; + const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E }; + opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64); + } /* use single byte nop if useMultiByteNop = false */ diff --git a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h index eb5fd7f0eaa7f..64930ab4ecf63 100644 --- a/3rdparty/xbyak/xbyak/xbyak_mnemonic.h +++ b/3rdparty/xbyak/xbyak/xbyak_mnemonic.h @@ -1,4 +1,4 @@ -const char *getVersionString() const { return "7.06"; } +const char *getVersionString() const { return "7.21"; } void aadd(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); } void aand(const Address& addr, const Reg32e ®) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); } void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); } @@ -645,7 +645,7 @@ void jz(const char *label, LabelType type = T_AUTO) { jz(std::string(label), typ void jz(const void *addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }//-V524 void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524 void lahf() { db(0x9F); } -void lddqu(const Xmm& xmm, const Address& addr) { opMR(addr, xmm, T_F2 | T_0F, 0xF0); } +void lddqu(const Xmm& xmm, const Address& addr) { opSSE(xmm, addr, T_F2 | T_0F, 0xF0); } void ldmxcsr(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0xAE); } void lea(const Reg& reg, const Address& addr) { if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opMR(addr, reg, 0, 0x8D); } void leave() { db(0xC9); } @@ -667,8 +667,8 @@ void loopne(const char *label) { loopne(std::string(label)); } void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); } void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, T_0F, 0xB2); } void lzcnt(const Reg®, const Operand& op) { if (opROO(Reg(), op, reg, T_APX|T_NF, 0xF5)) return; opCnt(reg, op, 0xBD); } -void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_66|T_0F, 0xF7); } -void maskmovq(const Mmx& reg1, const Mmx& reg2) { if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opRR(reg1, reg2, T_0F, 0xF7); } +void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_66|T_0F, 0xF7); } +void maskmovq(const Mmx& reg1, const Mmx& reg2) { opSSE(reg1, reg2, T_0F, 0xF7); } void maxpd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_66, 0x5F, isXMM_XMMorMEM); } void maxps(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F, 0x5F, isXMM_XMMorMEM); } void maxsd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F2, 0x5F, isXMM_XMMorMEM); } @@ -680,54 +680,52 @@ void minsd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F2, 0x5D void minss(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F3, 0x5D, isXMM_XMMorMEM); } void monitor() { db(0x0F); db(0x01); db(0xC8); } void monitorx() { db(0x0F); db(0x01); db(0xFA); } -void movapd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x29); } +void movapd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x29); } void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, T_0F, T_66); } -void movaps(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_NONE, 0x29); } +void movaps(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_NONE, 0x29); } void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, T_0F, T_NONE); } void movbe(const Address& addr, const Reg& reg) { opMR(addr, reg, T_0F38, 0xF1, T_APX, 0x61); } void movbe(const Reg& reg, const Address& addr) { opMR(addr, reg, T_0F38, 0xF0, T_APX, 0x60); } -void movd(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, 0x7E); } -void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, 0x6E); } -void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x6E); } -void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x7E); } +void movd(const Mmx& mmx, const Operand& op) { if (!(op.isMEM() || op.isREG(32))) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0x66); opSSE(mmx, op, T_0F, 0x6E); } +void movd(const Operand& op, const Mmx& mmx) { if (!(op.isMEM() || op.isREG(32))) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0x66); opSSE(mmx, op, T_0F, 0x7E); } void movddup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12, isXMM_XMMorMEM, NONE); } void movdir64b(const Reg& reg, const Address& addr) { opMR(addr, reg.cvt32(), T_66|T_0F38, 0xF8, T_APX|T_66); } void movdiri(const Address& addr, const Reg32e& reg) { opMR(addr, reg, T_0F38, 0xF9, T_APX); } -void movdq2q(const Mmx& mmx, const Xmm& xmm) { opRR(mmx, xmm, T_F2 | T_0F, 0xD6); } -void movdqa(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x7F); } +void movdq2q(const Mmx& mmx, const Xmm& xmm) { opSSE(mmx, xmm, T_F2 | T_0F, 0xD6); } +void movdqa(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x7F); } void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, T_0F, T_66); } -void movdqu(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F3, 0x7F); } +void movdqu(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F3, 0x7F); } void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, T_0F, T_F3); } -void movhlps(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_0F, 0x12); } +void movhlps(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_0F, 0x12); } void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_66|T_0F, 0x16); } void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_0F, 0x16); } -void movlhps(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_0F, 0x16); } +void movlhps(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_0F, 0x16); } void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_66|T_0F, 0x12); } void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_0F, 0x12); } void movmskpd(const Reg32e& reg, const Xmm& xmm) { db(0x66); movmskps(reg, xmm); } -void movmskps(const Reg32e& reg, const Xmm& xmm) { opRR(reg, xmm, T_0F, 0x50); } -void movntdq(const Address& addr, const Xmm& reg) { opMR(addr, Reg16(reg.getIdx()), T_0F, 0xE7); } -void movntdqa(const Xmm& xmm, const Address& addr) { opMR(addr, xmm, T_66 | T_0F38, 0x2A); } +void movmskps(const Reg32e& reg, const Xmm& xmm) { opSSE(reg, xmm, T_0F, 0x50); } +void movntdq(const Address& addr, const Xmm& reg) { if (reg.getIdx() >= 16) XBYAK_THROW(ERR_BAD_PARAMETER) opSSE(Reg16(reg.getIdx()), addr, T_0F, 0xE7); } +void movntdqa(const Xmm& xmm, const Address& addr) { opSSE(xmm, addr, T_66 | T_0F38, 0x2A); } void movnti(const Address& addr, const Reg32e& reg) { opMR(addr, reg, T_0F, 0xC3); } -void movntpd(const Address& addr, const Xmm& reg) { opMR(addr, Reg16(reg.getIdx()), T_0F, 0x2B); } -void movntps(const Address& addr, const Xmm& xmm) { opMR(addr, Mmx(xmm.getIdx()), T_0F, 0x2B); } -void movntq(const Address& addr, const Mmx& mmx) { if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opMR(addr, mmx, T_0F, 0xE7); } -void movq(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, mmx.isXMM() ? 0xD6 : 0x7F); } -void movq(const Mmx& mmx, const Operand& op) { if (mmx.isXMM()) db(0xF3); opRO(mmx, op, T_0F, mmx.isXMM() ? 0x7E : 0x6F, mmx.getKind() == op.getKind()); } -void movq2dq(const Xmm& xmm, const Mmx& mmx) { opRR(xmm, mmx, T_F3 | T_0F, 0xD6); } +void movntpd(const Address& addr, const Xmm& reg) { if (reg.getIdx() >= 16) XBYAK_THROW(ERR_BAD_PARAMETER) opSSE(Reg16(reg.getIdx()), addr, T_0F, 0x2B); } +void movntps(const Address& addr, const Xmm& xmm) { opSSE(Xmm(xmm.getIdx()), addr, T_0F, 0x2B); } +void movntq(const Address& addr, const Mmx& mmx) { if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(mmx, addr, T_0F, 0xE7); } +void movq(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(mmx, addr, T_0F, mmx.isXMM() ? 0xD6 : 0x7F); } +void movq(const Mmx& mmx, const Operand& op) { if (!op.isMEM() && mmx.getKind() != op.getKind()) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0xF3); opSSE(mmx, op, T_0F, mmx.isXMM() ? 0x7E : 0x6F); } +void movq2dq(const Xmm& xmm, const Mmx& mmx) { opSSE(xmm, mmx, T_F3 | T_0F, 0xD6); } void movsb() { db(0xA4); } void movsd() { db(0xA5); } -void movsd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F2, 0x11); } +void movsd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F2, 0x11); } void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_F2); } void movshdup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX, 0x16, isXMM_XMMorMEM, NONE); } void movsldup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX, 0x12, isXMM_XMMorMEM, NONE); } -void movss(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F3, 0x11); } +void movss(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F3, 0x11); } void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_F3); } void movsw() { db(0x66); db(0xA5); } void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); } -void movupd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x11); } +void movupd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x11); } void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_66); } -void movups(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_NONE, 0x11); } +void movups(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_NONE, 0x11); } void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_NONE); } void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); } void mpsadbw(const Xmm& xmm, const Operand& op, int imm) { opSSE(xmm, op, T_66 | T_0F3A, 0x42, isXMM_XMMorMEM, static_cast(imm)); } @@ -823,7 +821,7 @@ void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); } void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); } void pminud(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66 | T_0F38, 0x3B, isXMM_XMMorMEM); } void pminuw(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66 | T_0F38, 0x3A, isXMM_XMMorMEM); } -void pmovmskb(const Reg32e& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(reg, mmx, T_0F, 0xD7); } +void pmovmskb(const Reg32e& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(reg, mmx, T_0F, 0xD7); } void pmovsxbd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N4|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x21, isXMM_XMMorMEM, NONE); } void pmovsxbq(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N2|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x22, isXMM_XMMorMEM, NONE); } void pmovsxbw(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N8|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x20, isXMM_XMMorMEM, NONE); } @@ -1059,10 +1057,10 @@ void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66|T_0F38|T_YMM|T_EVEX, 0xDD); } void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_W0, 0xDB); } void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0xDF, imm); } -void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); } -void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); } -void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); } -void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); } +void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x55); } +void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x55); } +void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x54); } +void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x54); } void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_0F38|T_W0|T_YMM|T_B16, 0xB1); } void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM|T_B16, 0xB1); } void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x0D, imm); } @@ -1213,7 +1211,6 @@ void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3| void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_0F38|T_W0|T_YMM, 0xB0); } void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38|T_W0|T_YMM, 0xB0); } -void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding), 0x72); } void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); } void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); } void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); } @@ -1226,7 +1223,7 @@ void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); } void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_F3|T_0F|T_EW0|T_EVEX|T_SAE_X, 0x5A); } void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D); } -void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); } +void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); } void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX|T_SAE_Z|T_B32, 0x5B); } void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C); } void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C); } @@ -1239,64 +1236,64 @@ void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm); } void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm); } void vextractps(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm); } -void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x98); } -void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x98); } +void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x98); } +void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x98); } void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x99); } void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x99); } -void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA8); } -void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA8); } +void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA8); } +void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA8); } void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xA9); } void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xA9); } -void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB8); } -void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB8); } +void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB8); } +void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB8); } void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xB9); } void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xB9); } -void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x96); } -void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x96); } -void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA6); } -void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA6); } -void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB6); } -void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB6); } -void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9A); } -void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9A); } +void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x96); } +void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x96); } +void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA6); } +void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA6); } +void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB6); } +void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB6); } +void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9A); } +void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9A); } void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9B); } void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9B); } -void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAA); } -void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAA); } +void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAA); } +void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAA); } void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAB); } void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAB); } -void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBA); } -void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBA); } +void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBA); } +void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBA); } void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBB); } void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBB); } -void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x97); } -void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x97); } -void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA7); } -void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA7); } -void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB7); } -void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB7); } -void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9C); } -void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9C); } +void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x97); } +void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x97); } +void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA7); } +void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA7); } +void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB7); } +void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB7); } +void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9C); } +void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9C); } void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9D); } void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9D); } -void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAC); } -void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAC); } +void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAC); } +void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAC); } void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAD); } void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAD); } -void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBC); } -void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBC); } +void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBC); } +void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBC); } void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBD); } void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBD); } -void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9E); } -void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9E); } +void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9E); } +void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9E); } void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9F); } void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9F); } -void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAE); } -void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAE); } +void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAE); } +void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAE); } void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAF); } void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAF); } -void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBE); } -void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBE); } +void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBE); } +void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBE); } void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBF); } void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBF); } void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); } @@ -1320,20 +1317,18 @@ void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_X void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D); } void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E); } void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C); } -void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F); } -void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F); } -void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F); } -void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F); } -void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D); } -void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); } -void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D); } -void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D); } +void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5F); } +void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5F); } +void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5F); } +void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5F); } +void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5D); } +void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5D); } +void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5D); } +void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5D); } void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); } void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); } void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); } -void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); } -void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); } void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); } void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); } void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); } @@ -1370,13 +1365,12 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_ void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); } void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); } void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); } -void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); } void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); } void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); } void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); } void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59); } -void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56); } -void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56); } +void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x56); } +void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x56); } void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1C); } void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_EW0|T_YMM|T_EVEX|T_B32, 0x1E); } void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1D); } @@ -1421,22 +1415,10 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); } void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); } void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); } -void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); } -void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); } void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); } void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); } -void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); } -void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); } void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); } void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); } -void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); } -void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); } -void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); } void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); } void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); } void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); } @@ -1468,8 +1450,6 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); } void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); } void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); } -void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB5, encoding); } -void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB4, encoding); } void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_YMM|T_EVEX, 0x04); } void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM|T_EVEX, 0xF5); } void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); } @@ -1593,8 +1573,8 @@ void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x15); } void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_B64, 0x14); } void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x14); } -void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57); } -void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57); } +void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x57); } +void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x57); } void vzeroall() { db(0xC5); db(0xFC); db(0x77); } void vzeroupper() { db(0xC5); db(0xF8); db(0x77); } void wait() { db(0x9B); } @@ -1612,6 +1592,8 @@ void xor_(const Reg& d, const Operand& op, uint32_t imm) { opROI(d, op, imm, T_N void xor_(const Reg& d, const Operand& op1, const Operand& op2) { opROO(d, op1, op2, T_NF|T_CODE1_IF1, 0x30); } void xorpd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_66, 0x57, isXMM_XMMorMEM); } void xorps(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F, 0x57, isXMM_XMMorMEM); } +void xresldtrk() { db(0xF2); db(0x0F); db(0x01); db(0xE9); } +void xsusldtrk() { db(0xF2); db(0x0F); db(0x01); db(0xE8); } #ifdef XBYAK_ENABLE_OMITTED_OPERAND void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); } void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); } @@ -1892,8 +1874,8 @@ void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); } void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); } void cmpxchg16b(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xC7); } void fxrstor64(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xAE); } -void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x7E); } -void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x6E); } +void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x7E); } +void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x6E); } void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opRO(reg, op, 0, 0x63); } void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x16, 0, imm); } void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x22, 0, imm); } @@ -1935,6 +1917,10 @@ void aesencwide128kl(const Address& addr) { opSSE_APX(xmm0, addr, T_F3|T_0F38, 0 void aesencwide256kl(const Address& addr) { opSSE_APX(xmm2, addr, T_F3|T_0F38, 0xD8, T_F3|T_MUST_EVEX, 0xD8); } void encodekey128(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFA, 0xDA); } void encodekey256(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFB, 0xDB); } +void rdfsbase(const Reg32e& r) { opRR(eax, r, T_F3|T_0F, 0xAE); } +void rdgsbase(const Reg32e& r) { opRR(ecx, r, T_F3|T_0F, 0xAE); } +void wrfsbase(const Reg32e& r) { opRR(edx, r, T_F3|T_0F, 0xAE); } +void wrgsbase(const Reg32e& r) { opRR(ebx, r, T_F3|T_0F, 0xAE); } void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); } void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); } void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); } @@ -2041,6 +2027,7 @@ void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); } void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); } void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); } +void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); } void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); } void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); } void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); } @@ -2169,6 +2156,7 @@ void vcmpordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x, void vcmpordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 7); } void vcmpordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 7); } void vcmpordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 7); } +void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); } void vcmppd(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0xC2, imm); } void vcmpph(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0xC2, imm); } void vcmpps(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0xC2, imm); } @@ -2191,18 +2179,39 @@ void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); } void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); } void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); } -void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); } -void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x63); } +void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); } void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); } -void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); } +void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); } +void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); } +void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); } +void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); } +void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); } +void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } +void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); } +void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); } +void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); } +void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); } +void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); } +void vcvtneps2bf16(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); } void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); } void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); } void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); } void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); } +void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); } void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); } void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); } void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); } @@ -2210,6 +2219,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); } void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } +void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); } +void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); } void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); } void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); } void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); } @@ -2226,22 +2237,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3 void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); } void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); } void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); } +void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); } +void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); } +void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); } +void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); } void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); } +void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); } void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); } +void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); } +void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); } void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); } void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); } void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); } +void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); } +void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); } +void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); } void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); } +void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); } void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); } void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); } +void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); } +void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); } void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); } void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); } +void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); } void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); } @@ -2254,9 +2283,11 @@ void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2 void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); } void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); } +void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); } void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); } void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); } void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); } +void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); } void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); } void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); } void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); } @@ -2275,38 +2306,51 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); } void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); } +void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); } void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); } +void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); } void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); } +void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); } void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); } void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); } void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); } void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); } void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); } +void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); } void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); } +void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); } void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); } +void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); } void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); } void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); } void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); } void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); } void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); } +void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); } void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); } +void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); } void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); } +void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); } void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); } +void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); } void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); } +void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); } void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); } +void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); } void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); } +void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); } void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); } void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); } void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); } @@ -2325,12 +2369,14 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); } void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); } void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); } +void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); } void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); } void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); } void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); } void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); } +void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); } void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); } void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); } @@ -2345,10 +2391,19 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); } void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); } void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); } -void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F); } -void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F); } -void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D); } -void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D); } +void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); } +void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); } +void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); } +void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); } +void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); } +void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); } +void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); } +void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); } +void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); } +void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); } void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX, 0x6F); } void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW1|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); } @@ -2364,9 +2419,8 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); } void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); } -void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); } -void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); } +void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); } +void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); } void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); } void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); } void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); } @@ -2403,10 +2457,24 @@ void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { op void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x1E, imm); } void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3E, imm); } void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3F, imm); } +void vpcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x63); } void vpcompressd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8B); } void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8B); } +void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); } void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); } void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); } +void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); } +void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); } void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); } @@ -2431,6 +2499,8 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_6 void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x91, 0); } void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x44); } void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x44); } +void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB5); } +void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB4); } void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3D); } void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3F); } void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x39); } @@ -2524,14 +2594,17 @@ void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_ void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); } void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); } void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); } +void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); } void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); } +void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); } void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); } void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); } void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); } void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); } +void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); } void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); } void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); } @@ -2546,8 +2619,11 @@ void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); } void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); } void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); } +void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); } void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); } +void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } +void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); } void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); } void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); } void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); } @@ -2570,11 +2646,16 @@ void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); } void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); } void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); } +void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); } void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); } void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); } +void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); } void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); } void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); } -void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); } +void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } +void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); } #ifdef XBYAK64 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); } void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); } diff --git a/3rdparty/xbyak/xbyak/xbyak_util.h b/3rdparty/xbyak/xbyak/xbyak_util.h index dd428797723e0..1f138e3b4a382 100644 --- a/3rdparty/xbyak/xbyak/xbyak_util.h +++ b/3rdparty/xbyak/xbyak/xbyak_util.h @@ -547,6 +547,7 @@ class Cpu { XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE); XBYAK_DEFINE_TYPE(88, tSSE4a); XBYAK_DEFINE_TYPE(89, tCLWB); + XBYAK_DEFINE_TYPE(90, tTSXLDTRK); #undef XBYAK_SPLIT_ID #undef XBYAK_DEFINE_TYPE @@ -684,6 +685,7 @@ class Cpu { if (ECX & (1U << 28)) type_ |= tMOVDIR64B; if (EDX & (1U << 5)) type_ |= tUINTR; if (EDX & (1U << 14)) type_ |= tSERIALIZE; + if (EDX & (1U << 16)) type_ |= tTSXLDTRK; if (EDX & (1U << 22)) type_ |= tAMX_BF16; if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 25)) type_ |= tAMX_INT8;