diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp index ee14fce04307d..1ff8a20253d47 100644 --- a/pcsx2/vtlb.cpp +++ b/pcsx2/vtlb.cpp @@ -119,50 +119,105 @@ __noinline int CheckCache(u32 addr) } const size_t size = cachedTlbs.count; - const int stride = 4; - - __m128i addr_vec = _mm_set1_epi32(addr); + const int stride = 8; + //__m128i addr_vec = _mm_set1_epi32(addr); + __m256i addr_vec = _mm256_set1_epi32(addr); size_t i = 0; for (; i + stride <= size; i += stride) { + /* __m128i pfn1_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PFN1s[i])); __m128i pfn0_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PFN0s[i])); __m128i mask_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PageMasks[i])); + */ + + __m256i pfn1_vec = _mm256_loadu_si256(reinterpret_cast(&cachedTlbs.PFN1s[i])); + __m256i pfn0_vec = _mm256_loadu_si256(reinterpret_cast(&cachedTlbs.PFN0s[i])); + __m256i mask_vec = _mm256_loadu_si256(reinterpret_cast(&cachedTlbs.PageMasks[i])); + /* __m128i cached1_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.CacheEnabled1[i])); __m128i cached0_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.CacheEnabled0[i])); + */ + __m256i cached1_vec = _mm256_loadu_si256(reinterpret_cast(&cachedTlbs.CacheEnabled1[i])); + __m256i cached0_vec = _mm256_loadu_si256(reinterpret_cast(&cachedTlbs.CacheEnabled0[i])); + + /* __m128i pfn1_end_vec = _mm_add_epi32(pfn1_vec, mask_vec); __m128i pfn0_end_vec = _mm_add_epi32(pfn0_vec, mask_vec); + */ + + __m256i pfn1_end_vec = _mm256_add_epi32(pfn1_vec, mask_vec); + __m256i pfn0_end_vec = _mm256_add_epi32(pfn0_vec, mask_vec); + /* __m128i gteLowerBound0 = _mm_or_si128( _mm_cmpgt_epi32(addr_vec, pfn0_vec), _mm_cmpeq_epi32(addr_vec, pfn0_vec)); __m128i gteUpperBound0 = _mm_or_si128( _mm_cmpgt_epi32(pfn0_end_vec, addr_vec), _mm_cmpeq_epi32(pfn0_end_vec, addr_vec)); + */ + __m256i gteLowerBound0 = _mm256_or_si256( + _mm256_cmpgt_epi32(addr_vec, pfn0_vec), + _mm256_cmpeq_epi32(addr_vec, pfn0_vec)); + + __m256i gteUpperBound0 = _mm256_or_si256( + _mm256_cmpgt_epi32(pfn0_end_vec, addr_vec), + _mm256_cmpeq_epi32(pfn0_end_vec, addr_vec)); + + /* __m128i gteUpperBound1 = _mm_or_si128( _mm_cmpgt_epi32(pfn1_end_vec, addr_vec), _mm_cmpeq_epi32(pfn1_end_vec, addr_vec)); __m128i gteLowerBound1 = _mm_or_si128( _mm_cmpgt_epi32(addr_vec, pfn1_vec), _mm_cmpeq_epi32(addr_vec, pfn1_vec)); + */ + __m256i gteUpperBound1 = _mm256_or_si256( + _mm256_cmpgt_epi32(pfn1_end_vec, addr_vec), + _mm256_cmpeq_epi32(pfn1_end_vec, addr_vec)); + + __m256i gteLowerBound1 = _mm256_or_si256( + _mm256_cmpgt_epi32(addr_vec, pfn1_vec), + _mm256_cmpeq_epi32(addr_vec, pfn1_vec)); + + /* __m128i cmp0 = _mm_and_si128(gteLowerBound0, gteUpperBound0); __m128i cmp1 = _mm_and_si128(gteLowerBound1, gteUpperBound1); + */ + + __m256i cmp0 = _mm256_and_si256(gteLowerBound0, gteUpperBound0); + __m256i cmp1 = _mm256_and_si256(gteLowerBound1, gteUpperBound1); - cmp1 = _mm_and_si128(cmp1, cached1_vec); - cmp0 = _mm_and_si128(cmp0, cached0_vec); - __m128i cmp = _mm_or_si128(cmp1, cmp0); + //cmp1 = _mm_and_si128(cmp1, cached1_vec); + //cmp0 = _mm_and_si128(cmp0, cached0_vec); + cmp1 = _mm256_and_si256(cmp1, cached1_vec); + cmp0 = _mm256_and_si256(cmp0, cached0_vec); + + //__m128i cmp = _mm_or_si128(cmp1, cmp0); + + __m256i cmp = _mm256_or_si256(cmp1, cmp0); + //__m128i cmp = _mm_or_si128(cmp1, cmp0); + + /* if (!_mm_testz_si128(cmp, cmp)) { return true; } + */ + + if (!_mm256_testz_si256(cmp, cmp)) + { + return true; + } } for (; i < size; i++)