-
Notifications
You must be signed in to change notification settings - Fork 61
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
optimize popcount implementation #348
Conversation
1841ab7
to
5ae3d85
Compare
5ae3d85
to
78c7ac3
Compare
Is the asm the same after your last fix? |
The asm should be the same. For completeness sake: u8
Before: popcount_u8:
mov eax, edi
and edi, 85
shr al
and eax, 85
add edi, eax
mov edx, edi
and edi, 51
shr dl, 2
and edx, 51
add edx, edi
mov eax, edx
shr dl, 4
and eax, 15
add eax, edx
movzx eax, al
ret After: popcount_u8:
movzx eax, dil
xor edx, edx
popcnt eax, eax
test dil, dil
cmove eax, edx
ret u16
Before: popcount_u16:
mov eax, edi
and di, 21845
shr ax
and ax, 21845
add edi, eax
mov eax, edi
and di, 13107
shr ax, 2
and ax, 13107
add eax, edi
mov edx, eax
and ax, 3855
shr dx, 4
and dx, 3855
add edx, eax
movzx eax, dl
shr dx, 8
add eax, edx
movzx eax, ax
ret After: popcount_u16:
xor edx, edx
popcnt ax, di
test di, di
movzx eax, ax
cmove eax, edx
ret u32
Before: popcount_u32:
mov eax, edi
and edi, 1431655765
shr eax
and eax, 1431655765
add edi, eax
mov edx, edi
and edi, 858993459
shr edx, 2
and edx, 858993459
add edx, edi
mov eax, edx
and edx, 252645135
shr eax, 4
and eax, 252645135
add eax, edx
mov edx, eax
and eax, 16711935
shr edx, 8
and edx, 16711935
add edx, eax
movzx eax, dx
shr edx, 16
add eax, edx
ret After: popcount_u32:
xor eax, eax
xor edx, edx
popcnt eax, edi
test edi, edi
cmove eax, edx
ret u64
Before: popcount_u64:
movabs rdx, 6148914691236517205
mov rax, rdi
movabs rcx, 1085102592571150095
shr rax
and rdi, rdx
and rax, rdx
movabs rdx, 3689348814741910323
add rdi, rax
mov rax, rdi
and rdi, rdx
shr rax, 2
and rax, rdx
add rax, rdi
mov rdx, rax
and rax, rcx
shr rdx, 4
and rdx, rcx
movabs rcx, 71777214294589695
add rdx, rax
mov rax, rdx
and rdx, rcx
shr rax, 8
and rax, rcx
movabs rcx, 281470681808895
add rax, rdx
mov rdx, rax
and rax, rcx
shr rdx, 16
and rdx, rcx
add rax, rdx
mov rdx, rax
shr rdx, 32
add eax, edx
ret After: popcount_u64:
xor edx, edx
xor eax, eax
popcnt rdx, rdi
test rdi, rdi
cmovne eax, edx
ret u128
Before: popcount_128:
popcnt rsi, rsi
popcnt rdi, rdi
lea eax, [rdi+rsi]
ret After: popcount_128:
xor edx, edx
xor eax, eax
popcnt rdx, rsi
test rsi, rsi
cmovne eax, edx
xor edx, edx
popcnt rdx, rdi
add edx, eax
test rdi, rdi
cmovne eax, edx
ret Looks like the 128-bit case might have worse codegen, but considering we're no longer using built-in functions, gcc shouldn't ever emit a function call here. Not sure which is better here, thoughts? |
I found out how to optimize this better.
So, by changing I'm not sure why it doesn't work for the <= 32-bit cases: it should work as the GCC test cases above show. If you'd like to investigate, that's awesome; otherwise, we can just merge as is. Btw, the case for
(only one If you would like to take a look, that's good; otherwise I can fix it myself. |
78c7ac3
to
144fdba
Compare
I'm not able to reproduce the u128 issue, but I've found codegen to be less verbose if we don't special case it. One problem I ran into when developing this is assuming that the ctpop intrinsic always returns the same type, no matter the input type. From what I'm able to determine, the input type should be the same as the output, so I think we need to keep I'm now getting this: popcount_u8:
movzx eax, dil
xor edx, edx
popcnt eax, eax
test dil, dil
cmove eax, edx
ret
popcount_u16:
xor edx, edx
popcnt ax, di
test di, di
movzx eax, ax
cmove eax, edx
ret
popcount_u32:
xor eax, eax
xor edx, edx
popcnt eax, edi
test edi, edi
cmove eax, edx
ret
popcount_u64:
xor edx, edx
xor eax, eax
popcnt rdx, rdi
test rdi, rdi
cmovne eax, edx
ret
popcount_128:
mov rdx, rdi
or rdx, rsi
je .L68
xor ecx, ecx
popcnt rsi, rsi
popcnt rcx, rdi
lea eax, [rsi+rcx]
ret
.L68:
xor eax, eax
ret Sadly, it looks like we might be running into a gcc bug - there might not be a whole lot we can do here to get rid of the branching that gets generated until that bug gets fixed. |
Did you rebase on master?
You're talking about the Rust intrinsic? If so, I was able to change the |
144fdba
to
10e97a3
Compare
Just rebased on
Yes - the ui test |
Ok, thanks for testing. Is this ready to be merged? |
At this point, there's not much more I think I can do. I want to revisit codegen for |
I checked out your branch directly and I still get the bad codegen for u128: edit: I was using the wrong branch of gcc.
edit: the following was with a bad gcc branch (forget about this):
|
For the record: I've been testing against commit a033810a6062a1f8ebe51e224eaa4faa0c7e173c in your gcc fork. It doesn't look like changes since then should cause any difference, but I'll retest against the latest commit there and see if there's any change on my side. |
hmmm, latest commit spits out a lot of warnings:
|
Yeah, sorry about that. I merged a GCC PR, but not rust-lang/gccjit.rs#23 so this is causing some issue. I think it's best to wait that I merge it, so probably tomorrow. |
No worries - take your time. |
This is fixed. You can rebase with master. |
10e97a3
to
15c20c0
Compare
I've rebased, looks like CI passes. Can you make sure everything looks good from your side? |
Yes. I'll look at that tonight. |
In the current implementation, the gcc backend of rustc currently emits the following for a function that implements popcount for a u32 (x86_64 targeting AVX2, using standard unix calling convention): popcount: mov eax, edi and edi, 1431655765 shr eax and eax, 1431655765 add edi, eax mov edx, edi and edi, 858993459 shr edx, 2 and edx, 858993459 add edx, edi mov eax, edx and edx, 252645135 shr eax, 4 and eax, 252645135 add eax, edx mov edx, eax and eax, 16711935 shr edx, 8 and edx, 16711935 add edx, eax movzx eax, dx shr edx, 16 add eax, edx ret Rather than using this implementation, gcc could be told to use Wenger's algorithm. This would give the same function the following implementation: popcount: xor eax, eax xor edx, edx popcnt eax, edi test edi, edi cmove eax, edx ret This patch implements the popcount operation in terms of Wenger's algorithm in all cases. Signed-off-by: Andy Sadler <[email protected]>
15c20c0
to
64abf58
Compare
Applied your suggestions with 64abf58. |
I'm still getting the wrong asm for u128 and i128:
It has only one |
I haven't been able to reproduce that behavior. Do you have the source for a test I can use locally? |
Yes, that's what I use:
I could also try to add this to the int tests to see if the CI fails in the same way that it fails locally. |
Thanks - I'll try to see if I can reproduce locally.
FWIW |
Oh, I think I have been mistaken by some optimizations that are made since the value sent to
generates the correct:
Does that make sense? |
That looks to be what's happening, yes. If I replace Details
popcount_test::popcount_i8.constprop.0:
mov eax, 8
ret
popcount_test::popcount_u8.constprop.0:
mov eax, 8
ret
popcount_test::popcount_i16.constprop.0:
mov eax, 16
ret
popcount_test::popcount_u16.constprop.0:
mov eax, 16
ret
popcount_test::popcount_i32.constprop.0:
mov eax, 32
ret
popcount_test::popcount_u32.constprop.0:
mov eax, 32
ret
popcount_test::popcount_i64.constprop.0:
mov eax, 64
ret
popcount_test::popcount_u64.constprop.0:
mov eax, 64
ret
popcount_test::popcount_u128.constprop.0:
mov eax, 64
ret
popcount_test::popcount_i128.constprop.0:
mov eax, 64
ret Looks like the optimizer was smarter than I realized! |
Your earlier comment shows a shorter asm for the 128-bit case (with a jump, though). Edit: Oh, I think you reverted that change. |
I reverted that change since it was causing issues on builds where 128-bit integers weren't natively supported. I have a patch re-implementing and gating it behind 128-bit integers being natively supported by libgccjit. If you want, I can post it in this PR, otherwise I'll make another PR for it. |
Thanks a lot for your contribution! You can open a new PR for this. I'd be curious to know what's causing the issues on non-native 128-bit integers. |
AIUI it's because 128-bit integers aren't actually numbers when we emulate support for them, and that broke a few assumptions. You can find the logs for that CI run here. |
This errors seemed to be caused by the fact that we call this libgccjit function directly instead of the wrapper in the |
In the current implementation, the gcc backend of rustc currently emits the
following for a function that implements popcount for a u32 (x86_64 targeting
AVX2, using standard unix calling convention):
Rather than using this implementation, gcc could be told to use Wenger's
algorithm. This would give the same function the following implementation:
This patch implements the popcount operation in terms of Wenger's algorithm in
all cases.