[PATCH] D156029: [InstCombine] icmp udiv transform
Maksim Kita via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 24 03:29:29 PDT 2023
kitaisreal marked 8 inline comments as done.
kitaisreal added inline comments.
================
Comment at: llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp:4341
+
+ if (IntegerBitWidth > 128)
+ return nullptr;
----------------
goldstein.w.n wrote:
> Why the > 128 threshold?
It seems that this check is not required at all.
For i128 https://llvm.godbolt.org/z/rK7Wvfrso and i256 https://llvm.godbolt.org/z/xeEPa6voY if we apply transformation result assembly looks much better.
================
Comment at: llvm/test/Transforms/InstCombine/icmp-udiv.ll:99
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i128 [[TMP6]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = and i1 [[TMP5]], [[TMP7]]
+; CHECK-NEXT: ret i1 [[TMP8]]
----------------
nikic wrote:
> goldstein.w.n wrote:
> > It's not clear this is more canonical or universally better. It's true we occasionally add instructions to avoid div, but this is a lot. Maybe this transform belongs in the backend?
> Yeah, this does not seem like a good canonicalization from an IR perspective.
>
> I'm also wondering whether it would be preferable to use MULO instead of extending to double-width. Though it seems like codegen ends up being very similar for most targets (https://llvm.godbolt.org/z/73Phv83hT). Apparently muls that set overflow flags without computing a wide multiply aren't really a thing.
It seems that for `less`, `greater`, `less or equal`, `greater or equal` this optimization is universally better.
We trade `udiv` for 3 `zest`, 1 `mul` and for `greater` and `greater or equal` we also need 1 `add`. In final assembly it also looks much better.
================
Comment at: llvm/test/Transforms/InstCombine/icmp-udiv.ll:99
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i128 [[TMP6]], [[TMP1]]
+; CHECK-NEXT: [[TMP8:%.*]] = and i1 [[TMP5]], [[TMP7]]
+; CHECK-NEXT: ret i1 [[TMP8]]
----------------
kitaisreal wrote:
> nikic wrote:
> > goldstein.w.n wrote:
> > > It's not clear this is more canonical or universally better. It's true we occasionally add instructions to avoid div, but this is a lot. Maybe this transform belongs in the backend?
> > Yeah, this does not seem like a good canonicalization from an IR perspective.
> >
> > I'm also wondering whether it would be preferable to use MULO instead of extending to double-width. Though it seems like codegen ends up being very similar for most targets (https://llvm.godbolt.org/z/73Phv83hT). Apparently muls that set overflow flags without computing a wide multiply aren't really a thing.
> It seems that for `less`, `greater`, `less or equal`, `greater or equal` this optimization is universally better.
> We trade `udiv` for 3 `zest`, 1 `mul` and for `greater` and `greater or equal` we also need 1 `add`. In final assembly it also looks much better.
>
For `equals` and `not equals` it seems that it provides slightly more instructions, but I still think that if we can avoid division it is worth it.
I think it is okay to move this optimization in Backend, althought I am afraid that if we will not keep this optimization in IR, such construction will not be autovectorized.
For such code:
```
void test(uint32_t * __restrict x, uint32_t * __restrict y, uint8_t * result, uint32_t z, size_t size) {
for (size_t i = 0; i < size; ++i) {
result[i] = x[i] / y[i] == z;
}
}
```
Result vectorized loop after this patch:
```
.LBB0_4: # %vector.body
# =>This Inner Loop Header: Depth=1
movdqu (%rdi,%r9,4), %xmm4
movdqu (%rsi,%r9,4), %xmm9
movdqa %xmm4, %xmm5
punpckldq %xmm1, %xmm5 # xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
punpckhdq %xmm1, %xmm4 # xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
movdqa %xmm9, %xmm10
punpckhdq %xmm1, %xmm10 # xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
punpckldq %xmm1, %xmm9 # xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
movdqa %xmm0, %xmm6
pmuludq %xmm9, %xmm6
movdqa %xmm0, %xmm8
pmuludq %xmm10, %xmm8
pxor %xmm2, %xmm4
movdqa %xmm8, %xmm11
pxor %xmm2, %xmm11
movdqa %xmm11, %xmm12
pcmpgtd %xmm4, %xmm12
pxor %xmm2, %xmm5
movdqa %xmm6, %xmm13
pxor %xmm2, %xmm13
movdqa %xmm13, %xmm7
pcmpgtd %xmm5, %xmm7
movdqa %xmm7, %xmm14
shufps $136, %xmm12, %xmm14 # xmm14 = xmm14[0,2],xmm12[0,2]
pcmpeqd %xmm4, %xmm11
pcmpeqd %xmm5, %xmm13
shufps $221, %xmm11, %xmm13 # xmm13 = xmm13[1,3],xmm11[1,3]
andps %xmm14, %xmm13
shufps $221, %xmm12, %xmm7 # xmm7 = xmm7[1,3],xmm12[1,3]
orps %xmm13, %xmm7
paddq %xmm9, %xmm6
paddq %xmm10, %xmm8
pxor %xmm2, %xmm8
movdqa %xmm8, %xmm9
pcmpgtd %xmm4, %xmm9
pxor %xmm2, %xmm6
movdqa %xmm6, %xmm10
pcmpgtd %xmm5, %xmm10
movdqa %xmm10, %xmm11
shufps $136, %xmm9, %xmm11 # xmm11 = xmm11[0,2],xmm9[0,2]
pcmpeqd %xmm4, %xmm8
pcmpeqd %xmm5, %xmm6
shufps $221, %xmm8, %xmm6 # xmm6 = xmm6[1,3],xmm8[1,3]
andps %xmm11, %xmm6
shufps $221, %xmm9, %xmm10 # xmm10 = xmm10[1,3],xmm9[1,3]
orps %xmm6, %xmm10
andnps %xmm10, %xmm7
andps %xmm3, %xmm7
packuswb %xmm7, %xmm7
packuswb %xmm7, %xmm7
movd %xmm7, (%rdx,%r9)
addq $4, %r9
cmpq %r9, %rcx
jne .LBB0_4
```
In clang-16 result loop:
```
.LBB0_7: # =>This Inner Loop Header: Depth=1
movl (%rdi,%r10,4), %eax
xorl %edx, %edx
divl (%rsi,%r10,4)
cmpl %ecx, %eax
sete (%r9,%r10)
movl 4(%rdi,%r10,4), %eax
xorl %edx, %edx
divl 4(%rsi,%r10,4)
cmpl %ecx, %eax
sete 1(%r9,%r10)
addq $2, %r10
cmpq %r10, %r11
jne .LBB0_7
```
================
Comment at: llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll:111
%t0 = udiv i8 -1, %x
%r = icmp ugt i8 %t0, %y ; not ult
ret i1 %r
----------------
goldstein.w.n wrote:
> Can you add some vector tests?
I am not sure if I need to add vector tests here, as I just regenerated output here.
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D156029/new/
https://reviews.llvm.org/D156029
More information about the llvm-commits
mailing list