[PATCH] D67799: [InstCombine] Fold a shifty implementation of clamp0.
Huihui Zhang via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 20 16:35:51 PDT 2019
huihuiz marked 2 inline comments as done.
huihuiz added a comment.
For X86, AArch64 and ARM target, backend produce better ASM with this transformation. Please refer to below examples:
- Scalar Test ---
X86 target:
Test input; Run command : clang -O2 -target x86_64 -march=skylake -S clamp0.ll -o -
define i32 @clamp0(i32 %v) {
%sub = sub nsw i32 0, %v
%shr = ashr i32 %sub, 31
%and = and i32 %shr, %v
ret i32 %and
}
before
clamp0: # @clamp0
# %bb.0:
movl %edi, %eax
negl %eax
sarl $31, %eax
andl %edi, %eax
retq
After this optimization
clamp0: # @clamp0
# %bb.0:
movl %edi, %eax
sarl $31, %eax
andnl %edi, %eax, %eax
retq
AArch64 target:
Same test input; Run command : clang -O2 -target aarch64 -march=armv8a -S clamp0.ll -o -
before
clamp0: // @clamp0
// %bb.0:
neg w8, w0
and w0, w0, w8, asr #31
ret
After this optimization
clamp0: // @clamp0
// %bb.0:
bic w0, w0, w0, asr #31
ret
ARM target:
Same input; run : clang -O2 -target arm -march=armv8.1a -S clamp0.ll -o -
before:
clamp0:
.fnstart
@ %bb.0:
rsb r1, r0, #0
and r0, r0, r1, asr #31
bx lr
After this optimization
clamp0:
.fnstart
@ %bb.0:
bic r0, r0, r0, asr #31
bx lr
- Vector Test ---
X86 target:
Test input; Run command : clang -O2 -target x86_64 -march=skylake -S clamp0-vec.ll -o -
define <4 x i32> @clamp0-vec(<4 x i32> %v) {
%sub = sub nsw <4 x i32> zeroinitializer, %v
%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
%and = and <4 x i32> %shr, %v
ret <4 x i32> %and
}
before
"clamp0-vec": # @clamp0-vec
# %bb.0:
vpxor %xmm1, %xmm1, %xmm1
vpsubd %xmm0, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vpand %xmm0, %xmm1, %xmm0
retq
After this optimization
"clamp0-vec": # @clamp0-vec
# %bb.0:
vpxor %xmm1, %xmm1, %xmm1
vpmaxsd %xmm1, %xmm0, %xmm0
retq
AArch64 target:
Same test input; Run : clang -O2 -target aarch64 -march=armv8a -S clamp0-vec.ll -o -
before
"clamp0-vec": // @clamp0-vec
// %bb.0:
neg v1.4s, v0.4s
sshr v1.4s, v1.4s, #31
and v0.16b, v1.16b, v0.16b
ret
After this optimization
"clamp0-vec": // @clamp0-vec
// %bb.0:
movi v1.2d, #0000000000000000
smax v0.4s, v0.4s, v1.4s
ret
ARM target
Same input; Run : clang -O2 -target arm -march=armv8.1a -S clamp0-vec.ll -o -
before
"clamp0-vec":
.fnstart
@ %bb.0:
rsb r12, r0, #0
and r0, r0, r12, asr #31
rsb r12, r1, #0
and r1, r1, r12, asr #31
rsb r12, r2, #0
and r2, r2, r12, asr #31
rsb r12, r3, #0
and r3, r3, r12, asr #31
bx lr
After this optimization
"clamp0-vec":
.fnstart
@ %bb.0:
bic r0, r0, r0, asr #31
bic r1, r1, r1, asr #31
bic r2, r2, r2, asr #31
bic r3, r3, r3, asr #31
bx lr
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D67799/new/
https://reviews.llvm.org/D67799
More information about the llvm-commits
mailing list