[PATCH] D67800: [InstCombine] Fold a shifty implementation of clamp positive to allOnesValue.
Huihui Zhang via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 23 00:49:44 PDT 2019
huihuiz added a comment.
llvm-mca performance result for general folding:
- Scalar Tests ---
test input
define i32 @clamp255(i32 %v, i32 %x) {
%sub = sub nsw i32 %x, %v
%shr = ashr i32 %sub, 31
%or = or i32 %shr, %v
ret i32 %or
}
X86 skylake: cmov latency 1
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake
before
Iterations: 100
Instructions: 500
Total Cycles: 159
Total uOps: 700
Dispatch Width: 6
uOps Per Cycle: 4.40
IPC: 3.14
Block RThroughput: 1.2
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 movl %esi, %eax
1 1 0.25 subl %edi, %eax
1 1 0.50 sarl $31, %eax
1 1 0.25 orl %edi, %eax
3 7 1.00 U retq
After
Iterations: 100
Instructions: 400
Total Cycles: 135
Total uOps: 600
Dispatch Width: 6
uOps Per Cycle: 4.44
IPC: 2.96
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 cmpl %esi, %edi
1 1 0.25 movl $-1, %eax
1 1 0.50 cmovlel %edi, %eax
3 7 1.00 U retq
AMD znver2
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before
Iterations: 100
Instructions: 500
Total Cycles: 155
Total uOps: 600
Dispatch Width: 4
uOps Per Cycle: 3.87
IPC: 3.23
Block RThroughput: 1.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 movl %esi, %eax
1 1 0.25 subl %edi, %eax
1 1 0.25 sarl $31, %eax
1 1 0.25 orl %edi, %eax
2 1 0.50 U retq
After
Iterations: 100
Instructions: 400
Total Cycles: 137
Total uOps: 500
Dispatch Width: 4
uOps Per Cycle: 3.65
IPC: 2.92
Block RThroughput: 1.3
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 cmpl %esi, %edi
1 1 0.25 movl $-1, %eax
1 1 0.25 cmovlel %edi, %eax
2 1 0.50 U retq
AArch64 cortex-a57
clang clampPositiveToMinusOne.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before
Iterations: 100
Instructions: 300
Total Cycles: 303
Total uOps: 300
Dispatch Width: 3
uOps Per Cycle: 0.99
IPC: 0.99
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.50 sub w8, w1, w0
1 2 1.00 orr w0, w0, w8, asr #31
1 1 1.00 U ret
After
Iterations: 100
Instructions: 300
Total Cycles: 203
Total uOps: 300
Dispatch Width: 3
uOps Per Cycle: 1.48
IPC: 1.48
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.50 cmp w0, w1
1 1 0.50 csinv w0, w0, wzr, le
1 1 1.00 U ret
- Vector Test ---
Input
define <4 x i32> @clamp255(<4 x i32> %v, <4 x i32> %x) {
%sub = sub nsw <4 x i32> %x, %v
%shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
%or = or <4 x i32> %shr, %v
ret <4 x i32> %or
}
X86 skylake
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake
before
Iterations: 100
Instructions: 400
Total Cycles: 303
Total uOps: 600
Dispatch Width: 6
uOps Per Cycle: 1.98
IPC: 1.32
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.33 vpsubd %xmm0, %xmm1, %xmm1
1 1 0.50 vpsrad $31, %xmm1, %xmm1
1 1 0.33 vpor %xmm0, %xmm1, %xmm0
3 7 1.00 U retq
After
Iterations: 100
Instructions: 300
Total Cycles: 203
Total uOps: 500
Dispatch Width: 6
uOps Per Cycle: 2.46
IPC: 1.48
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.50 vpcmpgtd %xmm1, %xmm0, %xmm1
1 1 0.33 vpor %xmm0, %xmm1, %xmm0
3 7 1.00 U retq
AMD znver2
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before
Iterations: 100
Instructions: 400
Total Cycles: 303
Total uOps: 500
Dispatch Width: 4
uOps Per Cycle: 1.65
IPC: 1.32
Block RThroughput: 1.3
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 vpsubd %xmm0, %xmm1, %xmm1
1 1 0.25 vpsrad $31, %xmm1, %xmm1
1 1 0.25 vpor %xmm0, %xmm1, %xmm0
2 1 0.50 U retq
After
Iterations: 100
Instructions: 300
Total Cycles: 203
Total uOps: 400
Dispatch Width: 4
uOps Per Cycle: 1.97
IPC: 1.48
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 vpcmpgtd %xmm1, %xmm0, %xmm1
1 1 0.25 vpor %xmm0, %xmm1, %xmm0
2 1 0.50 U retq
AArch64 cortex-a57
clang clampPositiveToMinusOne-vec.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before
Iterations: 100
Instructions: 400
Total Cycles: 903
Total uOps: 400
Dispatch Width: 3
uOps Per Cycle: 0.44
IPC: 0.44
Block RThroughput: 1.5
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 3 0.50 sub v1.4s, v1.4s, v0.4s
1 3 0.50 sshr v1.4s, v1.4s, #31
1 3 0.50 orr v0.16b, v1.16b, v0.16b
1 1 1.00 U ret
After
Iterations: 100
Instructions: 300
Total Cycles: 603
Total uOps: 300
Dispatch Width: 3
uOps Per Cycle: 0.50
IPC: 0.50
Block RThroughput: 1.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 3 0.50 cmgt v1.4s, v0.4s, v1.4s
1 3 0.50 orr v0.16b, v0.16b, v1.16b
1 1 1.00 U ret
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D67800/new/
https://reviews.llvm.org/D67800
More information about the llvm-commits
mailing list