[PATCH] D67800: [InstCombine] Fold a shifty implementation of clamp positive to allOnesValue.

Mon Sep 23 00:49:44 PDT 2019

huihuiz added a comment.

llvm-mca performance result for general folding:

- Scalar Tests ---

test input

  define i32 @clamp255(i32 %v, i32 %x) {
    %sub = sub nsw i32 %x, %v
    %shr = ashr i32 %sub, 31
    %or = or i32 %shr, %v
    ret i32 %or
  }

X86 skylake: cmov latency 1
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake
before

  Iterations:        100
  Instructions:      500
  Total Cycles:      159
  Total uOps:        700

  Dispatch Width:    6
  uOps Per Cycle:    4.40
  IPC:               3.14
  Block RThroughput: 1.2

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        movl  %esi, %eax
   1      1     0.25                        subl  %edi, %eax
   1      1     0.50                        sarl  $31, %eax
   1      1     0.25                        orl   %edi, %eax
   3      7     1.00                  U     retq

After

  Iterations:        100
  Instructions:      400
  Total Cycles:      135
  Total uOps:        600

  Dispatch Width:    6
  uOps Per Cycle:    4.44
  IPC:               2.96
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        cmpl  %esi, %edi
   1      1     0.25                        movl  $-1, %eax
   1      1     0.50                        cmovlel       %edi, %eax
   3      7     1.00                  U     retq

AMD znver2
clang clampPositiveToMinusOne.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before

  Iterations:        100
  Instructions:      500
  Total Cycles:      155
  Total uOps:        600

  Dispatch Width:    4
  uOps Per Cycle:    3.87
  IPC:               3.23
  Block RThroughput: 1.5

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        movl  %esi, %eax
   1      1     0.25                        subl  %edi, %eax
   1      1     0.25                        sarl  $31, %eax
   1      1     0.25                        orl   %edi, %eax
   2      1     0.50                  U     retq

After

  Iterations:        100
  Instructions:      400
  Total Cycles:      137
  Total uOps:        500

  Dispatch Width:    4
  uOps Per Cycle:    3.65
  IPC:               2.92
  Block RThroughput: 1.3

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        cmpl  %esi, %edi
   1      1     0.25                        movl  $-1, %eax
   1      1     0.25                        cmovlel       %edi, %eax
   2      1     0.50                  U     retq

AArch64 cortex-a57
clang clampPositiveToMinusOne.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57    
before

  Iterations:        100
  Instructions:      300
  Total Cycles:      303
  Total uOps:        300

  Dispatch Width:    3
  uOps Per Cycle:    0.99
  IPC:               0.99
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.50                        sub   w8, w1, w0
   1      2     1.00                        orr   w0, w0, w8, asr #31
   1      1     1.00                  U     ret

After

  Iterations:        100
  Instructions:      300
  Total Cycles:      203
  Total uOps:        300

  Dispatch Width:    3
  uOps Per Cycle:    1.48
  IPC:               1.48
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.50                        cmp   w0, w1
   1      1     0.50                        csinv w0, w0, wzr, le
   1      1     1.00                  U     ret

- Vector Test ---

Input

  define <4 x i32> @clamp255(<4 x i32> %v, <4 x i32> %x) {
    %sub = sub nsw <4 x i32> %x, %v
    %shr = ashr <4 x i32> %sub, <i32 31, i32 31, i32 31, i32 31>
    %or = or <4 x i32> %shr, %v
    ret <4 x i32> %or
  }

X86 skylake
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=skylake -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake

before

  Iterations:        100
  Instructions:      400
  Total Cycles:      303
  Total uOps:        600

  Dispatch Width:    6
  uOps Per Cycle:    1.98
  IPC:               1.32
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.33                        vpsubd        %xmm0, %xmm1, %xmm1
   1      1     0.50                        vpsrad        $31, %xmm1, %xmm1
   1      1     0.33                        vpor  %xmm0, %xmm1, %xmm0
   3      7     1.00                  U     retq

After

  Iterations:        100
  Instructions:      300
  Total Cycles:      203
  Total uOps:        500

  Dispatch Width:    6
  uOps Per Cycle:    2.46
  IPC:               1.48
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.50                        vpcmpgtd      %xmm1, %xmm0, %xmm1
   1      1     0.33                        vpor  %xmm0, %xmm1, %xmm0
   3      7     1.00                  U     retq

AMD znver2
clang clampPositiveToMinusOne-vec.ll -O2 -target x86_64 -march=znver2 -S -o - | llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2
before

  Iterations:        100
  Instructions:      400
  Total Cycles:      303
  Total uOps:        500

  Dispatch Width:    4
  uOps Per Cycle:    1.65
  IPC:               1.32
  Block RThroughput: 1.3

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        vpsubd        %xmm0, %xmm1, %xmm1
   1      1     0.25                        vpsrad        $31, %xmm1, %xmm1
   1      1     0.25                        vpor  %xmm0, %xmm1, %xmm0
   2      1     0.50                  U     retq

After

  Iterations:        100
  Instructions:      300
  Total Cycles:      203
  Total uOps:        400

  Dispatch Width:    4
  uOps Per Cycle:    1.97
  IPC:               1.48
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      1     0.25                        vpcmpgtd      %xmm1, %xmm0, %xmm1
   1      1     0.25                        vpor  %xmm0, %xmm1, %xmm0
   2      1     0.50                  U     retq

AArch64 cortex-a57
clang clampPositiveToMinusOne-vec.ll -O2 -target aarch64 -mcpu=cortex-a57 -S -o - | llvm-mca -mtriple=aarch64 -mcpu=cortex-a57
before

  Iterations:        100
  Instructions:      400
  Total Cycles:      903
  Total uOps:        400

  Dispatch Width:    3
  uOps Per Cycle:    0.44
  IPC:               0.44
  Block RThroughput: 1.5

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      3     0.50                        sub   v1.4s, v1.4s, v0.4s
   1      3     0.50                        sshr  v1.4s, v1.4s, #31
   1      3     0.50                        orr   v0.16b, v1.16b, v0.16b
   1      1     1.00                  U     ret

After

  Iterations:        100
  Instructions:      300
  Total Cycles:      603
  Total uOps:        300

  Dispatch Width:    3
  uOps Per Cycle:    0.50
  IPC:               0.50
  Block RThroughput: 1.0

  Instruction Info:
  [1]: #uOps
  [2]: Latency
  [3]: RThroughput
  [4]: MayLoad
  [5]: MayStore
  [6]: HasSideEffects (U)

  [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
   1      3     0.50                        cmgt  v1.4s, v0.4s, v1.4s
   1      3     0.50                        orr   v0.16b, v0.16b, v1.16b
   1      1     1.00                  U     ret

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D67800/new/

https://reviews.llvm.org/D67800