[llvm] 146d35b - [ARM] CSEL generation

David Green via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 16 03:12:12 PDT 2020


Author: David Green
Date: 2020-07-16T11:10:53+01:00
New Revision: 146d35b6eeb5b360217d2f14a18c87b1a0aca77e

URL: https://github.com/llvm/llvm-project/commit/146d35b6eeb5b360217d2f14a18c87b1a0aca77e
DIFF: https://github.com/llvm/llvm-project/commit/146d35b6eeb5b360217d2f14a18c87b1a0aca77e.diff

LOG: [ARM] CSEL generation

This adds a peephole optimisation to turn a t2MOVccr that could not be
folded into any other instruction into a CSEL on 8.1-m. The t2MOVccr
would usually be expanded into a conditional mov, that becomes an IT;
MOV pair. We can instead generate a CSEL instruction, which can
potentially be smaller and allows better register allocation freedom,
which can help reduce codesize. Performance is more variable and may
depend on the micrarchitecture details, but initial results look good.
If we need to control this per-cpu, we can add a subtarget feature as we
need it.

Original patch by David Penry.

Differential Revision: https://reviews.llvm.org/D83566

Added: 
    

Modified: 
    llvm/lib/Target/ARM/ARMInstrThumb2.td
    llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
    llvm/lib/Target/ARM/Thumb2InstrInfo.h
    llvm/test/CodeGen/Thumb2/csel.ll
    llvm/test/CodeGen/Thumb2/float-ops.ll
    llvm/test/CodeGen/Thumb2/mve-abs.ll
    llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
    llvm/test/CodeGen/Thumb2/mve-vmaxv.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 7137e8ee66b8..d5143adaac17 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -5446,6 +5446,7 @@ class CS<string iname, bits<4> opcode, list<dag> pattern=[]>
   let Inst{3-0} = Rm{3-0};
 
   let Uses = [CPSR];
+  let hasSideEffects = 0;
 }
 
 def t2CSEL  : CS<"csel",  0b1000>;

diff  --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 48c6b47f2154..6ada546e5f48 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "Thumb2InstrInfo.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -118,6 +119,31 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
   return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
 }
 
+MachineInstr *
+Thumb2InstrInfo::optimizeSelect(MachineInstr &MI,
+                                SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                                bool PreferFalse) const {
+  // Try to use the base optimizeSelect, which uses canFoldIntoMOVCC to fold the
+  // MOVCC into another instruction. If that fails on 8.1-M fall back to using a
+  // CSEL.
+  MachineInstr *RV = ARMBaseInstrInfo::optimizeSelect(MI, SeenMIs, PreferFalse);
+  if (!RV && getSubtarget().hasV8_1MMainlineOps()) {
+    Register DestReg = MI.getOperand(0).getReg();
+
+    if (!DestReg.isVirtual())
+      return nullptr;
+
+    MachineInstrBuilder NewMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                        get(ARM::t2CSEL), DestReg)
+                                    .add(MI.getOperand(2))
+                                    .add(MI.getOperand(1))
+                                    .add(MI.getOperand(3));
+    SeenMIs.insert(NewMI);
+    return NewMI;
+  }
+  return RV;
+}
+
 void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, MCRegister DestReg,

diff  --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index ec3763632239..e31c49a38959 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -60,6 +60,10 @@ class Thumb2InstrInfo : public ARMBaseInstrInfo {
   ///
   const ThumbRegisterInfo &getRegisterInfo() const override { return RI; }
 
+  MachineInstr *optimizeSelect(MachineInstr &MI,
+                               SmallPtrSetImpl<MachineInstr *> &SeenMIs,
+                               bool) const override;
+
 private:
   void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
 };

diff  --git a/llvm/test/CodeGen/Thumb2/csel.ll b/llvm/test/CodeGen/Thumb2/csel.ll
index f2cf3e839a80..5a56fb6f692d 100644
--- a/llvm/test/CodeGen/Thumb2/csel.ll
+++ b/llvm/test/CodeGen/Thumb2/csel.ll
@@ -107,9 +107,7 @@ define i32 @csel_var(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: csel_var:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r0, #45
-; CHECK-NEXT:    it le
-; CHECK-NEXT:    movle r1, r2
-; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    csel r0, r1, r2, gt
 ; CHECK-NEXT:    bx lr
 entry:
   %cmp = icmp sgt i32 %a, 45

diff  --git a/llvm/test/CodeGen/Thumb2/float-ops.ll b/llvm/test/CodeGen/Thumb2/float-ops.ll
index fdd1b659d007..709bd49f2286 100644
--- a/llvm/test/CodeGen/Thumb2/float-ops.ll
+++ b/llvm/test/CodeGen/Thumb2/float-ops.ll
@@ -278,8 +278,10 @@ define double @select_d(double %a, double %b, i1 %c) {
 ; CHECK-LABEL: select_d:
 ; NONE: ldr{{(.w)?}}     [[REG:r[0-9]+]], [sp]
 ; NONE: ands    [[REG]], [[REG]], #1
-; NONE-DAG: moveq   r0, r2
-; NONE-DAG: moveq   r1, r3
+; NOREGS-DAG: moveq   r0, r2
+; NOREGS-DAG: moveq   r1, r3
+; ONLYREGS-DAG: csel   r0, r0, r2
+; ONLYREGS-DAG: csel   r1, r1, r3
 ; SP: ands r0, r0, #1
 ; SP-DAG: vmov [[ALO:r[0-9]+]], [[AHI:r[0-9]+]], d0
 ; SP-DAG: vmov [[BLO:r[0-9]+]], [[BHI:r[0-9]+]], d1

diff  --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll
index 29878063a8ca..0b5dcbced1a5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-abs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll
@@ -42,33 +42,30 @@ define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    vmov lr, s4
-; CHECK-NEXT:    vmov r0, s5
-; CHECK-NEXT:    rsbs.w r3, lr, #0
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    rsbs.w lr, r1, #0
 ; CHECK-NEXT:    sbc.w r2, r12, r0
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    ands r1, r1, #1
-; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    moveq r2, r0
-; CHECK-NEXT:    moveq r3, lr
-; CHECK-NEXT:    vmov lr, s6
-; CHECK-NEXT:    vmov.32 q0[0], r3
-; CHECK-NEXT:    vmov r0, s7
-; CHECK-NEXT:    vmov.32 q0[1], r2
-; CHECK-NEXT:    rsbs.w r2, lr, #0
-; CHECK-NEXT:    sbc.w r3, r12, r0
+; CHECK-NEXT:    cset r3, mi
+; CHECK-NEXT:    ands r3, r3, #1
+; CHECK-NEXT:    csel r1, lr, r1, ne
+; CHECK-NEXT:    csel r0, r2, r0, ne
+; CHECK-NEXT:    vmov.32 q1[0], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    rsbs r2, r1, #0
+; CHECK-NEXT:    sbc.w r12, r12, r0
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    ands r1, r1, #1
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    moveq r2, lr
-; CHECK-NEXT:    vmov.32 q0[2], r2
-; CHECK-NEXT:    it eq
-; CHECK-NEXT:    moveq r3, r0
-; CHECK-NEXT:    vmov.32 q0[3], r3
+; CHECK-NEXT:    cset r3, mi
+; CHECK-NEXT:    ands r3, r3, #1
+; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    csel r0, r12, r0, ne
+; CHECK-NEXT:    vmov.32 q1[2], r1
+; CHECK-NEXT:    vmov.32 q1[3], r0
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = icmp slt <2 x i64> %s1, zeroinitializer

diff  --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index 291c13543d14..9897b607d6b3 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -125,14 +125,12 @@ define arm_aapcs_vfpcc void @ssatmul_s_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    movlt r5, #1
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    moveq r3, r0
-; CHECK-NEXT:    moveq r4, r1
+; CHECK-NEXT:    csel r4, r4, r1, ne
+; CHECK-NEXT:    csel r3, r3, r0, ne
 ; CHECK-NEXT:    subs r5, r4, r2
 ; CHECK-NEXT:    sbcs r3, r3, #0
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r4, r2
-; CHECK-NEXT:    str r4, [r11], #4
+; CHECK-NEXT:    csel r3, r4, r2, lt
+; CHECK-NEXT:    str r3, [r11], #4
 ; CHECK-NEXT:    le lr, .LBB0_7
 ; CHECK-NEXT:  .LBB0_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
@@ -406,22 +404,20 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r12], #4
 ; CHECK-NEXT:    ldr r4, [r10], #4
-; CHECK-NEXT:    smull r4, r5, r4, r2
-; CHECK-NEXT:    asrl r4, r5, #31
-; CHECK-NEXT:    subs r2, r1, r4
-; CHECK-NEXT:    sbcs.w r2, r0, r5
-; CHECK-NEXT:    mov.w r2, #0
+; CHECK-NEXT:    smull r2, r5, r4, r2
+; CHECK-NEXT:    asrl r2, r5, #31
+; CHECK-NEXT:    subs r4, r1, r2
+; CHECK-NEXT:    sbcs.w r4, r0, r5
+; CHECK-NEXT:    mov.w r4, #0
 ; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    itt eq
-; CHECK-NEXT:    moveq r5, r0
-; CHECK-NEXT:    moveq r4, r1
-; CHECK-NEXT:    subs r2, r4, r3
-; CHECK-NEXT:    sbcs r2, r5, #0
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r4, r3
-; CHECK-NEXT:    str r4, [r11], #4
+; CHECK-NEXT:    movlt r4, #1
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r2, r2, r1, ne
+; CHECK-NEXT:    csel r4, r5, r0, ne
+; CHECK-NEXT:    subs r5, r2, r3
+; CHECK-NEXT:    sbcs r4, r4, #0
+; CHECK-NEXT:    csel r2, r2, r3, lt
+; CHECK-NEXT:    str r2, [r11], #4
 ; CHECK-NEXT:    le lr, .LBB1_7
 ; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    add sp, #8
@@ -1158,9 +1154,8 @@ define arm_aapcs_vfpcc void @ssatmul_4_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    asrlt r3, r2, #15
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, r1
-; CHECK-NEXT:    strh r3, [r4], #2
+; CHECK-NEXT:    csel r2, r3, r1, lt
+; CHECK-NEXT:    strh r2, [r4], #2
 ; CHECK-NEXT:    le lr, .LBB5_7
 ; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -1300,9 +1295,8 @@ define arm_aapcs_vfpcc void @ssatmul_8_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    asrlt r3, r2, #15
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, r1
-; CHECK-NEXT:    strh r3, [r4], #2
+; CHECK-NEXT:    csel r2, r3, r1, lt
+; CHECK-NEXT:    strh r2, [r4], #2
 ; CHECK-NEXT:    le lr, .LBB6_7
 ; CHECK-NEXT:  .LBB6_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
@@ -1439,9 +1433,8 @@ define arm_aapcs_vfpcc void @ssatmul_8i_q15(i16* nocapture readonly %pSrcA, i16*
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    asrlt r3, r2, #15
 ; CHECK-NEXT:    cmp r3, r1
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r3, r1
-; CHECK-NEXT:    strh r3, [r4], #2
+; CHECK-NEXT:    csel r2, r3, r1, lt
+; CHECK-NEXT:    strh r2, [r4], #2
 ; CHECK-NEXT:    le lr, .LBB7_7
 ; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 29e441e3e90c..0d22a7f3cd99 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -732,8 +732,7 @@ define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r2, r1
+; CHECK-NEXT:    csel r2, r2, r1, lt
 ; CHECK-NEXT:    le lr, .LBB7_8
 ; CHECK-NEXT:  .LBB7_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
@@ -819,8 +818,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mvn r4, #-2147483648
 ; CHECK-NEXT:    vminv.s32 r4, q0
 ; CHECK-NEXT:    cmp r0, r4
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r0, r4
+; CHECK-NEXT:    csel r0, r0, r4, lt
 ; CHECK-NEXT:    le lr, .LBB8_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    cmp r3, r1
@@ -834,8 +832,7 @@ define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r1], #4
 ; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    it ge
-; CHECK-NEXT:    movge r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, lt
 ; CHECK-NEXT:    le lr, .LBB8_8
 ; CHECK-NEXT:  .LBB8_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -933,8 +930,7 @@ define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    it le
-; CHECK-NEXT:    movle r2, r1
+; CHECK-NEXT:    csel r2, r2, r1, gt
 ; CHECK-NEXT:    le lr, .LBB9_8
 ; CHECK-NEXT:  .LBB9_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
@@ -1020,8 +1016,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mov.w r4, #-2147483648
 ; CHECK-NEXT:    vmaxv.s32 r4, q0
 ; CHECK-NEXT:    cmp r0, r4
-; CHECK-NEXT:    it le
-; CHECK-NEXT:    movle r0, r4
+; CHECK-NEXT:    csel r0, r0, r4, gt
 ; CHECK-NEXT:    le lr, .LBB10_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    cmp r3, r1
@@ -1035,8 +1030,7 @@ define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r1], #4
 ; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    it le
-; CHECK-NEXT:    movle r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, gt
 ; CHECK-NEXT:    le lr, .LBB10_8
 ; CHECK-NEXT:  .LBB10_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -1134,8 +1128,7 @@ define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    it hs
-; CHECK-NEXT:    movhs r2, r1
+; CHECK-NEXT:    csel r2, r2, r1, lo
 ; CHECK-NEXT:    le lr, .LBB11_8
 ; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
@@ -1221,8 +1214,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    mov.w r4, #-1
 ; CHECK-NEXT:    vminv.u32 r4, q0
 ; CHECK-NEXT:    cmp r0, r4
-; CHECK-NEXT:    it hs
-; CHECK-NEXT:    movhs r0, r4
+; CHECK-NEXT:    csel r0, r0, r4, lo
 ; CHECK-NEXT:    le lr, .LBB12_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
 ; CHECK-NEXT:    cmp r3, r1
@@ -1236,8 +1228,7 @@ define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r1], #4
 ; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
 ; CHECK-NEXT:    le lr, .LBB12_8
 ; CHECK-NEXT:  .LBB12_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -1335,8 +1326,7 @@ define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r1, [r0], #4
 ; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r2, r1
+; CHECK-NEXT:    csel r2, r2, r1, hi
 ; CHECK-NEXT:    le lr, .LBB13_8
 ; CHECK-NEXT:  .LBB13_9: @ %for.cond.cleanup
 ; CHECK-NEXT:    mov r0, r2
@@ -1418,8 +1408,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    vmaxv.u32 r4, q0
 ; CHECK-NEXT:    cmp r0, r4
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r0, r4
+; CHECK-NEXT:    csel r0, r0, r4, hi
 ; CHECK-NEXT:    le lr, .LBB14_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    cmp r3, r1
@@ -1433,8 +1422,7 @@ define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldr r2, [r1], #4
 ; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
 ; CHECK-NEXT:    le lr, .LBB14_6
 ; CHECK-NEXT:  @ %bb.7: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
index 36c201cced56..eca5f44904a1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmaxv.ll
@@ -145,8 +145,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK-NEXT:    vmaxv.s8 r1, q0
 ; CHECK-NEXT:    sxtb r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %c = icmp sgt i8 %r, %s2
@@ -161,8 +160,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    vmaxv.s8 r1, q0
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
@@ -180,8 +178,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK-NEXT:    vmaxv.s16 r1, q0
 ; CHECK-NEXT:    sxth r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %c = icmp sgt i16 %r, %s2
@@ -197,8 +194,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    vmaxv.s16 r1, q0
 ; CHECK-NEXT:    sxth r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
@@ -213,8 +209,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK-NEXT:    mov.w r1, #-2147483648
 ; CHECK-NEXT:    vmaxv.s32 r1, q0
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it gt
-; CHECK-NEXT:    movgt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, gt
 ; CHECK-NEXT:    bx lr
   %r = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %s1)
   %c = icmp sgt i32 %r, %s2
@@ -230,8 +225,7 @@ define arm_aapcs_vfpcc i8 @vmaxv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK-NEXT:    vmaxv.u8 r1, q0
 ; CHECK-NEXT:    uxtb r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it hi
-; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %c = icmp ugt i8 %r, %s2
@@ -246,8 +240,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    vmaxv.u8 r1, q0
 ; CHECK-NEXT:    uxtb r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it hi
-; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
@@ -264,8 +257,7 @@ define arm_aapcs_vfpcc i16 @vmaxv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK-NEXT:    vmaxv.u16 r1, q0
 ; CHECK-NEXT:    uxth r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it hi
-; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %c = icmp ugt i16 %r, %s2
@@ -280,8 +272,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    vmaxv.u16 r1, q0
 ; CHECK-NEXT:    uxth r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it hi
-; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
@@ -296,8 +287,7 @@ define arm_aapcs_vfpcc i32 @vmaxv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK-NEXT:    movs r1, #0
 ; CHECK-NEXT:    vmaxv.u32 r1, q0
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it hi
-; CHECK-NEXT:    movhi r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, hi
 ; CHECK-NEXT:    bx lr
   %r = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %s1)
   %c = icmp ugt i32 %r, %s2
@@ -313,8 +303,7 @@ define arm_aapcs_vfpcc i8 @vminv_s_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK-NEXT:    vminv.s8 r1, q0
 ; CHECK-NEXT:    sxtb r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %c = icmp slt i8 %r, %s2
@@ -329,8 +318,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    vminv.s8 r1, q0
 ; CHECK-NEXT:    sxtb r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %s1)
   %rs = sext i8 %r to i32
@@ -347,8 +335,7 @@ define arm_aapcs_vfpcc i16 @vminv_s_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK-NEXT:    vminv.s16 r1, q0
 ; CHECK-NEXT:    sxth r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %c = icmp slt i16 %r, %s2
@@ -363,8 +350,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    vminv.s16 r1, q0
 ; CHECK-NEXT:    sxth r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %s1)
   %rs = sext i16 %r to i32
@@ -379,8 +365,7 @@ define arm_aapcs_vfpcc i32 @vminv_s_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK-NEXT:    mvn r1, #-2147483648
 ; CHECK-NEXT:    vminv.s32 r1, q0
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lt
 ; CHECK-NEXT:    bx lr
   %r = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %s1)
   %c = icmp slt i32 %r, %s2
@@ -396,8 +381,7 @@ define arm_aapcs_vfpcc i8 @vminv_u_v16i8_i8(<16 x i8> %s1, i8 %s2) {
 ; CHECK-NEXT:    vminv.u8 r1, q0
 ; CHECK-NEXT:    uxtb r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %c = icmp ult i8 %r, %s2
@@ -412,8 +396,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v16i8_i32(<16 x i8> %s1, i32 %s2) {
 ; CHECK-NEXT:    vminv.u8 r1, q0
 ; CHECK-NEXT:    uxtb r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
   %r = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %s1)
   %rs = zext i8 %r to i32
@@ -430,8 +413,7 @@ define arm_aapcs_vfpcc i16 @vminv_u_v8i16_i16(<8 x i16> %s1, i16 %s2) {
 ; CHECK-NEXT:    vminv.u16 r1, q0
 ; CHECK-NEXT:    uxth r2, r1
 ; CHECK-NEXT:    cmp r2, r3
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %c = icmp ult i16 %r, %s2
@@ -446,8 +428,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v8i16_i32(<8 x i16> %s1, i32 %s2) {
 ; CHECK-NEXT:    vminv.u16 r1, q0
 ; CHECK-NEXT:    uxth r1, r1
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
   %r = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %s1)
   %rs = zext i16 %r to i32
@@ -462,8 +443,7 @@ define arm_aapcs_vfpcc i32 @vminv_u_v4i32_i32(<4 x i32> %s1, i32 %s2) {
 ; CHECK-NEXT:    mov.w r1, #-1
 ; CHECK-NEXT:    vminv.u32 r1, q0
 ; CHECK-NEXT:    cmp r1, r0
-; CHECK-NEXT:    it lo
-; CHECK-NEXT:    movlo r0, r1
+; CHECK-NEXT:    csel r0, r1, r0, lo
 ; CHECK-NEXT:    bx lr
   %r = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %s1)
   %c = icmp ult i32 %r, %s2


        


More information about the llvm-commits mailing list