[llvm] [CodeGen] Avoid sinking vector comparisons during CodeGenPrepare (PR #113158)

Tue Nov 5 03:10:07 PST 2024

https://github.com/david-arm updated https://github.com/llvm/llvm-project/pull/113158

>From 608fee8f5ae8a01ef3f04ad5068b93102886e19e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 5 Nov 2024 11:07:32 +0000
Subject: [PATCH 1/2] Add tests

---
 .../CodeGen/AArch64/no-sink-vector-cmp.ll     | 153 ++++++++++++++++++
 .../CodeGen/Thumb2/mve-sink-vector-cmp.ll     | 145 +++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
 create mode 100644 llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll

diff --git a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
new file mode 100644
index 00000000000000..78f4d4d195c1bd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
+; CHECK-LABEL: no_sink_simple:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    tbz w0, #0, .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %s
+; CHECK-NEXT:    cmgt v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    str q0, [x1]
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %t
+; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    ret
+  %d = icmp slt <4 x i32> %a, %b
+  br i1 %c, label %s, label %t
+
+s:
+  %s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer
+  store <4 x i32> %s1, ptr %p
+  ret <4 x i32> %s1
+
+t:
+  %s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer
+  ret <4 x i32> %s2
+}
+
+define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    mov w10, #4 // =0x4
+; CHECK-NEXT:    adrp x9, .LCPI1_0
+; CHECK-NEXT:    adrp x11, .LCPI1_1
+; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    dup v1.2d, x10
+; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI1_1]
+; CHECK-NEXT:    add x9, x0, #8
+; CHECK-NEXT:    mov w10, #16 // =0x10
+; CHECK-NEXT:    mov w11, #1 // =0x1
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_1: // %pred.store.continue18
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    add v2.2d, v2.2d, v1.2d
+; CHECK-NEXT:    add v3.2d, v3.2d, v1.2d
+; CHECK-NEXT:    subs x10, x10, #4
+; CHECK-NEXT:    add x9, x9, #16
+; CHECK-NEXT:    b.eq .LBB1_10
+; CHECK-NEXT:  .LBB1_2: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmhi v4.2d, v0.2d, v3.2d
+; CHECK-NEXT:    xtn v4.2s, v4.2d
+; CHECK-NEXT:    uzp1 v4.4h, v4.4h, v0.4h
+; CHECK-NEXT:    umov w12, v4.h[0]
+; CHECK-NEXT:    tbz w12, #0, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    stur w11, [x9, #-8]
+; CHECK-NEXT:  .LBB1_4: // %pred.store.continue
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    cmhi v4.2d, v4.2d, v3.2d
+; CHECK-NEXT:    xtn v4.2s, v4.2d
+; CHECK-NEXT:    uzp1 v4.4h, v4.4h, v0.4h
+; CHECK-NEXT:    umov w12, v4.h[1]
+; CHECK-NEXT:    tbz w12, #0, .LBB1_6
+; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    stur w11, [x9, #-4]
+; CHECK-NEXT:  .LBB1_6: // %pred.store.continue6
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    cmhi v4.2d, v4.2d, v2.2d
+; CHECK-NEXT:    xtn v4.2s, v4.2d
+; CHECK-NEXT:    uzp1 v4.4h, v0.4h, v4.4h
+; CHECK-NEXT:    umov w12, v4.h[2]
+; CHECK-NEXT:    tbz w12, #0, .LBB1_8
+; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    str w11, [x9]
+; CHECK-NEXT:  .LBB1_8: // %pred.store.continue8
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    cmhi v4.2d, v4.2d, v2.2d
+; CHECK-NEXT:    xtn v4.2s, v4.2d
+; CHECK-NEXT:    uzp1 v4.4h, v0.4h, v4.4h
+; CHECK-NEXT:    umov w12, v4.h[3]
+; CHECK-NEXT:    tbz w12, #0, .LBB1_1
+; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; CHECK-NEXT:    str w11, [x9, #4]
+; CHECK-NEXT:    b .LBB1_1
+; CHECK-NEXT:  .LBB1_10: // %for.cond.cleanup
+; CHECK-NEXT:    ret
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i64> %vec.ind, <i64 15, i64 15, i64 15, i64 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i64> %vec.ind, <i64 4, i64 4, i64 4, i64 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll b/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll
new file mode 100644
index 00000000000000..ca049763c5d2f2
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-sink-vector-cmp.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve < %s | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
+; CHECK-LABEL: no_sink_simple:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    lsls r0, r0, #31
+; CHECK-NEXT:    beq .LBB0_2
+; CHECK-NEXT:  @ %bb.1: @ %s
+; CHECK-NEXT:    vcmp.s32 gt, q1, q0
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:  .LBB0_2: @ %t
+; CHECK-NEXT:    vcmp.s32 gt, q1, q0
+; CHECK-NEXT:    vmov.i32 q0, #0x0
+; CHECK-NEXT:    vpsel q0, q1, q0
+; CHECK-NEXT:    bx lr
+  %d = icmp slt <4 x i32> %a, %b
+  br i1 %c, label %s, label %t
+
+s:
+  %s1 = select <4 x i1> %d, <4 x i32> %a, <4 x i32> zeroinitializer
+  store <4 x i32> %s1, ptr %p
+  ret <4 x i32> %s1
+
+t:
+  %s2 = select <4 x i1> %d, <4 x i32> %b, <4 x i32> zeroinitializer
+  ret <4 x i32> %s2
+}
+
+define arm_aapcs_vfpcc void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
+; CHECK-LABEL: vector_loop_with_icmp:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    adr r1, .LCPI1_0
+; CHECK-NEXT:    vmov.i32 q1, #0xf
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    mov.w lr, #1
+; CHECK-NEXT:    vmov.i32 q2, #0xf
+; CHECK-NEXT:    vmov.i32 q3, #0xf
+; CHECK-NEXT:    vmov.i32 q4, #0xf
+; CHECK-NEXT:    mov.w r12, #4
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:  .LBB1_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vcmp.u32 hi, q1, q0
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    vcmp.u32 hi, q2, q0
+; CHECK-NEXT:    lsls r2, r2, #31
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    strne.w lr, [r0, r1, lsl #2]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    vcmp.u32 hi, q3, q0
+; CHECK-NEXT:    lsrs r2, r2, #4
+; CHECK-NEXT:    lsls r2, r2, #31
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    orrne r2, r1, #1
+; CHECK-NEXT:    strne.w lr, [r0, r2, lsl #2]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    vcmp.u32 hi, q4, q0
+; CHECK-NEXT:    vadd.i32 q0, q0, r12
+; CHECK-NEXT:    lsrs r2, r2, #8
+; CHECK-NEXT:    lsls r2, r2, #31
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    orrne r2, r1, #2
+; CHECK-NEXT:    strne.w lr, [r0, r2, lsl #2]
+; CHECK-NEXT:    vmrs r2, p0
+; CHECK-NEXT:    lsrs r2, r2, #12
+; CHECK-NEXT:    lsls r2, r2, #31
+; CHECK-NEXT:    itt ne
+; CHECK-NEXT:    orrne r2, r1, #3
+; CHECK-NEXT:    strne.w lr, [r0, r2, lsl #2]
+; CHECK-NEXT:    adds r1, #4
+; CHECK-NEXT:    adc r3, r3, #0
+; CHECK-NEXT:    eor r2, r1, #16
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    bne .LBB1_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r7, pc}
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.3:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %pred.store.continue18 ]
+  %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %entry ], [ %vec.ind.next, %pred.store.continue18 ]
+  %0 = icmp ult <4 x i32> %vec.ind, <i32 15, i32 15, i32 15, i32 15>
+  %1 = extractelement <4 x i1> %0, i64 0
+  br i1 %1, label %pred.store.if, label %pred.store.continue
+
+pred.store.if:
+  %2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 1, ptr %2, align 4
+  br label %pred.store.continue
+
+pred.store.continue:
+  %3 = extractelement <4 x i1> %0, i64 1
+  br i1 %3, label %pred.store.if5, label %pred.store.continue6
+
+pred.store.if5:
+  %4 = or disjoint i64 %index, 1
+  %5 = getelementptr inbounds i32, ptr %dest, i64 %4
+  store i32 1, ptr %5, align 4
+  br label %pred.store.continue6
+
+pred.store.continue6:
+  %6 = extractelement <4 x i1> %0, i64 2
+  br i1 %6, label %pred.store.if7, label %pred.store.continue8
+
+pred.store.if7:
+  %7 = or disjoint i64 %index, 2
+  %8 = getelementptr inbounds i32, ptr %dest, i64 %7
+  store i32 1, ptr %8, align 4
+  br label %pred.store.continue8
+
+pred.store.continue8:
+  %9 = extractelement <4 x i1> %0, i64 3
+  br i1 %9, label %pred.store.if9, label %pred.store.continue18
+
+pred.store.if9:
+  %10 = or disjoint i64 %index, 3
+  %11 = getelementptr inbounds i32, ptr %dest, i64 %10
+  store i32 1, ptr %11, align 4
+  br label %pred.store.continue18
+
+pred.store.continue18:
+  %index.next = add i64 %index, 4
+  %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4>
+  %24 = icmp eq i64 %index.next, 16
+  br i1 %24, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:
+  ret void
+}

>From e4f714daeb8799e65b84518caf2a9382dc86f2c4 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Tue, 5 Nov 2024 11:08:46 +0000
Subject: [PATCH 2/2] [CodeGen] Avoid sinking vector comparisons during
 CodeGenPrepare

Whilst reviewing PR #109289 and doing some analysis with various
tests involving predicated blocks I noticed that we're making
codegen and performance worse by sinking vector comparisons
multiple times into blocks. It looks like the sinkCmpExpression
in CodeGenPrepare was written for scalar comparisons where there
is only a single condition register, whereas vector comparisons
typically produce a vector result. For some targets, such a NEON
or SVE, there are multiple allocatable vector registers that can
store the result and so we should avoid sinking in that case.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 26 ++----
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  8 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  1 -
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |  4 +
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  8 --
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   | 10 +++
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 10 ++-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |  2 +
 .../CodeGen/AArch64/no-sink-vector-cmp.ll     | 88 +++++++++----------
 9 files changed, 76 insertions(+), 81 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8e0cdc6f1a5e77..92999a90019170 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -497,10 +497,10 @@ class TargetLoweringBase {
     return true;
   }
 
-  /// Return true if multiple condition registers are available.
-  bool hasMultipleConditionRegisters() const {
-    return HasMultipleConditionRegisters;
-  }
+  /// Return true if multiple (allocatable) predicate registers are available
+  /// for \p VT. If there is only a single register the code generator will
+  /// sink comparisons into the blocks of their users.
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const { return false; }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
@@ -2389,7 +2389,7 @@ class TargetLoweringBase {
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
-    if (hasMultipleConditionRegisters())
+    if (hasMultiplePredicateRegisters(MVT::i1))
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
@@ -2496,15 +2496,6 @@ class TargetLoweringBase {
     StackPointerRegisterToSaveRestore = R;
   }
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
-    HasMultipleConditionRegisters = hasManyRegs;
-  }
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
@@ -3473,13 +3464,6 @@ class TargetLoweringBase {
 private:
   const TargetMachine &TM;
 
-  /// Tells the code generator that the target has multiple (allocatable)
-  /// condition registers that can be used to store the results of comparisons
-  /// for use by selects and conditional branches. With multiple condition
-  /// registers, the code generator will not aggressively sink comparisons into
-  /// the blocks of their users.
-  bool HasMultipleConditionRegisters;
-
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f1ac3d95a8dd87..48dfe86327b08f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1770,8 +1770,10 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
 /// lose; some adjustment may be wanted there.
 ///
 /// Return true if any changes are made.
-static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) {
-  if (TLI.hasMultipleConditionRegisters())
+static bool sinkCmpExpression(const DataLayout &DL, CmpInst *Cmp,
+                              const TargetLowering &TLI) {
+  EVT ResVT = TLI.getValueType(DL, Cmp->getType());
+  if (TLI.hasMultiplePredicateRegisters(ResVT))
     return false;
 
   // Avoid sinking soft-FP comparisons, since this can move them into a loop.
@@ -2176,7 +2178,7 @@ static bool adjustIsPower2Test(CmpInst *Cmp, const TargetLowering &TLI,
 }
 
 bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) {
-  if (sinkCmpExpression(Cmp, *TLI))
+  if (sinkCmpExpression(*DL, Cmp, *TLI))
     return true;
 
   if (combineToUAddWithOverflow(Cmp, ModifiedDT))
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 5bcde0e1bbec88..a7d22ccd451758 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -625,7 +625,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
   MaxGluedStoresPerMemcpy = 0;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
       MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
-  HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index d696355bb062a8..f1350e1eb53f33 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1346,6 +1346,10 @@ class AArch64TargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 
   bool softPromoteHalfType() const override { return true; }
+
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
+    return VT.isVector();
+  }
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e4b54c7d72b083..6e592e47e6a687 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -583,14 +583,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setSchedulingPreference(Sched::RegPressure);
   setJumpIsExpensive(true);
 
-  // FIXME: This is only partially true. If we have to do vector compares, any
-  // SGPR pair can be a condition register. If we have a uniform condition, we
-  // are better off doing SALU operations, where there is only one SCC. For now,
-  // we don't have a way of knowing during instruction selection if a condition
-  // will be uniform and we always use vector compares. Assume we are using
-  // vector compares until that is fixed.
-  setHasMultipleConditionRegisters(true);
-
   setMinCmpXchgSizeInBits(32);
   setSupportsUnalignedAtomics(false);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index b2fd31cb2346eb..166a9099a0d471 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -387,6 +387,16 @@ class AMDGPUTargetLowering : public TargetLowering {
   MVT getFenceOperandTy(const DataLayout &DL) const override {
     return MVT::i32;
   }
+
+  virtual bool hasMultiplePredicateRegisters(EVT VT) const override {
+    // FIXME: This is only partially true. If we have to do vector compares,
+    // any SGPR pair can be a condition register. If we have a uniform
+    // condition, we are better off doing SALU operations, where there is only
+    // one SCC. For now, we don't have a way of knowing during instruction
+    // selection if a condition will be uniform and we always use vector
+    // compares. Assume we are using vector compares until that is fixed.
+    return true;
+  }
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d8f3095ed7fb68..fc4ca03eb85f75 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1461,10 +1461,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
-  if (Subtarget.useCRBits()) {
-    setHasMultipleConditionRegisters();
+  if (Subtarget.useCRBits())
     setJumpIsExpensive();
-  }
 
   // TODO: The default entry number is set to 64. This stops most jump table
   // generation on PPC. But it is good for current PPC HWs because the indirect
@@ -19137,3 +19135,9 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Builder.CreateOr(
       Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
 }
+
+bool PPCTargetLowering::hasMultiplePredicateRegisters(EVT VT) const {
+  // With 32 condition bits, we don't need to sink (and duplicate) compares
+  // aggressively in CodeGenPrep.
+  return Subtarget.useCRBits();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index dde45e4cf6f4ae..7f2287f6599905 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1492,6 +1492,8 @@ namespace llvm {
     /// through to determine the optimal load/store instruction format.
     unsigned computeMOFlags(const SDNode *Parent, SDValue N,
                             SelectionDAG &DAG) const;
+
+    virtual bool hasMultiplePredicateRegisters(EVT VT) const override;
   }; // end class PPCTargetLowering
 
   namespace PPC {
diff --git a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
index 78f4d4d195c1bd..8b94cb2414f0db 100644
--- a/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
+++ b/llvm/test/CodeGen/AArch64/no-sink-vector-cmp.ll
@@ -6,14 +6,16 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @no_sink_simple(<4 x i32> %a, <4 x i32> %b, i1 %c, ptr %p) {
 ; CHECK-LABEL: no_sink_simple:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmgt v2.4s, v1.4s, v0.4s
+; CHECK-NEXT:    xtn v2.4h, v2.4s
 ; CHECK-NEXT:    tbz w0, #0, .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %s
-; CHECK-NEXT:    cmgt v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    sshll v1.4s, v2.4h, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    str q0, [x1]
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %t
-; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    sshll v0.4s, v2.4h, #0
 ; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %d = icmp slt <4 x i32> %a, %b
@@ -32,68 +34,64 @@ t:
 define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
 ; CHECK-LABEL: vector_loop_with_icmp:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #15 // =0xf
+; CHECK-NEXT:    mov w9, #15 // =0xf
 ; CHECK-NEXT:    mov w10, #4 // =0x4
-; CHECK-NEXT:    adrp x9, .LCPI1_0
+; CHECK-NEXT:    adrp x8, .LCPI1_0
 ; CHECK-NEXT:    adrp x11, .LCPI1_1
-; CHECK-NEXT:    dup v0.2d, x8
+; CHECK-NEXT:    dup v0.2d, x9
 ; CHECK-NEXT:    dup v1.2d, x10
-; CHECK-NEXT:    ldr q2, [x9, :lo12:.LCPI1_0]
+; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI1_0]
 ; CHECK-NEXT:    ldr q3, [x11, :lo12:.LCPI1_1]
-; CHECK-NEXT:    add x9, x0, #8
-; CHECK-NEXT:    mov w10, #16 // =0x10
-; CHECK-NEXT:    mov w11, #1 // =0x1
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    mov w10, #1 // =0x1
 ; CHECK-NEXT:    b .LBB1_2
 ; CHECK-NEXT:  .LBB1_1: // %pred.store.continue18
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    add v2.2d, v2.2d, v1.2d
 ; CHECK-NEXT:    add v3.2d, v3.2d, v1.2d
-; CHECK-NEXT:    subs x10, x10, #4
-; CHECK-NEXT:    add x9, x9, #16
+; CHECK-NEXT:    subs x9, x9, #4
+; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    b.eq .LBB1_10
 ; CHECK-NEXT:  .LBB1_2: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    cmhi v4.2d, v0.2d, v3.2d
-; CHECK-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEXT:    uzp1 v4.4h, v4.4h, v0.4h
-; CHECK-NEXT:    umov w12, v4.h[0]
-; CHECK-NEXT:    tbz w12, #0, .LBB1_4
-; CHECK-NEXT:  // %bb.3: // %pred.store.if
+; CHECK-NEXT:    cmhi v4.2d, v0.2d, v2.2d
+; CHECK-NEXT:    cmhi v5.2d, v0.2d, v3.2d
+; CHECK-NEXT:    uzp1 v4.4s, v5.4s, v4.4s
+; CHECK-NEXT:    xtn v4.4h, v4.4s
+; CHECK-NEXT:    umov w11, v4.h[0]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_6
+; CHECK-NEXT:  // %bb.3: // %pred.store.continue
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    stur w11, [x9, #-8]
-; CHECK-NEXT:  .LBB1_4: // %pred.store.continue
+; CHECK-NEXT:    umov w11, v4.h[1]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_7
+; CHECK-NEXT:  .LBB1_4: // %pred.store.continue6
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    dup v4.2d, x8
-; CHECK-NEXT:    cmhi v4.2d, v4.2d, v3.2d
-; CHECK-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEXT:    uzp1 v4.4h, v4.4h, v0.4h
-; CHECK-NEXT:    umov w12, v4.h[1]
-; CHECK-NEXT:    tbz w12, #0, .LBB1_6
-; CHECK-NEXT:  // %bb.5: // %pred.store.if5
+; CHECK-NEXT:    umov w11, v4.h[2]
+; CHECK-NEXT:    tbnz w11, #0, .LBB1_8
+; CHECK-NEXT:  .LBB1_5: // %pred.store.continue8
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    stur w11, [x9, #-4]
-; CHECK-NEXT:  .LBB1_6: // %pred.store.continue6
+; CHECK-NEXT:    umov w11, v4.h[3]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_1
+; CHECK-NEXT:    b .LBB1_9
+; CHECK-NEXT:  .LBB1_6: // %pred.store.if
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    dup v4.2d, x8
-; CHECK-NEXT:    cmhi v4.2d, v4.2d, v2.2d
-; CHECK-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEXT:    uzp1 v4.4h, v0.4h, v4.4h
-; CHECK-NEXT:    umov w12, v4.h[2]
-; CHECK-NEXT:    tbz w12, #0, .LBB1_8
-; CHECK-NEXT:  // %bb.7: // %pred.store.if7
+; CHECK-NEXT:    stur w10, [x8, #-8]
+; CHECK-NEXT:    umov w11, v4.h[1]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_4
+; CHECK-NEXT:  .LBB1_7: // %pred.store.if5
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    str w11, [x9]
-; CHECK-NEXT:  .LBB1_8: // %pred.store.continue8
+; CHECK-NEXT:    stur w10, [x8, #-4]
+; CHECK-NEXT:    umov w11, v4.h[2]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_5
+; CHECK-NEXT:  .LBB1_8: // %pred.store.if7
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    dup v4.2d, x8
-; CHECK-NEXT:    cmhi v4.2d, v4.2d, v2.2d
-; CHECK-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEXT:    uzp1 v4.4h, v0.4h, v4.4h
-; CHECK-NEXT:    umov w12, v4.h[3]
-; CHECK-NEXT:    tbz w12, #0, .LBB1_1
-; CHECK-NEXT:  // %bb.9: // %pred.store.if9
+; CHECK-NEXT:    str w10, [x8]
+; CHECK-NEXT:    umov w11, v4.h[3]
+; CHECK-NEXT:    tbz w11, #0, .LBB1_1
+; CHECK-NEXT:  .LBB1_9: // %pred.store.if9
 ; CHECK-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    str w11, [x9, #4]
+; CHECK-NEXT:    str w10, [x8, #4]
 ; CHECK-NEXT:    b .LBB1_1
 ; CHECK-NEXT:  .LBB1_10: // %for.cond.cleanup
 ; CHECK-NEXT:    ret