[llvm] ce3e7eb - [AArch64][SVE] Fix bad PTEST(PG, OP(PG, ...)) optimization

Tue Nov 15 02:54:30 PST 2022

Author: Cullen Rhodes
Date: 2022-11-15T10:34:23Z
New Revision: ce3e7eb96888f55347a1e1714c5ae65e7a472a70

URL: https://github.com/llvm/llvm-project/commit/ce3e7eb96888f55347a1e1714c5ae65e7a472a70
DIFF: https://github.com/llvm/llvm-project/commit/ce3e7eb96888f55347a1e1714c5ae65e7a472a70.diff

LOG: [AArch64][SVE] Fix bad PTEST(PG, OP(PG, ...)) optimization

AArch64InstrInfo::optimizePTestInstr attempts to remove a PTEST of a
predicate generating operation that identically sets flags (implictly).

When the PTEST and the predicate-generating operation use the same mask
the PTEST is currently removed. This is incorrect since it doesn't
consider element size. PTEST operates on 8-bit predicates, but for
instructions like compare that also support 16/32/64-bit predicates, the
implicit PTEST performed by the instruction will consider fewer lanes
for these element sizes and could set different first or last active
flags.

For example, consider the following instruction sequence

  ptrue p0.b			; P0=1111-1111-1111-1111
  index z0.s, #0, #1		; Z0=<0,1,2,3>
  index z1.s, #1, #1		; Z1=<1,2,3,4>
  cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
				;       ^ last active
  ptest p0, p1.b		; P1=0001-0001-0001-0001
				;     ^ last active

where the compare generates a canonical all active 32-bit predicate (equivalent
to 'ptrue p1.s, all'). The implicit PTEST sets the last active flag, whereas
the PTEST instruction with the same mask doesn't.

This patch restricts the optimization to instructions operating on 8-bit
predicates. One caveat is the optimization is safe regardless of element
size for any active, this will be addressed in a later patch.

Reviewed By: bsmith

Differential Revision: https://reviews.llvm.org/D137716

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll
    llvm/test/CodeGen/AArch64/sve-ptest-removal-pfirst-pnext.ll
    llvm/test/CodeGen/AArch64/sve-setcc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f558cf2c2cbf6..934cceb72c2b5 100644

--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1325,10 +1325,30 @@ bool AArch64InstrInfo::optimizePTestInstr(
 
     // Fallthough to simply remove the PTEST.
   } else if (PredIsPTestLike) {
-    // For PTEST(PG_1, PTEST_LIKE(PG2, ...)), PTEST is redundant when both
-    // instructions use the same predicate.
+    // For PTEST(PG, PTEST_LIKE(PG, ...)), the PTEST is redundant since the
+    // flags are set based on the same mask 'PG', but PTEST_LIKE must operate
+    // on 8-bit predicates like the PTEST.  Otherwise, for instructions like
+    // compare that also support 16/32/64-bit predicates, the implicit PTEST
+    // performed by the compare could consider fewer lanes for these element
+    // sizes.
+    //
+    // For example, consider
+    //
+    //   ptrue p0.b                    ; P0=1111-1111-1111-1111
+    //   index z0.s, #0, #1            ; Z0=<0,1,2,3>
+    //   index z1.s, #1, #1            ; Z1=<1,2,3,4>
+    //   cmphi p1.s, p0/z, z1.s, z0.s  ; P1=0001-0001-0001-0001
+    //                                 ;       ^ last active
+    //   ptest p0, p1.b                ; P1=0001-0001-0001-0001
+    //                                 ;     ^ last active
+    //
+    // where the compare generates a canonical all active 32-bit predicate
+    // (equivalent to 'ptrue p1.s, all'). The implicit PTEST sets the last
+    // active flag, whereas the PTEST instruction with the same mask doesn't.
     auto PTestLikeMask = MRI->getUniqueVRegDef(Pred->getOperand(1).getReg());
-    if (Mask != PTestLikeMask)
+    uint64_t PredElementSize = getElementSizeForOpcode(PredOpcode);
+    if ((Mask != PTestLikeMask) ||
+        (PredElementSize != AArch64::ElementSizeB))
       return false;
 
     // Fallthough to simply remove the PTEST.

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll
index d0ea1ddb252bb..2eae4f324ee24 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.ll
@@ -20,7 +20,8 @@ define i32 @cmpeq_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmpeq_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmpeq_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpeq.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -65,7 +66,8 @@ define i32 @cmpeq_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmpeq_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpeq_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -79,7 +81,8 @@ define i32 @cmpeq_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmpeq_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpeq_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
index 5df55777995a1..48ef72c9b3899 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpeq.mir
@@ -65,7 +65,7 @@ body:             |
     liveins: $p0, $z0, $z1
 
     ; CHECK-LABEL: name: cmpeq_nxv8i16
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %2:zpr = COPY $z1
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
@@ -101,7 +101,7 @@ body:             |
     liveins: $p0, $z0, $z1
 
     ; CHECK-LABEL: name: cmpeq_nxv4i32
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %2:zpr = COPY $z1
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
@@ -137,7 +137,7 @@ body:             |
     liveins: $p0, $z0, $z1
 
     ; CHECK-LABEL: name: cmpeq_nxv2i64
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %2:zpr = COPY $z1
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
@@ -204,7 +204,7 @@ body:             |
     liveins: $p0, $z0
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv8i16
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_H %0, %1, 0, implicit-def dead $nzcv
@@ -237,7 +237,7 @@ body:             |
     liveins: $p0, $z0
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv4i32
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_S %0, %1, 0, implicit-def dead $nzcv
@@ -270,7 +270,7 @@ body:             |
     liveins: $p0, $z0
 
     ; CHECK-LABEL: name: cmpeq_imm_nxv2i64
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
     %2:ppr = CMPEQ_PPzZI_D %0, %1, 0, implicit-def dead $nzcv
@@ -339,7 +339,7 @@ body:             |
     liveins: $p0, $z0, $z1
 
     ; CHECK-LABEL: name: cmpeq_wide_nxv8i16
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %2:zpr = COPY $z1
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0
@@ -375,7 +375,7 @@ body:             |
     liveins: $p0, $z0, $z1
 
     ; CHECK-LABEL: name: cmpeq_wide_nxv4i32
-    ; CHECK-NOT: PTEST
+    ; CHECK: PTEST
     %2:zpr = COPY $z1
     %1:zpr = COPY $z0
     %0:ppr_3b = COPY $p0

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll
index 5dae689b82a72..a99ded97a4408 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpge.ll
@@ -20,7 +20,8 @@ define i32 @cmpge_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmpge_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmpge_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpge.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -65,7 +66,8 @@ define i32 @cmpge_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmpge_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpge_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmpge p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -79,7 +81,8 @@ define i32 @cmpge_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmpge_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpge_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll
index c2dc452ad88f6..09878914b24b7 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpgt.ll
@@ -20,7 +20,8 @@ define i32 @cmpgt_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmpgt_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmpgt_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpgt p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpgt.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -65,7 +66,8 @@ define i32 @cmpgt_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmpgt_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpgt_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpgt p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmpgt p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -79,7 +81,8 @@ define i32 @cmpgt_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmpgt_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpgt_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpgt p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll
index e4b45921ece68..6fca640bda2ed 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphi.ll
@@ -20,7 +20,8 @@ define i32 @cmphi_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmphi_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmphi_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphi p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmphi p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmphi.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -66,7 +67,8 @@ define i32 @cmphi_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmphi_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmphi_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphi p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmphi p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -80,7 +82,8 @@ define i32 @cmphi_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmphi_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmphi_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphi p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmphi p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll
index 42906f6e9703d..3b2f679dd2fb0 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmphs.ll
@@ -20,7 +20,8 @@ define i32 @cmphs_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmphs_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmphs_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphs p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmphs p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmphs.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -65,7 +66,8 @@ define i32 @cmphs_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmphs_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmphs_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphs p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmphs p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -79,7 +81,8 @@ define i32 @cmphs_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmphs_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmphs_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmphs p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmphs p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
index 0ccd882a28086..d0e5a1fea839b 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
@@ -37,7 +37,8 @@ define i32 @cmple_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmple_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmple_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmple p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmple p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -51,7 +52,8 @@ define i32 @cmple_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmple_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmple_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmple p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmple p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
@@ -114,7 +116,8 @@ define i1 @cmp8_ptest_any_px(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsc
 define i1 @cmp32_ptest_first_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmp32_ptest_first_px:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
@@ -130,7 +133,8 @@ define i1 @cmp32_ptest_first_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <
 define i1 @cmp32_ptest_last_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmp32_ptest_last_px:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)
@@ -146,7 +150,8 @@ define i1 @cmp32_ptest_last_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <v
 define i1 @cmp32_ptest_any_px(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmp32_ptest_any_px:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpge p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll
index 3379e8637193a..c813acfd36525 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplo.ll
@@ -37,7 +37,8 @@ define i32 @cmplo_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmplo_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmplo_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmplo p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmplo p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -51,7 +52,8 @@ define i32 @cmplo_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmplo_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmplo_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmplo p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmplo p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll
index dbbabe9e7d0fa..53cd251f97eb6 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpls.ll
@@ -37,7 +37,8 @@ define i32 @cmpls_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmpls_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpls_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpls p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmpls p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -51,7 +52,8 @@ define i32 @cmpls_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmpls_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpls_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpls p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmpls p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll
index cf15a3572070b..5b8e739e6b306 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmplt.ll
@@ -37,7 +37,8 @@ define i32 @cmplt_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmplt_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmplt_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmplt p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmplt p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -51,7 +52,8 @@ define i32 @cmplt_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmplt_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmplt_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmplt p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmplt p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll
index ba4bd4b497d2c..12c82a0ebf7aa 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmpne.ll
@@ -20,7 +20,8 @@ define i32 @cmpne_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale
 define i32 @cmpne_nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmpne_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.cmpne.nxv4i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
@@ -65,7 +66,8 @@ define i32 @cmpne_wide_nxv16i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i32 @cmpne_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpne_wide_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    cmpne p1.h, p0/z, z0.h, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %pg)
@@ -79,7 +81,8 @@ define i32 @cmpne_wide_nxv8i16(<vscale x 16 x i1> %pg, <vscale x 8 x i16> %a, <v
 define i32 @cmpne_wide_nxv4i32(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: cmpne_wide_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, z1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg)

diff  --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-pfirst-pnext.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-pfirst-pnext.ll
index 55af332eacf9a..2255a36239a0b 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-pfirst-pnext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-pfirst-pnext.ll
@@ -17,6 +17,7 @@ define i32 @pnext_2(<vscale x 2 x i1> %pg, <vscale x 2 x i1> %a) {
 ; CHECK-LABEL: pnext_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    pnext p1.d, p0, p1.d
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.pnext.nxv2i1(<vscale x 2 x i1> %pg, <vscale x 2 x i1> %a)
@@ -29,6 +30,7 @@ define i32 @pnext_4(<vscale x 4 x i1> %pg, <vscale x 4 x i1> %a) {
 ; CHECK-LABEL: pnext_4:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    pnext p1.s, p0, p1.s
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.pnext.nxv4i1(<vscale x 4 x i1> %pg, <vscale x 4 x i1> %a)
@@ -41,6 +43,7 @@ define i32 @pnext_8(<vscale x 8 x i1> %pg, <vscale x 8 x i1> %a) {
 ; CHECK-LABEL: pnext_8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    pnext p1.h, p0, p1.h
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    cset w0, ne
 ; CHECK-NEXT:    ret
   %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.pnext.nxv8i1(<vscale x 8 x i1> %pg, <vscale x 8 x i1> %a)

diff  --git a/llvm/test/CodeGen/AArch64/sve-setcc.ll b/llvm/test/CodeGen/AArch64/sve-setcc.ll
index 60ee9b34d1760..36688ea9ce8c0 100644
--- a/llvm/test/CodeGen/AArch64/sve-setcc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-setcc.ll
@@ -6,6 +6,7 @@ define void @sve_cmplt_setcc(<vscale x 8 x i16>* %out, <vscale x 8 x i16> %in, <
 ; CHECK-LABEL: sve_cmplt_setcc:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmplt p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    b.eq .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %if.then
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
@@ -29,6 +30,7 @@ define void @sve_cmplt_setcc_inverted(<vscale x 8 x i16>* %out, <vscale x 8 x i1
 ; CHECK-LABEL: sve_cmplt_setcc_inverted:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    cmplt p1.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ptest p0, p1.b
 ; CHECK-NEXT:    b.ne .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %if.then
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]