[llvm] 871815e - [AArch64][SVE2p1] Add SVE2.1 while (predicate-pair) intrinsics

Thu Jan 19 01:32:30 PST 2023

Author: David Sherwood
Date: 2023-01-19T09:32:20Z
New Revision: 871815e062a9e1d143f29333e6129f1cad0f83bb

URL: https://github.com/llvm/llvm-project/commit/871815e062a9e1d143f29333e6129f1cad0f83bb
DIFF: https://github.com/llvm/llvm-project/commit/871815e062a9e1d143f29333e6129f1cad0f83bb.diff

LOG: [AArch64][SVE2p1] Add SVE2.1 while (predicate-pair) intrinsics

Adds intrinsics for the following instructions:

* WHILEGE (predicate pair)
* WHILEGT (predicate pair)
* WHILEHI (predicate pair)
* WHILEHS (predicate pair)
* WHILELE (predicate pair)
* WHILELO (predicate pair)
* WHILELS (predicate pair)
* WHILELT (predicate pair)

I've added an opcode selector called SelectOpcodeFromVT to
AArch64ISelDAGToDAG.cpp that we will extend in future to
select opcodes from different MVTs. For now, the only use is
for selecting predicate types.

NOTE: These intrinsics are still in development and are subject
to future changes.

Differential Revision: https://reviews.llvm.org/D141936

Added: 
    llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAArch64.td
    llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a6ecbb66a5d7b..172d844598528 100644

--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2683,4 +2683,13 @@ let TargetPrefix = "aarch64" in {
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                               [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                LLVMMatchType<0>, llvm_i32_ty]>;
+
+  //
+  // Predicate-pair intrinsics
+  //
+  foreach cmp = ["ge", "gt", "hi", "hs", "le", "lo", "ls", "lt"] in {
+    def int_aarch64_sve_while # cmp # _x2
+        : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>;
+  }
 }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 2ee78b9861ec3..666b6292e28fc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -356,6 +356,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
   void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale,
                             unsigned Opc_rr, unsigned Opc_ri,
                             bool IsIntr = false);
+  void SelectWhilePair(SDNode *N, unsigned Opc);
 
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
@@ -1688,6 +1689,64 @@ AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, unsigned Opc_rr,
   return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
 }
 
+enum class SelectTypeKind {
+  Int1 = 0,
+};
+
+/// This function selects an opcode from a list of opcodes, which is
+/// expected to be the opcode for { 8-bit, 16-bit, 32-bit, 64-bit }
+/// element types, in this order.
+template <SelectTypeKind Kind>
+static unsigned SelectOpcodeFromVT(EVT VT, ArrayRef<unsigned> Opcodes) {
+  // Only match scalable vector VTs
+  if (!VT.isScalableVector())
+    return 0;
+
+  EVT EltVT = VT.getVectorElementType();
+  switch (Kind) {
+  case SelectTypeKind::Int1:
+    if (EltVT != MVT::i1)
+      return 0;
+    break;
+  }
+
+  unsigned Offset;
+  switch (VT.getVectorMinNumElements()) {
+  case 16: // 8-bit
+    Offset = 0;
+    break;
+  case 8: // 16-bit
+    Offset = 1;
+    break;
+  case 4: // 32-bit
+    Offset = 2;
+    break;
+  case 2: // 64-bit
+    Offset = 3;
+    break;
+  default:
+    return 0;
+  }
+
+  return (Opcodes.size() <= Offset) ? 0 : Opcodes[Offset];
+}
+
+void AArch64DAGToDAGISel::SelectWhilePair(SDNode *N, unsigned Opc) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue Ops[] = {N->getOperand(1), N->getOperand(2)};
+
+  SDNode *WhilePair = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops);
+  SDValue SuperReg = SDValue(WhilePair, 0);
+
+  for (unsigned I = 0; I < 2; ++I)
+    ReplaceUses(SDValue(N, I), CurDAG->getTargetExtractSubreg(
+                                   AArch64::psub0 + I, DL, VT, SuperReg));
+
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
                                                unsigned Scale, unsigned Opc_ri,
                                                unsigned Opc_rr, bool IsIntr) {
@@ -4623,6 +4682,62 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       if (tryMULLV64LaneV128(IntNo, Node))
         return;
       break;
+    case Intrinsic::aarch64_sve_whilege_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILEGE_2PXX_B, AArch64::WHILEGE_2PXX_H,
+               AArch64::WHILEGE_2PXX_S, AArch64::WHILEGE_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilegt_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILEGT_2PXX_B, AArch64::WHILEGT_2PXX_H,
+               AArch64::WHILEGT_2PXX_S, AArch64::WHILEGT_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilehi_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILEHI_2PXX_B, AArch64::WHILEHI_2PXX_H,
+               AArch64::WHILEHI_2PXX_S, AArch64::WHILEHI_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilehs_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILEHS_2PXX_B, AArch64::WHILEHS_2PXX_H,
+               AArch64::WHILEHS_2PXX_S, AArch64::WHILEHS_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilele_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILELE_2PXX_B, AArch64::WHILELE_2PXX_H,
+               AArch64::WHILELE_2PXX_S, AArch64::WHILELE_2PXX_D}))
+      SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilelo_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILELO_2PXX_B, AArch64::WHILELO_2PXX_H,
+               AArch64::WHILELO_2PXX_S, AArch64::WHILELO_2PXX_D}))
+      SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilels_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILELS_2PXX_B, AArch64::WHILELS_2PXX_H,
+               AArch64::WHILELS_2PXX_S, AArch64::WHILELS_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
+    case Intrinsic::aarch64_sve_whilelt_x2:
+      if (auto Op = SelectOpcodeFromVT<SelectTypeKind::Int1>(
+              Node->getValueType(0),
+              {AArch64::WHILELT_2PXX_B, AArch64::WHILELT_2PXX_H,
+               AArch64::WHILELT_2PXX_S, AArch64::WHILELT_2PXX_D}))
+        SelectWhilePair(Node, Op);
+      return;
     }
     break;
   }

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d8a3738df505f..324d1cf0d007c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -299,6 +299,14 @@ static bool isZeroingInactiveLanes(SDValue Op) {
     case Intrinsic::aarch64_sve_whilelt:
     case Intrinsic::aarch64_sve_match:
     case Intrinsic::aarch64_sve_nmatch:
+    case Intrinsic::aarch64_sve_whilege_x2:
+    case Intrinsic::aarch64_sve_whilegt_x2:
+    case Intrinsic::aarch64_sve_whilehi_x2:
+    case Intrinsic::aarch64_sve_whilehs_x2:
+    case Intrinsic::aarch64_sve_whilele_x2:
+    case Intrinsic::aarch64_sve_whilelo_x2:
+    case Intrinsic::aarch64_sve_whilels_x2:
+    case Intrinsic::aarch64_sve_whilelt_x2:
       return true;
     }
   }

diff  --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll
new file mode 100644
index 0000000000000..ab70f57b48874
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-while-pp.ll
@@ -0,0 +1,663 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 -mattr=+sve2p1 < %s | FileCheck %s
+
+; == WHILEGE ==
+
+define <vscale x 16 x i1> @whilege_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilege_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilege { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilege_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilege_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilege { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilege_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilege_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilege { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilege_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilege_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilege { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILEGT ==
+
+define <vscale x 16 x i1> @whilegt_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilegt_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilegt { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilegt_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilegt_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilegt { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilegt_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilegt_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilegt { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilegt_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilegt_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilegt { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILEHI ==
+
+define <vscale x 16 x i1> @whilehi_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehi_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehi { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilehi_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehi_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehi { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilehi_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehi_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehi { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilehi_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehi_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehi { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILEHS ==
+
+define <vscale x 16 x i1> @whilehs_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehs_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehs { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilehs_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehs_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehs { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilehs_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehs_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehs { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilehs_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehs_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehs { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILELE ==
+
+define <vscale x 16 x i1> @whilele_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilele_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilele { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilele_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilele_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilele { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilele_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilele_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilele { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilele_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilele_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilele { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILELO ==
+
+define <vscale x 16 x i1> @whilelo_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelo_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilelo_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelo_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilelo_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelo_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilelo_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelo_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILELS ==
+
+define <vscale x 16 x i1> @whilels_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilels_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilels { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilels_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilels_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilels { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilels_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilels_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilels { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilels_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilels_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilels { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; == WHILELT ==
+
+define <vscale x 16 x i1> @whilelt_x2_nxv16i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelt_x2_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelt { p0.b, p1.b }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilelt_x2_nxv8i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelt_x2_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelt { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilelt_x2_nxv4i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelt_x2_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelt { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilelt_x2_nxv2i1(i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelt_x2_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelt { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    // kill: def $p0 killed $p0 killed $p0_p1
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 %m, i64 %n)
+  %res = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  ret <vscale x 2 x i1> %res
+}
+
+
+; Test that we get good code quality when using while in combination with other intrinsics
+
+define <vscale x 32 x i1> @codegen_whilege_b16_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilege_b16_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilege { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilegt_b32_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilegt_b32_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilegt { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilehi_b64_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilehi_b64_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilehi { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilehs_b16_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilehs_b16_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilehs { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilele_b32_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilele_b32_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilele { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilelo_b64_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilelo_b64_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilelo { p0.d, p1.d }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 2 x i1>, <vscale x 2 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilels_b16_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilels_b16_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilels { p0.h, p1.h }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 8 x i1>, <vscale x 8 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+define <vscale x 32 x i1> @codegen_whilelt_b32_x2(i64 noundef %op1, i64 noundef %op2) nounwind {
+; CHECK-LABEL: codegen_whilelt_b32_x2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    whilelt { p0.s, p1.s }, x0, x1
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64 %op1, i64 %op2)
+  %1 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 0
+  %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
+  %3 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> poison, <vscale x 16 x i1> %2, i64 0)
+  %4 = extractvalue { <vscale x 4 x i1>, <vscale x 4 x i1> } %0, 1
+  %5 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %4)
+  %6 = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> %3, <vscale x 16 x i1> %5, i64 16)
+  ret <vscale x 32 x i1> %6
+}
+
+
+; == Test that we use predicate registers starting at a multiple of 2 ==
+
+define <vscale x 16 x i1> @whilege_x2_nxv16i1_reg_off(<vscale x 16 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilege_x2_nxv16i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilege { p2.b, p3.b }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  %res = and <vscale x 16 x i1> %part1, %p0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilegt_x2_nxv8i1_reg_off(<vscale x 8 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilegt_x2_nxv8i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilegt { p2.h, p3.h }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  %res = and <vscale x 8 x i1> %part1, %p0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilehi_x2_nxv4i1_reg_off(<vscale x 4 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehi_x2_nxv4i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehi { p2.s, p3.s }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  %res = and <vscale x 4 x i1> %part1, %p0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilehs_x2_nxv2i1_reg_off(<vscale x 2 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilehs_x2_nxv2i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilehs { p2.d, p3.d }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  %res = and <vscale x 2 x i1> %part1, %p0
+  ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 16 x i1> @whilele_x2_nxv16i1_reg_off(<vscale x 16 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilele_x2_nxv16i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilele { p2.b, p3.b }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 16 x i1>, <vscale x 16 x i1>} %pp, 0
+  %res = and <vscale x 16 x i1> %part1, %p0
+  ret <vscale x 16 x i1> %res
+}
+
+define <vscale x 8 x i1> @whilelo_x2_nxv8i1_reg_off(<vscale x 8 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelo_x2_nxv8i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelo { p2.h, p3.h }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 8 x i1>, <vscale x 8 x i1>} %pp, 0
+  %res = and <vscale x 8 x i1> %part1, %p0
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 4 x i1> @whilels_x2_nxv4i1_reg_off(<vscale x 4 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilels_x2_nxv4i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilels { p2.s, p3.s }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 4 x i1>, <vscale x 4 x i1>} %pp, 0
+  %res = and <vscale x 4 x i1> %part1, %p0
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 2 x i1> @whilelt_x2_nxv2i1_reg_off(<vscale x 2 x i1> %p0, i64 %m, i64 %n) nounwind {
+; CHECK-LABEL: whilelt_x2_nxv2i1_reg_off:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    whilelt { p2.d, p3.d }, x0, x1
+; CHECK-NEXT:    and p0.b, p2/z, p2.b, p0.b
+; CHECK-NEXT:    ret
+  %pp = call { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64 %m, i64 %n)
+  %part1 = extractvalue {<vscale x 2 x i1>, <vscale x 2 x i1>} %pp, 0
+  %res = and <vscale x 2 x i1> %part1, %p0
+  ret <vscale x 2 x i1> %res
+}
+
+; == WHILEGE ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilege.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilege.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilege.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilege.x2.nxv2i1(i64, i64)
+
+; == WHILEGT ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilegt.x2.nxv2i1(i64, i64)
+
+; == WHILEHI ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehi.x2.nxv2i1(i64, i64)
+
+; == WHILEHS ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilehs.x2.nxv2i1(i64, i64)
+
+; == WHILELE ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilele.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilele.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilele.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilele.x2.nxv2i1(i64, i64)
+
+; == WHILELO ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelo.x2.nxv2i1(i64, i64)
+
+; == WHILELS ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilels.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilels.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilels.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilels.x2.nxv2i1(i64, i64)
+
+; == WHILELT ==
+declare { <vscale x 16 x i1>, <vscale x 16 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv16i1(i64, i64)
+declare { <vscale x 8 x i1>, <vscale x 8 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv8i1(i64, i64)
+declare { <vscale x 4 x i1>, <vscale x 4 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv4i1(i64, i64)
+declare { <vscale x 2 x i1>, <vscale x 2 x i1> } @llvm.aarch64.sve.whilelt.x2.nxv2i1(i64, i64)
+
+; == SVBOOL CONVERSION ==
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
+
+; == VECTOR INSERTS ==
+declare <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1>, <vscale x 16 x i1>, i64 immarg)