[llvm] [AArch64] Consider runtime mode when deciding to use SVE for fixed-length vectors. (PR #96081)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 19 08:10:04 PDT 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Sander de Smalen (sdesmalen-arm)

<details>
<summary>Changes</summary>

This also fixes the case where an SVE div is incorrectly to be assumed available in non-streaming mode with SME.

---

Patch is 65.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96081.diff


56 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+18-8) 
- (modified) llvm/lib/Target/AArch64/AArch64Subtarget.h (+9-3) 
- (modified) llvm/lib/Target/AArch64/AArch64TargetMachine.cpp (+7-1) 
- (modified) llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll (+216-100) 
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-mova-insert.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-and-combine.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitcast.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bitselect.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-build-vector.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-concat.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-vector-elt.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll (+2-3) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-extends.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-log.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mul.ll (+1-2) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-limit-duplane.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-loads.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-gather-scatter.ll (+48-43) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-subvector.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc-stores.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll (+1-1) 
- (modified) llvm/test/CodeGen/AArch64/sve-streaming-mode-test-register-mov.ll (+1-1) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c790209cc221f..a4fa25ffdd6ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1418,7 +1418,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget->hasSVEorSME()) {
+  if (Subtarget->isSVEorStreamingSVEAvailable()) {
     for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
       setOperationAction(ISD::BITREVERSE, VT, Custom);
       setOperationAction(ISD::BSWAP, VT, Custom);
@@ -1528,14 +1528,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
-    // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
-    for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
-                    MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
-                    MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+    // NEON doesn't support masked loads/stores, but SME and SVE do.
+    for (auto VT :
+         {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+          MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+          MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::MSTORE, VT, Custom);
-      setOperationAction(ISD::MGATHER, VT, Custom);
-      setOperationAction(ISD::MSCATTER, VT, Custom);
+    }
+
+    // NEON doesn't support masked gathers/scatters, but SVE does.
+    if (Subtarget->isSVEAvailable()) {
+      for (auto VT :
+           {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
+            MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
+            MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
+        setOperationAction(ISD::MGATHER, VT, Custom);
+        setOperationAction(ISD::MSCATTER, VT, Custom);
+      }
     }
 
     // Firstly, exclude all scalable vector extending loads/truncating stores,
@@ -6986,7 +6996,7 @@ bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
 
   // NEON-sized vectors can be emulated using SVE instructions.
   if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
-    return Subtarget->hasSVEorSME();
+    return Subtarget->isSVEorStreamingSVEAvailable();
 
   // Ensure NEON MVTs only belong to a single register class.
   if (VT.getFixedSizeInBits() <= 128)
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 7ef7a89b5749f..5e1a370778914 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -185,6 +185,12 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
            (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
   }
 
+  /// Returns true if the target has access to either the full range of SVE instructions,
+  /// or the streaming-compatible subset of SVE instructions.
+  bool isSVEorStreamingSVEAvailable() const {
+    return hasSVE() || hasSMEFA64() || (hasSME() && isStreaming());
+  }
+
   unsigned getMinVectorRegisterBitWidth() const {
     // Don't assume any minimum vector size when PSTATE.SM may not be 0, because
     // we don't yet support streaming-compatible codegen support that we trust
@@ -374,11 +380,11 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
 
   bool useSVEForFixedLengthVectors() const {
-    if (!isNeonAvailable())
-      return hasSVEorSME();
+    if (!isSVEorStreamingSVEAvailable())
+      return false;
 
     // Prefer NEON unless larger SVE registers are available.
-    return hasSVEorSME() && getMinSVEVectorSizeInBits() >= 256;
+    return !isNeonAvailable() || getMinSVEVectorSizeInBits() >= 256;
   }
 
   bool useSVEForFixedLengthVectors(EVT VT) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 7de9071476e7f..f94fa037a42c4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -187,6 +187,11 @@ static cl::opt<unsigned> SVEVectorBitsMinOpt(
              "with zero meaning no minimum size is assumed."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<bool> ForceStreaming(
+    "force-streaming",
+    cl::desc("Force the use of streaming code for all functions"),
+    cl::init(false), cl::Hidden);
+
 static cl::opt<bool> ForceStreamingCompatible(
     "force-streaming-compatible",
     cl::desc("Force the use of streaming-compatible code for all functions"),
@@ -412,7 +417,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
   StringRef FS = FSAttr.isValid() ? FSAttr.getValueAsString() : TargetFS;
   bool HasMinSize = F.hasMinSize();
 
-  bool IsStreaming = F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
+  bool IsStreaming = ForceStreaming ||
+                     F.hasFnAttribute("aarch64_pstate_sm_enabled") ||
                      F.hasFnAttribute("aarch64_pstate_sm_body");
   bool IsStreamingCompatible =
       F.hasFnAttribute("aarch64_pstate_sm_compatible") ||
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 9c72afd84fa7c..cdf2a962f9322 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,NONSTREAMING
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,STREAMING
 
 ; WITH VSCALE RANGE
 
@@ -362,145 +362,261 @@ define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %a, i32 %b) {
 ; FIXED-WIDTH VECTOR TYPES
 
 define i32 @ctz_v16i1(<16 x i1> %a) {
-; CHECK-LABEL: ctz_v16i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.b
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v16i1:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT:    ptrue p0.b, vl16
+; NONSTREAMING-NEXT:    ptrue p1.b
+; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v16i1:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT:    ptrue p0.b, vl16
+; STREAMING-NEXT:    lsl z0.b, z0.b, #7
+; STREAMING-NEXT:    ptrue p1.b
+; STREAMING-NEXT:    asr z0.b, z0.b, #7
+; STREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.b
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
   ret i32 %res
 }
 
 define i32 @ctz_v16i1_poison(<16 x i1> %a) {
-; CHECK-LABEL: ctz_v16i1_poison:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.b
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v16i1_poison:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT:    ptrue p0.b, vl16
+; NONSTREAMING-NEXT:    ptrue p1.b
+; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v16i1_poison:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT:    ptrue p0.b, vl16
+; STREAMING-NEXT:    lsl z0.b, z0.b, #7
+; STREAMING-NEXT:    ptrue p1.b
+; STREAMING-NEXT:    asr z0.b, z0.b, #7
+; STREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.b
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
   ret i32 %res
 }
 
 define i64 @add_i64_ctz_v16i1_poison(<16 x i1> %a, i64 %b) {
-; CHECK-LABEL: add_i64_ctz_v16i1_poison:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    ptrue p0.b, vl16
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    incp x0, p0.b
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: add_i64_ctz_v16i1_poison:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.16b, v0.16b, #7
+; NONSTREAMING-NEXT:    ptrue p0.b, vl16
+; NONSTREAMING-NEXT:    ptrue p1.b
+; NONSTREAMING-NEXT:    cmlt v0.16b, v0.16b, #0
+; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    incp x0, p0.b
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: add_i64_ctz_v16i1_poison:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $q0 killed $q0 def $z0
+; STREAMING-NEXT:    ptrue p0.b, vl16
+; STREAMING-NEXT:    lsl z0.b, z0.b, #7
+; STREAMING-NEXT:    ptrue p1.b
+; STREAMING-NEXT:    asr z0.b, z0.b, #7
+; STREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    incp x0, p0.b
+; STREAMING-NEXT:    ret
   %res = call i64 @llvm.experimental.cttz.elts.i64.v16i1(<16 x i1> %a, i1 1)
   %add = add i64 %res, %b
   ret i64 %add
 }
 
 define i32 @ctz_v8i1(<8 x i1> %a) {
-; CHECK-LABEL: ctz_v8i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.8b, v0.8b, #7
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.b
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v8i1:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.8b, v0.8b, #7
+; NONSTREAMING-NEXT:    ptrue p0.b, vl8
+; NONSTREAMING-NEXT:    ptrue p1.b
+; NONSTREAMING-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v8i1:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.b, vl8
+; STREAMING-NEXT:    lsl z0.b, z0.b, #7
+; STREAMING-NEXT:    ptrue p1.b
+; STREAMING-NEXT:    asr z0.b, z0.b, #7
+; STREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.b
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 0)
   ret i32 %res
 }
 
 define i32 @ctz_v8i1_poison(<8 x i1> %a) {
-; CHECK-LABEL: ctz_v8i1_poison:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.8b, v0.8b, #7
-; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
-; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.b
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v8i1_poison:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.8b, v0.8b, #7
+; NONSTREAMING-NEXT:    ptrue p0.b, vl8
+; NONSTREAMING-NEXT:    ptrue p1.b
+; NONSTREAMING-NEXT:    cmlt v0.8b, v0.8b, #0
+; NONSTREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.b
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v8i1_poison:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.b, vl8
+; STREAMING-NEXT:    lsl z0.b, z0.b, #7
+; STREAMING-NEXT:    ptrue p1.b
+; STREAMING-NEXT:    asr z0.b, z0.b, #7
+; STREAMING-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.b
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v8i1(<8 x i1> %a, i1 1)
   ret i32 %res
 }
 
 define i32 @ctz_v4i1(<4 x i1> %a) {
-; CHECK-LABEL: ctz_v4i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.4h, v0.4h, #15
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.h
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v4i1:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.4h, v0.4h, #15
+; NONSTREAMING-NEXT:    ptrue p0.h, vl4
+; NONSTREAMING-NEXT:    ptrue p1.h
+; NONSTREAMING-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONSTREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.h
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v4i1:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.h, vl4
+; STREAMING-NEXT:    lsl z0.h, z0.h, #15
+; STREAMING-NEXT:    ptrue p1.h
+; STREAMING-NEXT:    asr z0.h, z0.h, #15
+; STREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.h
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 0)
   ret i32 %res
 }
 
 define i32 @ctz_v4i1_poison(<4 x i1> %a) {
-; CHECK-LABEL: ctz_v4i1_poison:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.4h, v0.4h, #15
-; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    cmlt v0.4h, v0.4h, #0
-; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.h
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v4i1_poison:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.4h, v0.4h, #15
+; NONSTREAMING-NEXT:    ptrue p0.h, vl4
+; NONSTREAMING-NEXT:    ptrue p1.h
+; NONSTREAMING-NEXT:    cmlt v0.4h, v0.4h, #0
+; NONSTREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.h
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v4i1_poison:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.h, vl4
+; STREAMING-NEXT:    lsl z0.h, z0.h, #15
+; STREAMING-NEXT:    ptrue p1.h
+; STREAMING-NEXT:    asr z0.h, z0.h, #15
+; STREAMING-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.h
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v4i1(<4 x i1> %a, i1 1)
   ret i32 %res
 }
 
 define i32 @ctz_v2i1(<2 x i1> %a) {
-; CHECK-LABEL: ctz_v2i1:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.2s, v0.2s, #31
-; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.s
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v2i1:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.2s, v0.2s, #31
+; NONSTREAMING-NEXT:    ptrue p0.s, vl2
+; NONSTREAMING-NEXT:    ptrue p1.s
+; NONSTREAMING-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONSTREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.s
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v2i1:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.s, vl2
+; STREAMING-NEXT:    lsl z0.s, z0.s, #31
+; STREAMING-NEXT:    ptrue p1.s
+; STREAMING-NEXT:    asr z0.s, z0.s, #31
+; STREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.s
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 0)
   ret i32 %res
 }
 
 define i32 @ctz_v2i1_poison(<2 x i1> %a) {
-; CHECK-LABEL: ctz_v2i1_poison:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    shl v0.2s, v0.2s, #31
-; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    cmlt v0.2s, v0.2s, #0
-; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
-; CHECK-NEXT:    cntp x0, p0, p0.s
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
-; CHECK-NEXT:    ret
+; NONSTREAMING-LABEL: ctz_v2i1_poison:
+; NONSTREAMING:       // %bb.0:
+; NONSTREAMING-NEXT:    shl v0.2s, v0.2s, #31
+; NONSTREAMING-NEXT:    ptrue p0.s, vl2
+; NONSTREAMING-NEXT:    ptrue p1.s
+; NONSTREAMING-NEXT:    cmlt v0.2s, v0.2s, #0
+; NONSTREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; NONSTREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; NONSTREAMING-NEXT:    cntp x0, p0, p0.s
+; NONSTREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; NONSTREAMING-NEXT:    ret
+;
+; STREAMING-LABEL: ctz_v2i1_poison:
+; STREAMING:       // %bb.0:
+; STREAMING-NEXT:    // kill: def $d0 killed $d0 def $z0
+; STREAMING-NEXT:    ptrue p0.s, vl2
+; STREAMING-NEXT:    lsl z0.s, z0.s, #31
+; STREAMING-NEXT:    ptrue p1.s
+; STREAMING-NEXT:    asr z0.s, z0.s, #31
+; STREAMING-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; STREAMING-NEXT:    brkb p0.b, p1/z, p0.b
+; STREAMING-NEXT:    cntp x0, p0, p0.s
+; STREAMING-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
   ret i32 %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
index 25f3540766618..48fbd14bd8540 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-mova-extract.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 16 x i8> @extract_row_b(<vscale x 16 x i8> %zd, <vscale x 16 x i1> %pg, i32 %tileslice) {
 ; CHECK-LABEL: extract_row_b:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/96081


More information about the llvm-commits mailing list