[llvm] [AArch64] Lower factor-of-2 interleaved stores to STNP (PR #177938)

Tomer Shafir via llvm-commits llvm-commits at lists.llvm.org
Tue Feb 10 08:16:17 PST 2026


https://github.com/tomershafir updated https://github.com/llvm/llvm-project/pull/177938

>From 33d647c3795a1c8fac55be44e6728fc619bf9002 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 10 Feb 2026 13:54:55 +0200
Subject: [PATCH 1/3] [AArch64] Lower factor-of-2 interleaved stores to STNP

This patch prioritizes lowering to `stnp` over `st2` store instructions marked !nontemporal.

>From performance perspective, we should conservatively prioritize STNP lowering for non-temporal stores, because currently NT stores requires explicit usage of `__builtin_nontemporal_store()` intrinsic, so I think its reasonable to assume the developer explicitly intends to optimize D-cache usage of some hot non-temporal execution. He can rollback if it doesnt help.

The cost here is it adds a few instructions for code size (thus we predicate when not optimizing for code size), few extra fast instructions to execute, few extra short dep chains - should be commonly handled by OOO execution, I-cache alignment effects, few extra registers. In the future we can may be able to approximate a cost model to select by.

The patch implements an AArch64 specific static function to model what NT stores are directly legal on the backend currently, and `AArc
h64TargetLowering::lowerInterleavedStore` to conditionally skip st2 lowering.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   63 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |    4 +-
 .../AArch64/AArch64TargetTransformInfo.h      |    2 +-
 .../nontemporal-store-interleaved-optsize.ll  |   97 ++
 .../AArch64/nontemporal-store-interleaved.ll  | 1014 +++++++++++++++++
 5 files changed, 1176 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/nontemporal-store-interleaved-optsize.ll
 create mode 100644 llvm/test/CodeGen/AArch64/nontemporal-store-interleaved.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6c0544005e1dd..c30488122ccd6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7335,7 +7335,7 @@ static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
 //
 // Coordinated with STNP constraints in
 // `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
-// `AArch64TargetLowering::ReplaceNodeResults`
+// `isLegalNTStore`
 static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
                             const SDLoc &DL, SelectionDAG &DAG) {
   assert(StoreNode && "Expected a store operation");
@@ -18465,6 +18465,43 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
   return false;
 }
 
+// Coordinated with STNP handling in
+// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
+// `LowerNTStore`
+static bool isLegalNTStore(Type *DataType, Align Alignment,
+                           const DataLayout &DL) {
+  // Currently we only support NT stores lowering for little-endian targets.
+  if (!DL.isLittleEndian())
+    return false;
+
+  // The backend can lower to STNPWi in this case
+  if (DataType->isIntegerTy(64))
+    return true;
+
+  if (auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType)) {
+    unsigned NumElements = DataTypeTy->getNumElements();
+    unsigned EltSizeBits = DataTypeTy->getElementType()->getScalarSizeInBits();
+    unsigned TotalSizeBits =
+        DataTypeTy->getPrimitiveSizeInBits().getFixedValue();
+
+    // Currently only power-of-2 vectors are supported
+    if (!isPowerOf2_64(NumElements) || !isPowerOf2_64(EltSizeBits))
+      return false;
+
+    // The backend can lower to STNPSi or STNPDi in this case
+    // via `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
+    if (TotalSizeBits == 64u || TotalSizeBits == 128u)
+      return true;
+
+    // The backend can lower to STNPQi in this case via `LowerNTStore`
+    if (TotalSizeBits == 256u && (EltSizeBits == 8u || EltSizeBits == 16u ||
+                                  EltSizeBits == 32u || EltSizeBits == 64u))
+      return true;
+  }
+
+  return false;
+}
+
 /// Lower an interleaved store into a stN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -18573,6 +18610,30 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                             BaseAddr, DL)))
     return false;
 
+  // Conditionally skip nontemporal stores, because in that case we should
+  // prioritize emitting non-temporal store instructions, but AArch64 doesn't
+  // have non-temporal interleaved stores.
+  //
+  // Currently, STNP lowering can only either keep or increase code size,
+  // thus we predicate it to not apply when optimizing for code size.
+  //
+  // The check is conservative:
+  //
+  // - Don't skip if the interleaving factor is greater than 2, as the shuffling
+  // overhead becomes higher.
+  // - Don't skip if the store value types which are not directly legal. They
+  // may theoratically be split by legalization and lowered to STNPs, but they
+  // can also match only partially in the worst case and actually emit temporal
+  // stores.
+  //
+  // We may need to revisit this heuristic using an approximated cost model,
+  // also for higher factors.
+  Function *F = SI->getFunction();
+  if (Factor == 2 && SI->hasMetadata(LLVMContext::MD_nontemporal) &&
+      !F->hasOptSize() && !F->hasMinSize() &&
+      isLegalNTStore(SI->getValueOperand()->getType(), SI->getAlign(), DL))
+    return false;
+
   Type *PtrTy = SI->getPointerOperandType();
   Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
                                  STVTy->getElementCount());
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index dc5af3fa7f01c..06e635d43b172 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10753,8 +10753,8 @@ def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
 //
 // Currently we only support NT stores lowering for little-endian targets.
 //
-// Coordinated with STNP constraints in `AArch64TargetLowering::LowerNTStore`
-// and `AArch64TTIImpl::isLegalNTStore`.
+// Coordinated with STNP constraints in `LowerNTStore`
+// and `isLegalNTStore` from `AArch64ISelLowering.cpp`.
 //
 // Currently, STNP lowering can only either keep or increase code size, thus
 // we predicate it to not apply when optimizing for code size.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 494da4e1ff330..41173467a4046 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -403,7 +403,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     //
     // Coordinated with LDNP and STNP constraints in
     // `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
-    // `AArch64TargetLowering`
+    // `AArch64ISelLowering.cpp`
     if (!ST->isLittleEndian())
       return false;
 
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved-optsize.ll b/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved-optsize.ll
new file mode 100644
index 0000000000000..326cb4e1677b6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved-optsize.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64 | FileCheck %s --check-prefix CHECK-LE
+; RUN: llc < %s -mtriple aarch64_be | FileCheck %s --check-prefix CHECK-BE
+
+; Note: Currently, interleaved stores are only directly supported on AArch64 for 64 bit or 128 bit vector data.
+
+; Representative tests for each store type class where STNP shouldn't be lowered
+; when optimizing for size.
+
+define void @test_stnp_interleaved_store2_v2i32_Os(<2 x i32> %v0, <2 x i32> %v1, ptr %ptr) #0 {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i32_Os:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1
+; CHECK-LE-NEXT:    st2 { v0.2s, v1.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i32_Os:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v1.2s, v0.2s
+; CHECK-BE-NEXT:    st2 { v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v0, <2 x i32> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2i32_Oz(<2 x i32> %v0, <2 x i32> %v1, ptr %ptr) #1 {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i32_Oz:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1 def $d0_d1
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1 def $d0_d1
+; CHECK-LE-NEXT:    st2 { v0.2s, v1.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i32_Oz:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v1.2s, v0.2s
+; CHECK-BE-NEXT:    st2 { v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v0, <2 x i32> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2i64_Os(<2 x i64> %v0, <2 x i64> %v1, ptr %ptr) #0 {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i64_Os:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    st2 { v0.2d, v1.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i64_Os:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i64> %v0, <2 x i64> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i64> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2i64_Oz(<2 x i64> %v0, <2 x i64> %v1, ptr %ptr) #1 {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i64_Oz:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    st2 { v0.2d, v1.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i64_Oz:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i64> %v0, <2 x i64> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i64> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+!0 = !{ i32 1 }
+
+attributes #0 = { optsize }
+attributes #1 = { minsize optsize }
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved.ll b/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved.ll
new file mode 100644
index 0000000000000..398ffc5d2c941
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nontemporal-store-interleaved.ll
@@ -0,0 +1,1014 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64 | FileCheck %s --check-prefixes=CHECK-LE
+; RUN: llc < %s -mtriple aarch64_be | FileCheck %s --check-prefixes=CHECK-BE
+
+; Note: Currently, interleaved stores are only directly supported on AArch64 for 64 bit or 128 bit vector data.
+; Thus, this test cannot cover lowering to `STNPSi`.
+
+define void @test_stnp_interleaved_store2_v2i32(<2 x i32> %v0, <2 x i32> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    mov d1, v0.d[1]
+; CHECK-LE-NEXT:    stnp d0, d1, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v1.2s, v0.2s
+; CHECK-BE-NEXT:    st2 { v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i32> %v0, <2 x i32> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v4i16(<4 x i16> %v0, <4 x i16> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    mov d1, v0.d[1]
+; CHECK-LE-NEXT:    stnp d0, d1, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v1.4h, v0.4h
+; CHECK-BE-NEXT:    st2 { v1.4h, v2.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x i16> %v0, <4 x i16> %v1,
+                    <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v8i8(<8 x i8> %v0, <8 x i8> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    zip1 v0.16b, v0.16b, v1.16b
+; CHECK-LE-NEXT:    mov d1, v0.d[1]
+; CHECK-LE-NEXT:    stnp d0, d1, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.8b, v1.8b
+; CHECK-BE-NEXT:    rev64 v1.8b, v0.8b
+; CHECK-BE-NEXT:    st2 { v1.8b, v2.8b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <8 x i8> %v0, <8 x i8> %v1,
+                    <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
+                                i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2f32(<2 x float> %v0, <2 x float> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    mov d1, v0.d[1]
+; CHECK-LE-NEXT:    stnp d0, d1, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v1.2s, v0.2s
+; CHECK-BE-NEXT:    st2 { v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x float> %v0, <2 x float> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v4f16(<4 x half> %v0, <4 x half> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    mov d1, v0.d[1]
+; CHECK-LE-NEXT:    stnp d0, d1, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v1.4h, v0.4h
+; CHECK-BE-NEXT:    st2 { v1.4h, v2.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x half> %v0, <4 x half> %v1,
+                    <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2i64(<2 x i64> %v0, <2 x i64> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.2d, v0.2d, v1.2d
+; CHECK-LE-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i64> %v0, <2 x i64> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i64> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v4i32(<4 x i32> %v0, <4 x i32> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.4s, v2.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x i32> %v0, <4 x i32> %v1,
+                    <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v8i16(<8 x i16> %v0, <8 x i16> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.8h, v2.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <8 x i16> %v0, <8 x i16> %v1,
+                    <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
+                               i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v16i8(<16 x i8> %v0, <16 x i8> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    adrp x8, .LCPI8_0
+; CHECK-LE-NEXT:    adrp x9, .LCPI8_1
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    ldr q2, [x8, :lo12:.LCPI8_0]
+; CHECK-LE-NEXT:    ldr q3, [x9, :lo12:.LCPI8_1]
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
+; CHECK-LE-NEXT:    tbl v2.16b, { v0.16b, v1.16b }, v2.16b
+; CHECK-LE-NEXT:    tbl v0.16b, { v0.16b }, v3.16b
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v2.16b, v1.16b, #8
+; CHECK-BE-NEXT:    st2 { v2.16b, v3.16b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <16 x i8> %v0, <16 x i8> %v1,
+                    <32 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
+                                i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15,
+                                i32 8, i32 16, i32 9, i32 17, i32 10, i32 18, i32 11, i32 19,
+                                i32 12, i32 20, i32 13, i32 21, i32 14, i32 22, i32 15, i32 23>
+  store <32 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2f64(<2 x double> %v0, <2 x double> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.2d, v0.2d, v1.2d
+; CHECK-LE-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x double> %v0, <2 x double> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x double> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v4f32(<4 x float> %v0, <4 x float> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.4s, v2.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <4 x float> %v0, <4 x float> %v1,
+                    <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v8f16(<8 x half> %v0, <8 x half> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip2 v2.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    zip1 v0.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT:    stnp q0, q2, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v2.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st2 { v1.8h, v2.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <8 x half> %v0, <8 x half> %v1,
+                    <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
+                               i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v2i32(<2 x i32> %v0, <2 x i32> %v1, <2 x i32> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    st3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v4.2s, v2.2s
+; CHECK-BE-NEXT:    rev64 v3.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v2.2s, v0.2s
+; CHECK-BE-NEXT:    st3 { v2.2s, v3.2s, v4.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %shuffle = shufflevector <4 x i32> %s0, <4 x i32> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v4i16(<4 x i16> %v0, <4 x i16> %v1, <4 x i16> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    st3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v4.4h, v2.4h
+; CHECK-BE-NEXT:    rev64 v3.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v2.4h, v0.4h
+; CHECK-BE-NEXT:    st3 { v2.4h, v3.4h, v4.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i16> %v2, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <8 x i16> %s0, <8 x i16> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v8i8(<8 x i8> %v0, <8 x i8> %v1, <8 x i8> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    st3 { v0.8b, v1.8b, v2.8b }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v4.8b, v2.8b
+; CHECK-BE-NEXT:    rev64 v3.8b, v1.8b
+; CHECK-BE-NEXT:    rev64 v2.8b, v0.8b
+; CHECK-BE-NEXT:    st3 { v2.8b, v3.8b, v4.8b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i8> %v2, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <16 x i8> %s0, <16 x i8> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v2f32(<2 x float> %v0, <2 x float> %v1, <2 x float> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    st3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v4.2s, v2.2s
+; CHECK-BE-NEXT:    rev64 v3.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v2.2s, v0.2s
+; CHECK-BE-NEXT:    st3 { v2.2s, v3.2s, v4.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x float> %v0, <2 x float> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x float> %v2, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %shuffle = shufflevector <4 x float> %s0, <4 x float> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v4f16(<4 x half> %v0, <4 x half> %v1, <4 x half> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2 def $d0_d1_d2
+; CHECK-LE-NEXT:    st3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v4.4h, v2.4h
+; CHECK-BE-NEXT:    rev64 v3.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v2.4h, v0.4h
+; CHECK-BE-NEXT:    st3 { v2.4h, v3.4h, v4.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x half> %v2, <4 x half> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <8 x half> %s0, <8 x half> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v2i64(<2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.2d, v3.2d, v4.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i64> %v2, <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %shuffle = shufflevector <4 x i64> %s0, <4 x i64> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x i64> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.4s, v3.4s, v4.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v8i16(<8 x i16> %v0, <8 x i16> %v1, <8 x i16> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.8h, v3.8h, v4.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i16> %v2, <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <16 x i16> %s0, <16 x i16> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.16b, v1.16b, v2.16b }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.16b, v2.16b
+; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.16b, v3.16b, v4.16b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s1 = shufflevector <16 x i8> %v2, <16 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <32 x i8> %s0, <32 x i8> %s1, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
+  store <48 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v2f64(<2 x double> %v0, <2 x double> %v1, <2 x double> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.2d, v3.2d, v4.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x double> %v2, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  %shuffle = shufflevector <4 x double> %s0, <4 x double> %s1, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+  store <6 x double> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v4f32(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.4s, v3.4s, v4.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x float> %v2, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <8 x float> %s0, <8 x float> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+  store <12 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store3_v8f16(<8 x half> %v0, <8 x half> %v1, <8 x half> %v2, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store3_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2 def $q0_q1_q2
+; CHECK-LE-NEXT:    st3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store3_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st3 { v2.8h, v3.8h, v4.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x half> %v0, <8 x half> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x half> %v2, <8 x half> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle = shufflevector <16 x half> %s0, <16 x half> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+; Test conservative lowering of a st4 matching patterns
+
+define void @test_stnp_interleaved_store4_v2i32(<2 x i32> %v0, <2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d3 killed $d3 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v6.2s, v3.2s
+; CHECK-BE-NEXT:    rev64 v5.2s, v2.2s
+; CHECK-BE-NEXT:    rev64 v4.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v3.2s, v0.2s
+; CHECK-BE-NEXT:    st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i32> %v2, <2 x i32> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle = shufflevector <4 x i32> %s0, <4 x i32> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v4i16(<4 x i16> %v0, <4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d3 killed $d3 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v6.4h, v3.4h
+; CHECK-BE-NEXT:    rev64 v5.4h, v2.4h
+; CHECK-BE-NEXT:    rev64 v4.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v3.4h, v0.4h
+; CHECK-BE-NEXT:    st4 { v3.4h, v4.4h, v5.4h, v6.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i16> %v2, <4 x i16> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i16> %s0, <8 x i16> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v8i8(<8 x i8> %v0, <8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d3 killed $d3 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    st4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v6.8b, v3.8b
+; CHECK-BE-NEXT:    rev64 v5.8b, v2.8b
+; CHECK-BE-NEXT:    rev64 v4.8b, v1.8b
+; CHECK-BE-NEXT:    rev64 v3.8b, v0.8b
+; CHECK-BE-NEXT:    st4 { v3.8b, v4.8b, v5.8b, v6.8b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i8> %v2, <8 x i8> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle = shufflevector <16 x i8> %s0, <16 x i8> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v2f32(<2 x float> %v0, <2 x float> %v1, <2 x float> %v2, <2 x float> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d3 killed $d3 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    st4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v6.2s, v3.2s
+; CHECK-BE-NEXT:    rev64 v5.2s, v2.2s
+; CHECK-BE-NEXT:    rev64 v4.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v3.2s, v0.2s
+; CHECK-BE-NEXT:    st4 { v3.2s, v4.2s, v5.2s, v6.2s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x float> %v0, <2 x float> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x float> %v2, <2 x float> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle = shufflevector <4 x float> %s0, <4 x float> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v4f16(<4 x half> %v0, <4 x half> %v1, <4 x half> %v2, <4 x half> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d3 killed $d3 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d2 killed $d2 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $d0_d1_d2_d3 def $d0_d1_d2_d3
+; CHECK-LE-NEXT:    st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v6.4h, v3.4h
+; CHECK-BE-NEXT:    rev64 v5.4h, v2.4h
+; CHECK-BE-NEXT:    rev64 v4.4h, v1.4h
+; CHECK-BE-NEXT:    rev64 v3.4h, v0.4h
+; CHECK-BE-NEXT:    st4 { v3.4h, v4.4h, v5.4h, v6.4h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x half> %v2, <4 x half> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x half> %s0, <8 x half> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v2i64(<2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.2d, v4.2d, v5.2d, v6.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x i64> %v2, <2 x i64> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle = shufflevector <4 x i64> %s0, <4 x i64> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x i64> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v4i32(<4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v3.4s, v3.4s
+; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.4s, v4.4s, v5.4s, v6.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v8i16(<8 x i16> %v0, <8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v3.8h, v3.8h
+; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.8h, v4.8h, v5.8h, v6.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i16> %v2, <8 x i16> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle = shufflevector <16 x i16> %s0, <16 x i16> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i16> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v16i8(<16 x i8> %v0, <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v3.16b, v3.16b
+; CHECK-BE-NEXT:    rev64 v2.16b, v2.16b
+; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
+; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.16b, v4.16b, v5.16b, v6.16b }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %s1 = shufflevector <16 x i8> %v2, <16 x i8> %v3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %shuffle = shufflevector <32 x i8> %s0, <32 x i8> %s1, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
+  store <64 x i8> %shuffle, ptr %ptr, align 1, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v2f64(<2 x double> %v0, <2 x double> %v1, <2 x double> %v2, <2 x double> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.2d, v4.2d, v5.2d, v6.2d }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <2 x double> %v0, <2 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s1 = shufflevector <2 x double> %v2, <2 x double> %v3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle = shufflevector <4 x double> %s0, <4 x double> %s1, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  store <8 x double> %shuffle, ptr %ptr, align 8, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v4f32(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, <4 x float> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v3.4s, v3.4s
+; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.4s, v4.4s, v5.4s, v6.4s }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <4 x float> %v0, <4 x float> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <4 x float> %v2, <4 x float> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle = shufflevector <8 x float> %s0, <8 x float> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  store <16 x float> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store4_v8f16(<8 x half> %v0, <8 x half> %v1, <8 x half> %v2, <8 x half> %v3, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store4_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    // kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
+; CHECK-LE-NEXT:    st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store4_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v3.8h, v3.8h
+; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
+; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
+; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
+; CHECK-BE-NEXT:    ext v6.16b, v3.16b, v3.16b, #8
+; CHECK-BE-NEXT:    ext v5.16b, v2.16b, v2.16b, #8
+; CHECK-BE-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    st4 { v3.8h, v4.8h, v5.8h, v6.8h }, [x0]
+; CHECK-BE-NEXT:    ret
+entry:
+  %s0 = shufflevector <8 x half> %v0, <8 x half> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x half> %v2, <8 x half> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shuffle = shufflevector <16 x half> %s0, <16 x half> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x half> %shuffle, ptr %ptr, align 2, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v3i32_non_pow2_elt_count(<3 x i32> %v0, <3 x i32> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v3i32_non_pow2_elt_count:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    zip1 v2.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    zip2 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT:    mov d3, v2.d[1]
+; CHECK-LE-NEXT:    str d0, [x0, #16]
+; CHECK-LE-NEXT:    stnp d2, d3, [x0]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v3i32_non_pow2_elt_count:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    zip2 v2.4s, v0.4s, v1.4s
+; CHECK-BE-NEXT:    zip1 v0.4s, v0.4s, v1.4s
+; CHECK-BE-NEXT:    rev64 v1.4s, v2.4s
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
+; CHECK-BE-NEXT:    str d1, [x0, #16]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <3 x i32> %v0, <3 x i32> %v1,
+                    <6 x i32> <i32 0, i32 3, i32 1, i32 4, i32 2, i32 5>
+  store <6 x i32> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+define void @test_stnp_interleaved_store2_v2i24_non_pow2_elt_size(<2 x i24> %v0, <2 x i24> %v1, ptr %ptr) {
+; CHECK-LE-LABEL: test_stnp_interleaved_store2_v2i24_non_pow2_elt_size:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-LE-NEXT:    zip1 v2.2s, v0.2s, v1.2s
+; CHECK-LE-NEXT:    fmov w9, s0
+; CHECK-LE-NEXT:    str h0, [x0]
+; CHECK-LE-NEXT:    mov w8, v1.s[1]
+; CHECK-LE-NEXT:    dup v1.2s, v0.s[1]
+; CHECK-LE-NEXT:    lsr w9, w9, #16
+; CHECK-LE-NEXT:    mov w10, v2.s[1]
+; CHECK-LE-NEXT:    strb w9, [x0, #2]
+; CHECK-LE-NEXT:    fmov w9, s1
+; CHECK-LE-NEXT:    sturh w8, [x0, #9]
+; CHECK-LE-NEXT:    lsr w8, w8, #16
+; CHECK-LE-NEXT:    str h1, [x0, #6]
+; CHECK-LE-NEXT:    strb w8, [x0, #11]
+; CHECK-LE-NEXT:    lsr w8, w10, #16
+; CHECK-LE-NEXT:    lsr w9, w9, #16
+; CHECK-LE-NEXT:    sturh w10, [x0, #3]
+; CHECK-LE-NEXT:    strb w8, [x0, #5]
+; CHECK-LE-NEXT:    strb w9, [x0, #8]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_stnp_interleaved_store2_v2i24_non_pow2_elt_size:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    rev64 v1.2s, v1.2s
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    zip1 v2.2s, v0.2s, v1.2s
+; CHECK-BE-NEXT:    fmov w9, s0
+; CHECK-BE-NEXT:    mov w8, v1.s[1]
+; CHECK-BE-NEXT:    dup v1.2s, v0.s[1]
+; CHECK-BE-NEXT:    stur b0, [x0, #2]
+; CHECK-BE-NEXT:    lsr w9, w9, #8
+; CHECK-BE-NEXT:    mov w10, v2.s[1]
+; CHECK-BE-NEXT:    strb w8, [x0, #11]
+; CHECK-BE-NEXT:    lsr w8, w8, #8
+; CHECK-BE-NEXT:    strh w9, [x0]
+; CHECK-BE-NEXT:    fmov w9, s1
+; CHECK-BE-NEXT:    sturh w8, [x0, #9]
+; CHECK-BE-NEXT:    stur b1, [x0, #8]
+; CHECK-BE-NEXT:    lsr w8, w10, #8
+; CHECK-BE-NEXT:    lsr w9, w9, #8
+; CHECK-BE-NEXT:    strb w10, [x0, #5]
+; CHECK-BE-NEXT:    sturh w8, [x0, #3]
+; CHECK-BE-NEXT:    strh w9, [x0, #6]
+; CHECK-BE-NEXT:    ret
+entry:
+  %shuffle = shufflevector <2 x i24> %v0, <2 x i24> %v1,
+                    <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  store <4 x i24> %shuffle, ptr %ptr, align 4, !nontemporal !0
+  ret void
+}
+
+!0 = !{ i32 1 }

>From f7d0e87683e4f6e419864ccc942c93340b69b12c Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 10 Feb 2026 17:09:53 +0200
Subject: [PATCH 2/3] small cleanups and add an assert

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 98 +++++++++----------
 1 file changed, 48 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index c30488122ccd6..1a21aeba30158 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7331,6 +7331,46 @@ static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
+// Coordinated with STNP handling in
+// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
+// `LowerNTStore`
+static bool isLegalNTStore(Type *DataType, Align Alignment,
+                           const DataLayout &DL) {
+  // Currently we only support NT stores lowering for little-endian targets.
+  if (!DL.isLittleEndian())
+    return false;
+
+  // The backend can lower to STNPWi in this case
+  if (DataType->isIntegerTy(64))
+    return true;
+
+  auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType);
+  if (!DataTypeTy)
+    return false;
+
+  // Check fixed vector legality
+  unsigned NumElements = DataTypeTy->getNumElements();
+  unsigned EltSizeBits = DataTypeTy->getElementType()->getScalarSizeInBits();
+
+  // Currently only power-of-2 vectors are supported
+  if (!isPowerOf2_64(NumElements) || !isPowerOf2_64(EltSizeBits))
+    return false;
+
+  unsigned TotalSizeBits = DataTypeTy->getPrimitiveSizeInBits().getFixedValue();
+
+  // The backend can lower to STNPSi or STNPDi in this case
+  // via `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
+  if (TotalSizeBits == 64u || TotalSizeBits == 128u)
+    return true;
+
+  // The backend can lower to STNPQi in this case via `LowerNTStore`
+  if (TotalSizeBits == 256u && (EltSizeBits == 8u || EltSizeBits == 16u ||
+                                EltSizeBits == 32u || EltSizeBits == 64u))
+    return true;
+
+  return false;
+}
+
 // Lower non-temporal stores that would otherwise be broken by legalization.
 //
 // Coordinated with STNP constraints in
@@ -7374,6 +7414,9 @@ static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
           {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
            DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
           StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+      assert(isLegalNTStore(MemVT.getTypeForEVT(*DAG.getContext()),
+                            StoreNode->getAlign(), DAG.getDataLayout()) &&
+             "Lowering should be consistent with legality");
       return Result;
     }
   }
@@ -18465,43 +18508,6 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
   return false;
 }
 
-// Coordinated with STNP handling in
-// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
-// `LowerNTStore`
-static bool isLegalNTStore(Type *DataType, Align Alignment,
-                           const DataLayout &DL) {
-  // Currently we only support NT stores lowering for little-endian targets.
-  if (!DL.isLittleEndian())
-    return false;
-
-  // The backend can lower to STNPWi in this case
-  if (DataType->isIntegerTy(64))
-    return true;
-
-  if (auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType)) {
-    unsigned NumElements = DataTypeTy->getNumElements();
-    unsigned EltSizeBits = DataTypeTy->getElementType()->getScalarSizeInBits();
-    unsigned TotalSizeBits =
-        DataTypeTy->getPrimitiveSizeInBits().getFixedValue();
-
-    // Currently only power-of-2 vectors are supported
-    if (!isPowerOf2_64(NumElements) || !isPowerOf2_64(EltSizeBits))
-      return false;
-
-    // The backend can lower to STNPSi or STNPDi in this case
-    // via `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
-    if (TotalSizeBits == 64u || TotalSizeBits == 128u)
-      return true;
-
-    // The backend can lower to STNPQi in this case via `LowerNTStore`
-    if (TotalSizeBits == 256u && (EltSizeBits == 8u || EltSizeBits == 16u ||
-                                  EltSizeBits == 32u || EltSizeBits == 64u))
-      return true;
-  }
-
-  return false;
-}
-
 /// Lower an interleaved store into a stN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
@@ -18610,24 +18616,16 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
                             BaseAddr, DL)))
     return false;
 
-  // Conditionally skip nontemporal stores, because in that case we should
-  // prioritize emitting non-temporal store instructions, but AArch64 doesn't
-  // have non-temporal interleaved stores.
-  //
-  // Currently, STNP lowering can only either keep or increase code size,
-  // thus we predicate it to not apply when optimizing for code size.
+  // Conditionally skip nontemporal stores to prioritize emitting non-temporal
+  // store instructions, even though AArch64 doesn't have non-temporal
+  // interleaved stores.
   //
   // The check is conservative:
   //
+  // - Only when not optimizing for size, as STNP lowering can increase size.
   // - Don't skip if the interleaving factor is greater than 2, as the shuffling
   // overhead becomes higher.
-  // - Don't skip if the store value types which are not directly legal. They
-  // may theoratically be split by legalization and lowered to STNPs, but they
-  // can also match only partially in the worst case and actually emit temporal
-  // stores.
-  //
-  // We may need to revisit this heuristic using an approximated cost model,
-  // also for higher factors.
+  // - Don't skip if the store value types which are not directly legal.
   Function *F = SI->getFunction();
   if (Factor == 2 && SI->hasMetadata(LLVMContext::MD_nontemporal) &&
       !F->hasOptSize() && !F->hasMinSize() &&

>From 8786787bae783b6d00bc841b0023e5db483ff825 Mon Sep 17 00:00:00 2001
From: tomershafir <tomer.shafir8 at gmail.com>
Date: Tue, 10 Feb 2026 17:40:02 +0200
Subject: [PATCH 3/3] add test for unprioritied nt loads

---
 .../AArch64/nontemporal-load-interleaved.ll   | 999 ++++++++++++++++++
 1 file changed, 999 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/nontemporal-load-interleaved.ll

diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load-interleaved.ll b/llvm/test/CodeGen/AArch64/nontemporal-load-interleaved.ll
new file mode 100644
index 0000000000000..bda66d43e4b32
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load-interleaved.ll
@@ -0,0 +1,999 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple aarch64 | FileCheck %s --check-prefixes=CHECK-LE
+; RUN: llc < %s -mtriple aarch64_be | FileCheck %s --check-prefixes=CHECK-BE
+
+; Note: These tests show that LDNP is not (yet) prioritized for interleaved loads.
+
+define void @test_ldnp_interleaved_load2_v2i32(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <4 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <4 x i32> %loaded, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x i32> %loaded, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
+  store <2 x i32> %v0, ptr %out0
+  store <2 x i32> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v4i16(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.4h, v1.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.4h, v1.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <8 x i16> %loaded, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x i16> %loaded, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x i16> %v0, ptr %out0
+  store <4 x i16> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v8i8(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.8b, v1.8b }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.8b, v1.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8b }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <16 x i8> %loaded, <16 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i8> %loaded, <16 x i8> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i8> %v0, ptr %out0
+  store <8 x i8> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v2f32(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <4 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <4 x float> %loaded, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x float> %loaded, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  store <2 x float> %v0, ptr %out0
+  store <2 x float> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v4f16(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.4h, v1.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.4h, v1.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <8 x half> %loaded, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x half> %loaded, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x half> %v0, ptr %out0
+  store <4 x half> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v2i64(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.2d, v1.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.2d, v1.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <4 x i64>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <4 x i64> %loaded, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x i64> %loaded, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+  store <2 x i64> %v0, ptr %out0
+  store <2 x i64> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v4i32(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.4s, v1.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.4s, v1.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x i32> %v0, ptr %out0
+  store <4 x i32> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v8i16(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.8h, v1.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.8h, v1.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x i16> %v0, ptr %out0
+  store <8 x i16> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v16i8(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.16b, v1.16b }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.16b, v1.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.16b }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <32 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <32 x i8> %loaded, <32 x i8> poison,
+                      <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
+                                  i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v1 = shufflevector <32 x i8> %loaded, <32 x i8> poison,
+                      <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
+                                  i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  store <16 x i8> %v0, ptr %out0
+  store <16 x i8> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v2f64(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.2d, v1.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.2d, v1.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <4 x double>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <4 x double> %loaded, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x double> %loaded, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  store <2 x double> %v0, ptr %out0
+  store <2 x double> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v4f32(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.4s, v1.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.4s, v1.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <8 x float> %loaded, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x float> %loaded, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  store <4 x float> %v0, ptr %out0
+  store <4 x float> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load2_v8f16(ptr %ptr, ptr %out0, ptr %out1) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load2_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld2 { v0.8h, v1.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load2_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld2 { v0.8h, v1.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <16 x half> %loaded, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x half> %loaded, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  store <8 x half> %v0, ptr %out0
+  store <8 x half> %v1, ptr %out1
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v2i32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2s }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <6 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <6 x i32> %loaded, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x i32> %loaded, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x i32> %loaded, <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+  store <2 x i32> %v0, ptr %out0
+  store <2 x i32> %v1, ptr %out1
+  store <2 x i32> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v4i16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4h }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <12 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <12 x i16> %loaded, <12 x i16> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i16> %loaded, <12 x i16> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i16> %loaded, <12 x i16> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  store <4 x i16> %v0, ptr %out0
+  store <4 x i16> %v1, ptr %out1
+  store <4 x i16> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v8i8(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.8b, v1.8b, v2.8b }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.8b, v1.8b, v2.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8b }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8b }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <24 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <24 x i8> %loaded, <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i8> %loaded, <24 x i8> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i8> %loaded, <24 x i8> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  store <8 x i8> %v0, ptr %out0
+  store <8 x i8> %v1, ptr %out1
+  store <8 x i8> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v2f32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.2s, v1.2s, v2.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2s }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <6 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <6 x float> %loaded, <6 x float> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x float> %loaded, <6 x float> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x float> %loaded, <6 x float> poison, <2 x i32> <i32 2, i32 5>
+  store <2 x float> %v0, ptr %out0
+  store <2 x float> %v1, ptr %out1
+  store <2 x float> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v4f16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4h }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <12 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <12 x half> %loaded, <12 x half> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x half> %loaded, <12 x half> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x half> %loaded, <12 x half> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  store <4 x half> %v0, ptr %out0
+  store <4 x half> %v1, ptr %out1
+  store <4 x half> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v2i64(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <6 x i64>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <6 x i64> %loaded, <6 x i64> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x i64> %loaded, <6 x i64> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x i64> %loaded, <6 x i64> poison, <2 x i32> <i32 2, i32 5>
+  store <2 x i64> %v0, ptr %out0
+  store <2 x i64> %v1, ptr %out1
+  store <2 x i64> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v4i32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <12 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <12 x i32> %loaded, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x i32> %loaded, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x i32> %loaded, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  store <4 x i32> %v0, ptr %out0
+  store <4 x i32> %v1, ptr %out1
+  store <4 x i32> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v8i16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8h }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <24 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <24 x i16> %loaded, <24 x i16> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i16> %loaded, <24 x i16> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i16> %loaded, <24 x i16> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  store <8 x i16> %v0, ptr %out0
+  store <8 x i16> %v1, ptr %out1
+  store <8 x i16> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v16i8(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.16b, v1.16b, v2.16b }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.16b, v1.16b, v2.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.16b }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.16b }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <48 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <48 x i8> %loaded, <48 x i8> poison,
+                      <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21,
+                                  i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v1 = shufflevector <48 x i8> %loaded, <48 x i8> poison,
+                      <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22,
+                                  i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v2 = shufflevector <48 x i8> %loaded, <48 x i8> poison,
+                      <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23,
+                                  i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+  store <16 x i8> %v0, ptr %out0
+  store <16 x i8> %v1, ptr %out1
+  store <16 x i8> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v2f64(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.2d, v1.2d, v2.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <6 x double>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <6 x double> %loaded, <6 x double> poison, <2 x i32> <i32 0, i32 3>
+  %v1 = shufflevector <6 x double> %loaded, <6 x double> poison, <2 x i32> <i32 1, i32 4>
+  %v2 = shufflevector <6 x double> %loaded, <6 x double> poison, <2 x i32> <i32 2, i32 5>
+  store <2 x double> %v0, ptr %out0
+  store <2 x double> %v1, ptr %out1
+  store <2 x double> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v4f32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <12 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <12 x float> %loaded, <12 x float> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v1 = shufflevector <12 x float> %loaded, <12 x float> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v2 = shufflevector <12 x float> %loaded, <12 x float> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  store <4 x float> %v0, ptr %out0
+  store <4 x float> %v1, ptr %out1
+  store <4 x float> %v2, ptr %out2
+  ret void
+}
+
+define void @test_ldnp_interleaved_load3_v8f16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load3_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load3_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld3 { v0.8h, v1.8h, v2.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8h }, [x3]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <24 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <24 x half> %loaded, <24 x half> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x half> %loaded, <24 x half> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x half> %loaded, <24 x half> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  store <8 x half> %v0, ptr %out0
+  store <8 x half> %v1, ptr %out1
+  store <8 x half> %v2, ptr %out2
+  ret void
+}
+
+; Test conservative lowering of a ld4 matching patterns
+
+define void @test_ldnp_interleaved_load4_v2i32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v2i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    str d3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v2i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2s }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.2s }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x i32> %loaded, <8 x i32> poison, <2 x i32> <i32 3, i32 7>
+  store <2 x i32> %v0, ptr %out0
+  store <2 x i32> %v1, ptr %out1
+  store <2 x i32> %v2, ptr %out2
+  store <2 x i32> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v4i16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v4i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    str d3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v4i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4h }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.4h }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i16> %loaded, <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  store <4 x i16> %v0, ptr %out0
+  store <4 x i16> %v1, ptr %out1
+  store <4 x i16> %v2, ptr %out2
+  store <4 x i16> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v8i8(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v8i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    str d3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v8i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.8b, v1.8b, v2.8b, v3.8b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8b }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8b }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.8b }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <32 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <32 x i8> %loaded, <32 x i8> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i8> %loaded, <32 x i8> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i8> %loaded, <32 x i8> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i8> %loaded, <32 x i8> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <8 x i8> %v0, ptr %out0
+  store <8 x i8> %v1, ptr %out1
+  store <8 x i8> %v2, ptr %out2
+  store <8 x i8> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v2f32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v2f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    str d3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v2f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.2s, v1.2s, v2.2s, v3.2s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2s }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.2s }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <8 x float> %loaded, <8 x float> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x float> %loaded, <8 x float> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x float> %loaded, <8 x float> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x float> %loaded, <8 x float> poison, <2 x i32> <i32 3, i32 7>
+  store <2 x float> %v0, ptr %out0
+  store <2 x float> %v1, ptr %out1
+  store <2 x float> %v2, ptr %out2
+  store <2 x float> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v4f16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v4f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-LE-NEXT:    str d0, [x1]
+; CHECK-LE-NEXT:    str d1, [x2]
+; CHECK-LE-NEXT:    str d2, [x3]
+; CHECK-LE-NEXT:    str d3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v4f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4h }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.4h }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <16 x half> %loaded, <16 x half> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x half> %loaded, <16 x half> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x half> %loaded, <16 x half> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x half> %loaded, <16 x half> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  store <4 x half> %v0, ptr %out0
+  store <4 x half> %v1, ptr %out1
+  store <4 x half> %v2, ptr %out2
+  store <4 x half> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v2i64(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v2i64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v2i64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x i64>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <8 x i64> %loaded, <8 x i64> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x i64> %loaded, <8 x i64> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x i64> %loaded, <8 x i64> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x i64> %loaded, <8 x i64> poison, <2 x i32> <i32 3, i32 7>
+  store <2 x i64> %v0, ptr %out0
+  store <2 x i64> %v1, ptr %out1
+  store <2 x i64> %v2, ptr %out2
+  store <2 x i64> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v4i32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v4i32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v4i32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x i32>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <16 x i32> %loaded, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x i32> %loaded, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x i32> %loaded, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x i32> %loaded, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  store <4 x i32> %v0, ptr %out0
+  store <4 x i32> %v1, ptr %out1
+  store <4 x i32> %v2, ptr %out2
+  store <4 x i32> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v8i16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v8i16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v8i16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8h }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.8h }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <32 x i16>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <32 x i16> %loaded, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i16> %loaded, <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i16> %loaded, <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i16> %loaded, <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <8 x i16> %v0, ptr %out0
+  store <8 x i16> %v1, ptr %out1
+  store <8 x i16> %v2, ptr %out2
+  store <8 x i16> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v16i8(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v16i8:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v16i8:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.16b }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.16b }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.16b }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.16b }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <64 x i8>, ptr %ptr, align 1, !nontemporal !0
+  %v0 = shufflevector <64 x i8> %loaded, <64 x i8> poison,
+                      <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28,
+                                  i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v1 = shufflevector <64 x i8> %loaded, <64 x i8> poison,
+                      <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29,
+                                  i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v2 = shufflevector <64 x i8> %loaded, <64 x i8> poison,
+                      <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30,
+                                  i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v3 = shufflevector <64 x i8> %loaded, <64 x i8> poison,
+                      <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31,
+                                  i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+  store <16 x i8> %v0, ptr %out0
+  store <16 x i8> %v1, ptr %out1
+  store <16 x i8> %v2, ptr %out2
+  store <16 x i8> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v2f64(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v2f64:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v2f64:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.2d }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.2d }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.2d }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <8 x double>, ptr %ptr, align 8, !nontemporal !0
+  %v0 = shufflevector <8 x double> %loaded, <8 x double> poison, <2 x i32> <i32 0, i32 4>
+  %v1 = shufflevector <8 x double> %loaded, <8 x double> poison, <2 x i32> <i32 1, i32 5>
+  %v2 = shufflevector <8 x double> %loaded, <8 x double> poison, <2 x i32> <i32 2, i32 6>
+  %v3 = shufflevector <8 x double> %loaded, <8 x double> poison, <2 x i32> <i32 3, i32 7>
+  store <2 x double> %v0, ptr %out0
+  store <2 x double> %v1, ptr %out1
+  store <2 x double> %v2, ptr %out2
+  store <2 x double> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v4f32(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v4f32:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v4f32:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.4s, v1.4s, v2.4s, v3.4s }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.4s }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.4s }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <16 x float>, ptr %ptr, align 4, !nontemporal !0
+  %v0 = shufflevector <16 x float> %loaded, <16 x float> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v1 = shufflevector <16 x float> %loaded, <16 x float> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v2 = shufflevector <16 x float> %loaded, <16 x float> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v3 = shufflevector <16 x float> %loaded, <16 x float> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  store <4 x float> %v0, ptr %out0
+  store <4 x float> %v1, ptr %out1
+  store <4 x float> %v2, ptr %out2
+  store <4 x float> %v3, ptr %out3
+  ret void
+}
+
+define void @test_ldnp_interleaved_load4_v8f16(ptr %ptr, ptr %out0, ptr %out1, ptr %out2, ptr %out3) {
+; CHECK-LE-LABEL: test_ldnp_interleaved_load4_v8f16:
+; CHECK-LE:       // %bb.0: // %entry
+; CHECK-LE-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-LE-NEXT:    str q0, [x1]
+; CHECK-LE-NEXT:    str q1, [x2]
+; CHECK-LE-NEXT:    str q2, [x3]
+; CHECK-LE-NEXT:    str q3, [x4]
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: test_ldnp_interleaved_load4_v8f16:
+; CHECK-BE:       // %bb.0: // %entry
+; CHECK-BE-NEXT:    ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0]
+; CHECK-BE-NEXT:    st1 { v0.8h }, [x1]
+; CHECK-BE-NEXT:    st1 { v1.8h }, [x2]
+; CHECK-BE-NEXT:    st1 { v2.8h }, [x3]
+; CHECK-BE-NEXT:    st1 { v3.8h }, [x4]
+; CHECK-BE-NEXT:    ret
+entry:
+  %loaded = load <32 x half>, ptr %ptr, align 2, !nontemporal !0
+  %v0 = shufflevector <32 x half> %loaded, <32 x half> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x half> %loaded, <32 x half> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x half> %loaded, <32 x half> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x half> %loaded, <32 x half> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  store <8 x half> %v0, ptr %out0
+  store <8 x half> %v1, ptr %out1
+  store <8 x half> %v2, ptr %out2
+  store <8 x half> %v3, ptr %out3
+  ret void
+}
+
+!0 = !{ i32 1 }



More information about the llvm-commits mailing list