[llvm] [AArch64] New subtarget features to control ldp and stp formation, fo… (PR #66098)

Thu Sep 14 03:18:23 PDT 2023

https://github.com/manosanaggh updated https://github.com/llvm/llvm-project/pull/66098:

>From 4791aad2489a6987fe8767da1a67af2f2b8dda86 Mon Sep 17 00:00:00 2001
From: Manos Anagnostakis <manos.anagnostakis at vrull.eu>
Date: Thu, 14 Sep 2023 12:09:54 +0200
Subject: [PATCH] [AArch64] New subtarget features to control ldp and stp
 formation, focused on ampere1 and ampere1a.

On some AArch64 cores, including Ampere's ampere1 and ampere1a
architectures, load and store pair instructions are faster compared
to simple loads/stores only when the alignment of the pair is at least
twice that of the individual element being loaded.

Based on that, this patch introduces four new subtarget features,
two for controlling ldp and two for controlling stp, to cover
the ampere1 and ampere1a alignment needs and to enable optional
fine-grained control over ldp and stp generation in general.
The latter can be utilized by another cpu, if there are possible
benefits
with a different policy than the default provided by the compiler.

More specifically, for each of the ldp and stp respectively we have:

- disable-ldp/disable-stp: Do not emit ldp/stp.
- ldp-aligned-only/stp-aligned-only: Emit ldp/stp only if the source
pointer is aligned to at least double the alignment of the type.

Therefore, for -mcpu=ampere1 and -mcpu=ampere1a
ldp-aligned-only/stp-aligned-only become the defaults because,
of the benefit from the alignment, whereas for the rest
of the cpus the default behaviour of the compiler is maintained.
---
 llvm/lib/Target/AArch64/AArch64.td            |  20 +-
 .../AArch64/AArch64LoadStoreOptimizer.cpp     |  33 ++
 .../AArch64/ldp-stp-control-features.ll       | 390 ++++++++++++++++++
 3 files changed, 441 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 9a7cc283b5c15cc..d66a8a896bae46a 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -570,6 +570,18 @@ def FeatureD128 : SubtargetFeature<"d128", "HasD128",
     "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
     [FeatureLSE128]>;
 
+def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp",
+    "true", "Do not emit ldp">;
+
+def FeatureDisableStp : SubtargetFeature<"disable-stp", "HasDisableStp",
+    "true", "Do not emit stp">;
+
+def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedOnly",
+    "true", "In order to emit ldp, first check if the load will be aligned to 2 * element_size">;
+
+def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
+    "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -1239,7 +1251,9 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeatureArithmeticBccFusion,
                                    FeatureCmpBccFusion,
                                    FeatureFuseAddress,
-                                   FeatureFuseLiterals]>;
+                                   FeatureFuseLiterals,
+			           FeatureLdpAlignedOnly,
+                                   FeatureStpAlignedOnly]>;
 
 def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
@@ -1252,7 +1266,9 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureCmpBccFusion,
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
-                                    FeatureFuseLiterals]>;
+                                    FeatureFuseLiterals,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 41af5522d967dbf..dc2965178bc284e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2136,6 +2136,14 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
+  // If disable-ldp feature is opted, do not emit ldp.
+  if (MI.mayLoad() && Subtarget->hasDisableLdp())
+    return false;
+
+  // If disable-stp feature is opted, do not emit stp.
+  if (MI.mayStore() && Subtarget->hasDisableStp())
+    return false;
+
   // Early exit if the offset is not possible to match. (6 bits of positive
   // range, plus allow an extra one in case we find a later insn that matches
   // with Offset-1)
@@ -2159,6 +2167,31 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
     // Keeping the iterator straight is a pain, so we let the merge routine tell
     // us what the next instruction is after it's done mucking about.
     auto Prev = std::prev(MBBI);
+
+    // Fetch the memoperand of the load/store that is a candidate for
+    // combination.
+    MachineMemOperand *MemOp =
+        MI.memoperands_empty() ? nullptr : MI.memoperands().front();
+
+    // Get the needed alignments to check them if
+    // ldp-aligned-only/stp-aligned-only features are opted.
+    uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1;
+    uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1;
+
+    // If a load arrives and ldp-aligned-only feature is opted, check that the
+    // alignment of the source pointer is at least double the alignment of the
+    // type.
+    if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp &&
+        MemAlignment < 2 * TypeAlignment)
+      return false;
+
+    // If a store arrives and stp-aligned-only feature is opted, check that the
+    // alignment of the source pointer is at least double the alignment of the
+    // type.
+    if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp &&
+        MemAlignment < 2 * TypeAlignment)
+      return false;
+
     MBBI = mergePairedInsns(MBBI, Paired, Flags);
     // Collect liveness info for instructions between Prev and the new position
     // MBBI.
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
new file mode 100644
index 000000000000000..6f87df90d9177d0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -O2 -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-LABEL-DEFAULT,CHECK-DEFAULT,CHECK-NEXT-DEFAULT
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
+
+define i32 @ldp_aligned_int32_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    ldp w9, w8, [x8]
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8]
+; CHECK-NEXT-DEFAULT:    add w0, w8, w9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int32_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #4]
+; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -64
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load i32, ptr %4, align 64
+  %6 = getelementptr inbounds i32, ptr %4, i64 1
+  %7 = load i32, ptr %6, align 4
+  %8 = add nsw i32 %7, %5
+  ret i32 %8
+}
+
+define i64 @ldp_aligned_int64_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    ldp x9, x8, [x8]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8]
+; CHECK-NEXT-DEFAULT:    add x0, x8, x9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int64_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -128
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load i64, ptr %4, align 128
+  %6 = getelementptr inbounds i64, ptr %4, i64 1
+  %7 = load i64, ptr %6, align 8
+  %8 = add nsw i64 %7, %5
+  ret i64 %8
+}
+
+define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    ldp q0, q1, [x8]
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8]
+; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_v4si:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #16]
+; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -256
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load <4 x i32>, ptr %4, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
+  %7 = load <4 x i32>, ptr %6, align 16
+  %8 = add <4 x i32> %7, %5
+  ret <4 x i32> %8
+}
+
+define i32 @ldp_unaligned_int32_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    ldr w9, [x8, #4]
+; CHECK-NEXT:    ldr w8, [x8, #8]
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8, #4]
+; CHECK-NEXT-DEFAULT:    add w0, w8, w9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int32_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8, #4]
+; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -64
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds i32, ptr %4, i64 1
+  %6 = load i32, ptr %5, align 4
+  %7 = getelementptr inbounds i32, ptr %4, i64 2
+  %8 = load i32, ptr %7, align 8
+  %9 = add nsw i32 %8, %6
+  ret i32 %9
+}
+
+define i64 @ldp_unaligned_int64_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    ldr x9, [x8, #8]
+; CHECK-NEXT:    ldr x8, [x8, #16]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8, #8]
+; CHECK-NEXT-DEFAULT:    add x0, x8, x9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int64_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #16] 
+; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -128
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds i64, ptr %4, i64 1
+  %6 = load i64, ptr %5, align 8
+  %7 = getelementptr inbounds i64, ptr %4, i64 2
+  %8 = load i64, ptr %7, align 16
+  %9 = add nsw i64 %8, %6
+  ret i64 %9
+}
+
+define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    ldr q0, [x8, #16]
+; CHECK-NEXT:    ldr q1, [x8, #32]
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8, #16]
+; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_v4si:
+; CHECK-DISABLE-LDP:       // %bb.0: 
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8, #16]
+; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #32]
+; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -256
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
+  %6 = load <4 x i32>, ptr %5, align 16
+  %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2
+  %8 = load <4 x i32>, ptr %7, align 32
+  %9 = add <4 x i32> %8, %6
+  ret <4 x i32> %9
+}
+
+define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_aligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    stp w1, w1, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    stp w1, w1, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE: stp_aligned_int32_t:
+; CHECK-DISABLE:       // %bb.0:
+; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE:    str w1, [x0]
+; CHECK-NEXT-DISABLE:    str w1, [x0, #4]
+; CHECK-NEXT-DISABLE:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  store i32 %1, ptr %5, align 64
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  ret ptr %5
+}
+
+define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_aligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT:    stp x1, x1, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    stp x1, x1, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE: stp_aligned_int64_t:
+; CHECK-DISABLE:       // %bb.0: 
+; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE:    str x1, [x0]
+; CHECK-NEXT-DISABLE:    str x1, [x0, #8]
+; CHECK-NEXT-DISABLE:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  store i64 %1, ptr %5, align 128
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  ret ptr %5
+}
+
+define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_aligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_v4si: 
+; CHECK-DEFAULT:       // %bb.0: 
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    stp q0, q0, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_aligned_v4si:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-STP:    str q0, [x0]
+; CHECK-NEXT-DISABLE-STP:    str q0, [x0, #16]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  store <4 x i32> %1, ptr %5, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  ret ptr %5
+}
+
+define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    orr x0, x8, #0x4
+; CHECK-NEXT:    str w1, [x8, #4]
+; CHECK-NEXT:    str w1, [x8, #8]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x4
+; CHECK-NEXT-DEFAULT:    stp w1, w1, [x8, #4]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_int32_t:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x4
+; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #4]
+; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #8]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  %7 = getelementptr inbounds i32, ptr %5, i64 2
+  store i32 %1, ptr %7, align 8
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    orr x0, x8, #0x8
+; CHECK-NEXT:    str x1, [x8, #8]
+; CHECK-NEXT:    str x1, [x8, #16]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x8
+; CHECK-NEXT-DEFAULT:    stp x1, x1, [x8, #8]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_int64_t:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x8
+; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #8]
+; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #16]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  %7 = getelementptr inbounds i64, ptr %5, i64 2
+  store i64 %1, ptr %7, align 16
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_unaligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    orr x0, x8, #0x10
+; CHECK-NEXT:    str q0, [x8, #16]
+; CHECK-NEXT:    str q0, [x8, #32]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x10
+; CHECK-NEXT-DEFAULT:    stp q0, q0, [x8, #16]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_v4si:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x10
+; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #16]
+; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #32]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2
+  store <4 x i32> %1, ptr %7, align 32
+  ret ptr %6
+}
+