[llvm] [AArch64] New subtarget features to control ldp and stp formation, fo… (PR #66098)
    Manos Anagnostakis via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Wed Sep 13 12:10:07 PDT 2023
    
    
  
https://github.com/manosanaggh updated https://github.com/llvm/llvm-project/pull/66098:
>From 487a01d8193468d90c95eafa51a49f6eb575231e Mon Sep 17 00:00:00 2001
From: Manos Anagnostakis <manos.anagnostakis at vrull.eu>
Date: Wed, 13 Sep 2023 21:06:39 +0200
Subject: [PATCH] [AArch64] New subtarget features to control ldp and stp
 formation, focused on ampere1 and ampere1a.
On some AArch64 cores, including Ampere's ampere1 and ampere1a
architectures, load and store pair instructions are faster compared
to simple loads/stores only when the alignment of the pair is at least
twice that of the individual element being loaded.
Based on that, this patch introduces four new subtarget features,
two for controlling ldp and two for controlling stp, to cover
the ampere1 and ampere1a alignment needs and to enable optional
fine-grained control over ldp and stp generation in general.
The latter can be utilized by another cpu, if there are possible
benefits
with a different policy than the default provided by the compiler.
More specifically, for each of the ldp and stp respectively we have:
- disable-ldp/disable-stp: Do not emit ldp/stp.
- ldp-aligned-only/stp-aligned-only: Emit ldp/stp only if the source
pointer is aligned to at least double the alignment of the type.
Therefore, for -mcpu=ampere1 and -mcpu=ampere1a
ldp-aligned-only/stp-aligned-only become the defaults because,
of the benefit from the alignment, whereas for the rest
of the cpus the default behaviour of the compiler is maintained.
---
 ...arget-features-to-control-ldp-and-st.patch | 527 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64.td            |  20 +-
 .../AArch64/AArch64LoadStoreOptimizer.cpp     |  30 +
 .../AArch64/ldp-stp-control-features.ll       | 390 +++++++++++++
 4 files changed, 965 insertions(+), 2 deletions(-)
 create mode 100644 0001-AArch64-New-subtarget-features-to-control-ldp-and-st.patch
 create mode 100644 llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
diff --git a/0001-AArch64-New-subtarget-features-to-control-ldp-and-st.patch b/0001-AArch64-New-subtarget-features-to-control-ldp-and-st.patch
new file mode 100644
index 000000000000000..b5f6cba6d4e9550
--- /dev/null
+++ b/0001-AArch64-New-subtarget-features-to-control-ldp-and-st.patch
@@ -0,0 +1,527 @@
+From 2b39b2a699a573e422e7c1c010207a9fb1ff6d04 Mon Sep 17 00:00:00 2001
+From: Manos Anagnostakis <manos.anagnostakis at vrull.eu>
+Date: Tue, 12 Sep 2023 12:37:17 +0200
+Subject: [PATCH] [AArch64] New subtarget features to control ldp and stp
+ formation, focused on ampere1 and ampere1a.
+
+On some AArch64 cores, including Ampere's ampere1 and ampere1a
+architectures, load and store pair instructions are faster compared
+to simple loads/stores only when the alignment of the pair is at least
+twice that of the individual element being loaded.
+
+Based on that, this patch introduces four new subtarget features,
+two for controlling ldp and two for controlling stp, to cover
+the ampere1 and ampere1a alignment needs and to enable optional
+fine-grained control over ldp and stp generation in general.
+The latter can be utilized by another cpu, if there are possible benefits
+with a different policy than the default provided by the compiler.
+
+More specifically, for each of the ldp and stp respectively we have:
+
+- disable-ldp/disable-stp: Do not emit ldp/stp.
+- ldp-aligned-only/stp-aligned-only: Emit ldp/stp only if the source
+pointer is aligned to at least double the alignment of the type.
+
+Therefore, for -mcpu=ampere1 and -mcpu=ampere1a
+ldp-aligned-only/stp-aligned-only become the defaults because,
+of the benefit from the alignment, whereas for the rest
+of the cpus the default behaviour of the compiler is maintained.
+---
+ llvm/lib/Target/AArch64/AArch64.td            |  20 +-
+ .../AArch64/AArch64LoadStoreOptimizer.cpp     |  30 ++
+ .../AArch64/ldp-stp-control-features.ll       | 390 ++++++++++++++++++
+ 3 files changed, 438 insertions(+), 2 deletions(-)
+ create mode 100644 llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
+
+diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
+index 9a7cc283b5c1..d66a8a896bae 100644
+--- a/llvm/lib/Target/AArch64/AArch64.td
++++ b/llvm/lib/Target/AArch64/AArch64.td
+@@ -570,6 +570,18 @@ def FeatureD128 : SubtargetFeature<"d128", "HasD128",
+     "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
+     [FeatureLSE128]>;
+ 
++def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp",
++    "true", "Do not emit ldp">;
++
++def FeatureDisableStp : SubtargetFeature<"disable-stp", "HasDisableStp",
++    "true", "Do not emit stp">;
++
++def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedOnly",
++    "true", "In order to emit ldp, first check if the load will be aligned to 2 * element_size">;
++
++def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
++    "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
++
+ //===----------------------------------------------------------------------===//
+ // Architectures.
+ //
+@@ -1239,7 +1251,9 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
+                                    FeatureArithmeticBccFusion,
+                                    FeatureCmpBccFusion,
+                                    FeatureFuseAddress,
+-                                   FeatureFuseLiterals]>;
++                                   FeatureFuseLiterals,
++			           FeatureLdpAlignedOnly,
++                                   FeatureStpAlignedOnly]>;
+ 
+ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
+                                     "Ampere Computing Ampere-1A processors", [
+@@ -1252,7 +1266,9 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
+                                     FeatureCmpBccFusion,
+                                     FeatureFuseAddress,
+                                     FeatureFuseLiterals,
+-                                    FeatureFuseLiterals]>;
++                                    FeatureFuseLiterals,
++                                    FeatureLdpAlignedOnly,
++                                    FeatureStpAlignedOnly]>;
+ 
+ def ProcessorFeatures {
+   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
+diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+index 41af5522d967..e1f38cfbcb01 100644
+--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
++++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+@@ -2136,6 +2136,18 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+   if (!TII->isCandidateToMergeOrPair(MI))
+     return false;
+ 
++  // Fetch the memoperand of the load/store that is a candidate for
++  // combination.
++  MachineMemOperand *MemOp = MI.memoperands().front();
++
++  // If disable-ldp feature is opted, do not emit ldp.
++  if (MI.mayLoad() && Subtarget->hasDisableLdp())
++    return false;
++
++  // If disable-stp feature is opted, do not emit stp.
++  if (MI.mayStore() && Subtarget->hasDisableStp())
++    return false;
++
+   // Early exit if the offset is not possible to match. (6 bits of positive
+   // range, plus allow an extra one in case we find a later insn that matches
+   // with Offset-1)
+@@ -2148,6 +2160,24 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
+   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
+     return false;
+ 
++  // Get the needed alignments to check them if
++  // ldp-aligned-only/stp-aligned-only features are opted.
++  uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1;
++  uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1;
++
++  // If a load arrives and ldp-aligned-only feature is opted, check that the
++  // alignment of the source pointer is at least double the alignment of the
++  // type.
++  if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp &&
++      MemAlignment < 2 * TypeAlignment)
++    return false;
++  // If a store arrives and stp-aligned-only feature is opted, check that the
++  // alignment of the source pointer is at least double the alignment of the
++  // type.
++  if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp &&
++      MemAlignment < 2 * TypeAlignment)
++    return false;
++
+   // Look ahead up to LdStLimit instructions for a pairable instruction.
+   LdStPairFlags Flags;
+   MachineBasicBlock::iterator Paired =
+diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
+new file mode 100644
+index 000000000000..6f87df90d917
+--- /dev/null
++++ b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
+@@ -0,0 +1,390 @@
++; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 | FileCheck %s --check-prefixes=CHECK
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a | FileCheck %s --check-prefixes=CHECK
++; RUN: llc < %s -O2 -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-LABEL-DEFAULT,CHECK-DEFAULT,CHECK-NEXT-DEFAULT
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
++; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
++
++define i32 @ldp_aligned_int32_t(ptr %0) #0 {
++; CHECK-LABEL: ldp_aligned_int32_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT:    ldp w9, w8, [x8]
++; CHECK-NEXT:    add w0, w8, w9
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_aligned_int32_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8]
++; CHECK-NEXT-DEFAULT:    add w0, w8, w9
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int32_t:
++; CHECK-DISABLE-LDP:       // %bb.0:
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8]
++; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #4]
++; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -64
++  %4 = inttoptr i64 %3 to ptr
++  %5 = load i32, ptr %4, align 64
++  %6 = getelementptr inbounds i32, ptr %4, i64 1
++  %7 = load i32, ptr %6, align 4
++  %8 = add nsw i32 %7, %5
++  ret i32 %8
++}
++
++define i64 @ldp_aligned_int64_t(ptr %0) #0 {
++; CHECK-LABEL: ldp_aligned_int64_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT:    ldp x9, x8, [x8]
++; CHECK-NEXT:    add x0, x8, x9
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_aligned_int64_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8]
++; CHECK-NEXT-DEFAULT:    add x0, x8, x9
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int64_t:
++; CHECK-DISABLE-LDP:       // %bb.0:
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8]
++; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #8]
++; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -128
++  %4 = inttoptr i64 %3 to ptr
++  %5 = load i64, ptr %4, align 128
++  %6 = getelementptr inbounds i64, ptr %4, i64 1
++  %7 = load i64, ptr %6, align 8
++  %8 = add nsw i64 %7, %5
++  ret i64 %8
++}
++
++define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 {
++; CHECK-LABEL: ldp_aligned_v4si:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT:    ldp q0, q1, [x8]
++; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_aligned_v4si:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8]
++; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_aligned_v4si:
++; CHECK-DISABLE-LDP:       // %bb.0:
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8]
++; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #16]
++; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -256
++  %4 = inttoptr i64 %3 to ptr
++  %5 = load <4 x i32>, ptr %4, align 256
++  %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
++  %7 = load <4 x i32>, ptr %6, align 16
++  %8 = add <4 x i32> %7, %5
++  ret <4 x i32> %8
++}
++
++define i32 @ldp_unaligned_int32_t(ptr %0) #0 {
++; CHECK-LABEL: ldp_unaligned_int32_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT:    ldr w9, [x8, #4]
++; CHECK-NEXT:    ldr w8, [x8, #8]
++; CHECK-NEXT:    add w0, w8, w9
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_unaligned_int32_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8, #4]
++; CHECK-NEXT-DEFAULT:    add w0, w8, w9
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int32_t:
++; CHECK-DISABLE-LDP:       // %bb.0:
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8, #4]
++; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #8]
++; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -64
++  %4 = inttoptr i64 %3 to ptr
++  %5 = getelementptr inbounds i32, ptr %4, i64 1
++  %6 = load i32, ptr %5, align 4
++  %7 = getelementptr inbounds i32, ptr %4, i64 2
++  %8 = load i32, ptr %7, align 8
++  %9 = add nsw i32 %8, %6
++  ret i32 %9
++}
++
++define i64 @ldp_unaligned_int64_t(ptr %0) #0 {
++; CHECK-LABEL: ldp_unaligned_int64_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT:    ldr x9, [x8, #8]
++; CHECK-NEXT:    ldr x8, [x8, #16]
++; CHECK-NEXT:    add x0, x8, x9
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_unaligned_int64_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8, #8]
++; CHECK-NEXT-DEFAULT:    add x0, x8, x9
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int64_t:
++; CHECK-DISABLE-LDP:       // %bb.0:
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8, #8]
++; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #16] 
++; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -128
++  %4 = inttoptr i64 %3 to ptr
++  %5 = getelementptr inbounds i64, ptr %4, i64 1
++  %6 = load i64, ptr %5, align 8
++  %7 = getelementptr inbounds i64, ptr %4, i64 2
++  %8 = load i64, ptr %7, align 16
++  %9 = add nsw i64 %8, %6
++  ret i64 %9
++}
++
++define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 {
++; CHECK-LABEL: ldp_unaligned_v4si:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT:    ldr q0, [x8, #16]
++; CHECK-NEXT:    ldr q1, [x8, #32]
++; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: ldp_unaligned_v4si:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8, #16]
++; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_v4si:
++; CHECK-DISABLE-LDP:       // %bb.0: 
++; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8, #16]
++; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #32]
++; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
++; CHECK-NEXT-DISABLE-LDP:    ret
++  %2 = ptrtoint ptr %0 to i64
++  %3 = and i64 %2, -256
++  %4 = inttoptr i64 %3 to ptr
++  %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
++  %6 = load <4 x i32>, ptr %5, align 16
++  %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2
++  %8 = load <4 x i32>, ptr %7, align 32
++  %9 = add <4 x i32> %8, %6
++  ret <4 x i32> %9
++}
++
++define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
++; CHECK-LABEL: stp_aligned_int32_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x0, x0, #0xffffffffffffffc0
++; CHECK-NEXT:    stp w1, w1, [x0]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_aligned_int32_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DEFAULT:    stp w1, w1, [x0]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE: stp_aligned_int32_t:
++; CHECK-DISABLE:       // %bb.0:
++; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DISABLE:    str w1, [x0]
++; CHECK-NEXT-DISABLE:    str w1, [x0, #4]
++; CHECK-NEXT-DISABLE:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -64
++  %5 = inttoptr i64 %4 to ptr
++  store i32 %1, ptr %5, align 64
++  %6 = getelementptr inbounds i32, ptr %5, i64 1
++  store i32 %1, ptr %6, align 4
++  ret ptr %5
++}
++
++define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
++; CHECK-LABEL: stp_aligned_int64_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x0, x0, #0xffffffffffffff80
++; CHECK-NEXT:    stp x1, x1, [x0]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_aligned_int64_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff80
++; CHECK-NEXT-DEFAULT:    stp x1, x1, [x0]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE: stp_aligned_int64_t:
++; CHECK-DISABLE:       // %bb.0: 
++; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffff80
++; CHECK-NEXT-DISABLE:    str x1, [x0]
++; CHECK-NEXT-DISABLE:    str x1, [x0, #8]
++; CHECK-NEXT-DISABLE:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -128
++  %5 = inttoptr i64 %4 to ptr
++  store i64 %1, ptr %5, align 128
++  %6 = getelementptr inbounds i64, ptr %5, i64 1
++  store i64 %1, ptr %6, align 8
++  ret ptr %5
++}
++
++define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
++; CHECK-LABEL: stp_aligned_v4si:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x0, x0, #0xffffffffffffff00
++; CHECK-NEXT:    stp q0, q0, [x0]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_aligned_v4si: 
++; CHECK-DEFAULT:       // %bb.0: 
++; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff00
++; CHECK-NEXT-DEFAULT:    stp q0, q0, [x0]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-STP: stp_aligned_v4si:
++; CHECK-DISABLE-STP:       // %bb.0:
++; CHECK-NEXT-DISABLE-STP:    and x0, x0, #0xffffffffffffff00
++; CHECK-NEXT-DISABLE-STP:    str q0, [x0]
++; CHECK-NEXT-DISABLE-STP:    str q0, [x0, #16]
++; CHECK-NEXT-DISABLE-STP:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -256
++  %5 = inttoptr i64 %4 to ptr
++  store <4 x i32> %1, ptr %5, align 256
++  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
++  store <4 x i32> %1, ptr %6, align 16
++  ret ptr %5
++}
++
++define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
++; CHECK-LABEL: stp_unaligned_int32_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT:    orr x0, x8, #0x4
++; CHECK-NEXT:    str w1, [x8, #4]
++; CHECK-NEXT:    str w1, [x8, #8]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_unaligned_int32_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x4
++; CHECK-NEXT-DEFAULT:    stp w1, w1, [x8, #4]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-STP: stp_unaligned_int32_t:
++; CHECK-DISABLE-STP:       // %bb.0:
++; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffffc0
++; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x4
++; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #4]
++; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #8]
++; CHECK-NEXT-DISABLE-STP:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -64
++  %5 = inttoptr i64 %4 to ptr
++  %6 = getelementptr inbounds i32, ptr %5, i64 1
++  store i32 %1, ptr %6, align 4
++  %7 = getelementptr inbounds i32, ptr %5, i64 2
++  store i32 %1, ptr %7, align 8
++  ret ptr %6
++}
++
++define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
++; CHECK-LABEL: stp_unaligned_int64_t:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT:    orr x0, x8, #0x8
++; CHECK-NEXT:    str x1, [x8, #8]
++; CHECK-NEXT:    str x1, [x8, #16]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_unaligned_int64_t:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x8
++; CHECK-NEXT-DEFAULT:    stp x1, x1, [x8, #8]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-STP: stp_unaligned_int64_t:
++; CHECK-DISABLE-STP:       // %bb.0:
++; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff80
++; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x8
++; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #8]
++; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #16]
++; CHECK-NEXT-DISABLE-STP:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -128
++  %5 = inttoptr i64 %4 to ptr
++  %6 = getelementptr inbounds i64, ptr %5, i64 1
++  store i64 %1, ptr %6, align 8
++  %7 = getelementptr inbounds i64, ptr %5, i64 2
++  store i64 %1, ptr %7, align 16
++  ret ptr %6
++}
++
++define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
++; CHECK-LABEL: stp_unaligned_v4si:
++; CHECK:       // %bb.0:
++; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT:    orr x0, x8, #0x10
++; CHECK-NEXT:    str q0, [x8, #16]
++; CHECK-NEXT:    str q0, [x8, #32]
++; CHECK-NEXT:    ret
++;
++; CHECK-LABEL-DEFAULT: stp_unaligned_v4si:
++; CHECK-DEFAULT:       // %bb.0:
++; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x10
++; CHECK-NEXT-DEFAULT:    stp q0, q0, [x8, #16]
++; CHECK-NEXT-DEFAULT:    ret
++;
++; CHECK-LABEL-DISABLE-STP: stp_unaligned_v4si:
++; CHECK-DISABLE-STP:       // %bb.0:
++; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff00
++; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x10
++; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #16]
++; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #32]
++; CHECK-NEXT-DISABLE-STP:    ret
++  %3 = ptrtoint ptr %0 to i64
++  %4 = and i64 %3, -256
++  %5 = inttoptr i64 %4 to ptr
++  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
++  store <4 x i32> %1, ptr %6, align 16
++  %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2
++  store <4 x i32> %1, ptr %7, align 32
++  ret ptr %6
++}
++
+-- 
+2.40.1
+
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 9a7cc283b5c15cc..d66a8a896bae46a 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -570,6 +570,18 @@ def FeatureD128 : SubtargetFeature<"d128", "HasD128",
     "and Instructions (FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128)",
     [FeatureLSE128]>;
 
+def FeatureDisableLdp : SubtargetFeature<"disable-ldp", "HasDisableLdp",
+    "true", "Do not emit ldp">;
+
+def FeatureDisableStp : SubtargetFeature<"disable-stp", "HasDisableStp",
+    "true", "Do not emit stp">;
+
+def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedOnly",
+    "true", "In order to emit ldp, first check if the load will be aligned to 2 * element_size">;
+
+def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
+    "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -1239,7 +1251,9 @@ def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1",
                                    FeatureArithmeticBccFusion,
                                    FeatureCmpBccFusion,
                                    FeatureFuseAddress,
-                                   FeatureFuseLiterals]>;
+                                   FeatureFuseLiterals,
+			           FeatureLdpAlignedOnly,
+                                   FeatureStpAlignedOnly]>;
 
 def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     "Ampere Computing Ampere-1A processors", [
@@ -1252,7 +1266,9 @@ def TuneAmpere1A : SubtargetFeature<"ampere1a", "ARMProcFamily", "Ampere1A",
                                     FeatureCmpBccFusion,
                                     FeatureFuseAddress,
                                     FeatureFuseLiterals,
-                                    FeatureFuseLiterals]>;
+                                    FeatureFuseLiterals,
+                                    FeatureLdpAlignedOnly,
+                                    FeatureStpAlignedOnly]>;
 
 def ProcessorFeatures {
   list<SubtargetFeature> A53  = [HasV8_0aOps, FeatureCRC, FeatureCrypto,
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 41af5522d967dbf..e1f38cfbcb01ff7 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -2136,6 +2136,18 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   if (!TII->isCandidateToMergeOrPair(MI))
     return false;
 
+  // Fetch the memoperand of the load/store that is a candidate for
+  // combination.
+  MachineMemOperand *MemOp = MI.memoperands().front();
+
+  // If disable-ldp feature is opted, do not emit ldp.
+  if (MI.mayLoad() && Subtarget->hasDisableLdp())
+    return false;
+
+  // If disable-stp feature is opted, do not emit stp.
+  if (MI.mayStore() && Subtarget->hasDisableStp())
+    return false;
+
   // Early exit if the offset is not possible to match. (6 bits of positive
   // range, plus allow an extra one in case we find a later insn that matches
   // with Offset-1)
@@ -2148,6 +2160,24 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
     return false;
 
+  // Get the needed alignments to check them if
+  // ldp-aligned-only/stp-aligned-only features are opted.
+  uint64_t MemAlignment = MemOp ? MemOp->getAlign().value() : -1;
+  uint64_t TypeAlignment = MemOp ? Align(MemOp->getSize()).value() : -1;
+
+  // If a load arrives and ldp-aligned-only feature is opted, check that the
+  // alignment of the source pointer is at least double the alignment of the
+  // type.
+  if (MI.mayLoad() && Subtarget->hasLdpAlignedOnly() && MemOp &&
+      MemAlignment < 2 * TypeAlignment)
+    return false;
+  // If a store arrives and stp-aligned-only feature is opted, check that the
+  // alignment of the source pointer is at least double the alignment of the
+  // type.
+  if (MI.mayStore() && Subtarget->hasStpAlignedOnly() && MemOp &&
+      MemAlignment < 2 * TypeAlignment)
+    return false;
+
   // Look ahead up to LdStLimit instructions for a pairable instruction.
   LdStPairFlags Flags;
   MachineBasicBlock::iterator Paired =
diff --git a/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
new file mode 100644
index 000000000000000..6f87df90d9177d0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldp-stp-control-features.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -O2 -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-LABEL-DEFAULT,CHECK-DEFAULT,CHECK-NEXT-DEFAULT
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1 -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-ldp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-LDP,CHECK-DISABLE-LDP,CHECK-NEXT-DISABLE-LDP
+; RUN: llc < %s -O2 -mtriple=aarch64 -mcpu=ampere1a -mattr=+disable-stp | FileCheck %s --check-prefixes=CHECK-LABEL-DISABLE-STP,CHECK-DISABLE-STP,CHECK-NEXT-DISABLE-STP
+
+define i32 @ldp_aligned_int32_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    ldp w9, w8, [x8]
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8]
+; CHECK-NEXT-DEFAULT:    add w0, w8, w9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int32_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #4]
+; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -64
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load i32, ptr %4, align 64
+  %6 = getelementptr inbounds i32, ptr %4, i64 1
+  %7 = load i32, ptr %6, align 4
+  %8 = add nsw i32 %7, %5
+  ret i32 %8
+}
+
+define i64 @ldp_aligned_int64_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    ldp x9, x8, [x8]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8]
+; CHECK-NEXT-DEFAULT:    add x0, x8, x9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_int64_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -128
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load i64, ptr %4, align 128
+  %6 = getelementptr inbounds i64, ptr %4, i64 1
+  %7 = load i64, ptr %6, align 8
+  %8 = add nsw i64 %7, %5
+  ret i64 %8
+}
+
+define <4 x i32> @ldp_aligned_v4si(ptr %0) #0 {
+; CHECK-LABEL: ldp_aligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    ldp q0, q1, [x8]
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_aligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8]
+; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_aligned_v4si:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8]
+; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #16]
+; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -256
+  %4 = inttoptr i64 %3 to ptr
+  %5 = load <4 x i32>, ptr %4, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
+  %7 = load <4 x i32>, ptr %6, align 16
+  %8 = add <4 x i32> %7, %5
+  ret <4 x i32> %8
+}
+
+define i32 @ldp_unaligned_int32_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    ldr w9, [x8, #4]
+; CHECK-NEXT:    ldr w8, [x8, #8]
+; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    ldp w9, w8, [x8, #4]
+; CHECK-NEXT-DEFAULT:    add w0, w8, w9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int32_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-LDP:    ldr w9, [x8, #4]
+; CHECK-NEXT-DISABLE-LDP:    ldr w8, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    add w0, w8, w9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -64
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds i32, ptr %4, i64 1
+  %6 = load i32, ptr %5, align 4
+  %7 = getelementptr inbounds i32, ptr %4, i64 2
+  %8 = load i32, ptr %7, align 8
+  %9 = add nsw i32 %8, %6
+  ret i32 %9
+}
+
+define i64 @ldp_unaligned_int64_t(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    ldr x9, [x8, #8]
+; CHECK-NEXT:    ldr x8, [x8, #16]
+; CHECK-NEXT:    add x0, x8, x9
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    ldp x9, x8, [x8, #8]
+; CHECK-NEXT-DEFAULT:    add x0, x8, x9
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_int64_t:
+; CHECK-DISABLE-LDP:       // %bb.0:
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-LDP:    ldr x9, [x8, #8]
+; CHECK-NEXT-DISABLE-LDP:    ldr x8, [x8, #16] 
+; CHECK-NEXT-DISABLE-LDP:    add x0, x8, x9
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -128
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds i64, ptr %4, i64 1
+  %6 = load i64, ptr %5, align 8
+  %7 = getelementptr inbounds i64, ptr %4, i64 2
+  %8 = load i64, ptr %7, align 16
+  %9 = add nsw i64 %8, %6
+  ret i64 %9
+}
+
+define <4 x i32> @ldp_unaligned_v4si(ptr %0) #0 {
+; CHECK-LABEL: ldp_unaligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    ldr q0, [x8, #16]
+; CHECK-NEXT:    ldr q1, [x8, #32]
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: ldp_unaligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    ldp q0, q1, [x8, #16]
+; CHECK-NEXT-DEFAULT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-LDP: ldp_unaligned_v4si:
+; CHECK-DISABLE-LDP:       // %bb.0: 
+; CHECK-NEXT-DISABLE-LDP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-LDP:    ldr q0, [x8, #16]
+; CHECK-NEXT-DISABLE-LDP:    ldr q1, [x8, #32]
+; CHECK-NEXT-DISABLE-LDP:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT-DISABLE-LDP:    ret
+  %2 = ptrtoint ptr %0 to i64
+  %3 = and i64 %2, -256
+  %4 = inttoptr i64 %3 to ptr
+  %5 = getelementptr inbounds <4 x i32>, ptr %4, i64 1
+  %6 = load <4 x i32>, ptr %5, align 16
+  %7 = getelementptr inbounds <4 x i32>, ptr %4, i64 2
+  %8 = load <4 x i32>, ptr %7, align 32
+  %9 = add <4 x i32> %8, %6
+  ret <4 x i32> %9
+}
+
+define ptr @stp_aligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_aligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    stp w1, w1, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    stp w1, w1, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE: stp_aligned_int32_t:
+; CHECK-DISABLE:       // %bb.0:
+; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE:    str w1, [x0]
+; CHECK-NEXT-DISABLE:    str w1, [x0, #4]
+; CHECK-NEXT-DISABLE:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  store i32 %1, ptr %5, align 64
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  ret ptr %5
+}
+
+define dso_local ptr @stp_aligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_aligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT:    stp x1, x1, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    stp x1, x1, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE: stp_aligned_int64_t:
+; CHECK-DISABLE:       // %bb.0: 
+; CHECK-NEXT-DISABLE:    and x0, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE:    str x1, [x0]
+; CHECK-NEXT-DISABLE:    str x1, [x0, #8]
+; CHECK-NEXT-DISABLE:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  store i64 %1, ptr %5, align 128
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  ret ptr %5
+}
+
+define ptr @stp_aligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_aligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_aligned_v4si: 
+; CHECK-DEFAULT:       // %bb.0: 
+; CHECK-NEXT-DEFAULT:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    stp q0, q0, [x0]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_aligned_v4si:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x0, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-STP:    str q0, [x0]
+; CHECK-NEXT-DISABLE-STP:    str q0, [x0, #16]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  store <4 x i32> %1, ptr %5, align 256
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  ret ptr %5
+}
+
+define ptr @stp_unaligned_int32_t(ptr %0, i32 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int32_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT:    orr x0, x8, #0x4
+; CHECK-NEXT:    str w1, [x8, #4]
+; CHECK-NEXT:    str w1, [x8, #8]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_int32_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x4
+; CHECK-NEXT-DEFAULT:    stp w1, w1, [x8, #4]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_int32_t:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffffc0
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x4
+; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #4]
+; CHECK-NEXT-DISABLE-STP:    str w1, [x8, #8]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i32, ptr %5, i64 1
+  store i32 %1, ptr %6, align 4
+  %7 = getelementptr inbounds i32, ptr %5, i64 2
+  store i32 %1, ptr %7, align 8
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_int64_t(ptr %0, i64 %1) #0 {
+; CHECK-LABEL: stp_unaligned_int64_t:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT:    orr x0, x8, #0x8
+; CHECK-NEXT:    str x1, [x8, #8]
+; CHECK-NEXT:    str x1, [x8, #16]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_int64_t:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x8
+; CHECK-NEXT-DEFAULT:    stp x1, x1, [x8, #8]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_int64_t:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff80
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x8
+; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #8]
+; CHECK-NEXT-DISABLE-STP:    str x1, [x8, #16]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -128
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds i64, ptr %5, i64 1
+  store i64 %1, ptr %6, align 8
+  %7 = getelementptr inbounds i64, ptr %5, i64 2
+  store i64 %1, ptr %7, align 16
+  ret ptr %6
+}
+
+define ptr @stp_unaligned_v4si(ptr %0, <4 x i32> %1) #0 {
+; CHECK-LABEL: stp_unaligned_v4si:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT:    orr x0, x8, #0x10
+; CHECK-NEXT:    str q0, [x8, #16]
+; CHECK-NEXT:    str q0, [x8, #32]
+; CHECK-NEXT:    ret
+;
+; CHECK-LABEL-DEFAULT: stp_unaligned_v4si:
+; CHECK-DEFAULT:       // %bb.0:
+; CHECK-NEXT-DEFAULT:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DEFAULT:    orr x0, x8, #0x10
+; CHECK-NEXT-DEFAULT:    stp q0, q0, [x8, #16]
+; CHECK-NEXT-DEFAULT:    ret
+;
+; CHECK-LABEL-DISABLE-STP: stp_unaligned_v4si:
+; CHECK-DISABLE-STP:       // %bb.0:
+; CHECK-NEXT-DISABLE-STP:    and x8, x0, #0xffffffffffffff00
+; CHECK-NEXT-DISABLE-STP:    orr x0, x8, #0x10
+; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #16]
+; CHECK-NEXT-DISABLE-STP:    str q0, [x8, #32]
+; CHECK-NEXT-DISABLE-STP:    ret
+  %3 = ptrtoint ptr %0 to i64
+  %4 = and i64 %3, -256
+  %5 = inttoptr i64 %4 to ptr
+  %6 = getelementptr inbounds <4 x i32>, ptr %5, i64 1
+  store <4 x i32> %1, ptr %6, align 16
+  %7 = getelementptr inbounds <4 x i32>, ptr %5, i64 2
+  store <4 x i32> %1, ptr %7, align 32
+  ret ptr %6
+}
+
    
    
More information about the llvm-commits
mailing list