[llvm] [AArch64] Fixes for BigEndian 128bit volatile, atomic and non-temporal loads/stores (PR #67413)

David Green via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 26 03:08:48 PDT 2023


https://github.com/davemgreen created https://github.com/llvm/llvm-project/pull/67413

This fixes up the generation of 128bit atomic, volatile and non-temporal loads/stores, under the assumption that their operands should usually be the same as standard loads/stores.
https://godbolt.org/z/xxc89eMKE

Non-temporal stores were disabled under BE to keep things simple, bringing them in line with the LE versions. Atomic and volatile STP and LDP nodes have their operands swapped to makes sure they end up loading data in the same order as the non atomic/volatile versions.

Fixes #64580

>From a0259d57e85a3644e260aa8ff82c7ca8d6901a43 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Tue, 26 Sep 2023 10:36:09 +0100
Subject: [PATCH] [AArch64] Fixes for BigEndian 128bit volatile, atomic and
 non-temporal loads/stores

This fixes up the generation of 128bit atomic, volatile and non-temporal
loads/stores, under the assumption that they should usually be the same as
standard versions.
https://godbolt.org/z/xxc89eMKE

Fixes #64580
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  19 +--
 .../Atomics/aarch64_be-atomic-load-lse2.ll    |  16 +--
 .../aarch64_be-atomic-load-lse2_lse128.ll     |  16 +--
 .../Atomics/aarch64_be-atomic-load-rcpc3.ll   |  16 +--
 .../Atomics/aarch64_be-atomic-store-lse2.ll   |   8 +-
 .../aarch64_be-atomic-store-lse2_lse128.ll    |   4 +-
 .../Atomics/aarch64_be-atomic-store-rcpc3.ll  |   8 +-
 .../AArch64/i128_volatile_load_store.ll       |  29 ++---
 llvm/test/CodeGen/AArch64/nontemporal.ll      | 108 ++++--------------
 9 files changed, 79 insertions(+), 145 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3de6bd1ec94a82a..3199a971d13859a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -5705,11 +5705,11 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     // legalization will break up 256 bit inputs.
     ElementCount EC = MemVT.getVectorElementCount();
     if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
-        EC.isKnownEven() &&
-        ((MemVT.getScalarSizeInBits() == 8u ||
-          MemVT.getScalarSizeInBits() == 16u ||
-          MemVT.getScalarSizeInBits() == 32u ||
-          MemVT.getScalarSizeInBits() == 64u))) {
+        EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
+        (MemVT.getScalarSizeInBits() == 8u ||
+         MemVT.getScalarSizeInBits() == 16u ||
+         MemVT.getScalarSizeInBits() == 32u ||
+         MemVT.getScalarSizeInBits() == 64u)) {
       SDValue Lo =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
                       MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
@@ -5769,6 +5769,8 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   SDLoc DL(Op);
   auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
   unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(StoreValue.first, StoreValue.second);
   SDValue Result = DAG.getMemIntrinsicNode(
       Opcode, DL, DAG.getVTList(MVT::Other),
       {StoreNode->getChain(), StoreValue.first, StoreValue.second,
@@ -24169,8 +24171,11 @@ void AArch64TargetLowering::ReplaceNodeResults(
           {LoadNode->getChain(), LoadNode->getBasePtr()},
           LoadNode->getMemoryVT(), LoadNode->getMemOperand());
 
-      SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
-                                 Result.getValue(0), Result.getValue(1));
+      unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
+
+      SDValue Pair =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+                      Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
       Results.append({Pair, Result.getValue(2) /* Chain */});
     }
     return;
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2.ll
index 2fd70537a394647..08f7dd32c128079 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2.ll
@@ -229,35 +229,35 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ishld
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
@@ -265,7 +265,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ishld
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
@@ -273,7 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
@@ -281,7 +281,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2_lse128.ll
index 32c7507d1ce706b..74e612d59858fd6 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-lse2_lse128.ll
@@ -229,35 +229,35 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ishld
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
@@ -265,7 +265,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ishld
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
@@ -273,7 +273,7 @@ define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr)
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
@@ -281,7 +281,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
index a09b4c69755d599..de83b702d988ac6 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-load-rcpc3.ll
@@ -229,49 +229,49 @@ define dso_local i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr unordered, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr monotonic, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire:
-; CHECK:    ldiapp x1, x0, [x0]
+; CHECK:    ldiapp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
-; CHECK:    ldiapp x1, x0, [x0]
+; CHECK:    ldiapp x0, x1, [x0]
     %r = load atomic i128, ptr %ptr acquire, align 16
     ret i128 %r
 }
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
@@ -279,7 +279,7 @@ define dso_local i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
 
 define dso_local i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
-; CHECK:    ldp x1, x0, [x0]
+; CHECK:    ldp x0, x1, [x0]
 ; CHECK:    dmb ish
     %r = load atomic i128, ptr %ptr seq_cst, align 16
     ret i128 %r
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2.ll
index 5652cc52f024a99..c9c9de4f884b7cf 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2.ll
@@ -117,14 +117,14 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_unordered:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
@@ -132,7 +132,7 @@ define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_release:
 ; CHECK:    dmb ish
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
@@ -140,7 +140,7 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
 ; CHECK:    dmb ish
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
 ; CHECK:    dmb ish
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2_lse128.ll
index 59b5a1aa038ab58..29d6b15c09022ca 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-lse2_lse128.ll
@@ -117,14 +117,14 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_unordered:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
index f8f8fe7cd6fa557..84a1f38d423c254 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64_be-atomic-store-rcpc3.ll
@@ -117,21 +117,21 @@ define dso_local void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 
 define dso_local void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_unordered:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr unordered, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr monotonic, align 16
     ret void
 }
 
 define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_release:
-; CHECK:    stilp x1, x0, [x2]
+; CHECK:    stilp x0, x1, [x2]
     store atomic i128 %value, ptr %ptr release, align 16
     ret void
 }
@@ -139,7 +139,7 @@ define dso_local void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr)
 define dso_local void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
 ; CHECK:    dmb ish
-; CHECK:    stp x1, x0, [x2]
+; CHECK:    stp x0, x1, [x2]
 ; CHECK:    dmb ish
     store atomic i128 %value, ptr %ptr seq_cst, align 16
     ret void
diff --git a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
index c3a03b2cb35426b..302eaeb98540b4f 100644
--- a/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
+++ b/llvm/test/CodeGen/AArch64/i128_volatile_load_store.ll
@@ -129,15 +129,10 @@ entry:
 }
 
 define i128 @load_vol(i32, i32, ptr %p) {
-; CHECK-LE-LABEL: load_vol:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    ldp x0, x1, [x2]
-; CHECK-LE-NEXT:    ret
-;
-; CHECK-BE-LABEL: load_vol:
-; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    ldp x1, x0, [x2]
-; CHECK-BE-NEXT:    ret
+; CHECK-LABEL: load_vol:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldp x0, x1, [x2]
+; CHECK-NEXT:    ret
 entry:
   %l = load volatile i128, ptr %p, align 16
   ret i128 %l
@@ -154,16 +149,14 @@ entry:
 }
 
 define void @loadstore_vol(i128 %a, ptr %p) {
-; CHECK-LE-LABEL: loadstore_vol:
-; CHECK-LE:       // %bb.0: // %entry
-; CHECK-LE-NEXT:    stp x0, x1, [x2]
-; CHECK-LE-NEXT:    ret
-;
-; CHECK-BE-LABEL: loadstore_vol:
-; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    stp x1, x0, [x2]
-; CHECK-BE-NEXT:    ret
+; CHECK-LABEL: loadstore_vol:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    stp x0, x1, [x2]
+; CHECK-NEXT:    ret
 entry:
   store volatile i128 %a, ptr %p, align 16
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-BE: {{.*}}
+; CHECK-LE: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll
index fe19ca7e2cc43d1..f8ba150a0405ff2 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal.ll
@@ -10,9 +10,7 @@ define void @test_stnp_v4i64(ptr %p, <4 x i64> %v) #0 {
 ;
 ; CHECK-BE-LABEL: test_stnp_v4i64:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
 ; CHECK-BE-NEXT:    ret
   store <4 x i64> %v, ptr %p, align 1, !nontemporal !0
   ret void
@@ -565,11 +563,7 @@ define void @test_stnp_v32i8(<32 x i8> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v32i8:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
-; CHECK-BE-NEXT:    rev64 v0.16b, v0.16b
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <32 x i8> %v, ptr %ptr, align 4, !nontemporal !0
@@ -585,16 +579,8 @@ define void @test_stnp_v32i16(<32 x i16> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v32i16:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v3.8h, v3.8h
-; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
-; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
-; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <32 x i16> %v, ptr %ptr, align 4, !nontemporal !0
@@ -610,16 +596,8 @@ define void @test_stnp_v32f16(<32 x half> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v32f16:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v3.8h, v3.8h
-; CHECK-BE-NEXT:    rev64 v2.8h, v2.8h
-; CHECK-BE-NEXT:    rev64 v1.8h, v1.8h
-; CHECK-BE-NEXT:    rev64 v0.8h, v0.8h
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <32 x half> %v, ptr %ptr, align 4, !nontemporal !0
@@ -635,16 +613,8 @@ define void @test_stnp_v16i32(<16 x i32> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v16i32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v3.4s, v3.4s
-; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
-; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
-; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <16 x i32> %v, ptr %ptr, align 4, !nontemporal !0
@@ -660,16 +630,8 @@ define void @test_stnp_v16f32(<16 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v16f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v3.4s, v3.4s
-; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
-; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
-; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <16 x float> %v, ptr %ptr, align 4, !nontemporal !0
@@ -776,20 +738,10 @@ define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v16i32_invalid_offset:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    rev64 v3.4s, v3.4s
-; CHECK-BE-NEXT:    rev64 v2.4s, v2.4s
-; CHECK-BE-NEXT:    mov w8, #32032 // =0x7d20
-; CHECK-BE-NEXT:    rev64 v1.4s, v1.4s
-; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
-; CHECK-BE-NEXT:    mov w9, #32000 // =0x7d00
-; CHECK-BE-NEXT:    add x8, x0, x8
-; CHECK-BE-NEXT:    add x9, x0, x9
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q2, q3, [x8]
-; CHECK-BE-NEXT:    stnp q0, q1, [x9]
+; CHECK-BE-NEXT:    str q3, [x0, #32048]
+; CHECK-BE-NEXT:    str q2, [x0, #32032]
+; CHECK-BE-NEXT:    str q1, [x0, #32016]
+; CHECK-BE-NEXT:    str q0, [x0, #32000]
 ; CHECK-BE-NEXT:    ret
 entry:
   %gep = getelementptr <16 x i32>, ptr %ptr, i32 500
@@ -808,18 +760,10 @@ define void @test_stnp_v16f64(<16 x double> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v16f64:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
-; CHECK-BE-NEXT:    ext v6.16b, v6.16b, v6.16b, #8
-; CHECK-BE-NEXT:    ext v5.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q6, q7, [x0, #96]
-; CHECK-BE-NEXT:    stnp q4, q5, [x0, #64]
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
+; CHECK-BE-NEXT:    stp q4, q5, [x0, #64]
+; CHECK-BE-NEXT:    stp q6, q7, [x0, #96]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <16 x double> %v, ptr %ptr, align 4, !nontemporal !0
@@ -837,18 +781,10 @@ define void @test_stnp_v16i64(<16 x i64> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v16i64:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    ext v7.16b, v7.16b, v7.16b, #8
-; CHECK-BE-NEXT:    ext v6.16b, v6.16b, v6.16b, #8
-; CHECK-BE-NEXT:    ext v5.16b, v5.16b, v5.16b, #8
-; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
-; CHECK-BE-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
-; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
-; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; CHECK-BE-NEXT:    stnp q6, q7, [x0, #96]
-; CHECK-BE-NEXT:    stnp q4, q5, [x0, #64]
-; CHECK-BE-NEXT:    stnp q2, q3, [x0, #32]
-; CHECK-BE-NEXT:    stnp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q0, q1, [x0]
+; CHECK-BE-NEXT:    stp q2, q3, [x0, #32]
+; CHECK-BE-NEXT:    stp q4, q5, [x0, #64]
+; CHECK-BE-NEXT:    stp q6, q7, [x0, #96]
 ; CHECK-BE-NEXT:    ret
 entry:
   store <16 x i64> %v, ptr %ptr, align 4, !nontemporal !0



More information about the llvm-commits mailing list