[llvm] [AArch64][GISel] Optimize i128 stores by splitting into two i64 stores (PR #170276)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 2 08:12:07 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-aarch64
Author: None (ayank227)
<details>
<summary>Changes</summary>
This optimization splits i128 G_STORE(G_MERGE_VALUES(x, y)) into two i64 G_STOREs.
---
Patch is 36.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/170276.diff
11 Files Affected:
- (modified) llvm/lib/Target/AArch64/AArch64Combine.td (+10-1)
- (modified) llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp (+63)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll (+15-45)
- (modified) llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll (+11-31)
- (modified) llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll (+11-15)
- (modified) llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll (+5-13)
- (modified) llvm/test/CodeGen/AArch64/dup.ll (+30-33)
- (modified) llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll (+39-63)
- (modified) llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll (+39-63)
- (modified) llvm/test/CodeGen/AArch64/insertextract.ll (+19-23)
- (modified) llvm/test/CodeGen/AArch64/store.ll (+2-5)
``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 278314792bfb9..32955f3a6f952 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -280,6 +280,14 @@ def form_truncstore : GICombineRule<
(apply [{ applyFormTruncstore(*${root}, MRI, B, Observer, ${matchinfo}); }])
>;
+def split_store_128_matchdata : GIDefMatchData<"std::pair<Register, Register>">;
+def split_store_128 : GICombineRule<
+ (defs root:$root, split_store_128_matchdata:$matchinfo),
+ (match (G_STORE $src, $addr):$root,
+ [{ return matchSplitStore128(*${root}, MRI, ${matchinfo}); }]),
+ (apply [{ applySplitStore128(*${root}, MRI, B, Observer, ${matchinfo}); }])
+>;
+
def fold_merge_to_zext : GICombineRule<
(defs root:$d),
(match (wip_match_opcode G_MERGE_VALUES):$d,
@@ -339,7 +347,8 @@ def AArch64PostLegalizerLowering
: GICombiner<"AArch64PostLegalizerLoweringImpl",
[shuffle_vector_lowering, vashr_vlshr_imm,
icmp_lowering, build_vector_lowering,
- lower_vector_fcmp, form_truncstore, fconstant_to_constant,
+ lower_vector_fcmp, form_truncstore, split_store_128,
+ fconstant_to_constant,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mulv2s64,
vector_unmerge_lowering, insertelt_nonconst,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..7152558580763 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1119,6 +1119,69 @@ void applyFormTruncstore(MachineInstr &MI, MachineRegisterInfo &MRI,
Observer.changedInstr(MI);
}
+/// Optimize i128 stores by splitting into two i64 stores for STP pairing
+bool matchSplitStore128(MachineInstr &MI, MachineRegisterInfo &MRI,
+ std::pair<Register, Register> &Parts) {
+ assert(MI.getOpcode() == TargetOpcode::G_STORE);
+ GStore &Store = cast<GStore>(MI);
+
+ Register ValueReg = Store.getValueReg();
+ LLT ValueTy = MRI.getType(ValueReg);
+
+ // Only handle scalar types
+ if (!ValueTy.isScalar())
+ return false;
+
+ if (ValueTy.getSizeInBits() != 128)
+ return false;
+
+ // Check if the value comes from G_MERGE_VALUES
+ MachineInstr *DefMI = MRI.getVRegDef(ValueReg);
+ if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_MERGE_VALUES)
+ return false;
+
+ // Get the two i64 parts
+ if (DefMI->getNumOperands() != 3) // Dst + 2 sources
+ return false;
+
+ Register Part0 = DefMI->getOperand(1).getReg();
+ Register Part1 = DefMI->getOperand(2).getReg();
+
+ if (MRI.getType(Part0) != LLT::scalar(64) ||
+ MRI.getType(Part1) != LLT::scalar(64))
+ return false;
+
+ Parts = {Part0, Part1};
+ return true;
+}
+
+void applySplitStore128(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, GISelChangeObserver &Observer,
+ std::pair<Register, Register> &Parts) {
+ assert(MI.getOpcode() == TargetOpcode::G_STORE);
+ GStore &Store = cast<GStore>(MI);
+
+ B.setInstrAndDebugLoc(MI);
+
+ Register PtrReg = Store.getPointerReg();
+ MachineMemOperand &MMO = Store.getMMO();
+
+ // Create two i64 stores
+ // Store low part at [ptr]
+ B.buildStore(Parts.first, PtrReg, MMO.getPointerInfo(), MMO.getAlign(),
+ MMO.getFlags());
+
+ // Calculate offset for high part: ptr + 8
+ auto Offset = B.buildConstant(LLT::scalar(64), 8);
+ auto PtrHi = B.buildPtrAdd(MRI.getType(PtrReg), PtrReg, Offset);
+
+ // Store high part at [ptr + 8]
+ B.buildStore(Parts.second, PtrHi, MMO.getPointerInfo().getWithOffset(8),
+ commonAlignment(MMO.getAlign(), 8), MMO.getFlags());
+
+ MI.eraseFromParent();
+}
+
// Lower vector G_SEXT_INREG back to shifts for selection. We allowed them to
// form in the first place for combine opportunities, so any remaining ones
// at this stage need be lowered back.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
index be51210882eaa..ac18249950e9e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic-128.ll
@@ -27,9 +27,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
; CHECK-LLSC-O1-NEXT: stxp w10, x4, x5, [x0]
; CHECK-LLSC-O1-NEXT: cbnz w10, .LBB0_1
; CHECK-LLSC-O1-NEXT: .LBB0_4:
-; CHECK-LLSC-O1-NEXT: mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT: mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT: str q0, [x0]
+; CHECK-LLSC-O1-NEXT: stp x8, x9, [x0]
; CHECK-LLSC-O1-NEXT: ret
;
; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap:
@@ -45,9 +43,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x3, x5
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x4, x19
; CHECK-OUTLINE-LLSC-O1-NEXT: bl __aarch64_cas16_acq
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT: str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT: stp x0, x1, [x19]
; CHECK-OUTLINE-LLSC-O1-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; CHECK-OUTLINE-LLSC-O1-NEXT: ret
;
@@ -58,9 +54,7 @@ define void @val_compare_and_swap(ptr %p, i128 %oldval, i128 %newval) {
; CHECK-CAS-O1-NEXT: // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
; CHECK-CAS-O1-NEXT: // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
; CHECK-CAS-O1-NEXT: caspa x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT: mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT: mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT: str q0, [x0]
+; CHECK-CAS-O1-NEXT: stp x2, x3, [x0]
; CHECK-CAS-O1-NEXT: ret
;
; CHECK-LLSC-O0-LABEL: val_compare_and_swap:
@@ -154,9 +148,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
; CHECK-LLSC-O1-NEXT: stlxp w10, x4, x5, [x0]
; CHECK-LLSC-O1-NEXT: cbnz w10, .LBB1_1
; CHECK-LLSC-O1-NEXT: .LBB1_4:
-; CHECK-LLSC-O1-NEXT: mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT: mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT: str q0, [x0]
+; CHECK-LLSC-O1-NEXT: stp x8, x9, [x0]
; CHECK-LLSC-O1-NEXT: ret
;
; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic_seqcst:
@@ -172,9 +164,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x3, x5
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x4, x19
; CHECK-OUTLINE-LLSC-O1-NEXT: bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT: str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT: stp x0, x1, [x19]
; CHECK-OUTLINE-LLSC-O1-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; CHECK-OUTLINE-LLSC-O1-NEXT: ret
;
@@ -185,9 +175,7 @@ define void @val_compare_and_swap_monotonic_seqcst(ptr %p, i128 %oldval, i128 %n
; CHECK-CAS-O1-NEXT: // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
; CHECK-CAS-O1-NEXT: // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
; CHECK-CAS-O1-NEXT: caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT: mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT: mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT: str q0, [x0]
+; CHECK-CAS-O1-NEXT: stp x2, x3, [x0]
; CHECK-CAS-O1-NEXT: ret
;
; CHECK-LLSC-O0-LABEL: val_compare_and_swap_monotonic_seqcst:
@@ -281,9 +269,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
; CHECK-LLSC-O1-NEXT: stlxp w10, x4, x5, [x0]
; CHECK-LLSC-O1-NEXT: cbnz w10, .LBB2_1
; CHECK-LLSC-O1-NEXT: .LBB2_4:
-; CHECK-LLSC-O1-NEXT: mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT: mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT: str q0, [x0]
+; CHECK-LLSC-O1-NEXT: stp x8, x9, [x0]
; CHECK-LLSC-O1-NEXT: ret
;
; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_release_acquire:
@@ -299,9 +285,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x3, x5
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x4, x19
; CHECK-OUTLINE-LLSC-O1-NEXT: bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT: str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT: stp x0, x1, [x19]
; CHECK-OUTLINE-LLSC-O1-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; CHECK-OUTLINE-LLSC-O1-NEXT: ret
;
@@ -312,9 +296,7 @@ define void @val_compare_and_swap_release_acquire(ptr %p, i128 %oldval, i128 %ne
; CHECK-CAS-O1-NEXT: // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
; CHECK-CAS-O1-NEXT: // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
; CHECK-CAS-O1-NEXT: caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT: mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT: mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT: str q0, [x0]
+; CHECK-CAS-O1-NEXT: stp x2, x3, [x0]
; CHECK-CAS-O1-NEXT: ret
;
; CHECK-LLSC-O0-LABEL: val_compare_and_swap_release_acquire:
@@ -408,9 +390,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
; CHECK-LLSC-O1-NEXT: stlxp w10, x4, x5, [x0]
; CHECK-LLSC-O1-NEXT: cbnz w10, .LBB3_1
; CHECK-LLSC-O1-NEXT: .LBB3_4:
-; CHECK-LLSC-O1-NEXT: mov v0.d[0], x8
-; CHECK-LLSC-O1-NEXT: mov v0.d[1], x9
-; CHECK-LLSC-O1-NEXT: str q0, [x0]
+; CHECK-LLSC-O1-NEXT: stp x8, x9, [x0]
; CHECK-LLSC-O1-NEXT: ret
;
; CHECK-OUTLINE-LLSC-O1-LABEL: val_compare_and_swap_monotonic:
@@ -426,9 +406,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x3, x5
; CHECK-OUTLINE-LLSC-O1-NEXT: mov x4, x19
; CHECK-OUTLINE-LLSC-O1-NEXT: bl __aarch64_cas16_acq_rel
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[0], x0
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[1], x1
-; CHECK-OUTLINE-LLSC-O1-NEXT: str q0, [x19]
+; CHECK-OUTLINE-LLSC-O1-NEXT: stp x0, x1, [x19]
; CHECK-OUTLINE-LLSC-O1-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; CHECK-OUTLINE-LLSC-O1-NEXT: ret
;
@@ -439,9 +417,7 @@ define void @val_compare_and_swap_monotonic(ptr %p, i128 %oldval, i128 %newval)
; CHECK-CAS-O1-NEXT: // kill: def $x3 killed $x3 killed $x2_x3 def $x2_x3
; CHECK-CAS-O1-NEXT: // kill: def $x5 killed $x5 killed $x4_x5 def $x4_x5
; CHECK-CAS-O1-NEXT: caspal x2, x3, x4, x5, [x0]
-; CHECK-CAS-O1-NEXT: mov v0.d[0], x2
-; CHECK-CAS-O1-NEXT: mov v0.d[1], x3
-; CHECK-CAS-O1-NEXT: str q0, [x0]
+; CHECK-CAS-O1-NEXT: stp x2, x3, [x0]
; CHECK-CAS-O1-NEXT: ret
;
; CHECK-LLSC-O0-LABEL: val_compare_and_swap_monotonic:
@@ -525,9 +501,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
; CHECK-LLSC-O1-NEXT: stxp w10, x9, x8, [x2]
; CHECK-LLSC-O1-NEXT: cbnz w10, .LBB4_1
; CHECK-LLSC-O1-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-LLSC-O1-NEXT: mov v0.d[0], x9
-; CHECK-LLSC-O1-NEXT: mov v0.d[1], x8
-; CHECK-LLSC-O1-NEXT: str q0, [x3]
+; CHECK-LLSC-O1-NEXT: stp x9, x8, [x3]
; CHECK-LLSC-O1-NEXT: ret
;
; CHECK-OUTLINE-LLSC-O1-LABEL: atomic_load_relaxed:
@@ -538,9 +512,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
; CHECK-OUTLINE-LLSC-O1-NEXT: stxp w10, x9, x8, [x2]
; CHECK-OUTLINE-LLSC-O1-NEXT: cbnz w10, .LBB4_1
; CHECK-OUTLINE-LLSC-O1-NEXT: // %bb.2: // %atomicrmw.end
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[0], x9
-; CHECK-OUTLINE-LLSC-O1-NEXT: mov v0.d[1], x8
-; CHECK-OUTLINE-LLSC-O1-NEXT: str q0, [x3]
+; CHECK-OUTLINE-LLSC-O1-NEXT: stp x9, x8, [x3]
; CHECK-OUTLINE-LLSC-O1-NEXT: ret
;
; CHECK-CAS-O1-LABEL: atomic_load_relaxed:
@@ -548,9 +520,7 @@ define void @atomic_load_relaxed(i64, i64, ptr %p, ptr %p2) {
; CHECK-CAS-O1-NEXT: mov x0, xzr
; CHECK-CAS-O1-NEXT: mov x1, xzr
; CHECK-CAS-O1-NEXT: casp x0, x1, x0, x1, [x2]
-; CHECK-CAS-O1-NEXT: mov v0.d[0], x0
-; CHECK-CAS-O1-NEXT: mov v0.d[1], x1
-; CHECK-CAS-O1-NEXT: str q0, [x3]
+; CHECK-CAS-O1-NEXT: stp x0, x1, [x3]
; CHECK-CAS-O1-NEXT: ret
;
; CHECK-LLSC-O0-LABEL: atomic_load_relaxed:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
index fe7e24c2d8ba5..a4dcd0155a449 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/v8.4-atomic-128.ll
@@ -5,56 +5,42 @@ define void @test_atomic_load(ptr %addr) {
; CHECK-LABEL: test_atomic_load:
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%res.0 = load atomic i128, ptr %addr monotonic, align 16
store i128 %res.0, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%res.1 = load atomic i128, ptr %addr unordered, align 16
store i128 %res.1, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
-; CHECK: dmb ish
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: dmb ishld
+; CHECK: stp [[LO]], [[HI]], [x0]
%res.2 = load atomic i128, ptr %addr acquire, align 16
store i128 %res.2, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
; CHECK: dmb ish
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%res.3 = load atomic i128, ptr %addr seq_cst, align 16
store i128 %res.3, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #8]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, ptr %addr, i32 8
%res.5 = load atomic i128, ptr %addr8.1 monotonic, align 16
store i128 %res.5, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #504]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.2 = getelementptr i8, ptr %addr, i32 504
%res.6 = load atomic i128, ptr %addr8.2 monotonic, align 16
store i128 %res.6, ptr %addr
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0, #-512]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.3 = getelementptr i8, ptr %addr, i32 -512
%res.7 = load atomic i128, ptr %addr8.3 monotonic, align 16
store i128 %res.7, ptr %addr
@@ -76,9 +62,7 @@ define void @test_nonfolded_load1(ptr %addr) {
; CHECK: add x[[ADDR:[0-9]+]], x0, #4
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, ptr %addr, i32 4
%res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
store i128 %res.1, ptr %addr
@@ -91,9 +75,7 @@ define void @test_nonfolded_load2(ptr %addr) {
; CHECK: add x[[ADDR:[0-9]+]], x0, #512
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, ptr %addr, i32 512
%res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
store i128 %res.1, ptr %addr
@@ -106,9 +88,7 @@ define void @test_nonfolded_load3(ptr %addr) {
; CHECK: sub x[[ADDR:[0-9]+]], x0, #520
; CHECK: ldp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x[[ADDR]]]
-; CHECK: mov v[[Q:[0-9]+]].d[0], [[LO]]
-; CHECK: mov v[[Q]].d[1], [[HI]]
-; CHECK: str q[[Q]], [x0]
+; CHECK: stp [[LO]], [[HI]], [x0]
%addr8.1 = getelementptr i8, ptr %addr, i32 -520
%res.1 = load atomic i128, ptr %addr8.1 monotonic, align 16
store i128 %res.1, ptr %addr
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
index 63dcafed2320a..a188a1cfa7502 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -383,13 +383,11 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) {
; CHECK0-GISEL-LABEL: gep4:
; CHECK0-GISEL: // %bb.0:
; CHECK0-GISEL-NEXT: add x8, x0, x4, lsl #4
-; CHECK0-GISEL-NEXT: mov v0.d[0], x2
-; CHECK0-GISEL-NEXT: ldr q1, [x8]
-; CHECK0-GISEL-NEXT: mov d2, v1.d[1]
-; CHECK0-GISEL-NEXT: mov v0.d[1], x3
-; CHECK0-GISEL-NEXT: fmov x0, d1
-; CHECK0-GISEL-NEXT: fmov x1, d2
-; CHECK0-GISEL-NEXT: str q0, [x8]
+; CHECK0-GISEL-NEXT: ldr q0, [x8]
+; CHECK0-GISEL-NEXT: stp x2, x3, [x8]
+; CHECK0-GISEL-NEXT: mov d1, v0.d[1]
+; CHECK0-GISEL-NEXT: fmov x0, d0
+; CHECK0-GISEL-NEXT: fmov x1, d1
; CHECK0-GISEL-NEXT: ret
;
; CHECK3-SDAG-LABEL: gep4:
@@ -401,14 +399,12 @@ define i128 @gep4(ptr %p, i128 %a, i64 %b) {
;
; CHECK3-GISEL-LABEL: gep4:
; CHECK3-GISEL: // %bb.0:
-; CHECK3-GISEL-NEXT: ldr q1, [x0, x4, lsl #4]
-; CHECK3-GISEL-NEXT: mov v0.d[0], x2
-; CHECK3-GISEL-NEXT: mov x8, x0
-; CHECK3-GISEL-NEXT: mov d2, v1.d[1]
-; CHECK3-GISEL-NEXT: fmov x0, d1
-; CHECK3-GISEL-NEXT: mov v0.d[1], x3
-; CHECK3-GISEL-NEXT: fmov x1, d2
-; CHECK3-GISEL-NEXT: str q0, [x8, x4, lsl #4]
+; CHECK3-GISEL-NEXT: ldr q0, [x0, x4, lsl #4]
+; CHECK3-GISEL-NEXT: add x8, x0, x4, lsl #4
+; CHECK3-GISEL-NEXT: mov d1, v0.d[1]
+; CHECK3-GISEL-NEXT: fmov x0, d0
+; CHECK3-GISEL-NEXT: stp x2, x3, [x8]
+; CHECK3-GISEL-NEXT: fmov x1, d1
; CHECK3-GISEL-NEXT: ret
%g = getelementptr inbounds i128, ptr %p, i64 %b
%l = load i128, ptr %g
diff --git a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
index 0e1e15f9b6b91..cbcc6184182ae 100644
--- a/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-sli-sri-opt.ll
@@ -663,19 +663,11 @@ define void @testRightBad2x64(<2 x i64> %src1, <2 x i64> %src2, ptr %dest) nounw
}
define void @testLeftShouldNotCreateSLI1x128(<1 x i128> %src1, <1 x i128> %src2, ptr %dest) nounwind {
-; CHECK-SD-LABEL: testLeftShouldNotCreateSLI1x128:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: bfi x1, x2, #6, #58
-; CHECK-SD-NEXT: stp x0, x1, [x4]
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: testLeftShouldNotCreateSLI1x128:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: mov.d v0[0], x0
-; CHECK-GI-NEXT: bfi x1, x2, #6, #58
-; CHECK-GI-NEXT: mov.d v0[1], x1
-; CHECK-GI-NEXT: str q0, [x4]
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: testLeftShouldNotCreateSLI1x128:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bfi x1, x2, #6, #58
+; CHECK-NEXT: stp x0, x1, [x4]
+; CHECK-NEXT: ret
%and.i = and <1 x i128> %src1, <i128 1180591620717411303423>
%vshl_n = shl <1 x i128> %src2, <i128 70>
%result = or <1 x i128> %and.i, %vshl_n
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 6df6d76fb0592..6b0c62490a5db 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -1252,16 +1252,15 @@ define <2 x i128> @loaddup_str_v2i128(ptr %p) {
;
; CHECK-GI-LABEL: loaddup_str_v2i128:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: ldr q1, [x0]
-; CHECK-GI-NEXT: mov v0.d[0], xzr
+; CHECK-GI-NEXT: ldr q0, [x0]
; CHECK-GI-NEXT: mov x8, x0
-; CHECK-GI-NEXT: mov d2, v1.d[1]
-; CHECK-GI-NEXT: fmov x0, d1
-; CHECK-GI-NEXT: fmov x2, d1
-; CHECK-GI-NEXT: mov v0.d[1], xzr
-; CHECK-GI-NEXT: fmov x1, d2
-; CHECK-GI-NEXT: fmov x3, d2
-; CHECK-GI-NEXT: str q0, [x8]
+; CHECK-GI-NEXT: str xzr, [x0]
+; CHECK-GI-NEXT: str xzr, [x8, #8]
+; CHECK-GI-NEXT: mov d1, v0.d[1]
+; CHECK-GI-NEXT: fmov x0, d0
+; CHECK-GI-NEXT: fmov x2, d0
+; CHECK-GI-NEXT: fmov x1, d1
+; CHECK-GI-NEXT: fmov x3, d1
; CHECK-GI-NEXT: ret
entry:
%a = load i128, ptr %p
@@ -1340,18 +1339,17 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/170276
More information about the llvm-commits
mailing list