[clang] [llvm] [AArch64][SVE] Lower unpredicated loads/stores as LDR/STR. (PR #127837)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 10:04:34 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ricardo Jesus (rj-jesus)
<details>
<summary>Changes</summary>
Currently, given:
```cpp
svuint8_t foo(uint8_t *x) {
return svld1(svptrue_b8(), x);
}
```
We generate:
```gas
foo:
ptrue p0.b
ld1b { z0.b }, p0/z, [x0]
ret
```
On little-endian, we could instead be using LDR as follows:
```gas
foo:
ldr z0, [x0]
ret
```
The second form avoids the predicate dependency.
Likewise for other types and stores.
This generates a fair number of test changes, but all but
`llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll`
seem benign.
---
Patch is 413.30 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/127837.diff
69 Files Affected:
- (modified) clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c (+6-3)
- (modified) llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td (+14)
- (modified) llvm/lib/Target/AArch64/SVEInstrFormats.td (+1)
- (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll (+24-28)
- (modified) llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll (+6-7)
- (modified) llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll (+18-18)
- (modified) llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll (+1-2)
- (modified) llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll (+8-8)
- (modified) llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll (+48-53)
- (modified) llvm/test/CodeGen/AArch64/nontemporal-load.ll (+10-12)
- (modified) llvm/test/CodeGen/AArch64/sinksplat.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll (+161-137)
- (modified) llvm/test/CodeGen/AArch64/sme-peephole-opts.ll (+2-4)
- (modified) llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll (+43-86)
- (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (+1-2)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-faminmax.ll (+6-12)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll (+4-6)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll (+22-37)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll (+14-28)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll (+14-28)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (+9-18)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll (+8-16)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll (+4-8)
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll (+6-12)
- (modified) llvm/test/CodeGen/AArch64/spillfill-sve.ll (+32-32)
- (modified) llvm/test/CodeGen/AArch64/split-vector-insert.ll (+24-28)
- (modified) llvm/test/CodeGen/AArch64/stack-guard-sve.ll (+6-8)
- (modified) llvm/test/CodeGen/AArch64/stack-hazard.ll (+9-18)
- (modified) llvm/test/CodeGen/AArch64/sve-aliasing.ll (+26-26)
- (modified) llvm/test/CodeGen/AArch64/sve-alloca.ll (+2-3)
- (modified) llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll (+6-11)
- (modified) llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll (+152-168)
- (modified) llvm/test/CodeGen/AArch64/sve-dead-masked-store.ll (+2-4)
- (modified) llvm/test/CodeGen/AArch64/sve-extload-icmp.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll (+44-53)
- (modified) llvm/test/CodeGen/AArch64/sve-forward-st-to-ld.ll (+7-13)
- (modified) llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll (+11-11)
- (modified) llvm/test/CodeGen/AArch64/sve-fp.ll (+2-3)
- (modified) llvm/test/CodeGen/AArch64/sve-fpext-load.ll (+2-4)
- (modified) llvm/test/CodeGen/AArch64/sve-fptrunc-store.ll (+10-11)
- (modified) llvm/test/CodeGen/AArch64/sve-insert-element.ll (+4-4)
- (modified) llvm/test/CodeGen/AArch64/sve-insert-vector.ll (+49-61)
- (modified) llvm/test/CodeGen/AArch64/sve-int-arith.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/sve-ld1-addressing-mode-reg-imm.ll (+10-22)
- (modified) llvm/test/CodeGen/AArch64/sve-ld1r.ll (+5-9)
- (modified) llvm/test/CodeGen/AArch64/sve-llrint.ll (+655-648)
- (modified) llvm/test/CodeGen/AArch64/sve-load-store-strict-align.ll (+8-12)
- (modified) llvm/test/CodeGen/AArch64/sve-lrint.ll (+655-648)
- (modified) llvm/test/CodeGen/AArch64/sve-lsrchain.ll (+20-20)
- (modified) llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll (+8-9)
- (modified) llvm/test/CodeGen/AArch64/sve-min-max-pred.ll (+1-1)
- (modified) llvm/test/CodeGen/AArch64/sve-pr92779.ll (+1-2)
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve-reassocadd.ll (+9-18)
- (modified) llvm/test/CodeGen/AArch64/sve-redundant-store.ll (+8-14)
- (modified) llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll (+18-25)
- (modified) llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll (+28-33)
- (modified) llvm/test/CodeGen/AArch64/sve-split-load.ll (+17-21)
- (modified) llvm/test/CodeGen/AArch64/sve-split-store.ll (+14-17)
- (modified) llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll (+10-22)
- (modified) llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll (+9-14)
- (modified) llvm/test/CodeGen/AArch64/sve-unaligned-load-store-strict-align.ll (+2-3)
- (modified) llvm/test/CodeGen/AArch64/sve-vector-compress.ll (+3-3)
- (modified) llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll (+11-11)
- (modified) llvm/test/CodeGen/AArch64/sve2-rsh.ll (+1-2)
- (modified) llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll (+16-24)
- (modified) llvm/test/CodeGen/AArch64/zext-to-tbl.ll (+6-6)
- (modified) llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll (+29-38)
``````````diff
diff --git a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
index 692d11d97f486..0ed14b4b3b793 100644
--- a/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
+++ b/clang/test/CodeGen/AArch64/sve-vector-bits-codegen.c
@@ -13,9 +13,12 @@
void func(int *restrict a, int *restrict b) {
// CHECK-LABEL: func
-// CHECK256-COUNT-8: st1w
-// CHECK512-COUNT-4: st1w
-// CHECK1024-COUNT-2: st1w
+// CHECK256-COUNT-1: str
+// CHECK256-COUNT-7: st1w
+// CHECK512-COUNT-1: str
+// CHECK512-COUNT-3: st1w
+// CHECK1024-COUNT-1: str
+// CHECK1024-COUNT-1: st1w
// CHECK2048-COUNT-1: st1w
#pragma clang loop vectorize(enable)
for (int i = 0; i < 64; ++i)
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 28aecd14e33fa..d1393aebe3ad9 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2977,14 +2977,28 @@ let Predicates = [HasSVE_or_SME] in {
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
// same width as nxv16i8. This saves an add in cases where we would
// otherwise compute the address separately.
+ // Also allow using LDR/STR to avoid the predicate dependence.
multiclass unpred_loadstore_bitcast<ValueType Ty> {
let Predicates = [IsLE] in {
def : Pat<(Ty (load (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset))),
(LD1B (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
def : Pat<(store Ty:$val, (am_sve_regreg_lsl0 GPR64sp:$base, GPR64:$offset)),
(ST1B ZPR:$val, (PTRUE_B 31), GPR64sp:$base, GPR64:$offset)>;
+
+ let AddedComplexity = 2 in {
+ def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ (LDR_ZXI GPR64sp:$base, simm9:$offset)>;
+ def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
+ }
+
+ def : Pat<(Ty (load GPR64sp:$base)),
+ (LDR_ZXI GPR64sp:$base, (i64 0))>;
+ def : Pat<(store Ty:$val, GPR64sp:$base),
+ (STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
}
+ defm : unpred_loadstore_bitcast<nxv16i8>;
defm : unpred_loadstore_bitcast<nxv8i16>;
defm : unpred_loadstore_bitcast<nxv8f16>;
defm : unpred_loadstore_bitcast<nxv8bf16>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e443c5ab150bd..48f71297f8377 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9668,6 +9668,7 @@ multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatter
let WantsRoot = true in {
def am_sve_indexed_s4 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-8, 7>">;
def am_sve_indexed_s6 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-32, 31>">;
+ def am_sve_indexed_s9 : ComplexPattern<iPTR, 2, "SelectAddrModeIndexedSVE<-256, 255>">;
}
def am_sve_regreg_lsl0 : ComplexPattern<iPTR, 2, "SelectSVERegRegAddrMode<0>", []>;
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
index 7244ac949ab88..3a808f5a02f0d 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-array.ll
@@ -13,13 +13,12 @@ define void @array_1D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -37,8 +36,7 @@ define %my_subtype @array_1D_extract(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -56,12 +54,11 @@ define void @array_1D_insert(ptr %addr, %my_subtype %elt) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: str z0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -80,19 +77,18 @@ define void @array_2D(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-6
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x30, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 48 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #5, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #4, mul vl]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1d { z3.d }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1d { z5.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1d { z4.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #5, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0, #4, mul vl]
+; CHECK-NEXT: ldr z4, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z5, [x0, #3, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #5, mul vl]
+; CHECK-NEXT: str z3, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #3, mul vl]
+; CHECK-NEXT: str z4, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #6
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
index f03a6f018d34d..e7d8f4ff39cee 100644
--- a/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
+++ b/llvm/test/CodeGen/AArch64/alloca-load-store-scalable-struct.ll
@@ -12,13 +12,12 @@ define void @test(ptr %addr) #0 {
; CHECK-NEXT: addvl sp, sp, #-3
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, #2, mul vl]
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: st1d { z1.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: st1d { z2.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ldr z1, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #3
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 8e26ef6b87ecc..668dc18df6a0b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -25,11 +25,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
; CHECK-NEXT: .LBB0_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -114,11 +114,11 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
; CHECK-NEXT: .LBB1_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z2, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
+; CHECK-NEXT: ldr z4, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x1]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: add x0, x0, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
@@ -196,16 +196,16 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
; CHECK-NEXT: mov z3.d, z0.d
; CHECK-NEXT: .LBB2_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
-; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z5, [x0]
; CHECK-NEXT: subs x9, x9, x8
-; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
-; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
-; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldr z6, [x0, #3, mul vl]
+; CHECK-NEXT: ldr z7, [x1, #1, mul vl]
+; CHECK-NEXT: ldr z16, [x1]
+; CHECK-NEXT: ldr z17, [x0, #2, mul vl]
; CHECK-NEXT: add x0, x0, x10
-; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
-; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
+; CHECK-NEXT: ldr z18, [x1, #3, mul vl]
+; CHECK-NEXT: ldr z19, [x1, #2, mul vl]
; CHECK-NEXT: add x1, x1, x10
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
@@ -321,8 +321,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
; CHECK-NEXT: zip1 z1.d, z2.d, z2.d
; CHECK-NEXT: .LBB3_1: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
-; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ldr z4, [x0, #1, mul vl]
; CHECK-NEXT: add x0, x0, x11
; CHECK-NEXT: ld1w { z5.d }, p0/z, [x3, x8, lsl #2]
; CHECK-NEXT: add x8, x8, x9
diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index e6d5a2ac0fd79..820bc2c8a417f 100644
--- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -97,8 +97,7 @@ define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov z0.s, #1.00000000
-; CHECK-NEXT: ptrue p0.s
-; CHECK-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-NEXT: str z0, [x0]
; CHECK-NEXT: ret
entry:
%0 = shufflevector <vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 542b2e90ffc15..d5b9d17a98d55 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -103,9 +103,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #1
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -147,9 +147,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #2
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -191,9 +191,9 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsl x8, x8, #3
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: str q1, [x9, x8]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -211,10 +211,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_large_i32(ptr %
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: ptrue p1.d, vl8
; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
-; CHECK-NEXT: st1d { z0.d }, p0, [sp]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: ld1w { z0.d }, p1/z, [x1]
; CHECK-NEXT: st1d { z0.d }, p0, [sp]
-; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index d1171bc312473..69e805d9ca2ee 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -328,15 +328,14 @@ define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8
; CHECK: // %bb.0:
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: mov x8, sp
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: orr x8, x8, #0x8
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8, #1, mul vl]
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [x8]
+; CHECK-NEXT: ldr z1, [x8, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -354,22 +353,22 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: sub x8, x8, #1
+; CHECK-NEXT: str z3, [sp, #3, mul vl]
; CHECK-NEXT: cmp x8, #16
-; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl]
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
; CHECK-NEXT: csel x8, x8, x9, lo
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: add x10, x9, x8, lsl #2
-; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: st1w { z0.s }, p0, [sp]
-; CHECK-NEXT: st1w { z7.s }, p0, [sp, #7, mul vl]
-; CHECK-NEXT: st1w { z4.s }, p0, [sp, #4, mul vl]
-; CHECK-NEXT: st1w { z5.s }, p0, [sp, #5, mul vl]
-; CHECK-NEXT: st1w { z6.s }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: str z7, [sp, #7, mul vl]
+; CHECK-NEXT: str z4, [sp, #4, mul vl]
+; CHECK-NEXT: str z5, [sp, #5, mul vl]
+; CHECK-NEXT: str z6, [sp, #6, mul vl]
; CHECK-NEXT: ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
-; CHECK-NEXT: ld1w { z1.s }, p0/z, [x10, #1, mul vl]
-; CHECK-NEXT: ld1w { z2.s }, p0/z, [x10, #2, mul vl]
-; CHECK-NEXT: ld1w { z3.s }, p0/z, [x10, #3, mul vl]
+; CHECK-NEXT: ldr z1, [x10, #1, mul vl]
+; CHECK-NEXT: ldr z2, [x10, #2, mul vl]
+; CHECK-NEXT: ldr z3, [x10, #3, mul vl]
; CHECK-NEXT: addvl sp, sp, #8
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -453,16 +452,15 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: mov w9, #17 // =0x11
-; CHECK-NEXT: cmp x8, #17
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #17
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1b { z0.b }, p0, [sp]
-; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1b { z0.b }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -498,16 +496,15 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1h { z0.h }, p0, [sp]
-; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
; CHECK-NEXT: sub x8, x8, x9
-; CHECK-NEXT: ld1h { z0.h }, p0/z, [x8]
+; CHECK-NEXT: ldr z0, [x8]
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -609,16 +606,15 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
; CHECK-NEXT: addvl sp, sp, #-2
; CHECK-NEXT: rdvl x8, #1
-; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: mov w9, #18 // =0x12
-; CHECK-NEXT: cmp x8, #18
; CHECK-NEXT: mov x10, sp
+; CHECK-NEXT: cmp x8, #18
+; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: csel x9, x8, x9, lo
; CHECK-NEXT: add x8, x10, x8
-; CHECK-NEXT: st1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/127837
More information about the llvm-commits
mailing list