[llvm] [AArch64][SVE] Lower unpredicated loads/stores as fixed LDR/STR with -msve-vector-bits=128. (PR #127500)
Ricardo Jesus via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 10:23:04 PDT 2025
https://github.com/rj-jesus updated https://github.com/llvm/llvm-project/pull/127500
>From c69b267eff13a7984e73d77a6b148aeef44d7811 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 14 Feb 2025 05:37:21 -0800
Subject: [PATCH 1/3] Precommit unpredicated loads/stores tests
---
.../AArch64/sve-unpred-loads-stores.ll | 441 ++++++++++++++++++
1 file changed, 441 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
diff --git a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
new file mode 100644
index 0000000000000..d1b8edaf6b9dc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
@@ -0,0 +1,441 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -aarch64-sve-vector-bits-max=0 < %s | FileCheck %s --check-prefix=CHECK-VLA
+; RUN: llc -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv16i8:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 16 x i8>, ptr %0, align 16
+ ret <vscale x 16 x i8> %2
+}
+
+define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv16i8:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 16 x i8> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8i16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 8 x i16>, ptr %0, align 16
+ ret <vscale x 8 x i16> %2
+}
+
+define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8i16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 8 x i16> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4i32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 4 x i32>, ptr %0, align 16
+ ret <vscale x 4 x i32> %2
+}
+
+define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4i32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 4 x i32> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2i64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 2 x i64>, ptr %0, align 16
+ ret <vscale x 2 x i64> %2
+}
+
+define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2i64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 2 x i64> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8f16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 8 x half>, ptr %0, align 16
+ ret <vscale x 8 x half> %2
+}
+
+define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8f16:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 8 x half> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4f32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 4 x float>, ptr %0, align 16
+ ret <vscale x 4 x float> %2
+}
+
+define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4f32:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 4 x float> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2f64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ret
+ %2 = load <vscale x 2 x double>, ptr %0, align 16
+ ret <vscale x 2 x double> %2
+}
+
+define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2f64:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: ret
+ store <vscale x 2 x double> %1, ptr %0, align 16
+ ret void
+}
+
+define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv16i8_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv16i8_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 16 x i8>, ptr %4, align 16
+ ret <vscale x 16 x i8> %5
+}
+
+define void @st_nxv16i8_offset(ptr %0, <vscale x 16 x i8> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv16i8_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv16i8_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 16 x i8> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 8 x i16> @ld_nxv8i16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8i16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8i16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 8 x i16>, ptr %4, align 16
+ ret <vscale x 8 x i16> %5
+}
+
+define void @st_nxv8i16_offset(ptr %0, <vscale x 8 x i16> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8i16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8i16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 8 x i16> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 4 x i32> @ld_nxv4i32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4i32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4i32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 4 x i32>, ptr %4, align 16
+ ret <vscale x 4 x i32> %5
+}
+
+define void @st_nxv4i32_offset(ptr %0, <vscale x 4 x i32> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4i32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4i32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 4 x i32> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 2 x i64> @ld_nxv2i64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2i64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2i64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 2 x i64>, ptr %4, align 16
+ ret <vscale x 2 x i64> %5
+}
+
+define void @st_nxv2i64_offset(ptr %0, <vscale x 2 x i64> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2i64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2i64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 2 x i64> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 8 x half> @ld_nxv8f16_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv8f16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv8f16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 8 x half>, ptr %4, align 16
+ ret <vscale x 8 x half> %5
+}
+
+define void @st_nxv8f16_offset(ptr %0, <vscale x 8 x half> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv8f16_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv8f16_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 8 x half> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 4 x float> @ld_nxv4f32_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv4f32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv4f32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 4 x float>, ptr %4, align 16
+ ret <vscale x 4 x float> %5
+}
+
+define void @st_nxv4f32_offset(ptr %0, <vscale x 4 x float> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv4f32_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv4f32_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 4 x float> %1, ptr %5, align 16
+ ret void
+}
+
+define <vscale x 2 x double> @ld_nxv2f64_offset(ptr %0) #0 {
+; CHECK-VLA-LABEL: ld_nxv2f64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: ld_nxv2f64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %2 = tail call i64 @llvm.vscale.i64()
+ %3 = shl nuw nsw i64 %2, 4
+ %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
+ %5 = load <vscale x 2 x double>, ptr %4, align 16
+ ret <vscale x 2 x double> %5
+}
+
+define void @st_nxv2f64_offset(ptr %0, <vscale x 2 x double> %1) #0 {
+; CHECK-VLA-LABEL: st_nxv2f64_offset:
+; CHECK-VLA: // %bb.0:
+; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-VLA-NEXT: ret
+;
+; CHECK-128-LABEL: st_nxv2f64_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
+; CHECK-128-NEXT: ret
+ %3 = tail call i64 @llvm.vscale.i64()
+ %4 = shl nuw nsw i64 %3, 4
+ %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
+ store <vscale x 2 x double> %1, ptr %5, align 16
+ ret void
+}
+
+attributes #0 = { "target-features"="+sve" }
>From 4ddc20077ad9c0d4a10d75316e7ce474663efc57 Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Fri, 14 Feb 2025 05:37:32 -0800
Subject: [PATCH 2/3] [AArch64][SVE] Lower unpredicated loads/stores as LDR/STR
with sve-vector-bits=128.
Given the code below:
```cpp
svuint8_t foo(uint8_t *x) {
return svld1(svptrue_b8(), x);
}
```
When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we
currently generate:
```gas
foo:
ptrue p0.b
ld1b { z0.b }, p0/z, [x0]
ret
```
Whereas (on little-endian) we could instead be using LDR as follows:
```gas
foo:
ldr q0, [x0]
ret
```
Besides avoiding the predicate dependency, the above form enables
further optimisations such as LDP folds. Likewise for stores.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 51 +++
.../AArch64/sve-unpred-loads-stores.ll | 400 ++----------------
2 files changed, 95 insertions(+), 356 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 0db6c614684d7..600225175e138 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23654,6 +23654,28 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
return DAG.getMergeValues({Extract, TokenFactor}, DL);
}
+// Replace packed scalable loads with fixed loads when vscale_range(1, 1).
+// This enables further optimisations such as LDP folds.
+static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ EVT MemVT = LD->getMemoryVT();
+ if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
+ !MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ MemVT.getSizeInBits().getKnownMinValue() != 128 ||
+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
+ return SDValue();
+
+ SDLoc DL(LD);
+ MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+ MemVT.getVectorMinNumElements());
+ SDValue NewLoad = DAG.getLoad(
+ NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+ LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
+ SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
+ return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
+}
+
// Perform TBI simplification if supported by the target and try to break up
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
// load instructions can be selected.
@@ -23691,6 +23713,9 @@ static SDValue performLOADCombine(SDNode *N,
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
return Res;
+ if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
+ return Res;
+
if (!LD->isNonTemporal())
return SDValue(N, 0);
@@ -23949,6 +23974,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
return Chain;
}
+// Replace packed scalable stores with fixed stores when vscale_range(1, 1).
+static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const AArch64Subtarget *Subtarget) {
+ SDValue Value = ST->getValue();
+ EVT ValueVT = Value.getValueType();
+ if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
+ !DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
+ !ValueVT.isScalableVector() || ST->isTruncatingStore() ||
+ ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
+ return SDValue();
+
+ SDLoc DL(ST);
+ MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
+ ValueVT.getVectorMinNumElements());
+ SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
+ SDValue NewStore = DAG.getStore(
+ ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
+ ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
+ return NewStore;
+}
+
static unsigned getFPSubregForVT(EVT VT) {
assert(VT.isSimple() && "Expected simple VT");
switch (VT.getSimpleVT().SimpleTy) {
@@ -23997,6 +24045,9 @@ static SDValue performSTORECombine(SDNode *N,
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
return Res;
+ if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
+ return Res;
+
// If this is an FP_ROUND followed by a store, fold this into a truncating
// store. We can do this even if this is already a truncstore.
// We purposefully don't care about legality of the nodes here as we know
diff --git a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
index d1b8edaf6b9dc..94e23cc2fe6ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
@@ -1,441 +1,129 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -aarch64-sve-vector-bits-max=0 < %s | FileCheck %s --check-prefix=CHECK-VLA
-; RUN: llc -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | not grep -e ldr -e str
-target triple = "aarch64-unknown-linux-gnu"
-
-define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv16i8:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) {
; CHECK-128-LABEL: ld_nxv16i8:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 16 x i8>, ptr %0, align 16
+ %2 = load <vscale x 16 x i8>, ptr %0, align 1
ret <vscale x 16 x i8> %2
}
-define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv16i8:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) {
; CHECK-128-LABEL: st_nxv16i8:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 16 x i8> %1, ptr %0, align 16
+ store <vscale x 16 x i8> %1, ptr %0, align 1
ret void
}
-define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv8i16:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) {
; CHECK-128-LABEL: ld_nxv8i16:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 8 x i16>, ptr %0, align 16
+ %2 = load <vscale x 8 x i16>, ptr %0, align 2
ret <vscale x 8 x i16> %2
}
-define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv8i16:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) {
; CHECK-128-LABEL: st_nxv8i16:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 8 x i16> %1, ptr %0, align 16
+ store <vscale x 8 x i16> %1, ptr %0, align 2
ret void
}
-define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv4i32:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) {
; CHECK-128-LABEL: ld_nxv4i32:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 4 x i32>, ptr %0, align 16
+ %2 = load <vscale x 4 x i32>, ptr %0, align 4
ret <vscale x 4 x i32> %2
}
-define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv4i32:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) {
; CHECK-128-LABEL: st_nxv4i32:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 4 x i32> %1, ptr %0, align 16
+ store <vscale x 4 x i32> %1, ptr %0, align 4
ret void
}
-define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv2i64:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) {
; CHECK-128-LABEL: ld_nxv2i64:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 2 x i64>, ptr %0, align 16
+ %2 = load <vscale x 2 x i64>, ptr %0, align 8
ret <vscale x 2 x i64> %2
}
-define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv2i64:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) {
; CHECK-128-LABEL: st_nxv2i64:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 2 x i64> %1, ptr %0, align 16
+ store <vscale x 2 x i64> %1, ptr %0, align 8
ret void
}
-define <vscale x 8 x half> @ld_nxv8f16(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv8f16:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 8 x half> @ld_nxv8f16(ptr %0) {
; CHECK-128-LABEL: ld_nxv8f16:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 8 x half>, ptr %0, align 16
+ %2 = load <vscale x 8 x half>, ptr %0, align 2
ret <vscale x 8 x half> %2
}
-define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv8f16:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) {
; CHECK-128-LABEL: st_nxv8f16:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 8 x half> %1, ptr %0, align 16
+ store <vscale x 8 x half> %1, ptr %0, align 2
ret void
}
-define <vscale x 4 x float> @ld_nxv4f32(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv4f32:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 4 x float> @ld_nxv4f32(ptr %0) {
; CHECK-128-LABEL: ld_nxv4f32:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 4 x float>, ptr %0, align 16
+ %2 = load <vscale x 4 x float>, ptr %0, align 4
ret <vscale x 4 x float> %2
}
-define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv4f32:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) {
; CHECK-128-LABEL: st_nxv4f32:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 4 x float> %1, ptr %0, align 16
+ store <vscale x 4 x float> %1, ptr %0, align 4
ret void
}
-define <vscale x 2 x double> @ld_nxv2f64(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv2f64:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define <vscale x 2 x double> @ld_nxv2f64(ptr %0) {
; CHECK-128-LABEL: ld_nxv2f64:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0]
+; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
- %2 = load <vscale x 2 x double>, ptr %0, align 16
+ %2 = load <vscale x 2 x double>, ptr %0, align 8
ret <vscale x 2 x double> %2
}
-define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv2f64:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0]
-; CHECK-VLA-NEXT: ret
-;
+define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) {
; CHECK-128-LABEL: st_nxv2f64:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0]
+; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
- store <vscale x 2 x double> %1, ptr %0, align 16
+ store <vscale x 2 x double> %1, ptr %0, align 8
ret void
}
-
-define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv16i8_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv16i8_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 16 x i8>, ptr %4, align 16
- ret <vscale x 16 x i8> %5
-}
-
-define void @st_nxv16i8_offset(ptr %0, <vscale x 16 x i8> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv16i8_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv16i8_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 16 x i8> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 8 x i16> @ld_nxv8i16_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv8i16_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv8i16_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 8 x i16>, ptr %4, align 16
- ret <vscale x 8 x i16> %5
-}
-
-define void @st_nxv8i16_offset(ptr %0, <vscale x 8 x i16> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv8i16_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv8i16_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 8 x i16> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 4 x i32> @ld_nxv4i32_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv4i32_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv4i32_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 4 x i32>, ptr %4, align 16
- ret <vscale x 4 x i32> %5
-}
-
-define void @st_nxv4i32_offset(ptr %0, <vscale x 4 x i32> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv4i32_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv4i32_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 4 x i32> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 2 x i64> @ld_nxv2i64_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv2i64_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv2i64_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 2 x i64>, ptr %4, align 16
- ret <vscale x 2 x i64> %5
-}
-
-define void @st_nxv2i64_offset(ptr %0, <vscale x 2 x i64> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv2i64_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv2i64_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 2 x i64> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 8 x half> @ld_nxv8f16_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv8f16_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv8f16_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 8 x half>, ptr %4, align 16
- ret <vscale x 8 x half> %5
-}
-
-define void @st_nxv8f16_offset(ptr %0, <vscale x 8 x half> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv8f16_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv8f16_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 8 x half> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 4 x float> @ld_nxv4f32_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv4f32_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv4f32_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 4 x float>, ptr %4, align 16
- ret <vscale x 4 x float> %5
-}
-
-define void @st_nxv4f32_offset(ptr %0, <vscale x 4 x float> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv4f32_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv4f32_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 4 x float> %1, ptr %5, align 16
- ret void
-}
-
-define <vscale x 2 x double> @ld_nxv2f64_offset(ptr %0) #0 {
-; CHECK-VLA-LABEL: ld_nxv2f64_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: ld_nxv2f64_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %2 = tail call i64 @llvm.vscale.i64()
- %3 = shl nuw nsw i64 %2, 4
- %4 = getelementptr inbounds nuw i8, ptr %0, i64 %3
- %5 = load <vscale x 2 x double>, ptr %4, align 16
- ret <vscale x 2 x double> %5
-}
-
-define void @st_nxv2f64_offset(ptr %0, <vscale x 2 x double> %1) #0 {
-; CHECK-VLA-LABEL: st_nxv2f64_offset:
-; CHECK-VLA: // %bb.0:
-; CHECK-VLA-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-VLA-NEXT: ret
-;
-; CHECK-128-LABEL: st_nxv2f64_offset:
-; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
-; CHECK-128-NEXT: ret
- %3 = tail call i64 @llvm.vscale.i64()
- %4 = shl nuw nsw i64 %3, 4
- %5 = getelementptr inbounds nuw i8, ptr %0, i64 %4
- store <vscale x 2 x double> %1, ptr %5, align 16
- ret void
-}
-
-attributes #0 = { "target-features"="+sve" }
>From 70568c98d3000ea08d354aad8abd31efd29914dd Mon Sep 17 00:00:00 2001
From: Ricardo Jesus <rjj at nvidia.com>
Date: Thu, 6 Mar 2025 08:30:33 -0800
Subject: [PATCH 3/3] Address comments and rebase patch
---
.../Target/AArch64/AArch64ISelLowering.cpp | 27 +++--
.../AArch64/sve-fixed-length-offsets.ll | 16 +--
.../AArch64/sve-unpred-loads-stores.ll | 112 +++++++++++++++++-
3 files changed, 133 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 600225175e138..c7de92f843c7e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23660,20 +23660,21 @@ static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
EVT MemVT = LD->getMemoryVT();
- if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
- !MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+ if (!DCI.isBeforeLegalize() || !Subtarget->isLittleEndian() ||
+ !Subtarget->hasNEON() || !MemVT.isScalableVector() ||
+ LD->getExtensionType() != ISD::NON_EXTLOAD ||
MemVT.getSizeInBits().getKnownMinValue() != 128 ||
Subtarget->getMaxSVEVectorSizeInBits() != 128)
return SDValue();
SDLoc DL(LD);
- MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
+ EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
MemVT.getVectorMinNumElements());
SDValue NewLoad = DAG.getLoad(
NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
- return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
+ return DAG.getMergeValues({Insert, NewLoad.getValue(1)}, DL);
}
// Perform TBI simplification if supported by the target and try to break up
@@ -23980,21 +23981,21 @@ static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
SDValue Value = ST->getValue();
EVT ValueVT = Value.getValueType();
- if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
- !DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
- !ValueVT.isScalableVector() || ST->isTruncatingStore() ||
+ if (!DCI.isBeforeLegalize() || !Subtarget->isLittleEndian() ||
+ !Subtarget->hasNEON() || !ValueVT.isScalableVector() ||
+ ST->isTruncatingStore() ||
ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
Subtarget->getMaxSVEVectorSizeInBits() != 128)
return SDValue();
SDLoc DL(ST);
- MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
- ValueVT.getVectorMinNumElements());
+ EVT NewVT =
+ EVT::getVectorVT(*DAG.getContext(), ValueVT.getVectorElementType(),
+ ValueVT.getVectorMinNumElements());
SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
- SDValue NewStore = DAG.getStore(
- ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
- ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
- return NewStore;
+ return DAG.getStore(ST->getChain(), DL, NewValue, ST->getBasePtr(),
+ ST->getPointerInfo(), ST->getOriginalAlign(),
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
}
static unsigned getFPSubregForVT(EVT VT) {
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll
index d7b67d73a671e..8aba77d365d6e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-offsets.ll
@@ -17,8 +17,8 @@ define void @nxv16i8(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv16i8:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
-; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
+; CHECK-128-NEXT: ldr q0, [x0, #256]
+; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv16i8:
@@ -62,8 +62,8 @@ define void @nxv8i16(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv8i16:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
-; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
+; CHECK-128-NEXT: ldr q0, [x0, #256]
+; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv8i16:
@@ -107,8 +107,8 @@ define void @nxv4i32(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv4i32:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
-; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
+; CHECK-128-NEXT: ldr q0, [x0, #256]
+; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv4i32:
@@ -152,8 +152,8 @@ define void @nxv2i64(ptr %ldptr, ptr %stptr) {
;
; CHECK-128-LABEL: nxv2i64:
; CHECK-128: // %bb.0:
-; CHECK-128-NEXT: ldr z0, [x0, #16, mul vl]
-; CHECK-128-NEXT: str z0, [x1, #16, mul vl]
+; CHECK-128-NEXT: ldr q0, [x0, #256]
+; CHECK-128-NEXT: str q0, [x1, #256]
; CHECK-128-NEXT: ret
;
; CHECK-256-LABEL: nxv2i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
index 94e23cc2fe6ec..4d6ee892c7f49 100644
--- a/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll
@@ -1,12 +1,18 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-128
-; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | not grep -e ldr -e str
+; RUN: llc -mtriple=aarch64_be-linux-gnu -mattr=+sve -aarch64-sve-vector-bits-max=128 < %s | FileCheck %s --check-prefix=CHECK-BE-128
define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) {
; CHECK-128-LABEL: ld_nxv16i8:
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv16i8:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.b
+; CHECK-BE-128-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 16 x i8>, ptr %0, align 1
ret <vscale x 16 x i8> %2
}
@@ -16,6 +22,12 @@ define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv16i8:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.b
+; CHECK-BE-128-NEXT: st1b { z0.b }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 16 x i8> %1, ptr %0, align 1
ret void
}
@@ -25,6 +37,12 @@ define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv8i16:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.h
+; CHECK-BE-128-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 8 x i16>, ptr %0, align 2
ret <vscale x 8 x i16> %2
}
@@ -34,6 +52,12 @@ define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv8i16:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.h
+; CHECK-BE-128-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 8 x i16> %1, ptr %0, align 2
ret void
}
@@ -43,6 +67,12 @@ define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv4i32:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.s
+; CHECK-BE-128-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 4 x i32>, ptr %0, align 4
ret <vscale x 4 x i32> %2
}
@@ -52,6 +82,12 @@ define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv4i32:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.s
+; CHECK-BE-128-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 4 x i32> %1, ptr %0, align 4
ret void
}
@@ -61,6 +97,12 @@ define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv2i64:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.d
+; CHECK-BE-128-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 2 x i64>, ptr %0, align 8
ret <vscale x 2 x i64> %2
}
@@ -70,6 +112,12 @@ define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv2i64:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.d
+; CHECK-BE-128-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 2 x i64> %1, ptr %0, align 8
ret void
}
@@ -79,6 +127,12 @@ define <vscale x 8 x half> @ld_nxv8f16(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv8f16:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.h
+; CHECK-BE-128-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 8 x half>, ptr %0, align 2
ret <vscale x 8 x half> %2
}
@@ -88,6 +142,12 @@ define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv8f16:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.h
+; CHECK-BE-128-NEXT: st1h { z0.h }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 8 x half> %1, ptr %0, align 2
ret void
}
@@ -97,6 +157,12 @@ define <vscale x 4 x float> @ld_nxv4f32(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv4f32:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.s
+; CHECK-BE-128-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 4 x float>, ptr %0, align 4
ret <vscale x 4 x float> %2
}
@@ -106,6 +172,12 @@ define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv4f32:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.s
+; CHECK-BE-128-NEXT: st1w { z0.s }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 4 x float> %1, ptr %0, align 4
ret void
}
@@ -115,6 +187,12 @@ define <vscale x 2 x double> @ld_nxv2f64(ptr %0) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: ldr q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ld_nxv2f64:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.d
+; CHECK-BE-128-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-BE-128-NEXT: ret
%2 = load <vscale x 2 x double>, ptr %0, align 8
ret <vscale x 2 x double> %2
}
@@ -124,6 +202,38 @@ define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) {
; CHECK-128: // %bb.0:
; CHECK-128-NEXT: str q0, [x0]
; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: st_nxv2f64:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.d
+; CHECK-BE-128-NEXT: st1d { z0.d }, p0, [x0]
+; CHECK-BE-128-NEXT: ret
store <vscale x 2 x double> %1, ptr %0, align 8
ret void
}
+
+; Test LDP/STP fold.
+define void @ldp_stp_nxv16i8_offset(ptr %ldptr, ptr %stptr) {
+; CHECK-128-LABEL: ldp_stp_nxv16i8_offset:
+; CHECK-128: // %bb.0:
+; CHECK-128-NEXT: ldp q0, q1, [x0, #-16]
+; CHECK-128-NEXT: stp q0, q1, [x1, #-16]
+; CHECK-128-NEXT: ret
+;
+; CHECK-BE-128-LABEL: ldp_stp_nxv16i8_offset:
+; CHECK-BE-128: // %bb.0:
+; CHECK-BE-128-NEXT: ptrue p0.b
+; CHECK-BE-128-NEXT: mov x8, #-16 // =0xfffffffffffffff0
+; CHECK-BE-128-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
+; CHECK-BE-128-NEXT: ld1b { z1.b }, p0/z, [x0]
+; CHECK-BE-128-NEXT: st1b { z0.b }, p0, [x1, x8]
+; CHECK-BE-128-NEXT: st1b { z1.b }, p0, [x1]
+; CHECK-BE-128-NEXT: ret
+ %ldptr.1 = getelementptr inbounds i8, ptr %ldptr, i64 -16
+ %ld1 = load <vscale x 16 x i8>, ptr %ldptr.1, align 1
+ %ld2 = load <vscale x 16 x i8>, ptr %ldptr, align 1
+ %stptr.1 = getelementptr inbounds i8, ptr %stptr, i64 -16
+ store <vscale x 16 x i8> %ld1, ptr %stptr.1, align 1
+ store <vscale x 16 x i8> %ld2, ptr %stptr, align 1
+ ret void
+}
More information about the llvm-commits
mailing list