[llvm] [AArch64][SVE] Select non-temporal instructions for loads/stores with the nontemporal flag (PR #171261)
Yuta Mukai via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 21:59:23 PST 2025
https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/171261
>From 2e0c95dd702bd86473cca348ce74b7380326b098 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Fri, 5 Dec 2025 06:21:22 +0000
Subject: [PATCH 1/2] [AArch64][SVE] Select non-temporal instructions for
loads/stores with the nontemporal flag
Select SVE non-temporal load/store instructions for unpredicated
vector loads/stores with the `nontemporal` flag, for which regular
instructions were previously used.
Also, disable the transformation of predicated loads/stores with an
all-true mask into `ldr`/`str` when the `nontemporal` flag is present,
ensuring non-temporal instructions are used.
Fixes #169034
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 ++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 27 ++-
llvm/test/CodeGen/AArch64/nontemporal-load.ll | 21 +--
.../CodeGen/AArch64/sve-nontemporal-ldst.ll | 164 ++++++++++++++++++
.../AArch64/sve-nontemporal-masked-ldst.ll | 23 +++
5 files changed, 233 insertions(+), 14 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 64017d7cafca3..97287c2d9e389 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -639,6 +639,12 @@ def non_temporal_load :
cast<MaskedLoadSDNode>(N)->isNonTemporal();
}]>;
+def temporal_load :
+ PatFrag<(ops node:$ptr),
+ (load node:$ptr), [{
+ return !cast<LoadSDNode>(N)->isNonTemporal();
+}]>;
+
// non-truncating masked store fragment.
def nontrunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
@@ -684,6 +690,12 @@ def non_temporal_store :
!cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
+def temporal_store :
+ PatFrag<(ops node:$val, node:$ptr),
+ (store node:$val, node:$ptr), [{
+ return !cast<StoreSDNode>(N)->isNonTemporal();
+}]>;
+
multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
// offsets = (signed)Index << sizeof(elt)
def NAME#_signed_scaled :
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index bfa4ce6da212b..9e00f4b944b38 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3075,6 +3075,15 @@ let Predicates = [HasSVE_or_SME] in {
(RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
}
+ defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
+ defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+ defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3111,6 +3120,15 @@ let Predicates = [HasSVE_or_SME] in {
(RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
}
+ defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
+ defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+ defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
+ defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
+ defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
+
defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3164,18 +3182,19 @@ let Predicates = [HasSVE_or_SME] in {
}
// Allow using LDR/STR to avoid the predicate dependence.
+ // Not applied if the nontemporal flag is set.
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
let AddedComplexity = 2 in {
- def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ def : Pat<(Ty (temporal_load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
(LDR_ZXI GPR64sp:$base, simm9:$offset)>;
- def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ def : Pat<(temporal_store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
}
- def : Pat<(Ty (load GPR64sp:$base)),
+ def : Pat<(Ty (temporal_load GPR64sp:$base)),
(LDR_ZXI GPR64sp:$base, (i64 0))>;
- def : Pat<(store Ty:$val, GPR64sp:$base),
+ def : Pat<(temporal_store Ty:$val, GPR64sp:$base),
(STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-load.ll b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
index ffafe69b29266..ad92530eabf08 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-load.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-load.ll
@@ -612,21 +612,22 @@ define <16 x double> @test_ldnp_v16f64(ptr %A) {
define <vscale x 20 x float> @test_ldnp_v20f32_vscale(ptr %A) {
; CHECK-LABEL: test_ldnp_v20f32_vscale:
; CHECK: ; %bb.0:
-; CHECK-NEXT: ldr z0, [x0]
-; CHECK-NEXT: ldr z1, [x0, #1, mul vl]
-; CHECK-NEXT: ldr z2, [x0, #2, mul vl]
-; CHECK-NEXT: ldr z3, [x0, #3, mul vl]
-; CHECK-NEXT: ldr z4, [x0, #4, mul vl]
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-NEXT: ret
;
; CHECK-BE-LABEL: test_ldnp_v20f32_vscale:
; CHECK-BE: // %bb.0:
; CHECK-BE-NEXT: ptrue p0.s
-; CHECK-BE-NEXT: ld1w { z0.s }, p0/z, [x0]
-; CHECK-BE-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl]
-; CHECK-BE-NEXT: ld1w { z2.s }, p0/z, [x0, #2, mul vl]
-; CHECK-BE-NEXT: ld1w { z3.s }, p0/z, [x0, #3, mul vl]
-; CHECK-BE-NEXT: ld1w { z4.s }, p0/z, [x0, #4, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-BE-NEXT: ldnt1w { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z2.s }, p0/z, [x0, #2, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z3.s }, p0/z, [x0, #3, mul vl]
+; CHECK-BE-NEXT: ldnt1w { z4.s }, p0/z, [x0, #4, mul vl]
; CHECK-BE-NEXT: ret
%lv = load<vscale x 20 x float>, ptr %A, align 8, !nontemporal !0
ret <vscale x 20 x float> %lv
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
new file mode 100644
index 0000000000000..e631d7cbe711d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
@@ -0,0 +1,164 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i8> @load_nxv16i8(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 16 x i8>, ptr %a, !nontemporal !0
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 8 x i16> @load_nxv8i16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x i16>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x i16> %load
+}
+
+define <vscale x 4 x i32> @load_nxv4i32(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 4 x i32>, ptr %a, !nontemporal !0
+ ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @load_nxv2i64(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 2 x i64>, ptr %a, !nontemporal !0
+ ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 8 x half> @load_nxv8f16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x half>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x half> %load
+}
+
+define <vscale x 8 x bfloat> @load_nxv8bf16(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 8 x bfloat>, ptr %a, !nontemporal !0
+ ret <vscale x 8 x bfloat> %load
+}
+
+define <vscale x 4 x float> @load_nxv4f32(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 4 x float>, ptr %a, !nontemporal !0
+ ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x double> @load_nxv2f64(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %load = load <vscale x 2 x double>, ptr %a, !nontemporal !0
+ ret <vscale x 2 x double> %load
+}
+
+define void @store_nxv16i8(<vscale x 16 x i8> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: stnt1b { z0.b }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 16 x i8> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8i16(<vscale x 8 x i16> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x i16> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 4 x i32> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2i64(<vscale x 2 x i64> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 2 x i64> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8f16(<vscale x 8 x half> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x half> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv8bfl16(<vscale x 8 x bfloat> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv8bfl16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: stnt1h { z0.h }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 8 x bfloat> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv4f32(<vscale x 4 x float> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 4 x float> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2f64(<vscale x 2 x double> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0]
+; CHECK-NEXT: ret
+ store <vscale x 2 x double> %x, ptr %a, !nontemporal !0
+ ret void
+}
+
+!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
index 36df5e5deadfc..bb016f840411a 100644
--- a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -66,9 +66,32 @@ define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i
ret void
}
+define <vscale x 4 x i32> @unmasked_load_nxv4i32(ptr %a) nounwind {
+; CHECK-LABEL: unmasked_load_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i32> poison), !nontemporal !0
+ ret <vscale x 4 x i32> %load
+}
+
+define void @unmasked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: unmasked_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
+; CHECK-NEXT: ret
+ %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+ call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> %mask), !nontemporal !0
+ ret void
+}
+
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
!0 = !{i32 1}
>From d44043d6eca800c4850278d5e55ebfa592244283 Mon Sep 17 00:00:00 2001
From: Yuta Mukai <mukai.yuta at fujitsu.com>
Date: Wed, 10 Dec 2025 05:41:04 +0000
Subject: [PATCH 2/2] fixup! [AArch64][SVE] Select non-temporal instructions
for loads/stores with the nontemporal flag
---
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 12 ---
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 69 ++++++++-------
.../CodeGen/AArch64/sve-nontemporal-ldst.ll | 87 +++++++++++++++++++
.../AArch64/sve-nontemporal-masked-ldst.ll | 15 ++--
4 files changed, 129 insertions(+), 54 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 97287c2d9e389..64017d7cafca3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -639,12 +639,6 @@ def non_temporal_load :
cast<MaskedLoadSDNode>(N)->isNonTemporal();
}]>;
-def temporal_load :
- PatFrag<(ops node:$ptr),
- (load node:$ptr), [{
- return !cast<LoadSDNode>(N)->isNonTemporal();
-}]>;
-
// non-truncating masked store fragment.
def nontrunc_masked_store :
PatFrag<(ops node:$val, node:$ptr, node:$pred),
@@ -690,12 +684,6 @@ def non_temporal_store :
!cast<MaskedStoreSDNode>(N)->isCompressingStore();
}]>;
-def temporal_store :
- PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return !cast<StoreSDNode>(N)->isNonTemporal();
-}]>;
-
multiclass masked_gather_scatter<PatFrags GatherScatterOp> {
// offsets = (signed)Index << sizeof(elt)
def NAME#_signed_scaled :
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 9e00f4b944b38..657709ef0461d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3061,29 +3061,22 @@ let Predicates = [HasSVE_or_SME] in {
multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
- ComplexPattern AddrCP> {
- let AddedComplexity = 1 in {
+ ComplexPattern AddrCP, int AddedComplexity = 0> {
+ let AddedComplexity = !add(1, AddedComplexity) in {
def _reg : Pat<(Store Ty:$val, (AddrCP GPR64sp:$base, GPR64:$offset)),
(RegRegInst ZPR:$val, (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
- let AddedComplexity = 2 in {
+ let AddedComplexity = !add(2, AddedComplexity) in {
def _imm : Pat<(Store Ty:$val, (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
(RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
- def : Pat<(Store Ty:$val, GPR64:$base),
- (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
+ let AddedComplexity = AddedComplexity in {
+ def : Pat<(Store Ty:$val, GPR64:$base),
+ (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
+ }
}
- defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
- defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
- defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
- defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
- defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
-
defm : unpred_store< store, nxv16i8, ST1B, ST1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H, ST1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S, ST1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3104,31 +3097,33 @@ let Predicates = [HasSVE_or_SME] in {
defm : unpred_store< store, nxv2f32, ST1W_D, ST1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_store< store, nxv2f64, ST1D, ST1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+ defm : unpred_store<nontemporalstore, nxv16i8, STNT1B_ZRR, STNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0, 1>;
+ defm : unpred_store<nontemporalstore, nxv8i16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_store<nontemporalstore, nxv4i32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2, 1>;
+ defm : unpred_store<nontemporalstore, nxv2i64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3, 1>;
+ defm : unpred_store<nontemporalstore, nxv8f16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_store<nontemporalstore, nxv8bf16, STNT1H_ZRR, STNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_store<nontemporalstore, nxv4f32, STNT1W_ZRR, STNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2, 1>;
+ defm : unpred_store<nontemporalstore, nxv2f64, STNT1D_ZRR, STNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3, 1>;
+
multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegRegInst,
Instruction RegImmInst, Instruction PTrue,
- ComplexPattern AddrCP> {
- let AddedComplexity = 1 in {
+ ComplexPattern AddrCP, int AddedComplexity = 0> {
+ let AddedComplexity = !add(1, AddedComplexity) in {
def _reg: Pat<(Ty (Load (AddrCP GPR64sp:$base, GPR64:$offset))),
(RegRegInst (PTrue 31), GPR64sp:$base, GPR64:$offset)>;
}
- let AddedComplexity = 2 in {
+ let AddedComplexity = !add(2, AddedComplexity) in {
def _imm: Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
(RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
}
- def : Pat<(Ty (Load GPR64:$base)),
- (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
+ let AddedComplexity = AddedComplexity in {
+ def : Pat<(Ty (Load GPR64:$base)),
+ (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
+ }
}
- defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B ,am_sve_regreg_lsl0>;
- defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
- defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
- defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H ,am_sve_regreg_lsl1>;
- defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S ,am_sve_regreg_lsl2>;
- defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D ,am_sve_regreg_lsl3>;
-
defm : unpred_load< load, nxv16i8, LD1B, LD1B_IMM, PTRUE_B, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv8i16, LD1B_H, LD1B_H_IMM, PTRUE_H, am_sve_regreg_lsl0>;
defm : unpred_load< zextloadvi8, nxv4i32, LD1B_S, LD1B_S_IMM, PTRUE_S, am_sve_regreg_lsl0>;
@@ -3161,6 +3156,15 @@ let Predicates = [HasSVE_or_SME] in {
defm : unpred_load< load, nxv2f32, LD1W_D, LD1W_D_IMM, PTRUE_D, am_sve_regreg_lsl2>;
defm : unpred_load< load, nxv2f64, LD1D, LD1D_IMM, PTRUE_D, am_sve_regreg_lsl3>;
+ defm : unpred_load<nontemporalload, nxv16i8, LDNT1B_ZRR, LDNT1B_ZRI, PTRUE_B, am_sve_regreg_lsl0, 1>;
+ defm : unpred_load<nontemporalload, nxv8i16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_load<nontemporalload, nxv4i32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2, 1>;
+ defm : unpred_load<nontemporalload, nxv2i64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3, 1>;
+ defm : unpred_load<nontemporalload, nxv8f16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_load<nontemporalload, nxv8bf16, LDNT1H_ZRR, LDNT1H_ZRI, PTRUE_H, am_sve_regreg_lsl1, 1>;
+ defm : unpred_load<nontemporalload, nxv4f32, LDNT1W_ZRR, LDNT1W_ZRI, PTRUE_S, am_sve_regreg_lsl2, 1>;
+ defm : unpred_load<nontemporalload, nxv2f64, LDNT1D_ZRR, LDNT1D_ZRI, PTRUE_D, am_sve_regreg_lsl3, 1>;
+
let Predicates = [HasSVE_or_SME, IsLE] in {
// Allow using the reg+reg form of ld1b/st1b for memory accesses with the
// same width as nxv16i8. This saves an add in cases where we would
@@ -3182,19 +3186,18 @@ let Predicates = [HasSVE_or_SME] in {
}
// Allow using LDR/STR to avoid the predicate dependence.
- // Not applied if the nontemporal flag is set.
let Predicates = [HasSVE_or_SME, IsLE, AllowMisalignedMemAccesses] in
foreach Ty = [ nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv8f16, nxv4f32, nxv2f64, nxv8bf16 ] in {
let AddedComplexity = 2 in {
- def : Pat<(Ty (temporal_load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
+ def : Pat<(Ty (load (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset))),
(LDR_ZXI GPR64sp:$base, simm9:$offset)>;
- def : Pat<(temporal_store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
+ def : Pat<(store Ty:$val, (am_sve_indexed_s9 GPR64sp:$base, simm9:$offset)),
(STR_ZXI ZPR:$val, GPR64sp:$base, simm9:$offset)>;
}
- def : Pat<(Ty (temporal_load GPR64sp:$base)),
+ def : Pat<(Ty (load GPR64sp:$base)),
(LDR_ZXI GPR64sp:$base, (i64 0))>;
- def : Pat<(temporal_store Ty:$val, GPR64sp:$base),
+ def : Pat<(store Ty:$val, GPR64sp:$base),
(STR_ZXI ZPR:$val, GPR64sp:$base, (i64 0))>;
}
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
index e631d7cbe711d..962e551e4799f 100644
--- a/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-ldst.ll
@@ -81,6 +81,50 @@ define <vscale x 2 x double> @load_nxv2f64(ptr %a) nounwind {
ret <vscale x 2 x double> %load
}
+define <vscale x 16 x i8> @load_nxv16i8_reg(ptr %a, i64 %off) nounwind {
+; CHECK-LABEL: load_nxv16i8_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr i8, ptr %a, i64 %off
+ %load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 16 x i8> @load_nxv16i8_imm(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv16i8_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
+ %load = load <vscale x 16 x i8>, ptr %ptr, !nontemporal !0
+ ret <vscale x 16 x i8> %load
+}
+
+define <vscale x 2 x double> @load_nxv2f64_reg(ptr %a, i64 %off) nounwind {
+; CHECK-LABEL: load_nxv2f64_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr double, ptr %a, i64 %off
+ %load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
+ ret <vscale x 2 x double> %load
+}
+
+define <vscale x 2 x double> @load_nxv2f64_imm(ptr %a) nounwind {
+; CHECK-LABEL: load_nxv2f64_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
+ %load = load <vscale x 2 x double>, ptr %ptr, !nontemporal !0
+ ret <vscale x 2 x double> %load
+}
+
define void @store_nxv16i8(<vscale x 16 x i8> %x, ptr %a) nounwind {
; CHECK-LABEL: store_nxv16i8:
; CHECK: // %bb.0:
@@ -161,4 +205,47 @@ define void @store_nxv2f64(<vscale x 2 x double> %x, ptr %a) nounwind {
ret void
}
+define void @store_nxv16i8_reg(<vscale x 16 x i8> %x, ptr %a, i64 %off) nounwind {
+; CHECK-LABEL: store_nxv16i8_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr i8, ptr %a, i64 %off
+ store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv16i8_imm(<vscale x 16 x i8> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv16i8_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: stnt1b { z0.b }, p0, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %ptr = getelementptr <vscale x 16 x i8>, ptr %a, i64 1
+ store <vscale x 16 x i8> %x, ptr %ptr, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2f64_reg(<vscale x 2 x double> %x, ptr %a, i64 %off) nounwind {
+; CHECK-LABEL: store_nxv2f64_reg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, x1, lsl #3]
+; CHECK-NEXT: ret
+ %ptr = getelementptr double, ptr %a, i64 %off
+ store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
+ ret void
+}
+
+define void @store_nxv2f64_imm(<vscale x 2 x double> %x, ptr %a) nounwind {
+; CHECK-LABEL: store_nxv2f64_imm:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: stnt1d { z0.d }, p0, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %ptr = getelementptr <vscale x 2 x double>, ptr %a, i64 1
+ store <vscale x 2 x double> %x, ptr %ptr, !nontemporal !0
+ ret void
+}
!0 = !{i32 1}
diff --git a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
index bb016f840411a..f097d874cd11b 100644
--- a/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/sve-nontemporal-masked-ldst.ll
@@ -66,25 +66,23 @@ define void @masked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a, <vscale x 4 x i
ret void
}
-define <vscale x 4 x i32> @unmasked_load_nxv4i32(ptr %a) nounwind {
-; CHECK-LABEL: unmasked_load_nxv4i32:
+define <vscale x 4 x i32> @all_active_load_nxv4i32(ptr %a) nounwind {
+; CHECK-LABEL: all_active_load_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0]
; CHECK-NEXT: ret
- %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i32> poison), !nontemporal !0
+ %load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison), !nontemporal !0
ret <vscale x 4 x i32> %load
}
-define void @unmasked_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
-; CHECK-LABEL: unmasked_store_nxv4i32:
+define void @all_active_store_nxv4i32(<vscale x 4 x i32> %x, ptr %a) nounwind {
+; CHECK-LABEL: all_active_store_nxv4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: stnt1w { z0.s }, p0, [x0]
; CHECK-NEXT: ret
- %mask = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
- call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> %mask), !nontemporal !0
+ call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %x, ptr %a, i32 1, <vscale x 4 x i1> splat (i1 true)), !nontemporal !0
ret void
}
@@ -92,6 +90,5 @@ declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
declare <4 x i32> @llvm.masked.load.v4i32(ptr, i32, <4 x i1>, <4 x i32>)
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
-declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
!0 = !{i32 1}
More information about the llvm-commits
mailing list