[llvm] 6d877e6 - [AArch64][SVE][CodeGen] Prefer ld1r* over indexed-load when consumed by a splat
Peter Waller via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 21 06:24:05 PST 2022
Author: Peter Waller
Date: 2022-12-21T14:23:39Z
New Revision: 6d877e6717ee648ba9371445aa3c9e304105dfa2
URL: https://github.com/llvm/llvm-project/commit/6d877e6717ee648ba9371445aa3c9e304105dfa2
DIFF: https://github.com/llvm/llvm-project/commit/6d877e6717ee648ba9371445aa3c9e304105dfa2.diff
LOG: [AArch64][SVE][CodeGen] Prefer ld1r* over indexed-load when consumed by a splat
If a load is consumed by a single splat, don't consider indexed loads.
This is an alternative implementation to D138581.
Depends on D139637.
Differential Revision: https://reviews.llvm.org/D139850
Added:
Modified:
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/sve-ld1r.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ee84070b95e..faee3f8c2cb8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21530,14 +21530,38 @@ bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
return CI->isTailCall();
}
-bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
- SDValue &Offset,
- ISD::MemIndexedMode &AM,
- bool &IsInc,
- SelectionDAG &DAG) const {
+bool AArch64TargetLowering::getIndexedAddressParts(
+ SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+ ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const {
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
return false;
+ // Non-null if there is exactly one user of the loaded value (ignoring chain).
+ SDNode *ValOnlyUser = nullptr;
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+ ++UI) {
+ if (UI.getUse().getResNo() == 1)
+ continue; // Ignore chain.
+ if (ValOnlyUser == nullptr)
+ ValOnlyUser = *UI;
+ else {
+ ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
+ break;
+ }
+ }
+
+ auto IsUndefOrZero = [](SDValue V) {
+ return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
+ };
+
+ // If the only user of the value is a scalable vector splat, it is
+ // preferable to do a replicating load (ld1r*).
+ if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
+ (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
+ (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
+ IsUndefOrZero(ValOnlyUser->getOperand(2)))))
+ return false;
+
Base = Op->getOperand(0);
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
@@ -21570,7 +21594,7 @@ bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
return false;
bool IsInc;
- if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
return false;
AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
return true;
@@ -21591,7 +21615,7 @@ bool AArch64TargetLowering::getPostIndexedAddressParts(
return false;
bool IsInc;
- if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+ if (!getIndexedAddressParts(N, Op, Base, Offset, AM, IsInc, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 8547fd4e968e..1eac0afea395 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1159,9 +1159,9 @@ class AArch64TargetLowering : public TargetLowering {
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
- bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
- ISD::MemIndexedMode &AM, bool &IsInc,
- SelectionDAG &DAG) const;
+ bool getIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+ SDValue &Offset, ISD::MemIndexedMode &AM,
+ bool &IsInc, SelectionDAG &DAG) const;
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const override;
diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index d7a920960156..632641e7042d 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -1186,6 +1186,102 @@ define <vscale x 2 x double> @negtest_dup_ld1rd_double_passthru_nxv2f64(<vscale
ret <vscale x 2 x double> %res
}
+
+; Check that a load consumed by a scalable splat prefers a replicating load.
+define i8* @avoid_preindex_load(i8* %src, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+ %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Check that a load consumed by a scalable splat prefers a replicating
+; load over a pre-indexed load.
+define i8* @avoid_preindex_load_dup(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load_dup:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Same as avoid_preindex_load_dup, but with zero passthru.
+define i8* @avoid_preindex_load_dup_passthru_zero(i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
+; CHECK: // %bb.0:
+; CHECK-NEXT: add x8, x0, #1
+; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; If a dup has a non-undef passthru, stick with the pre-indexed load.
+define i8* @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, i8* %src, <vscale x 2 x i1> %pg, <vscale x 2 x i64>* %out) {
+; CHECK-LABEL: preindex_load_dup_passthru:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb x8, [x0, #1]!
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: mov z0.d, p0/m, x8
+; CHECK-NEXT: st1d { z0.d }, p1, [x1]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %passthru, <vscale x 2 x i1> %pg, i64 %ext)
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ ret i8* %ptr
+}
+
+; Show that a second user of the load prevents the replicating load
+; check which would ordinarily inhibit indexed loads from firing.
+define i8* @preidx8sext64_instead_of_ld1r(i8* %src, <vscale x 2 x i64>* %out, i64* %dst) {
+; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb x8, [x0, #1]!
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: mov z0.d, x8
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: str x8, [x2]
+; CHECK-NEXT: ret
+ %ptr = getelementptr inbounds i8, i8* %src, i64 1
+ %tmp = load i8, i8* %ptr, align 4
+ %ext = sext i8 %tmp to i64
+ %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+ %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+ store <vscale x 2 x i64> %dup, <vscale x 2 x i64>* %out
+ store i64 %ext, i64* %dst
+ ret i8* %ptr
+}
+
+
declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
More information about the llvm-commits
mailing list