[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines (PR #142739)
Fabian Ritter via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 23 06:37:15 PDT 2025
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/142739
>From f8aa9d23e4a70e3155d19b0ad0e209b47866aab9 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 4 Jun 2025 03:32:32 -0400
Subject: [PATCH 1/7] [AMDGPU][SDAG] Add ISD::PTRADD DAG combines
This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one
that is closely connected.
The generic DAG combine is based on a part of PR #105669 by @rgwott, which was
adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello
LLVM tree. I added some parts and removed several disjuncts from the
reassociation condition:
- `isNullConstant(X)`, since there are address spaces where 0 is a perfectly
normal value that shouldn't be treated specially,
- `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since
they cause regressions in AMDGPU.
For SWDEV-516125.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 92 ++++++++-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 49 +++++
llvm/lib/Target/AMDGPU/SIISelLowering.h | 1 +
.../AMDGPU/ptradd-sdag-optimizations.ll | 194 ++++++------------
4 files changed, 201 insertions(+), 135 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 56a5643e13442..b203431232035 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -421,6 +421,7 @@ namespace {
SDValue visitADDLike(SDNode *N);
SDValue visitADDLikeCommutative(SDValue N0, SDValue N1,
SDNode *LocReference);
+ SDValue visitPTRADD(SDNode *N);
SDValue visitSUB(SDNode *N);
SDValue visitADDSAT(SDNode *N);
SDValue visitSUBSAT(SDNode *N);
@@ -1140,7 +1141,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
return true;
}
- if (Opc != ISD::ADD)
+ if (Opc != ISD::ADD && Opc != ISD::PTRADD)
return false;
auto *C2 = dyn_cast<ConstantSDNode>(N1);
@@ -1860,6 +1861,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::TokenFactor: return visitTokenFactor(N);
case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
case ISD::ADD: return visitADD(N);
+ case ISD::PTRADD: return visitPTRADD(N);
case ISD::SUB: return visitSUB(N);
case ISD::SADDSAT:
case ISD::UADDSAT: return visitADDSAT(N);
@@ -2630,6 +2632,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
return SDValue();
}
+/// Try to fold a pointer arithmetic node.
+/// This needs to be done separately from normal addition, because pointer
+/// addition is not commutative.
+SDValue DAGCombiner::visitPTRADD(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT PtrVT = N0.getValueType();
+ EVT IntVT = N1.getValueType();
+ SDLoc DL(N);
+
+ // This is already ensured by an assert in SelectionDAG::getNode(). Several
+ // combines here depend on this assumption.
+ assert(PtrVT == IntVT &&
+ "PTRADD with different operand types is not supported");
+
+ // fold (ptradd undef, y) -> undef
+ if (N0.isUndef())
+ return N0;
+
+ // fold (ptradd x, undef) -> undef
+ if (N1.isUndef())
+ return DAG.getUNDEF(PtrVT);
+
+ // fold (ptradd x, 0) -> x
+ if (isNullConstant(N1))
+ return N0;
+
+ // fold (ptradd 0, x) -> x
+ if (isNullConstant(N0))
+ return N1;
+
+ if (N0.getOpcode() == ISD::PTRADD &&
+ !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+ SDValue Z = N1;
+ bool N0OneUse = N0.hasOneUse();
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
+ // * y is a constant and (ptradd x, y) has one use; or
+ // * y and z are both constants.
+ if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, the new ones are as well.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N0->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
+ AddToWorklist(Add.getNode());
+ return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
+ }
+
+ // TODO: There is another possible fold here that was proven useful.
+ // It would be this:
+ //
+ // (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
+ // * (ptradd x, y) has one use; and
+ // * y is a constant; and
+ // * z is not a constant.
+ //
+ // In some cases, specifically in AArch64's FEAT_CPA, it exposes the
+ // opportunity to select more complex instructions such as SUBPT and
+ // MSUBPT. However, a hypothetical corner case has been found that we could
+ // not avoid. Consider this (pseudo-POSIX C):
+ //
+ // char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
+ // char *p = mmap(LARGE_CONSTANT);
+ // char *q = foo(p, -LARGE_CONSTANT);
+ //
+ // Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
+ // further + z takes it back to the start of the mapping, so valid,
+ // regardless of the address mmap gave back. However, if mmap gives you an
+ // address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
+ // borrow from the high bits (with the subsequent + z carrying back into
+ // the high bits to give you a well-defined pointer) and thus trip
+ // FEAT_CPA's pointer corruption checks.
+ //
+ // We leave this fold as an opportunity for future work, addressing the
+ // corner case for FEAT_CPA, as well as reconciling the solution with the
+ // more general application of pointer arithmetic in other future targets.
+ }
+
+ return SDValue();
+}
+
/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
/// a shift and add with a different constant.
static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL,
@@ -15053,6 +15142,7 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
default:
break;
case ISD::ADD:
+ case ISD::PTRADD:
case ISD::SUB: {
unsigned AlignShift = Log2(AL);
SDValue LHS = N0.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 17c7fb7bb1533..9c84104da4ce7 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -945,6 +945,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}
setTargetDAGCombine({ISD::ADD,
+ ISD::PTRADD,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
@@ -15095,6 +15096,52 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N1.getOpcode() == ISD::ADD) {
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool N1OneUse = N1.hasOneUse();
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+ if ((ZIsConstant != YIsConstant) && N1OneUse) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, the new ones are as well.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N1->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+
+ if (YIsConstant)
+ std::swap(Y, Z);
+
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performSubCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -15633,6 +15680,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
case ISD::ADD:
return performAddCombine(N, DCI);
+ case ISD::PTRADD:
+ return performPtrAddCombine(N, DCI);
case ISD::SUB:
return performSubCombine(N, DCI);
case ISD::UADDO_CARRY:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 89fb12b52c3e6..85d3533061755 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -220,6 +220,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
DAGCombinerInfo &DCI) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 4a5fa641da038..b78dea1684545 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -8,24 +8,14 @@
; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
-; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 24
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: global_load_ZTwoUses:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
%l = load i64, ptr addrspace(1) %gep1, align 8
@@ -37,9 +27,8 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 24
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
@@ -60,69 +49,36 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
; would be folded away in most cases, but the index computation introduced by
; the legalization of wide vector stores can for example introduce them.
define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
-; GFX942_PTRADD-LABEL: store_v16i32:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v4, 0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_add_u32 s2, s0, 32
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s20
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s21
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s22
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s23
-; GFX942_PTRADD-NEXT: s_addc_u32 s3, s1, 0
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s16
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s17
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s18
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s19
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s12
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s13
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s14
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s15
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s8
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s9
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s10
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s11
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: store_v16i32:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v4, 0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s20
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s21
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s22
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s23
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s16
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s17
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s18
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s19
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s12
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s13
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s14
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s15
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s8
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s9
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s11
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: store_v16i32:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, s20
+; GFX942-NEXT: v_mov_b32_e32 v1, s21
+; GFX942-NEXT: v_mov_b32_e32 v2, s22
+; GFX942-NEXT: v_mov_b32_e32 v3, s23
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s12
+; GFX942-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-NEXT: v_mov_b32_e32 v2, s14
+; GFX942-NEXT: v_mov_b32_e32 v3, s15
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: v_mov_b32_e32 v1, s9
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
ret void
@@ -131,20 +87,12 @@ entry:
; Tests the (ptradd 0, x) -> x DAG combine.
define void @baseptr_null(i64 %offset, i8 %v) {
-; GFX942_PTRADD-LABEL: baseptr_null:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], 0, 0, v[0:1]
-; GFX942_PTRADD-NEXT: flat_store_byte v[0:1], v2
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: baseptr_null:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: flat_store_byte v[0:1], v2
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: baseptr_null:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: flat_store_byte v[0:1], v2
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr null, i64 %offset
store i8 %v, ptr %gep, align 1
ret void
@@ -153,40 +101,21 @@ define void @baseptr_null(i64 %offset, i8 %v) {
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
; assertalign DAG combine.
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
-; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_add_u32 s8, s4, 8
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 0
-; GFX942_PTRADD-NEXT: s_addc_u32 s9, s5, 0
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[8:9] sc0 sc1
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr0_sgpr1
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr8 killed $sgpr9
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr2_sgpr3
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
-; GFX942_LEGACY-NEXT: ; kill: killed $sgpr0_sgpr1
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942_LEGACY-NEXT: ; kill: killed $sgpr2_sgpr3
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: llvm_amdgcn_queue_ptr:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
+; GFX942-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
+; GFX942-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
+; GFX942-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: ; kill: killed $sgpr2_sgpr3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_endpgm
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -197,6 +126,3 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
store volatile i64 %dispatch.id, ptr addrspace(1) %ptr
ret void
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX942: {{.*}}
>From 255880e9c3abf2885b06d671164042afd20dbee7 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Thu, 5 Jun 2025 04:49:51 -0400
Subject: [PATCH 2/7] Remove undef/poison operand handling from the PTRADD dag
combine
Those are folded earlier already.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 --------
1 file changed, 8 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b203431232035..1ce01e0da5951 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2647,14 +2647,6 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
assert(PtrVT == IntVT &&
"PTRADD with different operand types is not supported");
- // fold (ptradd undef, y) -> undef
- if (N0.isUndef())
- return N0;
-
- // fold (ptradd x, undef) -> undef
- if (N1.isUndef())
- return DAG.getUNDEF(PtrVT);
-
// fold (ptradd x, 0) -> x
if (isNullConstant(N1))
return N0;
>From b905ddf366ea846f7a363f40c913d3cf002416f2 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 6 Jun 2025 03:09:12 -0400
Subject: [PATCH 3/7] Bail out early in the generic combine to reduce
identation, add a comment referring to the target-specific reassociation
there, and remove a currently unused variable in the target-specific combine.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 102 +++++++++---------
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1 -
2 files changed, 52 insertions(+), 51 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1ce01e0da5951..9381703ee2a95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2655,59 +2655,61 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
if (isNullConstant(N0))
return N1;
- if (N0.getOpcode() == ISD::PTRADD &&
- !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
- SDValue X = N0.getOperand(0);
- SDValue Y = N0.getOperand(1);
- SDValue Z = N1;
- bool N0OneUse = N0.hasOneUse();
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
-
- // (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
- // * y is a constant and (ptradd x, y) has one use; or
- // * y and z are both constants.
- if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
- SDNodeFlags Flags;
- // If both additions in the original were NUW, the new ones are as well.
- if (N->getFlags().hasNoUnsignedWrap() &&
- N0->getFlags().hasNoUnsignedWrap())
- Flags |= SDNodeFlags::NoUnsignedWrap;
- SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
- AddToWorklist(Add.getNode());
- return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
- }
+ if (N0.getOpcode() != ISD::PTRADD ||
+ reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1))
+ return SDValue();
- // TODO: There is another possible fold here that was proven useful.
- // It would be this:
- //
- // (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
- // * (ptradd x, y) has one use; and
- // * y is a constant; and
- // * z is not a constant.
- //
- // In some cases, specifically in AArch64's FEAT_CPA, it exposes the
- // opportunity to select more complex instructions such as SUBPT and
- // MSUBPT. However, a hypothetical corner case has been found that we could
- // not avoid. Consider this (pseudo-POSIX C):
- //
- // char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
- // char *p = mmap(LARGE_CONSTANT);
- // char *q = foo(p, -LARGE_CONSTANT);
- //
- // Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
- // further + z takes it back to the start of the mapping, so valid,
- // regardless of the address mmap gave back. However, if mmap gives you an
- // address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
- // borrow from the high bits (with the subsequent + z carrying back into
- // the high bits to give you a well-defined pointer) and thus trip
- // FEAT_CPA's pointer corruption checks.
- //
- // We leave this fold as an opportunity for future work, addressing the
- // corner case for FEAT_CPA, as well as reconciling the solution with the
- // more general application of pointer arithmetic in other future targets.
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+ SDValue Z = N1;
+ bool N0OneUse = N0.hasOneUse();
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
+ // * y is a constant and (ptradd x, y) has one use; or
+ // * y and z are both constants.
+ if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, the new ones are as well.
+ if (N->getFlags().hasNoUnsignedWrap() && N0->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
+ AddToWorklist(Add.getNode());
+ return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
}
+ // TODO: There is another possible fold here that was proven useful.
+ // It would be this:
+ //
+ // (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
+ // * (ptradd x, y) has one use; and
+ // * y is a constant; and
+ // * z is not a constant.
+ //
+ // In some cases, specifically in AArch64's FEAT_CPA, it exposes the
+ // opportunity to select more complex instructions such as SUBPT and
+ // MSUBPT. However, a hypothetical corner case has been found that we could
+ // not avoid. Consider this (pseudo-POSIX C):
+ //
+ // char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
+ // char *p = mmap(LARGE_CONSTANT);
+ // char *q = foo(p, -LARGE_CONSTANT);
+ //
+ // Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
+ // further + z takes it back to the start of the mapping, so valid,
+ // regardless of the address mmap gave back. However, if mmap gives you an
+ // address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
+ // borrow from the high bits (with the subsequent + z carrying back into
+ // the high bits to give you a well-defined pointer) and thus trip
+ // FEAT_CPA's pointer corruption checks.
+ //
+ // We leave this fold as an opportunity for future work, addressing the
+ // corner case for FEAT_CPA, as well as reconciling the solution with the
+ // more general application of pointer arithmetic in other future targets.
+ // For now each architecture that wants this fold must implement it in the
+ // target-specific code (see e.g. SITargetLowering::performPtrAddCombine)
+
return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9c84104da4ce7..67d36cb99524c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15099,7 +15099,6 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
- EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
>From 267a93211483a3f275dc9ebc9f3c5f1f33f8b265 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 10 Jun 2025 05:37:23 -0400
Subject: [PATCH 4/7] Add an explicit PtrVT == IntVT to the (ptradd 0, x) -> x
fold
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9381703ee2a95..967f0f2e6c3f5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2652,7 +2652,7 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
return N0;
// fold (ptradd 0, x) -> x
- if (isNullConstant(N0))
+ if (isNullConstant(N0) && PtrVT == IntVT)
return N1;
if (N0.getOpcode() != ISD::PTRADD ||
>From b87fccc1b9e1c25976a79c9324cad93f367b345e Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 10 Jun 2025 05:40:46 -0400
Subject: [PATCH 5/7] Add a test for poison/undef folds.
---
.../AMDGPU/ptradd-sdag-undef-poison.ll | 49 +++++++++++++++++++
1 file changed, 49 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
new file mode 100644
index 0000000000000..1934ce395e63d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+
+; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode.
+; If any additions are generated for these tests, the folds don't work.
+
+define ptr @poison_offset(ptr %p, i64 %offset) {
+; GFX942-LABEL: poison_offset:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %gep1 = getelementptr i8, ptr %p, i64 poison
+ ret ptr %gep1
+}
+
+define ptr @poison_base(ptr %p, i64 %offset) {
+; GFX942-LABEL: poison_base:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %gep1 = getelementptr i8, ptr poison, i64 %offset
+ ret ptr %gep1
+}
+
+define ptr @undef_offset(ptr %p, i64 %offset) {
+; GFX942-LABEL: undef_offset:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %gep1 = getelementptr i8, ptr %p, i64 undef
+ ret ptr %gep1
+}
+
+define ptr @undef_base(ptr %p, i64 %offset) {
+; GFX942-LABEL: undef_base:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %gep1 = getelementptr i8, ptr undef, i64 %offset
+ ret ptr %gep1
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX942_LEGACY: {{.*}}
+; GFX942_PTRADD: {{.*}}
>From cb71724f6f1f5c68e3a0db4b8510463a5ee2757b Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 13 Jun 2025 08:52:59 -0400
Subject: [PATCH 6/7] Implement reviewer suggestions
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 ++---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 +++++++++----------
2 files changed, 17 insertions(+), 20 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 967f0f2e6c3f5..36a0e7a5099fa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2652,7 +2652,7 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
return N0;
// fold (ptradd 0, x) -> x
- if (isNullConstant(N0) && PtrVT == IntVT)
+ if (PtrVT == IntVT && isNullConstant(N0))
return N1;
if (N0.getOpcode() != ISD::PTRADD ||
@@ -2670,10 +2670,9 @@ SDValue DAGCombiner::visitPTRADD(SDNode *N) {
// * y is a constant and (ptradd x, y) has one use; or
// * y and z are both constants.
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
- SDNodeFlags Flags;
// If both additions in the original were NUW, the new ones are as well.
- if (N->getFlags().hasNoUnsignedWrap() && N0->getFlags().hasNoUnsignedWrap())
- Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDNodeFlags Flags =
+ (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
AddToWorklist(Add.getNode());
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 67d36cb99524c..b54c6cbb18de4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15119,22 +15119,20 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
- bool N1OneUse = N1.hasOneUse();
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if ((ZIsConstant != YIsConstant) && N1OneUse) {
- SDNodeFlags Flags;
- // If both additions in the original were NUW, the new ones are as well.
- if (N->getFlags().hasNoUnsignedWrap() &&
- N1->getFlags().hasNoUnsignedWrap())
- Flags |= SDNodeFlags::NoUnsignedWrap;
-
- if (YIsConstant)
- std::swap(Y, Z);
-
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
- DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ if (N1.hasOneUse()) {
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+ if (ZIsConstant != YIsConstant) {
+ // If both additions in the original were NUW, the new ones are as well.
+ SDNodeFlags Flags =
+ (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
+ if (YIsConstant)
+ std::swap(Y, Z);
+
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ }
}
}
>From 00afb5a69752cb1a621376569d7878d3f08ae4cc Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Mon, 23 Jun 2025 07:53:17 -0400
Subject: [PATCH 7/7] Update new AArch64/cpa-selectiondag.ll test
---
llvm/test/CodeGen/AArch64/cpa-selectiondag.ll | 107 ++++++++----------
1 file changed, 46 insertions(+), 61 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll b/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
index 69fd6c4de78bc..aeeffde01cb6e 100644
--- a/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
+++ b/llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
@@ -119,23 +119,17 @@ define void @msubpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O0: // %bb.0: // %entry
; CHECK-CPA-O0-NEXT: // implicit-def: $x8
; CHECK-CPA-O0-NEXT: mov w8, w0
-; CHECK-CPA-O0-NEXT: sxtw x9, w8
-; CHECK-CPA-O0-NEXT: mov x8, xzr
-; CHECK-CPA-O0-NEXT: subs x8, x8, x9
-; CHECK-CPA-O0-NEXT: lsl x8, x8, #1
-; CHECK-CPA-O0-NEXT: subs x10, x8, x9
+; CHECK-CPA-O0-NEXT: sxtw x8, w8
+; CHECK-CPA-O0-NEXT: mov w9, #48 // =0x30
+; CHECK-CPA-O0-NEXT: // kill: def $x9 killed $w9
+; CHECK-CPA-O0-NEXT: mneg x8, x8, x9
+; CHECK-CPA-O0-NEXT: add x8, x8, #288
; CHECK-CPA-O0-NEXT: adrp x9, array2
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array2
-; CHECK-CPA-O0-NEXT: mov w8, #288 // =0x120
-; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
-; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #4
-; CHECK-CPA-O0-NEXT: mov w10, #96 // =0x60
-; CHECK-CPA-O0-NEXT: // kill: def $x10 killed $w10
-; CHECK-CPA-O0-NEXT: addpt x10, x9, x10
-; CHECK-CPA-O0-NEXT: ldr q1, [x10, #16]
-; CHECK-CPA-O0-NEXT: ldr q2, [x10, #32]
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #96]
+; CHECK-CPA-O0-NEXT: ldr q1, [x9, #112]
+; CHECK-CPA-O0-NEXT: ldr q2, [x9, #128]
; CHECK-CPA-O0-NEXT: str q2, [x8, #32]
; CHECK-CPA-O0-NEXT: str q1, [x8, #16]
; CHECK-CPA-O0-NEXT: str q0, [x8]
@@ -144,21 +138,17 @@ define void @msubpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O3-LABEL: msubpt1:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-CPA-O3-NEXT: sxtw x9, w0
-; CHECK-CPA-O3-NEXT: adrp x8, array2
-; CHECK-CPA-O3-NEXT: add x8, x8, :lo12:array2
-; CHECK-CPA-O3-NEXT: mov w11, #96 // =0x60
-; CHECK-CPA-O3-NEXT: mov w12, #288 // =0x120
-; CHECK-CPA-O3-NEXT: ldr q2, [x8, #96]
-; CHECK-CPA-O3-NEXT: neg x10, x9
-; CHECK-CPA-O3-NEXT: addpt x11, x8, x11
-; CHECK-CPA-O3-NEXT: lsl x10, x10, #1
-; CHECK-CPA-O3-NEXT: ldp q1, q0, [x11, #16]
-; CHECK-CPA-O3-NEXT: sub x9, x10, x9
-; CHECK-CPA-O3-NEXT: addpt x10, x8, x12
-; CHECK-CPA-O3-NEXT: addpt x9, x10, x9, lsl #4
-; CHECK-CPA-O3-NEXT: stp q1, q0, [x9, #16]
-; CHECK-CPA-O3-NEXT: str q2, [x9]
+; CHECK-CPA-O3-NEXT: sxtw x8, w0
+; CHECK-CPA-O3-NEXT: mov w9, #48 // =0x30
+; CHECK-CPA-O3-NEXT: mneg x8, x8, x9
+; CHECK-CPA-O3-NEXT: adrp x9, array2
+; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array2
+; CHECK-CPA-O3-NEXT: ldp q1, q0, [x9, #112]
+; CHECK-CPA-O3-NEXT: ldr q2, [x9, #96]
+; CHECK-CPA-O3-NEXT: add x8, x8, #288
+; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
+; CHECK-CPA-O3-NEXT: stp q1, q0, [x8, #16]
+; CHECK-CPA-O3-NEXT: str q2, [x8]
; CHECK-CPA-O3-NEXT: ret
;
; CHECK-NOCPA-O0-LABEL: msubpt1:
@@ -205,15 +195,15 @@ entry:
define void @subpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O0-LABEL: subpt1:
; CHECK-CPA-O0: // %bb.0: // %entry
-; CHECK-CPA-O0-NEXT: adrp x9, array
-; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
+; CHECK-CPA-O0-NEXT: // implicit-def: $x8
+; CHECK-CPA-O0-NEXT: mov w8, w0
+; CHECK-CPA-O0-NEXT: sxtw x9, w8
; CHECK-CPA-O0-NEXT: mov w8, #96 // =0x60
; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
+; CHECK-CPA-O0-NEXT: subs x8, x8, x9, lsl #8
+; CHECK-CPA-O0-NEXT: adrp x9, array
+; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
-; CHECK-CPA-O0-NEXT: // implicit-def: $x10
-; CHECK-CPA-O0-NEXT: mov w10, w0
-; CHECK-CPA-O0-NEXT: sbfiz x10, x10, #8, #32
-; CHECK-CPA-O0-NEXT: subpt x8, x8, x10
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O0-NEXT: str q0, [x8]
; CHECK-CPA-O0-NEXT: ret
@@ -221,13 +211,13 @@ define void @subpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O3-LABEL: subpt1:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: // kill: def $w0 killed $w0 def $x0
-; CHECK-CPA-O3-NEXT: adrp x8, array
-; CHECK-CPA-O3-NEXT: add x8, x8, :lo12:array
+; CHECK-CPA-O3-NEXT: sxtw x8, w0
; CHECK-CPA-O3-NEXT: mov w9, #96 // =0x60
-; CHECK-CPA-O3-NEXT: sbfiz x10, x0, #8, #32
-; CHECK-CPA-O3-NEXT: addpt x9, x8, x9
-; CHECK-CPA-O3-NEXT: ldr q0, [x8, #32]
-; CHECK-CPA-O3-NEXT: subpt x8, x9, x10
+; CHECK-CPA-O3-NEXT: sub x8, x9, x8, lsl #8
+; CHECK-CPA-O3-NEXT: adrp x9, array
+; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array
+; CHECK-CPA-O3-NEXT: ldr q0, [x9, #32]
+; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: str q0, [x8]
; CHECK-CPA-O3-NEXT: ret
;
@@ -264,28 +254,24 @@ entry:
define void @subpt2(i32 %index, i32 %elem) {
; CHECK-CPA-O0-LABEL: subpt2:
; CHECK-CPA-O0: // %bb.0: // %entry
-; CHECK-CPA-O0-NEXT: mov x8, xzr
-; CHECK-CPA-O0-NEXT: subs x10, x8, w0, sxtw
-; CHECK-CPA-O0-NEXT: adrp x9, array
-; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: mov w8, #96 // =0x60
; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
+; CHECK-CPA-O0-NEXT: subs x8, x8, w0, sxtw #4
+; CHECK-CPA-O0-NEXT: adrp x9, array
+; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
-; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #4
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O0-NEXT: str q0, [x8]
; CHECK-CPA-O0-NEXT: ret
;
; CHECK-CPA-O3-LABEL: subpt2:
; CHECK-CPA-O3: // %bb.0: // %entry
-; CHECK-CPA-O3-NEXT: mov x8, xzr
-; CHECK-CPA-O3-NEXT: mov w9, #96 // =0x60
-; CHECK-CPA-O3-NEXT: adrp x10, array
-; CHECK-CPA-O3-NEXT: add x10, x10, :lo12:array
-; CHECK-CPA-O3-NEXT: sub x8, x8, w0, sxtw
-; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
-; CHECK-CPA-O3-NEXT: ldr q0, [x10, #32]
-; CHECK-CPA-O3-NEXT: addpt x8, x9, x8, lsl #4
+; CHECK-CPA-O3-NEXT: mov w8, #96 // =0x60
+; CHECK-CPA-O3-NEXT: adrp x9, array
+; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array
+; CHECK-CPA-O3-NEXT: sub x8, x8, w0, sxtw #4
+; CHECK-CPA-O3-NEXT: ldr q0, [x9, #32]
+; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: str q0, [x8]
; CHECK-CPA-O3-NEXT: ret
;
@@ -670,14 +656,13 @@ define hidden void @multidim() {
; CHECK-CPA-O0-NEXT: .cfi_offset w30, -16
; CHECK-CPA-O0-NEXT: adrp x8, b
; CHECK-CPA-O0-NEXT: ldrh w9, [x8, :lo12:b]
+; CHECK-CPA-O0-NEXT: // implicit-def: $x8
; CHECK-CPA-O0-NEXT: mov w8, w9
-; CHECK-CPA-O0-NEXT: mov w10, w8
+; CHECK-CPA-O0-NEXT: ubfiz x8, x8, #1, #32
+; CHECK-CPA-O0-NEXT: add x10, x8, #2
; CHECK-CPA-O0-NEXT: adrp x8, a
; CHECK-CPA-O0-NEXT: add x8, x8, :lo12:a
-; CHECK-CPA-O0-NEXT: mov w11, #2 // =0x2
-; CHECK-CPA-O0-NEXT: // kill: def $x11 killed $w11
-; CHECK-CPA-O0-NEXT: addpt x8, x8, x11
-; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #1
+; CHECK-CPA-O0-NEXT: addpt x8, x8, x10
; CHECK-CPA-O0-NEXT: add w9, w9, #1
; CHECK-CPA-O0-NEXT: mov w9, w9
; CHECK-CPA-O0-NEXT: // kill: def $x9 killed $w9
@@ -697,13 +682,13 @@ define hidden void @multidim() {
; CHECK-CPA-O3-LABEL: multidim:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: adrp x8, b
-; CHECK-CPA-O3-NEXT: mov w9, #2 // =0x2
; CHECK-CPA-O3-NEXT: adrp x10, a
; CHECK-CPA-O3-NEXT: add x10, x10, :lo12:a
; CHECK-CPA-O3-NEXT: ldrh w8, [x8, :lo12:b]
-; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
-; CHECK-CPA-O3-NEXT: addpt x9, x9, x8, lsl #1
+; CHECK-CPA-O3-NEXT: lsl x9, x8, #1
; CHECK-CPA-O3-NEXT: add x8, x8, #1
+; CHECK-CPA-O3-NEXT: add x9, x9, #2
+; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: ldrb w8, [x8]
; CHECK-CPA-O3-NEXT: cbz w8, .LBB14_2
More information about the llvm-branch-commits
mailing list