[llvm-branch-commits] [llvm] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (PR #143673)
Fabian Ritter via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Jun 13 05:07:04 PDT 2025
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/143673
>From 50de6e085242ce975af812088f4ef48896444fb6 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 11 Jun 2025 05:48:45 -0400
Subject: [PATCH] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines
This patch adds several (AMDGPU-)target-specific DAG combines for
ISD::PTRADD nodes that reproduce existing similar transforms for
ISD::ADD nodes. There is no functional change intended for the existing
target-specific PTRADD combine.
For SWDEV-516125.
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 151 ++++++++++++++----
.../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++++++------------
3 files changed, 167 insertions(+), 139 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 45a37622a531b..1210777428020 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6706,7 +6706,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
return SDValue();
int64_t Offset = C2->getSExtValue();
switch (Opcode) {
- case ISD::ADD: break;
+ case ISD::ADD:
+ case ISD::PTRADD:
+ break;
case ISD::SUB: Offset = -uint64_t(Offset); break;
default: return SDValue();
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 184984abcdf32..fe002b3daed89 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -46,6 +47,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::SDPatternMatch;
#define DEBUG_TYPE "si-lower"
@@ -14329,7 +14331,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL,
// instead of a tree.
SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
DAGCombinerInfo &DCI) const {
- assert(N->getOpcode() == ISD::ADD);
+ assert(N->isAnyAdd());
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
@@ -14362,7 +14364,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
for (SDNode *User : LHS->users()) {
// There is a use that does not feed into addition, so the multiply can't
// be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
- if (User->getOpcode() != ISD::ADD)
+ if (!User->isAnyAdd())
return SDValue();
// We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
@@ -14474,8 +14476,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
SDValue Hi = getHiHalf64(LHS, DAG);
SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == ISD::PTRADD)
+ Opcode = ISD::ADD;
SDValue AddHi =
- DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags());
+ DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
@@ -14949,44 +14954,120 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
+ EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
- if (N1.getOpcode() == ISD::ADD) {
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
- // y is not, and (add y, z) is used only once.
- // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
- // z is not, and (add y, z) is used only once.
- // The goal is to move constant offsets to the outermost ptradd, to create
- // more opportunities to fold offsets into memory instructions.
- // Together with the generic combines in DAGCombiner.cpp, this also
- // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
- //
- // This transform is here instead of in the general DAGCombiner as it can
- // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
- // AArch64's CPA.
- SDValue X = N0;
- SDValue Y = N1.getOperand(0);
- SDValue Z = N1.getOperand(1);
- bool N1OneUse = N1.hasOneUse();
- bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
- bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
- if ((ZIsConstant != YIsConstant) && N1OneUse) {
- SDNodeFlags Flags;
- // If both additions in the original were NUW, the new ones are as well.
- if (N->getFlags().hasNoUnsignedWrap() &&
- N1->getFlags().hasNoUnsignedWrap())
- Flags |= SDNodeFlags::NoUnsignedWrap;
-
- if (YIsConstant)
- std::swap(Y, Z);
-
- SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
- DCI.AddToWorklist(Inner.getNode());
- return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ // The following folds transform PTRADDs into regular arithmetic in cases
+ // where the PTRADD wouldn't be folded as an immediate offset into memory
+ // instructions anyway. They are target-specific in that other targets might
+ // prefer to not lose information about the pointer arithmetic.
+
+ // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
+ // Adapted from DAGCombiner::visitADDLikeCommutative.
+ SDValue V, K;
+ if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
+ SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
+ }
+
+ // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
+ // performAddCombine.
+ if (N1.getOpcode() == ISD::MUL) {
+ if (Subtarget->hasMad64_32()) {
+ if (SDValue Folded = tryFoldToMad64_32(N, DCI))
+ return Folded;
+ }
+ }
+
+ // If the 32 low bits of the constant are all zero, there is nothing to fold
+ // into an immediate offset, so it's better to eliminate the unnecessary
+ // addition for the lower 32 bits than to preserve the PTRADD.
+ // Analogous to a fold in performAddCombine.
+ if (VT == MVT::i64) {
+ if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
+ return Folded;
+ }
+
+ if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) {
+ // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with
+ // global address GA and constant c, such that c can be folded into GA.
+ SDValue GAValue = N0.getOperand(0);
+ if (const GlobalAddressSDNode *GA =
+ dyn_cast<GlobalAddressSDNode>(GAValue)) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (DCI.isBeforeLegalizeOps() && TLI.isOffsetFoldingLegal(GA)) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, reassociation preserves
+ // that.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N0->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags);
+ }
}
}
+ if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
+ return SDValue();
+
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ SDNodeFlags ReassocFlags;
+ // If both additions in the original were NUW, reassociation preserves that.
+ if (N->getFlags().hasNoUnsignedWrap() && N1->getFlags().hasNoUnsignedWrap())
+ ReassocFlags |= SDNodeFlags::NoUnsignedWrap;
+ if (ZIsConstant != YIsConstant) {
+
+ if (YIsConstant)
+ std::swap(Y, Z);
+
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags);
+ }
+
+ // If one of Y and Z is constant, they have been handled above. If both were
+ // constant, the addition would have been folded in SelectionDAG::getNode
+ // already. This ensures that the generic DAG combines won't undo the
+ // following reassociation.
+ assert(!YIsConstant && !ZIsConstant);
+
+ if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) {
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
+ // y are uniform and z isn't.
+ // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
+ // z are uniform and y isn't.
+ // The goal is to push uniform operands up in the computation, so that they
+ // can be handled with scalar operations. We can't use reassociateScalarOps
+ // for this since it requires two identical commutative operations to
+ // reassociate.
+ if (Y->isDivergent())
+ std::swap(Y, Z);
+ SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
+ DCI.AddToWorklist(UniformInner.getNode());
+ return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
+ }
+
return SDValue();
}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 1ec94162951a6..c00bccdbce6b7 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -145,49 +145,29 @@ entry:
; Test skipping the lower-32-bit addition if it is unnecessary.
define ptr @huge_offset_low_32_unused(ptr %p) {
-; GFX942_PTRADD-LABEL: huge_offset_low_32_unused:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0
-; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: huge_offset_low_32_unused:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: huge_offset_low_32_unused:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_add_u32_e32 v1, 1, v1
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000
ret ptr %gep
}
; Reassociate address computation if it leads to more scalar operations.
define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_r:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_r:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: reassoc_scalar_r:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_u32 s2, s2, s6
+; GFX942-NEXT: s_addc_u32 s3, s3, s7
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
@@ -198,30 +178,18 @@ entry:
}
define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) {
-; GFX942_PTRADD-LABEL: reassoc_scalar_l:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: reassoc_scalar_l:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6
-; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: reassoc_scalar_l:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_add_u32 s2, s2, s6
+; GFX942-NEXT: s_addc_u32 s3, s3, s7
+; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
@@ -233,24 +201,14 @@ entry:
; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold
define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) {
-; GFX942_PTRADD-LABEL: shl_neg_offset:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: shl_neg_offset:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
-; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: shl_neg_offset:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3]
+; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%offset = sub i64 0, %noffset
%x = shl i64 %offset, %shift
%gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x
@@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0 at rel32@lo+4
-; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0 at rel32@hi+12
+; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0 at rel32@lo+14
+; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0 at rel32@hi+22
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
; GFX942_LEGACY-LABEL: complextype_global_gep:
@@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
; Tests the tryFoldToMad64_32 PTRADD combine.
define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) {
-; GFX942_PTRADD-LABEL: fold_mad64:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0
-; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: fold_mad64:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
-; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: fold_mad64:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_mov_b32_e32 v2, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; GFX942-NEXT: global_store_dword v[0:1], v2, off
+; GFX942-NEXT: s_endpgm
%voffset32 = call i32 @llvm.amdgcn.workitem.id.x()
%voffset = zext i32 %voffset32 to i64
%p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0
More information about the llvm-branch-commits
mailing list