[llvm] AMDGPU: Return legal addressmode correctly for flat scratch (PR #71494)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 6 23:30:36 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-globalisel
Author: Ruiling, Song (ruiling)
<details>
<summary>Changes</summary>
Depends on #<!-- -->70634 for test changes.
---
Patch is 145.44 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71494.diff
16 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+54-9)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+3-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+54-8)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+3-1)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+9-1)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.h (+1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll (+88-114)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir (+4-12)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+32-60)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll (+35-45)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+189-223)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+21-30)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+284-308)
- (modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+6-12)
- (added) llvm/test/CodeGen/AMDGPU/scratch-pointer-sink.ll (+46)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index cd810f0b43e50db..3ec526b2094c0ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1151,13 +1151,58 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p CheckTwoInstrs is set, we will check
+// against the last two instructions which calculate \p FullAddr. When \p
+// CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction before the last).
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue FullAddr,
+ bool CheckTwoInstrs,
+ bool CheckTwoOperands,
uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return CurDAG->SignBitIsZero(Base);
+
+ // whether we can prove the operands are non-negative from operation.
+ auto HasOnlyNonNegativeOperands = [](SDValue Addr) -> bool {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+ };
+
+ if (CheckTwoInstrs) {
+ auto PartAddr = FullAddr.getOperand(0);
+ // Make sure we are doing SGPR + VGPR + Imm.
+ assert(isa<ConstantSDNode>(FullAddr.getOperand(1)));
+ if (HasOnlyNonNegativeOperands(FullAddr) &&
+ HasOnlyNonNegativeOperands(PartAddr))
+ return true;
+
+ auto LHS = PartAddr.getOperand(0);
+ auto RHS = PartAddr.getOperand(1);
+ return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+ }
+
+ // Single instruction case
+ if (HasOnlyNonNegativeOperands(FullAddr))
+ return true;
+
+ auto LHS = FullAddr.getOperand(0);
+ auto RHS = FullAddr.getOperand(1);
+ if (CheckTwoOperands)
+ return CurDAG->SignBitIsZero(LHS) && CurDAG->SignBitIsZero(RHS);
+
+ // If the immediate offset is negative, the base address cannot also be
+ // negative.
+ ConstantSDNode *ImmOp = nullptr;
+ if (FullAddr.getOpcode() == ISD::ADD &&
+ (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+ if (ImmOp->getSExtValue() < 0)
+ return true;
+ }
+
+ return CurDAG->SignBitIsZero(LHS);
}
// TODO: If offset is too big, put low 16-bit into offset.
@@ -1554,7 +1599,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
- isFlatScratchBaseLegal(N0, FlatVariant)) {
+ isFlatScratchBaseLegal(Addr, false, false, FlatVariant)) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1786,8 +1831,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
- if (CurDAG->isBaseWithConstantOffset(Addr) &&
- isFlatScratchBaseLegal(Addr.getOperand(0))) {
+ if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1844,6 +1888,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
int64_t ImmOffset = 0;
SDValue LHS, RHS;
+ SDValue FullAddr = Addr;
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1865,7 +1910,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
@@ -1891,7 +1936,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(FullAddr, FullAddr != Addr, true))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..08f393ab9fae8d4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ SDValue FullAddr, bool CheckTwoInstrs = false,
+ bool CheckTwoOperands = false,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2cf60f338105b1e..a0808e032d13f90 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -17,6 +17,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
@@ -4103,7 +4104,10 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+ auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
+ if (ConstOffset == 0 ||
+ !isFlatScratchBaseLegal(*AddrDef->MI, nullptr, false, FlatVariant))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4265,15 +4269,16 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// Match the immediate offset first, which canonically is moved as low as
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ auto AddrDef = getDefSrcRegIgnoringCopies(Root.getReg(), *MRI);
- if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+ if (ConstOffset != 0 && isFlatScratchBaseLegal(*AddrDef->MI) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
+ AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
}
- auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = AddrDef->MI->getOperand(1).getIndex();
return {{
@@ -4343,6 +4348,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ Register FullAddr = Addr;
if (ConstOffset != 0 &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
Addr = PtrBase;
@@ -4360,7 +4366,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
- if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+ auto FullAddrDef = getDefSrcRegIgnoringCopies(FullAddr, *MRI);
+ if (!isFlatScratchBaseLegal(*FullAddrDef->MI,
+ FullAddr != Addr ? AddrDef->MI : nullptr, true))
return std::nullopt;
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
@@ -4494,14 +4502,52 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
return KB->signBitIsZero(Base);
}
+// Check that the address value of flat scratch load/store being put into
+// SGPR/VGPR is legal with respect to hardware's requirement that address in
+// SGPR/VGPR should be unsigned. When \p PartAddr is set, we will check
+// against both instructions to be sure the address are non-negative. When
+// \p CheckTwoOperands is set, we will check both operands (In case of two
+// instructions, they are the operands from the instruction \p PartAddr).
bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant) const {
+ MachineInstr &FullAddr, MachineInstr *PartAddr, bool CheckTwoOperands,
+ uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return KB->signBitIsZero(Base);
+ // whether we can prove the operands are non-negative from operation.
+ auto HasOnlyNonNegativeOperands = [](MachineInstr *Addr) -> bool {
+ return Addr->getOpcode() == TargetOpcode::G_OR ||
+ (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
+ Addr->getFlag(MachineInstr::NoUWrap));
+ };
+
+ if (PartAddr) {
+ if (HasOnlyNonNegativeOperands(&FullAddr) &&
+ HasOnlyNonNegativeOperands(PartAddr))
+ return true;
+ Register LHS = PartAddr->getOperand(1).getReg();
+ Register RHS = PartAddr->getOperand(2).getReg();
+ return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+ }
+
+ // Single instruction case
+ if (HasOnlyNonNegativeOperands(&FullAddr))
+ return true;
+
+ Register LHS = FullAddr.getOperand(1).getReg();
+ Register RHS = FullAddr.getOperand(2).getReg();
+ if (CheckTwoOperands)
+ return KB->signBitIsZero(LHS) && KB->signBitIsZero(RHS);
+
+ if (FullAddr.getOpcode() == TargetOpcode::G_PTR_ADD) {
+ auto RhsValReg = getIConstantVRegValWithLookThrough(RHS, *MRI);
+ // If the immediate offset is negative, the base address cannot also be
+ // negative.
+ if (RhsValReg && RhsValReg->Value.getSExtValue() < 0)
+ return true;
+ }
+
+ return KB->signBitIsZero(LHS);
}
bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 93e45fcd8682f07..53e5fb995fc041e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -244,7 +244,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ MachineInstr &FullAddr, MachineInstr *PartAddr = nullptr,
+ bool CheckTwoOperands = false,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c46d81f57af6a9..8be3d0460af4e96 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1357,6 +1357,13 @@ bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
return isLegalMUBUFAddressingMode(AM);
}
+bool SITargetLowering::isLegalFlatScratchAddressingMode(
+ const AddrMode &AM) const {
+ return AM.Scale == 0 &&
+ Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
+}
+
bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
// additionally can do r + r + i with addr64. 32-bit has more addressing
@@ -1448,7 +1455,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return isLegalMUBUFAddressingMode(AM);
+ return Subtarget->enableFlatScratch() ? isLegalFlatScratchAddressingMode(AM)
+ : isLegalMUBUFAddressingMode(AM);
if (AS == AMDGPUAS::LOCAL_ADDRESS ||
(AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 746a88c5ea13a30..90a67853e8011fe 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -222,6 +222,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalFlatScratchAddressingMode(const AddrMode &AM) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
index 45df3bc094f351e..ec2cd43e5fb5df3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll
@@ -89,17 +89,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add_u32_e32 v1, 4, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
-; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 4
-; GFX9-NEXT: scratch_store_dword v1, v3, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: scratch_store_dword v1, v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
+; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -111,42 +109,39 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c
-; GFX10-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-NEXT: v_mov_b32_e32 v2, 15
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT: v_add3_u32 v1, 4, v1, v2
-; GFX10-NEXT: scratch_store_dword v0, v3, off
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1
+; GFX10-NEXT: scratch_store_dword v0, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_endpgm
;
; GFX940-LABEL: store_load_vindex_kernel:
; GFX940: ; %bb.0: ; %bb
; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX940-NEXT: v_mov_b32_e32 v3, 15
; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX940-NEXT: v_mov_b32_e32 v2, 4
-; GFX940-NEXT: scratch_store_dword v1, v3, off offset:4 sc0 sc1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v2, 15
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
+; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
+; GFX940-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_kernel:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
-; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:4 dlc
+; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1
+; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: v_add3_u32 v1, 4, v1, v2
-; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v1
+; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
bb:
@@ -233,34 +228,31 @@ define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
; GFX9-LABEL: private_ptr_foo:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX9-NEXT: scratch_store_dword v0, v1, off
+; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: private_ptr_foo:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX10-NEXT: scratch_store_dword v0, v1, off
+; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: private_ptr_foo:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000
-; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
+; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: private_ptr_foo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
-; GFX11-NEXT: scratch_store_b32 v0, v1, off
+; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4
; GFX11-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
store float 1.000000e+01, ptr addrspace(5) %gep, align 4
@@ -366,16 +358,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
-; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1
-; GFX9-NEXT: v_mov_b32_e32 v3, 15
; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0x104
-; GFX9-NEXT: scratch_store_dword v1, v3, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, 15
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
-; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: scratch_store_dword v1, v2, off
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NE...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/71494
More information about the llvm-commits
mailing list