[llvm] [AMDGPU] Folding imm offset in more cases for scratch access (PR #70634)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 30 01:07:45 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Ruiling, Song (ruiling)
<details>
<summary>Changes</summary>
For scratch load/store, our hardware only accept non-negative value in SGPR/VGPR. Besides the case that we can prove from known bits, we can also prove that the value in `base` will be non-negative: 1.) When the ADD for the address calculation has NonUnsignedWrap flag. 2.) When the immediate offset is already negative.
---
Patch is 105.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70634.diff
8 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+19-5)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll (+32-60)
- (modified) llvm/test/CodeGen/AMDGPU/flat-scratch.ll (+192-226)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+21-30)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+284-308)
- (modified) llvm/test/CodeGen/AMDGPU/memory_clause.ll (+6-12)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd5e..691d644badd24b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1146,10 +1146,23 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr, SDValue Base,
uint64_t FlatVariant) const {
if (FlatVariant != SIInstrFlags::FlatScratch)
return true;
+
+ if (Addr.getOpcode() == ISD::ADD) {
+ // For `nuw` addition, we should not have negative base address.
+ if (Addr->getFlags().hasNoUnsignedWrap())
+ return true;
+
+ auto *RHS = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+ // If the immediate offset is negative, we should not have the base being
+ // negative as well.
+ if (RHS && RHS->getSExtValue() < 0)
+ return true;
+ }
+
// When value in 32-bit Base can be negative calculate scratch offset using
// 32-bit add instruction, otherwise use Base(unsigned) + offset.
return CurDAG->SignBitIsZero(Base);
@@ -1549,7 +1562,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
- isFlatScratchBaseLegal(N0, FlatVariant)) {
+ isFlatScratchBaseLegal(Addr, N0, FlatVariant)) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1782,7 +1795,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
if (CurDAG->isBaseWithConstantOffset(Addr) &&
- isFlatScratchBaseLegal(Addr.getOperand(0))) {
+ isFlatScratchBaseLegal(Addr, Addr.getOperand(0))) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1860,7 +1873,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr, SAddr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
@@ -1886,7 +1899,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr, SAddr) ||
+ !isFlatScratchBaseLegal(Addr, VAddr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index a8a606f60a3faee..8a47757f70bbfbc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -155,7 +155,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
bool isFlatScratchBaseLegal(
- SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ SDValue Addr, SDValue Base,
+ uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
index af023835c529776..329f0a2068cb072 100644
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -704,11 +704,11 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR: ; %bb.0: ; %bb
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR-NEXT: scratch_load_short_d16_hi v1, v0, off
-; FLATSCR-NEXT: v_add_u32_e32 v2, 2, v0
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR-NEXT: scratch_load_short_d16 v0, v2, off
+; FLATSCR-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
;
; GFX10_DEFAULT-LABEL: chain_hi_to_lo_private_other_dep:
@@ -726,22 +726,22 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(ptr addrspace(5) %ptr) {
; FLATSCR_GFX10: ; %bb.0: ; %bb
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, v0, off
-; FLATSCR_GFX10-NEXT: v_add_nc_u32_e32 v2, 2, v0
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v0, v2, off
+; FLATSCR_GFX10-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR_GFX10-NEXT: scratch_load_short_d16 v1, v0, off offset:2
; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v0, v1
; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: chain_hi_to_lo_private_other_dep:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_d16_hi_b16 v1, v0, off
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_sub_u16 v0, v1, -12 op_sel_hi:[1,0]
-; GFX11-NEXT: scratch_load_d16_b16 v0, v2, off
+; GFX11-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX11-NEXT: scratch_load_d16_b16 v1, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
bb:
%gep_lo = getelementptr inbounds i16, ptr addrspace(5) %ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
index c849cf08094e718..ad4d4a4a30fc6d0 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-i8-i16.ll
@@ -11,16 +11,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %ou
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT: scratch_load_ubyte v0, v0, off
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT: scratch_load_u8 v0, v0, off
+; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -38,16 +36,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %ou
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX10-NEXT: scratch_load_sbyte v0, v0, off
+; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
-; GFX11-NEXT: scratch_load_i8 v0, v0, off
+; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -65,16 +61,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %o
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT: scratch_load_ushort v0, v0, off
+; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT: scratch_load_u16 v0, v0, off
+; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -92,16 +86,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %o
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX10-NEXT: scratch_load_sshort v0, v0, off
+; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_v:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
-; GFX11-NEXT: scratch_load_i16 v0, v0, off
+; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -359,16 +351,14 @@ define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, p
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 1
-; GFX10-NEXT: scratch_load_ubyte v2, off, s2
+; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 1
-; GFX11-NEXT: scratch_load_u8 v2, off, s0
+; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -386,16 +376,14 @@ define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, p
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 1
-; GFX10-NEXT: scratch_load_sbyte v2, off, s2
+; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 1
-; GFX11-NEXT: scratch_load_i8 v2, off, s0
+; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -413,16 +401,14 @@ define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 2
-; GFX10-NEXT: scratch_load_ushort v2, off, s2
+; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 2
-; GFX11-NEXT: scratch_load_u16 v2, off, s0
+; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -440,16 +426,14 @@ define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: s_add_i32 s2, s2, 2
-; GFX10-NEXT: scratch_load_sshort v2, off, s2
+; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_s:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s0, s0, 2
-; GFX11-NEXT: scratch_load_i16 v2, off, s0
+; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
@@ -713,19 +697,16 @@ define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT: scratch_load_ubyte v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_zext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT: scratch_load_u8 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -745,19 +726,16 @@ define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in,
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
-; GFX10-NEXT: scratch_load_sbyte v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i8_sext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
-; GFX11-NEXT: scratch_load_i8 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -777,19 +755,16 @@ define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT: scratch_load_ushort v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_zext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT: scratch_load_u16 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
@@ -809,19 +784,16 @@ define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
-; GFX10-NEXT: scratch_load_sshort v0, v0, off
+; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: flat_store_dword v[1:2], v0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_scratch_load_i16_sext_svs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
-; GFX11-NEXT: scratch_load_i16 v0, v0, off
+; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b32 v[1:2], v0
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 07b3df2a8520aae..fe984ffa653a5ef 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -576,11 +576,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
; GFX9-NEXT: v_add_u32_e32 v1, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 15
-; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
; GFX9-NEXT: scratch_store_dword v1, v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0
-; GFX9-NEXT: scratch_load_dword v0, v0, off glc
+; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
+; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -592,24 +591,22 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 15
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, 4, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
-; GFX10-NEXT: scratch_store_dword v0, v2, off
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, 4, v0
+; GFX10-NEXT: scratch_store_dword v1, v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
+; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: store_load_vindex_kernel:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_nc_u32_e32 v1, 4, v0
-; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
-; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
+; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0
+; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
+; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_endpgm
;
@@ -628,8 +625,7 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0
-; GFX9-PAL-NEXT: scratch_lo...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70634
More information about the llvm-commits
mailing list