[llvm] 7b9f620 - [AMDGPU] Work around GFX11 flat scratch SVS swizzling bug
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 13 13:03:43 PDT 2022
Author: Jay Foad
Date: 2022-06-13T21:00:42+01:00
New Revision: 7b9f620e78464626fad36e629f5d053892e1cf8c
URL: https://github.com/llvm/llvm-project/commit/7b9f620e78464626fad36e629f5d053892e1cf8c
DIFF: https://github.com/llvm/llvm-project/commit/7b9f620e78464626fad36e629f5d053892e1cf8c.diff
LOG: [AMDGPU] Work around GFX11 flat scratch SVS swizzling bug
Differential Revision: https://reviews.llvm.org/D127635
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
llvm/lib/Target/AMDGPU/GCNSubtarget.h
llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e32d0c861ffb..fc55e29325cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1805,6 +1805,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
return true;
}
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
+ SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
+ if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+ return false;
+
+ // The bug affects the swizzling of SVS accesses if there is any carry out
+ // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+ // voffset to (soffset + inst_offset).
+ KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
+ KnownBits SKnown = KnownBits::computeForAddSub(
+ true, false, CurDAG->computeKnownBits(SAddr),
+ KnownBits::makeConstant(APInt(32, ImmOffset)));
+ uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+ uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+ return (VMax & 3) + (SMax & 3) >= 4;
+}
+
bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
SDValue &VAddr, SDValue &SAddr,
SDValue &Offset) const {
@@ -1832,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
+ if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
+ return false;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
return true;
}
@@ -1854,6 +1874,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
+ if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
+ return false;
SAddr = SelectSAddrFI(CurDAG, SAddr);
Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 21f97f2e87f2..93d43e17ba79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -188,6 +188,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
SDValue &VOffset, SDValue &Offset) const;
bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &Offset) const;
+ bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
+ uint64_t ImmOffset) const;
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a01582c60897..f20cd8e4a7d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3985,6 +3985,24 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
}};
}
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
+ Register VAddr, Register SAddr, uint64_t ImmOffset) const {
+ if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+ return false;
+
+ // The bug affects the swizzling of SVS accesses if there is any carry out
+ // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+ // voffset to (soffset + inst_offset).
+ auto VKnown = KnownBits->getKnownBits(VAddr);
+ auto SKnown = KnownBits::computeForAddSub(
+ true, false, KnownBits->getKnownBits(SAddr),
+ KnownBits::makeConstant(APInt(32, ImmOffset)));
+ uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+ uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+ return (VMax & 3) + (SMax & 3) >= 4;
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register Addr = Root.getReg();
@@ -4013,6 +4031,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+ if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
+ return None;
+
if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
int FI = LHSDef->MI->getOperand(1).getIndex();
return {{
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6a101f950abf..dd74a26efdac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -210,6 +210,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectScratchSAddr(MachineOperand &Root) const;
+ bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr,
+ uint64_t ImmOffset) const;
InstructionSelector::ComplexRendererFns
selectScratchSVAddr(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a7102351ae19..4f54e76ccbb6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1044,6 +1044,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasVOPDInsts() const { return HasVOPDInsts; }
+ bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+
/// Return true if the target has the S_DELAY_ALU instruction.
bool hasDelayAlu() const { return GFX11Insts; }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 3dd8dcf0c09f..32297e863a46 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -51,12 +51,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -132,17 +132,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
; GFX11-SDAG-LABEL: soff1_voff2:
; GFX11-SDAG: ; %bb.0: ; %bb
; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -311,12 +311,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -395,18 +395,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
; GFX11-SDAG-LABEL: soff2_voff2:
; GFX11-SDAG: ; %bb.0: ; %bb
; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2
; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -576,16 +576,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
; GFX11-SDAG: ; %bb.0: ; %bb
; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v2, v3, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
@@ -664,18 +665,19 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
; GFX11-SDAG-LABEL: soff4_voff2:
; GFX11-SDAG: ; %bb.0: ; %bb
; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1
; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0
; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4
; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v3, v2, off offset:2 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index a8e97b5f7f49..c49c617bbe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4091,11 +4091,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_movk_i32 s0, 0xef7f
-; GFX11-NEXT: scratch_store_b8 v0, v1, s0 dlc
+; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: scratch_load_u8 v0, v0, s0 glc dlc
+; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
@@ -4149,11 +4149,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture
; GFX11-PAL: ; %bb.0: ; %bb
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0
; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-PAL-NEXT: s_movk_i32 s0, 0xef7f
-; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, s0 dlc
+; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc
; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, s0 glc dlc
+; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc
; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
More information about the llvm-commits
mailing list