[llvm] 7b9f620 - [AMDGPU] Work around GFX11 flat scratch SVS swizzling bug

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Mon Jun 13 13:03:43 PDT 2022


Author: Jay Foad
Date: 2022-06-13T21:00:42+01:00
New Revision: 7b9f620e78464626fad36e629f5d053892e1cf8c

URL: https://github.com/llvm/llvm-project/commit/7b9f620e78464626fad36e629f5d053892e1cf8c
DIFF: https://github.com/llvm/llvm-project/commit/7b9f620e78464626fad36e629f5d053892e1cf8c.diff

LOG: [AMDGPU] Work around GFX11 flat scratch SVS swizzling bug

Differential Revision: https://reviews.llvm.org/D127635

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e32d0c861ffb..fc55e29325cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1805,6 +1805,24 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
   return true;
 }
 
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug(
+    SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const {
+  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+    return false;
+
+  // The bug affects the swizzling of SVS accesses if there is any carry out
+  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+  // voffset to (soffset + inst_offset).
+  KnownBits VKnown = CurDAG->computeKnownBits(VAddr);
+  KnownBits SKnown = KnownBits::computeForAddSub(
+      true, false, CurDAG->computeKnownBits(SAddr),
+      KnownBits::makeConstant(APInt(32, ImmOffset)));
+  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+  return (VMax & 3) + (SMax & 3) >= 4;
+}
+
 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
                                              SDValue &VAddr, SDValue &SAddr,
                                              SDValue &Offset) const  {
@@ -1832,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
         VAddr = SDValue(VMov, 0);
         SAddr = LHS;
+        if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
+          return false;
         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
         return true;
       }
@@ -1854,6 +1874,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
     return false;
   }
 
+  if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
+    return false;
   SAddr = SelectSAddrFI(CurDAG, SAddr);
   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
   return true;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 21f97f2e87f2..93d43e17ba79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -188,6 +188,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
                          SDValue &VOffset, SDValue &Offset) const;
   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
                           SDValue &Offset) const;
+  bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr,
+                                     uint64_t ImmOffset) const;
   bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
                            SDValue &SAddr, SDValue &Offset) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a01582c60897..f20cd8e4a7d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3985,6 +3985,24 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   }};
 }
 
+// Check whether the flat scratch SVS swizzle bug affects this access.
+bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
+    Register VAddr, Register SAddr, uint64_t ImmOffset) const {
+  if (!Subtarget->hasFlatScratchSVSSwizzleBug())
+    return false;
+
+  // The bug affects the swizzling of SVS accesses if there is any carry out
+  // from the two low order bits (i.e. from bit 1 into bit 2) when adding
+  // voffset to (soffset + inst_offset).
+  auto VKnown = KnownBits->getKnownBits(VAddr);
+  auto SKnown = KnownBits::computeForAddSub(
+      true, false, KnownBits->getKnownBits(SAddr),
+      KnownBits::makeConstant(APInt(32, ImmOffset)));
+  uint64_t VMax = VKnown.getMaxValue().getZExtValue();
+  uint64_t SMax = SKnown.getMaxValue().getZExtValue();
+  return (VMax & 3) + (SMax & 3) >= 4;
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register Addr = Root.getReg();
@@ -4013,6 +4031,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
   Register LHS = AddrDef->MI->getOperand(1).getReg();
   auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
 
+  if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
+    return None;
+
   if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
     int FI = LHSDef->MI->getOperand(1).getIndex();
     return {{

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 6a101f950abf..dd74a26efdac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -210,6 +210,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
 
   InstructionSelector::ComplexRendererFns
   selectScratchSAddr(MachineOperand &Root) const;
+  bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr,
+                                     uint64_t ImmOffset) const;
   InstructionSelector::ComplexRendererFns
   selectScratchSVAddr(MachineOperand &Root) const;
 

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a7102351ae19..4f54e76ccbb6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1044,6 +1044,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasVOPDInsts() const { return HasVOPDInsts; }
 
+  bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
+
   /// Return true if the target has the S_DELAY_ALU instruction.
   bool hasDelayAlu() const { return GFX11Insts; }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
index 3dd8dcf0c09f..32297e863a46 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll
@@ -51,12 +51,12 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -132,17 +132,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) {
 ; GFX11-SDAG-LABEL: soff1_voff2:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -311,12 +311,12 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) {
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -395,18 +395,18 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) {
 ; GFX11-SDAG-LABEL: soff2_voff2:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    v_add3_u32 v0, 4, s0, v0
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, off offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -576,16 +576,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) {
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 2
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    v_add3_u32 v2, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v1, off offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v2, v3, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, s0 offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -664,18 +665,19 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) {
 ; GFX11-SDAG-LABEL: soff4_voff2:
 ; GFX11-SDAG:       ; %bb.0: ; %bb
 ; GFX11-SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x24
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 1
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v2, 2
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v3, 4
+; GFX11-SDAG-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX11-SDAG-NEXT:    v_add3_u32 v3, 4, s0, v0
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s0, 4
 ; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v1, s0 offset:1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v2, s0 offset:2 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v3, v2, off offset:2 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v3, s0 offset:4 dlc
+; GFX11-SDAG-NEXT:    scratch_store_b8 v0, v4, s0 offset:4 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index a8e97b5f7f49..c49c617bbe8a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -4091,11 +4091,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-NEXT:    s_movk_i32 s0, 0xef7f
-; GFX11-NEXT:    scratch_store_b8 v0, v1, s0 dlc
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_u8 v0, v0, s0 glc dlc
+; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4149,11 +4149,11 @@ define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
-; GFX11-PAL-NEXT:    s_movk_i32 s0, 0xef7f
-; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, s0 dlc
+; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, s0 glc dlc
+; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
 bb:


        


More information about the llvm-commits mailing list