[llvm] 442de0c - AMDGPU: Correct const_index_stride for wave 32 for PAL ABI

Fri May 7 04:28:53 PDT 2021

Author: David Stuttard
Date: 2021-05-07T12:19:49+01:00
New Revision: 442de0c1adf36bfddb5fb66b442bba8999fa733b

URL: https://github.com/llvm/llvm-project/commit/442de0c1adf36bfddb5fb66b442bba8999fa733b
DIFF: https://github.com/llvm/llvm-project/commit/442de0c1adf36bfddb5fb66b442bba8999fa733b.diff

LOG: AMDGPU: Correct const_index_stride for wave 32 for PAL ABI

Since there is a single scratch resource descriptor for all shaders, if there is
a wave32 and a wave64 shader (for instance for VsFs pairs)
then the const_index_stride will be incorrect for wave32 shaders.

Differential Revision: https://reviews.llvm.org/D101830

Change-Id: Id8de5566b0d1a07a814e2e7db016df9d20bf6d2c

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index c8ac34dc1523..acb25d0bb951 100644

--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -525,6 +525,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
     // The pointer to the GIT is formed from the offset passed in and either
     // the amdgpu-git-ptr-high function attribute or the top part of the PC
     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+    Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
 
     buildGitPtr(MBB, I, DL, TII, Rsrc01);
 
@@ -546,6 +547,20 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
       .addImm(0) // cpol
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
       .addMemOperand(MMO);
+
+    // The driver will always set the SRD for wave 64 (bits 118:117 of
+    // descriptor / bits 22:21 of third sub-reg will be 0b11)
+    // If the shader is actually wave32 we have to modify the const_index_stride
+    // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
+    // reason the driver does this is that there can be cases where it presents
+    // 2 shaders with 
diff erent wave size (e.g. VsFs).
+    // TODO: convert to using SCRATCH instructions or multiple SRD buffers
+    if (ST.isWave32()) {
+      const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
+      BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
+          .addImm(21)
+          .addReg(Rsrc03, RegState::ImplicitDefine);
+    }
   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);

diff  --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index d1b826b70145..4a0f31829ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -3,7 +3,8 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features  %s | FileCheck -check-prefix=GCN %s
 
 ; Check that it doesn't crash
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
 
 target datalayout = "A5"
 
@@ -13,8 +14,8 @@ define amdgpu_cs void @test_simple_indirect_call() {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-NEXT:    s_mov_b32 s36, s0
-; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x10
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_mov_b32 s32, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s36, s36, s0
@@ -23,6 +24,23 @@ define amdgpu_cs void @test_simple_indirect_call() {
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: test_simple_indirect_call:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_getpc_b64 s[36:37]
+; GFX10-NEXT:    s_mov_b32 s36, s0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x10
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_bitset0_b32 s39, 21
+; GFX10-NEXT:    s_mov_b32 s32, 0
+; GFX10-NEXT:    s_add_u32 s36, s36, s0
+; GFX10-NEXT:    s_addc_u32 s37, s37, 0
+; GFX10-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX10-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT:    s_endpgm
+
 
   %pc = call i64 @llvm.amdgcn.s.getpc()
   %fun = inttoptr i64 %pc to void()*