[llvm] 976f3b3 - [AMDGPU] Only allow implicit WQM in pixel shaders

Wed Nov 24 03:05:23 PST 2021

Author: Carl Ritson
Date: 2021-11-24T20:04:42+09:00
New Revision: 976f3b3c9eba0835d5ab7d191bd2e88ceda86ebe

URL: https://github.com/llvm/llvm-project/commit/976f3b3c9eba0835d5ab7d191bd2e88ceda86ebe
DIFF: https://github.com/llvm/llvm-project/commit/976f3b3c9eba0835d5ab7d191bd2e88ceda86ebe.diff

LOG: [AMDGPU] Only allow implicit WQM in pixel shaders

Implicit derivatives are only valid in pixel shaders,
hence only implicitly enable WQM for pixel shaders.
This avoids unintended WQM in other shader types (e.g. compute)
when image sampling instructions are used.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D114414

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/memory_clause.ll
    llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
    llvm/test/CodeGen/AMDGPU/wqm.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 582f6b272df3b..46012e5d7d978 100644

--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -487,6 +487,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
+  bool HasImplicitDerivatives =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -506,6 +508,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // If LOD is not supported WQM is not needed.
         if (!ST->hasExtendedImageInsts())
           continue;
+        // Only generate implicit WQM if implicit derivatives are required.
+        // This avoids inserting unintended WQM if a shader type without
+        // implicit derivatives uses an image sampling instruction.
+        if (!HasImplicitDerivatives)
+          continue;
         // Sampling instructions don't need to produce results for all pixels
         // in a quad, they just require all inputs of a quad to have been
         // computed for derivatives.

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index ab39fae82d37e..c976d58e2dad6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -391,27 +391,25 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT:    s_mov_b32 s18, -1
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
 ; GCN-NEXT:    s_mov_b32 s19, 0xe00000
 ; GCN-NEXT:    s_add_u32 s16, s16, s3
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mov_b64 s[12:13], exec
-; GCN-NEXT:    s_wqm_b64 exec, exec
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x40b00000
-; GCN-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x24
-; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
 ; GCN-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    ;;#ASMSTART
-; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
 ; GCN-NEXT:    s_brev_b32 s0, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    s_mov_b32 s3, 0
 ; GCN-NEXT:    s_mov_b32 s1, s0
 ; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s15
-; GCN-NEXT:    s_and_b64 exec, exec, s[12:13]
+; GCN-NEXT:    v_mov_b32_e32 v1, s13
+; GCN-NEXT:    ;;#ASMSTART
+; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    buffer_load_dword v2, off, s[16:19], 0 offset:4
+; GCN-NEXT:    s_nop 0
 ; GCN-NEXT:    image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_f32_e32 v0, v2, v0
@@ -424,25 +422,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
 ; GCN-SCRATCH-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GCN-SCRATCH-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GCN-SCRATCH-NEXT:    s_mov_b32 s9, exec_lo
-; GCN-SCRATCH-NEXT:    s_wqm_b32 exec_lo, exec_lo
 ; GCN-SCRATCH-NEXT:    s_clause 0x1
 ; GCN-SCRATCH-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x24
 ; GCN-SCRATCH-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x44
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x40b00000
 ; GCN-SCRATCH-NEXT:    s_brev_b32 s8, 1
+; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
 ; GCN-SCRATCH-NEXT:    scratch_store_dword off, v0, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GCN-SCRATCH-NEXT:    ;;#ASMSTART
 ; GCN-SCRATCH-NEXT:    ;;#ASMEND
+; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
 ; GCN-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-SCRATCH-NEXT:    v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT:    s_and_b32 exec_lo, exec_lo, s9
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT:    s_mov_b32 s9, s8
 ; GCN-SCRATCH-NEXT:    s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT:    scratch_load_dword v2, off, off offset:4
 ; GCN-SCRATCH-NEXT:    image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GCN-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-SCRATCH-NEXT:    v_add_f32_e32 v0, v2, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 447dd9a7eeed0..a499159907d2e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -9,19 +9,22 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 
 ; GFX9-LABEL: non_preserved_vgpr_tuple8:
 ; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 
 ; GFX9: v_mov_b32_e32 v36, v16
 ; GFX9-NEXT: v_mov_b32_e32 v35, v15
 ; GFX9-NEXT: v_mov_b32_e32 v34, v14
 ; GFX9-NEXT: v_mov_b32_e32 v33, v13
 ; GFX9-NEXT: v_mov_b32_e32 v32, v12
+
+; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
 ; GFX9: ;;#ASMSTART
 ; GFX9-NEXT: ;;#ASMEND
 ; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
@@ -39,10 +42,6 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ;
 ; GFX10-LABEL: non_preserved_vgpr_tuple8:
 ; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 
 ; GFX10: v_mov_b32_e32 v36, v16
 ; GFX10-NEXT: v_mov_b32_e32 v35, v15
@@ -50,10 +49,16 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: v_mov_b32_e32 v33, v13
 ; GFX10-NEXT: v_mov_b32_e32 v32, v12
 
+; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+
 ; GFX10: ;;#ASMSTART
 ; GFX10-NEXT: ;;#ASMEND
 
 ; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT: s_addk_i32 s32, 0x400
 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
@@ -100,6 +105,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT: v_mov_b32_e32 v41, v12
 
 ; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
+; GFX9-NEXT: s_addk_i32 s32, 0x800
 ; GFX9-NEXT: s_getpc_b64 s[4:5]
 ; GFX9-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
@@ -133,12 +139,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT: s_getpc_b64 s[4:5]
 ; GFX10-NEXT: s_add_u32 s4, s4, extern_func at gotpcrel32@lo+4
 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func at gotpcrel32@hi+12
-; GFX10-NEXT: v_mov_b32_e32 v41, v16
+; GFX10-NEXT: v_writelane_b32 v40, s30, 8
 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v42, v15
-; GFX10-NEXT: v_mov_b32_e32 v43, v14
-; GFX10-NEXT: v_mov_b32_e32 v44, v13
-; GFX10-NEXT: v_mov_b32_e32 v45, v12
+; GFX10-NEXT: v_writelane_b32 v40, s31, 9
 ; GFX10-NEXT: s_waitcnt vmcnt(0)
 ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 36e7c21c1f938..4d0b9320f7be4 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -1,5 +1,48 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o -  %s | FileCheck %s
 
+--- |
+  define amdgpu_ps void @test_strict_wwm_scc() {
+    ret void
+  }
+  define amdgpu_ps void @test_strict_wwm_scc2() {
+    ret void
+  }
+  define amdgpu_ps void @no_cfg() {
+    ret void
+  }
+  define amdgpu_ps void @copy_exec() {
+    ret void
+  }
+  define amdgpu_ps void @scc_always_live() {
+    ret void
+  }
+  define amdgpu_ps void @test_wwm_set_inactive_propagation() {
+    ret void
+  }
+  define amdgpu_ps void @test_wqm_lr_phi() {
+    ret void
+  }
+  define amdgpu_cs void @no_wqm_in_cs() {
+    ret void
+  }
+  define amdgpu_es void @no_wqm_in_es() {
+    ret void
+  }
+  define amdgpu_gs void @no_wqm_in_gs() {
+    ret void
+  }
+  define amdgpu_hs void @no_wqm_in_hs() {
+    ret void
+  }
+  define amdgpu_ls void @no_wqm_in_ls() {
+    ret void
+  }
+  define amdgpu_vs void @no_wqm_in_vs() {
+    ret void
+  }
+...
+---
+
 ---
 # Check for awareness that s_or_saveexec_b64 clobbers SCC
 #
@@ -298,3 +341,105 @@ body:             |
     $vgpr1 = COPY %4.sub1:vreg_128
     SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
 ...
+
+---
+#CHECK-LABEL: name: no_wqm_in_cs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_cs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_es
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_es
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_gs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_gs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_hs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_hs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_ls
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_ls
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...
+
+---
+#CHECK-LABEL: name: no_wqm_in_vs
+#CHECK-NOT: S_WQM
+name:            no_wqm_in_vs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr1, $vgpr2
+
+    undef %0.sub0:vreg_64 = COPY $vgpr1
+    %0.sub1:vreg_64 = COPY $vgpr2
+    %100:sgpr_256 = IMPLICIT_DEF
+    %101:sgpr_128 = IMPLICIT_DEF
+
+    %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
+...