[PATCH] D124981: [AMDGPU] Enable WQM if demotes and softwqm are combined

Wed May 4 22:56:21 PDT 2022

critson created this revision.
critson added reviewers: foad, ruiling, piotr.
Herald added subscribers: jsilvanus, hsmhsm, kerbowa, hiraditya, t-tye, tpr, dstuttard, yaxunl, nhaehnle, jvesely, kzhuravl, arsenm.
Herald added a project: All.
critson requested review of this revision.
Herald added subscribers: llvm-commits, wdng.
Herald added a project: LLVM.

Demotes may be used to explicitly create helper invocations.
These helper invocations may be intend to have observable effects
in WQM, e.g. in fragment shader subgroup operations.
Facilitate this behaviour by forcing softwqm operations to be run
in WQM when demotes are present in a shader.
Conversely this allows such operations to be marked softwqm so
helper lanes are only enabled if demotes are present.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D124981

Files:
  llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
  llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll


Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
===================================================================

--- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
+++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.softwqm.ll
@@ -200,12 +200,30 @@
   ret float %r
 }
 
+; Check that WQM is triggered for softwqm with demote.
+;
+;CHECK-LABEL: {{^}}test_demote_1:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_demote_1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
+  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
+  %c1 = fcmp oge float %src0, 0.0
+  call void @llvm.amdgcn.wqm.demote(i1 %c1)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
+  ret float %out.0
+}
+
 declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
 declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32 immarg) #3
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
-declare void @llvm.amdgcn.kill(i1) #1
+declare void @llvm.amdgcn.wqm.demote(i1) #1
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare float @llvm.amdgcn.softwqm.f32(float) #3
 declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
Index: llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -489,6 +489,7 @@
   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
   bool HasImplicitDerivatives =
       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
+  bool HasDemotes = false;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -573,6 +574,8 @@
                    Opcode == AMDGPU::SI_DEMOTE_I1) {
           KillInstrs.push_back(&MI);
           BBI.NeedsLowering = true;
+          if (Opcode == AMDGPU::SI_DEMOTE_I1)
+            HasDemotes = true;
         } else if (WQMOutputs) {
           // The function is in machine SSA form, which means that physical
           // VGPRs correspond to shader inputs and outputs. Inputs are
@@ -601,6 +604,12 @@
     }
   }
 
+  // Demotes may be used to intentionally introduce new helper lanes.
+  // Enable WQM to facilitate this effect if there are operations which
+  // would change behaviour when run in WQM, i.e. SOFT_WQM instructions.
+  if (HasDemotes && !SoftWQMInstrs.empty())
+    GlobalFlags |= StateWQM;
+
   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
   // ever used anywhere in the function. This implements the corresponding
   // semantics of @llvm.amdgcn.set.inactive.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D124981.427200.patch
Type: text/x-patch
Size: 3235 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20220505/b11b7c1a/attachment.bin>