[PATCH] D20839: AMDGPU: Add amdgpu-ps-wqm-outputs function attributes

Wed Jun 1 03:50:41 PDT 2016

nhaehnle updated this revision to Diff 59195.
nhaehnle marked an inline comment as done.
nhaehnle added a comment.

Instead of looking for copies, look for defs of physical VGPRs. Since this
is before register allocation, the only physical VGPRs should be shader
inputs and outputs, and we never overwrite the inputs, so that should do the
right thing.

Marek, the point of the comment is that even when the main part uses
derivatives, it's probably not very common to use derivatives of gl_Color at
least, and in that case the prolog still wouldn't need WQM (per-sample
interpolants are different).

But as you say, PS prologs aren't very common, the additional number of
instructions is small, and memory accesses are a non-issue - that's why I
didn't bother to do anything more involved.


http://reviews.llvm.org/D20839

Files:
  lib/Target/AMDGPU/SIWholeQuadMode.cpp
  test/CodeGen/AMDGPU/wqm.ll

Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================

--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -332,6 +332,19 @@
   ret <4 x float> %tex
 }
 
+; Check prolog shaders.
+;
+; CHECK-LABEL: {{^}}test_prolog_1:
+; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_add_f32_e32 v0,
+; CHECK: s_and_b64 exec, exec, [[ORIG]]
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+main_body:
+  %s = fadd float %a, %b
+  ret float %s
+}
+
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
@@ -345,3 +358,4 @@
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 attributes #3 = { nounwind readnone }
+attributes #4 = { "amdgpu-ps-wqm-outputs" }
Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp
===================================================================
--- lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -154,14 +154,15 @@
 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
+  bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
 
   for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {
     MachineBasicBlock &MBB = *BI;
 
     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
       MachineInstr &MI = *II;
       unsigned Opcode = MI.getOpcode();
-      char Flags;
+      char Flags = 0;
 
       if (TII->isWQM(Opcode) || TII->isDS(Opcode)) {
         Flags = StateWQM;
@@ -175,15 +176,39 @@
             ExecExports.push_back(&MI);
         } else if (Opcode == AMDGPU::SI_PS_LIVE) {
           LiveMaskQueries.push_back(&MI);
+        } else if (WQMOutputs) {
+          // The function is in machine SSA form, which means that physical
+          // VGPRs correspond to shader inputs and outputs. Inputs are
+          // only used, outputs are only defined.
+          for (const MachineOperand &MO : MI.defs()) {
+            if (!MO.isReg())
+              continue;
+
+            unsigned Reg = MO.getReg();
+
+            if (!TRI->isVirtualRegister(Reg) &&
+                TRI->hasVGPRs(TRI->getPhysRegClass(Reg))) {
+              Flags = StateWQM;
+              break;
+            }
+          }
         }
 
-        continue;
+        if (!Flags)
+          continue;
       }
 
       Instructions[&MI].Needs = Flags;
       Worklist.push_back(&MI);
       GlobalFlags |= Flags;
     }
+
+    if (WQMOutputs && MBB.succ_empty()) {
+      // This is a prolog shader. Make sure we go back to exact mode at the end.
+      Blocks[&MBB].OutNeeds = StateExact;
+      Worklist.push_back(&MBB);
+      GlobalFlags |= StateExact;
+    }
   }
 
   return GlobalFlags;


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D20839.59195.patch
Type: text/x-patch
Size: 2965 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160601/ab94b84d/attachment.bin>