[llvm] 0462795 - [AMDGPU] Propagate AGPR RC from PHI to its PHI operands

Fri Apr 3 11:23:36 PDT 2020

Author: Stanislav Mekhanoshin
Date: 2020-04-03T11:23:02-07:00
New Revision: 0462795095e57d9cac3c68ff1f28bc4bef8c4865

URL: https://github.com/llvm/llvm-project/commit/0462795095e57d9cac3c68ff1f28bc4bef8c4865
DIFF: https://github.com/llvm/llvm-project/commit/0462795095e57d9cac3c68ff1f28bc4bef8c4865.diff

LOG: [AMDGPU] Propagate AGPR RC from PHI to its PHI operands

We can fix register class of PHI based on its all AGPR uses.
That leaves behind all PHIs which were already processed
earlier. Propagate RC back to PHI operands of a PHI.

Differential Revision: https://reviews.llvm.org/D77344

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
    llvm/test/CodeGen/AMDGPU/mfma-loop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f067e96b8aca..7283c33fe985 100644

--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -766,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
   SmallSet<const MachineInstr *, 4> Visited;
+  SetVector<MachineInstr *> PHIOperands;
   worklist.insert(&MI);
   Visited.insert(&MI);
   while (!worklist.empty()) {
@@ -810,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+    for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+      MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
+      if (DefMI && DefMI->isPHI())
+        PHIOperands.insert(DefMI);
+    }
   }
 
   bool hasVGPRInput = false;
@@ -845,4 +851,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     TII->legalizeOperands(MI, MDT);
   }
 
+  // Propagate register class back to PHI operands which are PHI themselves.
+  while (!PHIOperands.empty()) {
+    processPHINode(*PHIOperands.pop_back_val());
+  }
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index b66c9d41446f..9b5438c5e91d 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -487,5 +487,50 @@ exit:
   ret void
 }
 
+; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+
+; Check that we do not copy agprs to vgprs and back in an outer loop.
+
+; GCN: [[OUTER_LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: [[INNER_LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[INNER_LOOP]]
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
+
+; Final result should be read only once after the loop.
+
+; GCN-COUNT-32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi.0 = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %inner.exit ]
+  %c.0 = phi i32 [ 0, %entry ], [ %inc.0, %inner.exit ]
+  br label %inner.for.cond.preheader
+
+inner.for.cond.preheader:
+  %phi = phi <32 x float> [ %phi.0, %for.cond.preheader ], [ %mai.1, %inner.for.cond.preheader ]
+  %c = phi i32 [ 0, %for.cond.preheader ], [ %inc, %inner.for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %inner.exit, label %inner.for.cond.preheader
+
+inner.exit:
+  %inc.0 = add nuw nsw i32 %c.0, 1
+  %cc.0 = icmp eq i32 %inc.0, 16
+  br i1 %cc.0, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()