[llvm] d4303b3 - [AMDGPU] Fold AGPR reg_sequence initializers

Fri Oct 25 11:39:18 PDT 2019

Author: Stanislav Mekhanoshin
Date: 2019-10-25T11:39:02-07:00
New Revision: d4303b38616cada612cda70c2506c4ac70f66f45

URL: https://github.com/llvm/llvm-project/commit/d4303b38616cada612cda70c2506c4ac70f66f45
DIFF: https://github.com/llvm/llvm-project/commit/d4303b38616cada612cda70c2506c4ac70f66f45.diff

LOG: [AMDGPU] Fold AGPR reg_sequence initializers

Differential Revision: https://reviews.llvm.org/D69413

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/test/CodeGen/AMDGPU/mfma-loop.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 7cf5f802c099..a141a22c53fe 100644

--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -14,6 +14,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -441,6 +442,42 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
+// Find a def of the UseReg, check if it is a reg_seqence and find initializers
+// for each subreg, tracking it to foldable inline immediate if possible.
+// Returns true on success.
+static bool getRegSeqInit(
+    SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
+    Register UseReg, uint8_t OpTy,
+    const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
+  MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+  if (!Def || !Def->isRegSequence())
+    return false;
+
+  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
+    MachineOperand *Sub = &Def->getOperand(I);
+    assert (Sub->isReg());
+
+    for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
+         SubDef && Sub->isReg() && !Sub->getSubReg() &&
+         TII->isFoldableCopy(*SubDef);
+         SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
+      MachineOperand *Op = &SubDef->getOperand(1);
+      if (Op->isImm()) {
+        if (TII->isInlineConstant(*Op, OpTy))
+          Sub = Op;
+        break;
+      }
+      if (!Op->isReg())
+        break;
+      Sub = Op;
+    }
+
+    Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
+  }
+
+  return true;
+}
+
 static bool tryToFoldACImm(const SIInstrInfo *TII,
                            const MachineOperand &OpToFold,
                            MachineInstr *UseMI,
@@ -474,39 +511,30 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
     return false;
 
   MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
-  const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
-  if (!Def || !Def->isRegSequence())
+  SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+  if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
     return false;
 
-  int64_t Imm;
-  MachineOperand *Op;
-  for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
-    const MachineOperand &Sub = Def->getOperand(I);
-    if (!Sub.isReg() || Sub.getSubReg())
+  int32_t Imm;
+  for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
+    const MachineOperand *Op = Defs[I].first;
+    if (!Op->isImm())
       return false;
-    MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
-    while (SubDef && !SubDef->isMoveImmediate() &&
-           !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
-      SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
-    if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
-      return false;
-    Op = &SubDef->getOperand(1);
+
     auto SubImm = Op->getImm();
-    if (I == 1) {
-      if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
+    if (!I) {
+      Imm = SubImm;
+      if (!TII->isInlineConstant(*Op, OpTy) ||
+          !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
         return false;
 
-      Imm = SubImm;
       continue;
     }
     if (Imm != SubImm)
       return false; // Can only fold splat constants
   }
 
-  if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
-    return false;
-
-  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Op);
+  appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
   return true;
 }
 
@@ -645,11 +673,92 @@ void SIFoldOperands::foldOperand(
       LLVM_DEBUG(dbgs() << "Folding " << OpToFold
                         << "\n into " << *UseMI << '\n');
       unsigned Size = TII->getOpSize(*UseMI, 1);
-      UseMI->getOperand(1).setReg(OpToFold.getReg());
+      Register UseReg = OpToFold.getReg();
+      UseMI->getOperand(1).setReg(UseReg);
       UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
       UseMI->getOperand(1).setIsKill(false);
       CopiesToReplace.push_back(UseMI);
       OpToFold.setIsKill(false);
+
+      // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
+      // can only accept VGPR or inline immediate. Recreate a reg_sequence with
+      // its initializers right here, so we will rematerialize immediates and
+      // avoid copies via 
diff erent reg classes.
+      SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+      if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+          getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
+                        *MRI)) {
+        const DebugLoc &DL = UseMI->getDebugLoc();
+        MachineBasicBlock &MBB = *UseMI->getParent();
+
+        UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
+        for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
+          UseMI->RemoveOperand(I);
+
+        MachineInstrBuilder B(*MBB.getParent(), UseMI);
+        DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+        SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
+        for (unsigned I = 0; I < Size / 4; ++I) {
+          MachineOperand *Def = Defs[I].first;
+          TargetInstrInfo::RegSubRegPair CopyToVGPR;
+          if (Def->isImm() &&
+              TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+            int64_t Imm = Def->getImm();
+
+            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+            BuildMI(MBB, UseMI, DL,
+                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
+            B.addReg(Tmp);
+          } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
+            auto Src = getRegSubRegPair(*Def);
+            Def->setIsKill(false);
+            if (!SeenAGPRs.insert(Src)) {
+              // We cannot build a reg_sequence out of the same registers, they
+              // must be copied. Better do it here before copyPhysReg() created
+              // several reads to do the AGPR->VGPR->AGPR copy.
+              CopyToVGPR = Src;
+            } else {
+              B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
+                       Src.SubReg);
+            }
+          } else {
+            assert(Def->isReg());
+            Def->setIsKill(false);
+            auto Src = getRegSubRegPair(*Def);
+
+            // Direct copy from SGPR to AGPR is not possible. To avoid creation
+            // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
+            // create a copy here and track if we already have such a copy.
+            if (TRI->isSGPRReg(*MRI, Src.Reg)) {
+              CopyToVGPR = Src;
+            } else {
+              auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
+              B.addReg(Tmp);
+            }
+          }
+
+          if (CopyToVGPR.Reg) {
+            Register Vgpr;
+            if (VGPRCopies.count(CopyToVGPR)) {
+              Vgpr = VGPRCopies[CopyToVGPR];
+            } else {
+              Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+              BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
+              VGPRCopies[CopyToVGPR] = Vgpr;
+            }
+            auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+            BuildMI(MBB, UseMI, DL,
+                    TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
+            B.addReg(Tmp);
+          }
+
+          B.addImm(Defs[I].second);
+        }
+        LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
+        return;
+      }
+
       if (Size != 4)
         return;
       if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&

diff  --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 90c64763899c..57dd31316e14 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,53 +1,115 @@
-; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
 
+; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+
+; Check that we do not copy agprs to vgprs and back inside the loop.
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+
+; Final result should be read only once after the loop.
+
+; GCN-COUNT32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
+
 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
 ; 3 vgprs are needed to avoid wait states between writes.
+; Check that we do not use 32 temp sgprs as well.
+
+; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
+; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]]
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+
+; GCN-COUNT32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
 
-; FIXME: We should not be using and temporary registers at all.
-; At the moment we initialize an sgpr, then copy it via vgprs.
+for.cond.preheader:
+  %phi = phi <32 x float> [ <float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0, float 123.0>, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+; GCN:         v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
+; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GCN-COUNT32: v_accvgpr_read_b32
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+define amdgpu_kernel void @test_mfma_loop_non_splat(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+for.cond.preheader:
+  %phi = phi <32 x float> [ <float 0.0, float 1.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
 
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
+; 3 vgprs are needed to avoid wait states between writes.
 
-; Check that we do not copy agprs to vgprs and back inside the loop.
+; GCN: v_mov_b32_e32 [[TMP1:v[0-9]+]], 0x42f60000
+; GCN: v_mov_b32_e32 [[TMP2:v[0-9]+]], 0x42f80000
+; GCN: v_mov_b32_e32 [[TMP3:v[0-9]+]], 0x42fe0000
+; GCN-COUNT29: v_mov_b32_e32 v1, 0x4{{[0-9a-f]+}}
+; GCN-COUNT10: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN-COUNT11: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
 
 ; GCN: [[LOOP:BB[0-9_]+]]:
 ; GCN-NOT: v_accvgpr
@@ -55,16 +117,179 @@
 ; GCN-NOT: v_accvgpr
 ; GCN: s_cbranch_scc1 [[LOOP]]
 
-; Final result should be read only once after the loop.
+; GCN-COUNT32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(<32 x float> addrspace(1)* %arg) {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ <float 123.0, float 124.0, float 125.0, float 126.0, float 127.0, float 128.0, float 129.0, float 130.0, float 131.0, float 132.0, float 133.0, float 134.0, float 135.0, float 136.0, float 137.0, float 138.0, float 139.0, float 140.0, float 141.0, float 142.0, float 143.0, float 144.0, float 145.0, float 146.0, float 147.0, float 148.0, float 149.0, float 150.0, float 151.0, float 152.0, float 153.0, float 154.0>, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
+
+; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
 
 ; GCN-COUNT32: v_accvgpr_read_b32
 
-define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
+define amdgpu_kernel void @test_mfma_loop_vgpr_init(<32 x float> addrspace(1)* %arg) {
 entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %init = bitcast i32 %tid to float
+  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
+  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
+  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
+  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
+  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
+  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
+  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
+  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
+  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
+  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
+  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
+  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
+  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
+  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
+  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
+  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
+  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
+  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
+  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
+  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
+  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
+  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
+  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
+  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
+  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
+  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
+  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
+  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
+  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
+  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
+  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
+  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
+
   br label %for.cond.preheader
 
 for.cond.preheader:
-  %phi = phi <32 x float> [ zeroinitializer, %entry ], [ %mai.1, %for.cond.preheader ]
+  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
+
+; GCN:         v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
+; GCN-COUNT32: v_accvgpr_write_b32 a0, [[TMP]]
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+
+; GCN-COUNT32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_loop_sgpr_init(<32 x float> addrspace(1)* %arg, float %init) {
+entry:
+  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
+  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
+  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
+  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
+  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
+  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
+  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
+  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
+  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
+  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
+  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
+  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
+  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
+  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
+  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
+  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
+  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
+  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
+  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
+  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
+  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
+  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
+  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
+  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
+  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
+  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
+  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
+  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
+  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
+  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
+  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
+  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
+
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, v0
+; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
+
+; GCN-COUNT30: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+
+; GCN-COUNT32: v_accvgpr_read_b32
+
+define amdgpu_kernel void @test_mfma_loop_mixed_init(<32 x float> addrspace(1)* %arg, float %x) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %init = bitcast i32 %tid to float
+  %tmp0 = insertelement <32 x float> zeroinitializer, float %init, i32 0
+  %tmp1 = insertelement <32 x float> %tmp0, float %x, i32 1
+
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ %tmp1, %entry ], [ %mai.1, %for.cond.preheader ]
   %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
   %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
   %inc = add nuw nsw i32 %c, 1
@@ -91,8 +316,6 @@ exit:
 ; GCN-COUNT32: v_accvgpr_read_b32
 define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(<32 x float> addrspace(1)* %arg) {
 entry:
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %init = bitcast i32 %tid to float
   %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
 
   br label %for.cond.preheader
@@ -110,5 +333,74 @@ exit:
   ret void
 }
 
+; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
+
+; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
+; GCN: v_mfma_f32_32x32x1f32
+
+; Check that we are using only one tmp VGPR.
+
+; GCN: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
+; GCN-COUNT32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]{{$}}
+
+; GCN: [[LOOP:BB[0-9_]+]]:
+; GCN-NOT: v_accvgpr
+; GCN: v_mfma_f32_32x32x1f32
+; GCN-NOT: v_accvgpr
+; GCN: s_cbranch_scc1 [[LOOP]]
+
+; GCN-COUNT32: v_accvgpr_read_b32
+define amdgpu_kernel void @test_mfma_loop_agpr_init(<32 x float> addrspace(1)* %arg) {
+entry:
+  %mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  %init = extractelement <32 x float> %mai.0, i32 0
+  %tmp0 = insertelement <32 x float> undef, float %init, i32 0
+  %tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
+  %tmp2 = insertelement <32 x float> %tmp1, float %init, i32 2
+  %tmp3 = insertelement <32 x float> %tmp2, float %init, i32 3
+  %tmp4 = insertelement <32 x float> %tmp3, float %init, i32 4
+  %tmp5 = insertelement <32 x float> %tmp4, float %init, i32 5
+  %tmp6 = insertelement <32 x float> %tmp5, float %init, i32 6
+  %tmp7 = insertelement <32 x float> %tmp6, float %init, i32 7
+  %tmp8 = insertelement <32 x float> %tmp7, float %init, i32 8
+  %tmp9 = insertelement <32 x float> %tmp8, float %init, i32 9
+  %tmp10 = insertelement <32 x float> %tmp9, float %init, i32 10
+  %tmp11 = insertelement <32 x float> %tmp10, float %init, i32 11
+  %tmp12 = insertelement <32 x float> %tmp11, float %init, i32 12
+  %tmp13 = insertelement <32 x float> %tmp12, float %init, i32 13
+  %tmp14 = insertelement <32 x float> %tmp13, float %init, i32 14
+  %tmp15 = insertelement <32 x float> %tmp14, float %init, i32 15
+  %tmp16 = insertelement <32 x float> %tmp15, float %init, i32 16
+  %tmp17 = insertelement <32 x float> %tmp16, float %init, i32 17
+  %tmp18 = insertelement <32 x float> %tmp17, float %init, i32 18
+  %tmp19 = insertelement <32 x float> %tmp18, float %init, i32 19
+  %tmp20 = insertelement <32 x float> %tmp19, float %init, i32 20
+  %tmp21 = insertelement <32 x float> %tmp20, float %init, i32 21
+  %tmp22 = insertelement <32 x float> %tmp21, float %init, i32 22
+  %tmp23 = insertelement <32 x float> %tmp22, float %init, i32 23
+  %tmp24 = insertelement <32 x float> %tmp23, float %init, i32 24
+  %tmp25 = insertelement <32 x float> %tmp24, float %init, i32 25
+  %tmp26 = insertelement <32 x float> %tmp25, float %init, i32 26
+  %tmp27 = insertelement <32 x float> %tmp26, float %init, i32 27
+  %tmp28 = insertelement <32 x float> %tmp27, float %init, i32 28
+  %tmp29 = insertelement <32 x float> %tmp28, float %init, i32 29
+  %tmp30 = insertelement <32 x float> %tmp29, float %init, i32 30
+  %tmp31 = insertelement <32 x float> %tmp30, float %init, i32 31
+
+  br label %for.cond.preheader
+
+for.cond.preheader:
+  %phi = phi <32 x float> [ %tmp31, %entry ], [ %mai.1, %for.cond.preheader ]
+  %c = phi i32 [ 0, %entry ], [ %inc, %for.cond.preheader ]
+  %mai.1 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> %phi, i32 0, i32 0, i32 0)
+  %inc = add nuw nsw i32 %c, 1
+  %cc = icmp eq i32 %inc, 16
+  br i1 %cc, label %exit, label %for.cond.preheader
+
+exit:
+  store <32 x float> %mai.1, <32 x float> addrspace(1)* %arg
+  ret void
+}
+
 declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
 declare i32 @llvm.amdgcn.workitem.id.x()