[llvm] ca0c92d - [AMDGPU] Allow to use a whole register file on gfx90a for VGPRs

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Oct 21 18:24:49 PDT 2021


Author: Stanislav Mekhanoshin
Date: 2021-10-21T18:24:34-07:00
New Revision: ca0c92d6a1cc93ea83411511848644cb217c31ce

URL: https://github.com/llvm/llvm-project/commit/ca0c92d6a1cc93ea83411511848644cb217c31ce
DIFF: https://github.com/llvm/llvm-project/commit/ca0c92d6a1cc93ea83411511848644cb217c31ce.diff

LOG: [AMDGPU] Allow to use a whole register file on gfx90a for VGPRs

In a kernel which does not have calls or AGPR usage we can allocate
the whole vector register budget for VGPRs and have no AGPRs as
long as VGPRs stay addressable (i.e. below 256).

Differential Revision: https://reviews.llvm.org/D111764

Added: 
    llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll

Modified: 
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 32a447f6e6863..1bfeb6c415b63 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -651,3 +651,35 @@ bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
   }
   return false;
 }
+
+bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
+  if (UsesAGPRs)
+    return *UsesAGPRs;
+
+  if (!AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) ||
+      MF.getFrameInfo().hasCalls()) {
+    UsesAGPRs = true;
+    return true;
+  }
+
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const Register Reg = Register::index2VirtReg(I);
+    const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+    if (RC && SIRegisterInfo::isAGPRClass(RC)) {
+      UsesAGPRs = true;
+      return true;
+    }
+  }
+
+  for (MCRegister Reg : AMDGPU::AGPR_32RegClass) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      UsesAGPRs = true;
+      return true;
+    }
+  }
+
+  UsesAGPRs = false;
+  return false;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index e848ef8d359c7..c305bc20e40d4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -433,6 +433,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   // Current recorded maximum possible occupancy.
   unsigned Occupancy;
 
+  mutable Optional<bool> UsesAGPRs;
+
   MCPhysReg getNextUserSGPR() const;
 
   MCPhysReg getNextSystemSGPR() const;
@@ -946,6 +948,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
       Occupancy = Limit;
     limitOccupancy(MF);
   }
+
+  // \returns true if a function needs or may need AGPRs.
+  bool usesAGPRs(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 2896ec8e4acb1..f5a74fe665e39 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -501,18 +501,36 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
-  // TODO: In an entry function without calls and AGPRs used it is possible
-  //       to use the whole register budget for VGPRs. Even more it shall
-  //       be possible to estimate maximum AGPR/VGPR pressure and split
-  //       register file accordingly.
-  if (ST.hasGFX90AInsts())
-    MaxNumVGPRs /= 2;
+  unsigned MaxNumAGPRs = MaxNumVGPRs;
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+
+  if (ST.hasGFX90AInsts()) {
+    // In an entry function without calls and AGPRs used it is possible to use
+    // the whole register budget for VGPRs.
+
+    // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and
+    //       split register file accordingly.
+    if (MFI->usesAGPRs(MF)) {
+      MaxNumVGPRs /= 2;
+      MaxNumAGPRs = MaxNumVGPRs;
+    } else {
+      if (MaxNumVGPRs > TotalNumVGPRs) {
+        MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
+        MaxNumVGPRs = TotalNumVGPRs;
+      } else
+        MaxNumAGPRs = 0;
+    }
+  }
+
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
-    Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+  }
+
+  for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
+    unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
   }
 
@@ -536,8 +554,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
-  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 81b055166dd2a..1ce32129aad33 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -12678,216 +12678,216 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s13
 ; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
 ; GFX90A-NEXT:    s_sub_u32 s14, 0, s12
 ; GFX90A-NEXT:    s_subb_u32 s15, 0, s13
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s16, v2
+; GFX90A-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v1
+; GFX90A-NEXT:    v_mul_f32_e32 v2, s18, v1
+; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s14, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s15, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s14, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s15, v1
+; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v2, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v0
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s15, v0
-; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s14, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v1
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s15, v1
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s14, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
 ; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v1, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v1, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX90A-NEXT:    s_mov_b32 s15, s14
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s5, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s12, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s12, v1
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s13, v1
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v1
+; GFX90A-NEXT:    v_sub_u32_e32 v4, s5, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v6, vcc, s4, v6
-; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v7, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v6
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v5
+; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
+; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v5
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v8, v7, s[0:1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s5
-; GFX90A-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
+; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v1, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v6
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
 ; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
 ; GFX90A-NEXT:    s_add_u32 s8, s8, s4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s9
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s9
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v2
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v4
+; GFX90A-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v7, s1
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v4
+; GFX90A-NEXT:    v_mul_f32_e32 v4, s18, v1
+; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v4
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
-; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
-; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
 ; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s11, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s11, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v4, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v4, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v4, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
-; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v5
+; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v14, v1, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v13, v1, v8
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v7
 ; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v4
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v4
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v7, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s7, v4
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v4
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v1
 ; GFX90A-NEXT:    v_sub_u32_e32 v6, s7, v5
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v7, vcc, s6, v7
@@ -12909,19 +12909,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v3, v6
+; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v1, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v2, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s0, v3
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v4
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v4, vcc, s0, v1
+; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[2:5], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -14426,222 +14426,222 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
 ; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s13
 ; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
 ; GFX90A-NEXT:    s_sub_u32 s2, 0, s12
 ; GFX90A-NEXT:    s_subb_u32 s3, 0, s13
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
-; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s16, v2
+; GFX90A-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
-; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v1
+; GFX90A-NEXT:    v_mul_f32_e32 v2, s18, v1
+; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
 ; GFX90A-NEXT:    s_mov_b32 s15, s14
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
+; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v1, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v2, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v2, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GFX90A-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v1
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v1
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
 ; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v1, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v1, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v5
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s4, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s5, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v1
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s12, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v1
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v1
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v0, s12, v0
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v5, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s12, v0
+; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v4, s13
+; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, s4, v1
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v1
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v6
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v5, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v6
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[0:1]
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s5
+; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
 ; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
 ; GFX90A-NEXT:    s_add_u32 s2, s10, s0
-; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v5, s5
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
-; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s5
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v5
-; GFX90A-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s14
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v3, s17, v3
-; GFX90A-NEXT:    v_mul_f32_e32 v5, s18, v3
-; GFX90A-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s19, v5
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s14, v2
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s14, v1
+; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v4
+; GFX90A-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v7, s14
+; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v4
+; GFX90A-NEXT:    v_mul_f32_e32 v4, s18, v1
+; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v4
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s3, v3
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s3, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v3, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v6
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v4, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v4, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v5, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v4, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v5, v6
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v2, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v6
-; GFX90A-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
+; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v5
+; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1]
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v1
 ; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v6, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v6, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v3, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
+; GFX90A-NEXT:    v_mul_lo_u32 v14, v1, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v13, v1, v8
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v6, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v6, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v2, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
+; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX90A-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s6, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v4
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v4
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s7, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v6, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v3
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v7, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s7, v4
+; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v4
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
+; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v1
+; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v1, s4, v1
+; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v4
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
-; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
+; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v3
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v1
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
@@ -14655,23 +14655,23 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v3, s10, v3
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s10, v2
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GFX90A-NEXT:    v_xor_b32_e32 v5, s10, v4
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v3
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v4, vcc, s10, v1
+; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX90A-NEXT:    global_store_dwordx4 v0, v[2:5], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
index 97ad823dad338..dde186cbd9ca0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -548,6 +548,7 @@ attributes #256 = { nounwind "amdgpu-flat-work-group-size"="256,256" }
 ; GFX10CU-WAVE32: NumVgprs: 128
 ; GFX10CU-WAVE64: NumVgprs: 128
 define amdgpu_kernel void @f512() #512 {
+  call void @foo()
   call void @use256vgprs()
   ret void
 }
@@ -563,7 +564,11 @@ attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
 ; GFX10CU-WAVE32: NumVgprs: 64
 ; GFX10CU-WAVE64: NumVgprs: 64
 define amdgpu_kernel void @f1024() #1024 {
+  call void @foo()
   call void @use256vgprs()
   ret void
 }
+
 attributes #1024 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }
+
+declare void @foo()

diff  --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
new file mode 100644
index 0000000000000..36d3ad4316d11
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -0,0 +1,700 @@
+; -enable-misched=false makes the register usage more predictable
+; -regalloc=fast just makes the test run faster
+; RUN: llc -march=amdgcn -mcpu=gfx90a -amdgpu-function-calls=false -enable-misched=false -sgpr-regalloc=fast -vgpr-regalloc=fast < %s | FileCheck %s --check-prefixes=GCN,GFX90A
+
+define internal void @use256vgprs() {
+  %v0 = call i32 asm sideeffect "; def $0", "=v"()
+  %v1 = call i32 asm sideeffect "; def $0", "=v"()
+  %v2 = call i32 asm sideeffect "; def $0", "=v"()
+  %v3 = call i32 asm sideeffect "; def $0", "=v"()
+  %v4 = call i32 asm sideeffect "; def $0", "=v"()
+  %v5 = call i32 asm sideeffect "; def $0", "=v"()
+  %v6 = call i32 asm sideeffect "; def $0", "=v"()
+  %v7 = call i32 asm sideeffect "; def $0", "=v"()
+  %v8 = call i32 asm sideeffect "; def $0", "=v"()
+  %v9 = call i32 asm sideeffect "; def $0", "=v"()
+  %v10 = call i32 asm sideeffect "; def $0", "=v"()
+  %v11 = call i32 asm sideeffect "; def $0", "=v"()
+  %v12 = call i32 asm sideeffect "; def $0", "=v"()
+  %v13 = call i32 asm sideeffect "; def $0", "=v"()
+  %v14 = call i32 asm sideeffect "; def $0", "=v"()
+  %v15 = call i32 asm sideeffect "; def $0", "=v"()
+  %v16 = call i32 asm sideeffect "; def $0", "=v"()
+  %v17 = call i32 asm sideeffect "; def $0", "=v"()
+  %v18 = call i32 asm sideeffect "; def $0", "=v"()
+  %v19 = call i32 asm sideeffect "; def $0", "=v"()
+  %v20 = call i32 asm sideeffect "; def $0", "=v"()
+  %v21 = call i32 asm sideeffect "; def $0", "=v"()
+  %v22 = call i32 asm sideeffect "; def $0", "=v"()
+  %v23 = call i32 asm sideeffect "; def $0", "=v"()
+  %v24 = call i32 asm sideeffect "; def $0", "=v"()
+  %v25 = call i32 asm sideeffect "; def $0", "=v"()
+  %v26 = call i32 asm sideeffect "; def $0", "=v"()
+  %v27 = call i32 asm sideeffect "; def $0", "=v"()
+  %v28 = call i32 asm sideeffect "; def $0", "=v"()
+  %v29 = call i32 asm sideeffect "; def $0", "=v"()
+  %v30 = call i32 asm sideeffect "; def $0", "=v"()
+  %v31 = call i32 asm sideeffect "; def $0", "=v"()
+  %v32 = call i32 asm sideeffect "; def $0", "=v"()
+  %v33 = call i32 asm sideeffect "; def $0", "=v"()
+  %v34 = call i32 asm sideeffect "; def $0", "=v"()
+  %v35 = call i32 asm sideeffect "; def $0", "=v"()
+  %v36 = call i32 asm sideeffect "; def $0", "=v"()
+  %v37 = call i32 asm sideeffect "; def $0", "=v"()
+  %v38 = call i32 asm sideeffect "; def $0", "=v"()
+  %v39 = call i32 asm sideeffect "; def $0", "=v"()
+  %v40 = call i32 asm sideeffect "; def $0", "=v"()
+  %v41 = call i32 asm sideeffect "; def $0", "=v"()
+  %v42 = call i32 asm sideeffect "; def $0", "=v"()
+  %v43 = call i32 asm sideeffect "; def $0", "=v"()
+  %v44 = call i32 asm sideeffect "; def $0", "=v"()
+  %v45 = call i32 asm sideeffect "; def $0", "=v"()
+  %v46 = call i32 asm sideeffect "; def $0", "=v"()
+  %v47 = call i32 asm sideeffect "; def $0", "=v"()
+  %v48 = call i32 asm sideeffect "; def $0", "=v"()
+  %v49 = call i32 asm sideeffect "; def $0", "=v"()
+  %v50 = call i32 asm sideeffect "; def $0", "=v"()
+  %v51 = call i32 asm sideeffect "; def $0", "=v"()
+  %v52 = call i32 asm sideeffect "; def $0", "=v"()
+  %v53 = call i32 asm sideeffect "; def $0", "=v"()
+  %v54 = call i32 asm sideeffect "; def $0", "=v"()
+  %v55 = call i32 asm sideeffect "; def $0", "=v"()
+  %v56 = call i32 asm sideeffect "; def $0", "=v"()
+  %v57 = call i32 asm sideeffect "; def $0", "=v"()
+  %v58 = call i32 asm sideeffect "; def $0", "=v"()
+  %v59 = call i32 asm sideeffect "; def $0", "=v"()
+  %v60 = call i32 asm sideeffect "; def $0", "=v"()
+  %v61 = call i32 asm sideeffect "; def $0", "=v"()
+  %v62 = call i32 asm sideeffect "; def $0", "=v"()
+  %v63 = call i32 asm sideeffect "; def $0", "=v"()
+  %v64 = call i32 asm sideeffect "; def $0", "=v"()
+  %v65 = call i32 asm sideeffect "; def $0", "=v"()
+  %v66 = call i32 asm sideeffect "; def $0", "=v"()
+  %v67 = call i32 asm sideeffect "; def $0", "=v"()
+  %v68 = call i32 asm sideeffect "; def $0", "=v"()
+  %v69 = call i32 asm sideeffect "; def $0", "=v"()
+  %v70 = call i32 asm sideeffect "; def $0", "=v"()
+  %v71 = call i32 asm sideeffect "; def $0", "=v"()
+  %v72 = call i32 asm sideeffect "; def $0", "=v"()
+  %v73 = call i32 asm sideeffect "; def $0", "=v"()
+  %v74 = call i32 asm sideeffect "; def $0", "=v"()
+  %v75 = call i32 asm sideeffect "; def $0", "=v"()
+  %v76 = call i32 asm sideeffect "; def $0", "=v"()
+  %v77 = call i32 asm sideeffect "; def $0", "=v"()
+  %v78 = call i32 asm sideeffect "; def $0", "=v"()
+  %v79 = call i32 asm sideeffect "; def $0", "=v"()
+  %v80 = call i32 asm sideeffect "; def $0", "=v"()
+  %v81 = call i32 asm sideeffect "; def $0", "=v"()
+  %v82 = call i32 asm sideeffect "; def $0", "=v"()
+  %v83 = call i32 asm sideeffect "; def $0", "=v"()
+  %v84 = call i32 asm sideeffect "; def $0", "=v"()
+  %v85 = call i32 asm sideeffect "; def $0", "=v"()
+  %v86 = call i32 asm sideeffect "; def $0", "=v"()
+  %v87 = call i32 asm sideeffect "; def $0", "=v"()
+  %v88 = call i32 asm sideeffect "; def $0", "=v"()
+  %v89 = call i32 asm sideeffect "; def $0", "=v"()
+  %v90 = call i32 asm sideeffect "; def $0", "=v"()
+  %v91 = call i32 asm sideeffect "; def $0", "=v"()
+  %v92 = call i32 asm sideeffect "; def $0", "=v"()
+  %v93 = call i32 asm sideeffect "; def $0", "=v"()
+  %v94 = call i32 asm sideeffect "; def $0", "=v"()
+  %v95 = call i32 asm sideeffect "; def $0", "=v"()
+  %v96 = call i32 asm sideeffect "; def $0", "=v"()
+  %v97 = call i32 asm sideeffect "; def $0", "=v"()
+  %v98 = call i32 asm sideeffect "; def $0", "=v"()
+  %v99 = call i32 asm sideeffect "; def $0", "=v"()
+  %v100 = call i32 asm sideeffect "; def $0", "=v"()
+  %v101 = call i32 asm sideeffect "; def $0", "=v"()
+  %v102 = call i32 asm sideeffect "; def $0", "=v"()
+  %v103 = call i32 asm sideeffect "; def $0", "=v"()
+  %v104 = call i32 asm sideeffect "; def $0", "=v"()
+  %v105 = call i32 asm sideeffect "; def $0", "=v"()
+  %v106 = call i32 asm sideeffect "; def $0", "=v"()
+  %v107 = call i32 asm sideeffect "; def $0", "=v"()
+  %v108 = call i32 asm sideeffect "; def $0", "=v"()
+  %v109 = call i32 asm sideeffect "; def $0", "=v"()
+  %v110 = call i32 asm sideeffect "; def $0", "=v"()
+  %v111 = call i32 asm sideeffect "; def $0", "=v"()
+  %v112 = call i32 asm sideeffect "; def $0", "=v"()
+  %v113 = call i32 asm sideeffect "; def $0", "=v"()
+  %v114 = call i32 asm sideeffect "; def $0", "=v"()
+  %v115 = call i32 asm sideeffect "; def $0", "=v"()
+  %v116 = call i32 asm sideeffect "; def $0", "=v"()
+  %v117 = call i32 asm sideeffect "; def $0", "=v"()
+  %v118 = call i32 asm sideeffect "; def $0", "=v"()
+  %v119 = call i32 asm sideeffect "; def $0", "=v"()
+  %v120 = call i32 asm sideeffect "; def $0", "=v"()
+  %v121 = call i32 asm sideeffect "; def $0", "=v"()
+  %v122 = call i32 asm sideeffect "; def $0", "=v"()
+  %v123 = call i32 asm sideeffect "; def $0", "=v"()
+  %v124 = call i32 asm sideeffect "; def $0", "=v"()
+  %v125 = call i32 asm sideeffect "; def $0", "=v"()
+  %v126 = call i32 asm sideeffect "; def $0", "=v"()
+  %v127 = call i32 asm sideeffect "; def $0", "=v"()
+  %v128 = call i32 asm sideeffect "; def $0", "=v"()
+  %v129 = call i32 asm sideeffect "; def $0", "=v"()
+  %v130 = call i32 asm sideeffect "; def $0", "=v"()
+  %v131 = call i32 asm sideeffect "; def $0", "=v"()
+  %v132 = call i32 asm sideeffect "; def $0", "=v"()
+  %v133 = call i32 asm sideeffect "; def $0", "=v"()
+  %v134 = call i32 asm sideeffect "; def $0", "=v"()
+  %v135 = call i32 asm sideeffect "; def $0", "=v"()
+  %v136 = call i32 asm sideeffect "; def $0", "=v"()
+  %v137 = call i32 asm sideeffect "; def $0", "=v"()
+  %v138 = call i32 asm sideeffect "; def $0", "=v"()
+  %v139 = call i32 asm sideeffect "; def $0", "=v"()
+  %v140 = call i32 asm sideeffect "; def $0", "=v"()
+  %v141 = call i32 asm sideeffect "; def $0", "=v"()
+  %v142 = call i32 asm sideeffect "; def $0", "=v"()
+  %v143 = call i32 asm sideeffect "; def $0", "=v"()
+  %v144 = call i32 asm sideeffect "; def $0", "=v"()
+  %v145 = call i32 asm sideeffect "; def $0", "=v"()
+  %v146 = call i32 asm sideeffect "; def $0", "=v"()
+  %v147 = call i32 asm sideeffect "; def $0", "=v"()
+  %v148 = call i32 asm sideeffect "; def $0", "=v"()
+  %v149 = call i32 asm sideeffect "; def $0", "=v"()
+  %v150 = call i32 asm sideeffect "; def $0", "=v"()
+  %v151 = call i32 asm sideeffect "; def $0", "=v"()
+  %v152 = call i32 asm sideeffect "; def $0", "=v"()
+  %v153 = call i32 asm sideeffect "; def $0", "=v"()
+  %v154 = call i32 asm sideeffect "; def $0", "=v"()
+  %v155 = call i32 asm sideeffect "; def $0", "=v"()
+  %v156 = call i32 asm sideeffect "; def $0", "=v"()
+  %v157 = call i32 asm sideeffect "; def $0", "=v"()
+  %v158 = call i32 asm sideeffect "; def $0", "=v"()
+  %v159 = call i32 asm sideeffect "; def $0", "=v"()
+  %v160 = call i32 asm sideeffect "; def $0", "=v"()
+  %v161 = call i32 asm sideeffect "; def $0", "=v"()
+  %v162 = call i32 asm sideeffect "; def $0", "=v"()
+  %v163 = call i32 asm sideeffect "; def $0", "=v"()
+  %v164 = call i32 asm sideeffect "; def $0", "=v"()
+  %v165 = call i32 asm sideeffect "; def $0", "=v"()
+  %v166 = call i32 asm sideeffect "; def $0", "=v"()
+  %v167 = call i32 asm sideeffect "; def $0", "=v"()
+  %v168 = call i32 asm sideeffect "; def $0", "=v"()
+  %v169 = call i32 asm sideeffect "; def $0", "=v"()
+  %v170 = call i32 asm sideeffect "; def $0", "=v"()
+  %v171 = call i32 asm sideeffect "; def $0", "=v"()
+  %v172 = call i32 asm sideeffect "; def $0", "=v"()
+  %v173 = call i32 asm sideeffect "; def $0", "=v"()
+  %v174 = call i32 asm sideeffect "; def $0", "=v"()
+  %v175 = call i32 asm sideeffect "; def $0", "=v"()
+  %v176 = call i32 asm sideeffect "; def $0", "=v"()
+  %v177 = call i32 asm sideeffect "; def $0", "=v"()
+  %v178 = call i32 asm sideeffect "; def $0", "=v"()
+  %v179 = call i32 asm sideeffect "; def $0", "=v"()
+  %v180 = call i32 asm sideeffect "; def $0", "=v"()
+  %v181 = call i32 asm sideeffect "; def $0", "=v"()
+  %v182 = call i32 asm sideeffect "; def $0", "=v"()
+  %v183 = call i32 asm sideeffect "; def $0", "=v"()
+  %v184 = call i32 asm sideeffect "; def $0", "=v"()
+  %v185 = call i32 asm sideeffect "; def $0", "=v"()
+  %v186 = call i32 asm sideeffect "; def $0", "=v"()
+  %v187 = call i32 asm sideeffect "; def $0", "=v"()
+  %v188 = call i32 asm sideeffect "; def $0", "=v"()
+  %v189 = call i32 asm sideeffect "; def $0", "=v"()
+  %v190 = call i32 asm sideeffect "; def $0", "=v"()
+  %v191 = call i32 asm sideeffect "; def $0", "=v"()
+  %v192 = call i32 asm sideeffect "; def $0", "=v"()
+  %v193 = call i32 asm sideeffect "; def $0", "=v"()
+  %v194 = call i32 asm sideeffect "; def $0", "=v"()
+  %v195 = call i32 asm sideeffect "; def $0", "=v"()
+  %v196 = call i32 asm sideeffect "; def $0", "=v"()
+  %v197 = call i32 asm sideeffect "; def $0", "=v"()
+  %v198 = call i32 asm sideeffect "; def $0", "=v"()
+  %v199 = call i32 asm sideeffect "; def $0", "=v"()
+  %v200 = call i32 asm sideeffect "; def $0", "=v"()
+  %v201 = call i32 asm sideeffect "; def $0", "=v"()
+  %v202 = call i32 asm sideeffect "; def $0", "=v"()
+  %v203 = call i32 asm sideeffect "; def $0", "=v"()
+  %v204 = call i32 asm sideeffect "; def $0", "=v"()
+  %v205 = call i32 asm sideeffect "; def $0", "=v"()
+  %v206 = call i32 asm sideeffect "; def $0", "=v"()
+  %v207 = call i32 asm sideeffect "; def $0", "=v"()
+  %v208 = call i32 asm sideeffect "; def $0", "=v"()
+  %v209 = call i32 asm sideeffect "; def $0", "=v"()
+  %v210 = call i32 asm sideeffect "; def $0", "=v"()
+  %v211 = call i32 asm sideeffect "; def $0", "=v"()
+  %v212 = call i32 asm sideeffect "; def $0", "=v"()
+  %v213 = call i32 asm sideeffect "; def $0", "=v"()
+  %v214 = call i32 asm sideeffect "; def $0", "=v"()
+  %v215 = call i32 asm sideeffect "; def $0", "=v"()
+  %v216 = call i32 asm sideeffect "; def $0", "=v"()
+  %v217 = call i32 asm sideeffect "; def $0", "=v"()
+  %v218 = call i32 asm sideeffect "; def $0", "=v"()
+  %v219 = call i32 asm sideeffect "; def $0", "=v"()
+  %v220 = call i32 asm sideeffect "; def $0", "=v"()
+  %v221 = call i32 asm sideeffect "; def $0", "=v"()
+  %v222 = call i32 asm sideeffect "; def $0", "=v"()
+  %v223 = call i32 asm sideeffect "; def $0", "=v"()
+  %v224 = call i32 asm sideeffect "; def $0", "=v"()
+  %v225 = call i32 asm sideeffect "; def $0", "=v"()
+  %v226 = call i32 asm sideeffect "; def $0", "=v"()
+  %v227 = call i32 asm sideeffect "; def $0", "=v"()
+  %v228 = call i32 asm sideeffect "; def $0", "=v"()
+  %v229 = call i32 asm sideeffect "; def $0", "=v"()
+  %v230 = call i32 asm sideeffect "; def $0", "=v"()
+  %v231 = call i32 asm sideeffect "; def $0", "=v"()
+  %v232 = call i32 asm sideeffect "; def $0", "=v"()
+  %v233 = call i32 asm sideeffect "; def $0", "=v"()
+  %v234 = call i32 asm sideeffect "; def $0", "=v"()
+  %v235 = call i32 asm sideeffect "; def $0", "=v"()
+  %v236 = call i32 asm sideeffect "; def $0", "=v"()
+  %v237 = call i32 asm sideeffect "; def $0", "=v"()
+  %v238 = call i32 asm sideeffect "; def $0", "=v"()
+  %v239 = call i32 asm sideeffect "; def $0", "=v"()
+  %v240 = call i32 asm sideeffect "; def $0", "=v"()
+  %v241 = call i32 asm sideeffect "; def $0", "=v"()
+  %v242 = call i32 asm sideeffect "; def $0", "=v"()
+  %v243 = call i32 asm sideeffect "; def $0", "=v"()
+  %v244 = call i32 asm sideeffect "; def $0", "=v"()
+  %v245 = call i32 asm sideeffect "; def $0", "=v"()
+  %v246 = call i32 asm sideeffect "; def $0", "=v"()
+  %v247 = call i32 asm sideeffect "; def $0", "=v"()
+  %v248 = call i32 asm sideeffect "; def $0", "=v"()
+  %v249 = call i32 asm sideeffect "; def $0", "=v"()
+  %v250 = call i32 asm sideeffect "; def $0", "=v"()
+  %v251 = call i32 asm sideeffect "; def $0", "=v"()
+  %v252 = call i32 asm sideeffect "; def $0", "=v"()
+  %v253 = call i32 asm sideeffect "; def $0", "=v"()
+  %v254 = call i32 asm sideeffect "; def $0", "=v"()
+  %v255 = call i32 asm sideeffect "; def $0", "=v"()
+  call void asm sideeffect "; use $0", "v"(i32 %v0)
+  call void asm sideeffect "; use $0", "v"(i32 %v1)
+  call void asm sideeffect "; use $0", "v"(i32 %v2)
+  call void asm sideeffect "; use $0", "v"(i32 %v3)
+  call void asm sideeffect "; use $0", "v"(i32 %v4)
+  call void asm sideeffect "; use $0", "v"(i32 %v5)
+  call void asm sideeffect "; use $0", "v"(i32 %v6)
+  call void asm sideeffect "; use $0", "v"(i32 %v7)
+  call void asm sideeffect "; use $0", "v"(i32 %v8)
+  call void asm sideeffect "; use $0", "v"(i32 %v9)
+  call void asm sideeffect "; use $0", "v"(i32 %v10)
+  call void asm sideeffect "; use $0", "v"(i32 %v11)
+  call void asm sideeffect "; use $0", "v"(i32 %v12)
+  call void asm sideeffect "; use $0", "v"(i32 %v13)
+  call void asm sideeffect "; use $0", "v"(i32 %v14)
+  call void asm sideeffect "; use $0", "v"(i32 %v15)
+  call void asm sideeffect "; use $0", "v"(i32 %v16)
+  call void asm sideeffect "; use $0", "v"(i32 %v17)
+  call void asm sideeffect "; use $0", "v"(i32 %v18)
+  call void asm sideeffect "; use $0", "v"(i32 %v19)
+  call void asm sideeffect "; use $0", "v"(i32 %v20)
+  call void asm sideeffect "; use $0", "v"(i32 %v21)
+  call void asm sideeffect "; use $0", "v"(i32 %v22)
+  call void asm sideeffect "; use $0", "v"(i32 %v23)
+  call void asm sideeffect "; use $0", "v"(i32 %v24)
+  call void asm sideeffect "; use $0", "v"(i32 %v25)
+  call void asm sideeffect "; use $0", "v"(i32 %v26)
+  call void asm sideeffect "; use $0", "v"(i32 %v27)
+  call void asm sideeffect "; use $0", "v"(i32 %v28)
+  call void asm sideeffect "; use $0", "v"(i32 %v29)
+  call void asm sideeffect "; use $0", "v"(i32 %v30)
+  call void asm sideeffect "; use $0", "v"(i32 %v31)
+  call void asm sideeffect "; use $0", "v"(i32 %v32)
+  call void asm sideeffect "; use $0", "v"(i32 %v33)
+  call void asm sideeffect "; use $0", "v"(i32 %v34)
+  call void asm sideeffect "; use $0", "v"(i32 %v35)
+  call void asm sideeffect "; use $0", "v"(i32 %v36)
+  call void asm sideeffect "; use $0", "v"(i32 %v37)
+  call void asm sideeffect "; use $0", "v"(i32 %v38)
+  call void asm sideeffect "; use $0", "v"(i32 %v39)
+  call void asm sideeffect "; use $0", "v"(i32 %v40)
+  call void asm sideeffect "; use $0", "v"(i32 %v41)
+  call void asm sideeffect "; use $0", "v"(i32 %v42)
+  call void asm sideeffect "; use $0", "v"(i32 %v43)
+  call void asm sideeffect "; use $0", "v"(i32 %v44)
+  call void asm sideeffect "; use $0", "v"(i32 %v45)
+  call void asm sideeffect "; use $0", "v"(i32 %v46)
+  call void asm sideeffect "; use $0", "v"(i32 %v47)
+  call void asm sideeffect "; use $0", "v"(i32 %v48)
+  call void asm sideeffect "; use $0", "v"(i32 %v49)
+  call void asm sideeffect "; use $0", "v"(i32 %v50)
+  call void asm sideeffect "; use $0", "v"(i32 %v51)
+  call void asm sideeffect "; use $0", "v"(i32 %v52)
+  call void asm sideeffect "; use $0", "v"(i32 %v53)
+  call void asm sideeffect "; use $0", "v"(i32 %v54)
+  call void asm sideeffect "; use $0", "v"(i32 %v55)
+  call void asm sideeffect "; use $0", "v"(i32 %v56)
+  call void asm sideeffect "; use $0", "v"(i32 %v57)
+  call void asm sideeffect "; use $0", "v"(i32 %v58)
+  call void asm sideeffect "; use $0", "v"(i32 %v59)
+  call void asm sideeffect "; use $0", "v"(i32 %v60)
+  call void asm sideeffect "; use $0", "v"(i32 %v61)
+  call void asm sideeffect "; use $0", "v"(i32 %v62)
+  call void asm sideeffect "; use $0", "v"(i32 %v63)
+  call void asm sideeffect "; use $0", "v"(i32 %v64)
+  call void asm sideeffect "; use $0", "v"(i32 %v65)
+  call void asm sideeffect "; use $0", "v"(i32 %v66)
+  call void asm sideeffect "; use $0", "v"(i32 %v67)
+  call void asm sideeffect "; use $0", "v"(i32 %v68)
+  call void asm sideeffect "; use $0", "v"(i32 %v69)
+  call void asm sideeffect "; use $0", "v"(i32 %v70)
+  call void asm sideeffect "; use $0", "v"(i32 %v71)
+  call void asm sideeffect "; use $0", "v"(i32 %v72)
+  call void asm sideeffect "; use $0", "v"(i32 %v73)
+  call void asm sideeffect "; use $0", "v"(i32 %v74)
+  call void asm sideeffect "; use $0", "v"(i32 %v75)
+  call void asm sideeffect "; use $0", "v"(i32 %v76)
+  call void asm sideeffect "; use $0", "v"(i32 %v77)
+  call void asm sideeffect "; use $0", "v"(i32 %v78)
+  call void asm sideeffect "; use $0", "v"(i32 %v79)
+  call void asm sideeffect "; use $0", "v"(i32 %v80)
+  call void asm sideeffect "; use $0", "v"(i32 %v81)
+  call void asm sideeffect "; use $0", "v"(i32 %v82)
+  call void asm sideeffect "; use $0", "v"(i32 %v83)
+  call void asm sideeffect "; use $0", "v"(i32 %v84)
+  call void asm sideeffect "; use $0", "v"(i32 %v85)
+  call void asm sideeffect "; use $0", "v"(i32 %v86)
+  call void asm sideeffect "; use $0", "v"(i32 %v87)
+  call void asm sideeffect "; use $0", "v"(i32 %v88)
+  call void asm sideeffect "; use $0", "v"(i32 %v89)
+  call void asm sideeffect "; use $0", "v"(i32 %v90)
+  call void asm sideeffect "; use $0", "v"(i32 %v91)
+  call void asm sideeffect "; use $0", "v"(i32 %v92)
+  call void asm sideeffect "; use $0", "v"(i32 %v93)
+  call void asm sideeffect "; use $0", "v"(i32 %v94)
+  call void asm sideeffect "; use $0", "v"(i32 %v95)
+  call void asm sideeffect "; use $0", "v"(i32 %v96)
+  call void asm sideeffect "; use $0", "v"(i32 %v97)
+  call void asm sideeffect "; use $0", "v"(i32 %v98)
+  call void asm sideeffect "; use $0", "v"(i32 %v99)
+  call void asm sideeffect "; use $0", "v"(i32 %v100)
+  call void asm sideeffect "; use $0", "v"(i32 %v101)
+  call void asm sideeffect "; use $0", "v"(i32 %v102)
+  call void asm sideeffect "; use $0", "v"(i32 %v103)
+  call void asm sideeffect "; use $0", "v"(i32 %v104)
+  call void asm sideeffect "; use $0", "v"(i32 %v105)
+  call void asm sideeffect "; use $0", "v"(i32 %v106)
+  call void asm sideeffect "; use $0", "v"(i32 %v107)
+  call void asm sideeffect "; use $0", "v"(i32 %v108)
+  call void asm sideeffect "; use $0", "v"(i32 %v109)
+  call void asm sideeffect "; use $0", "v"(i32 %v110)
+  call void asm sideeffect "; use $0", "v"(i32 %v111)
+  call void asm sideeffect "; use $0", "v"(i32 %v112)
+  call void asm sideeffect "; use $0", "v"(i32 %v113)
+  call void asm sideeffect "; use $0", "v"(i32 %v114)
+  call void asm sideeffect "; use $0", "v"(i32 %v115)
+  call void asm sideeffect "; use $0", "v"(i32 %v116)
+  call void asm sideeffect "; use $0", "v"(i32 %v117)
+  call void asm sideeffect "; use $0", "v"(i32 %v118)
+  call void asm sideeffect "; use $0", "v"(i32 %v119)
+  call void asm sideeffect "; use $0", "v"(i32 %v120)
+  call void asm sideeffect "; use $0", "v"(i32 %v121)
+  call void asm sideeffect "; use $0", "v"(i32 %v122)
+  call void asm sideeffect "; use $0", "v"(i32 %v123)
+  call void asm sideeffect "; use $0", "v"(i32 %v124)
+  call void asm sideeffect "; use $0", "v"(i32 %v125)
+  call void asm sideeffect "; use $0", "v"(i32 %v126)
+  call void asm sideeffect "; use $0", "v"(i32 %v127)
+  call void asm sideeffect "; use $0", "v"(i32 %v128)
+  call void asm sideeffect "; use $0", "v"(i32 %v129)
+  call void asm sideeffect "; use $0", "v"(i32 %v130)
+  call void asm sideeffect "; use $0", "v"(i32 %v131)
+  call void asm sideeffect "; use $0", "v"(i32 %v132)
+  call void asm sideeffect "; use $0", "v"(i32 %v133)
+  call void asm sideeffect "; use $0", "v"(i32 %v134)
+  call void asm sideeffect "; use $0", "v"(i32 %v135)
+  call void asm sideeffect "; use $0", "v"(i32 %v136)
+  call void asm sideeffect "; use $0", "v"(i32 %v137)
+  call void asm sideeffect "; use $0", "v"(i32 %v138)
+  call void asm sideeffect "; use $0", "v"(i32 %v139)
+  call void asm sideeffect "; use $0", "v"(i32 %v140)
+  call void asm sideeffect "; use $0", "v"(i32 %v141)
+  call void asm sideeffect "; use $0", "v"(i32 %v142)
+  call void asm sideeffect "; use $0", "v"(i32 %v143)
+  call void asm sideeffect "; use $0", "v"(i32 %v144)
+  call void asm sideeffect "; use $0", "v"(i32 %v145)
+  call void asm sideeffect "; use $0", "v"(i32 %v146)
+  call void asm sideeffect "; use $0", "v"(i32 %v147)
+  call void asm sideeffect "; use $0", "v"(i32 %v148)
+  call void asm sideeffect "; use $0", "v"(i32 %v149)
+  call void asm sideeffect "; use $0", "v"(i32 %v150)
+  call void asm sideeffect "; use $0", "v"(i32 %v151)
+  call void asm sideeffect "; use $0", "v"(i32 %v152)
+  call void asm sideeffect "; use $0", "v"(i32 %v153)
+  call void asm sideeffect "; use $0", "v"(i32 %v154)
+  call void asm sideeffect "; use $0", "v"(i32 %v155)
+  call void asm sideeffect "; use $0", "v"(i32 %v156)
+  call void asm sideeffect "; use $0", "v"(i32 %v157)
+  call void asm sideeffect "; use $0", "v"(i32 %v158)
+  call void asm sideeffect "; use $0", "v"(i32 %v159)
+  call void asm sideeffect "; use $0", "v"(i32 %v160)
+  call void asm sideeffect "; use $0", "v"(i32 %v161)
+  call void asm sideeffect "; use $0", "v"(i32 %v162)
+  call void asm sideeffect "; use $0", "v"(i32 %v163)
+  call void asm sideeffect "; use $0", "v"(i32 %v164)
+  call void asm sideeffect "; use $0", "v"(i32 %v165)
+  call void asm sideeffect "; use $0", "v"(i32 %v166)
+  call void asm sideeffect "; use $0", "v"(i32 %v167)
+  call void asm sideeffect "; use $0", "v"(i32 %v168)
+  call void asm sideeffect "; use $0", "v"(i32 %v169)
+  call void asm sideeffect "; use $0", "v"(i32 %v170)
+  call void asm sideeffect "; use $0", "v"(i32 %v171)
+  call void asm sideeffect "; use $0", "v"(i32 %v172)
+  call void asm sideeffect "; use $0", "v"(i32 %v173)
+  call void asm sideeffect "; use $0", "v"(i32 %v174)
+  call void asm sideeffect "; use $0", "v"(i32 %v175)
+  call void asm sideeffect "; use $0", "v"(i32 %v176)
+  call void asm sideeffect "; use $0", "v"(i32 %v177)
+  call void asm sideeffect "; use $0", "v"(i32 %v178)
+  call void asm sideeffect "; use $0", "v"(i32 %v179)
+  call void asm sideeffect "; use $0", "v"(i32 %v180)
+  call void asm sideeffect "; use $0", "v"(i32 %v181)
+  call void asm sideeffect "; use $0", "v"(i32 %v182)
+  call void asm sideeffect "; use $0", "v"(i32 %v183)
+  call void asm sideeffect "; use $0", "v"(i32 %v184)
+  call void asm sideeffect "; use $0", "v"(i32 %v185)
+  call void asm sideeffect "; use $0", "v"(i32 %v186)
+  call void asm sideeffect "; use $0", "v"(i32 %v187)
+  call void asm sideeffect "; use $0", "v"(i32 %v188)
+  call void asm sideeffect "; use $0", "v"(i32 %v189)
+  call void asm sideeffect "; use $0", "v"(i32 %v190)
+  call void asm sideeffect "; use $0", "v"(i32 %v191)
+  call void asm sideeffect "; use $0", "v"(i32 %v192)
+  call void asm sideeffect "; use $0", "v"(i32 %v193)
+  call void asm sideeffect "; use $0", "v"(i32 %v194)
+  call void asm sideeffect "; use $0", "v"(i32 %v195)
+  call void asm sideeffect "; use $0", "v"(i32 %v196)
+  call void asm sideeffect "; use $0", "v"(i32 %v197)
+  call void asm sideeffect "; use $0", "v"(i32 %v198)
+  call void asm sideeffect "; use $0", "v"(i32 %v199)
+  call void asm sideeffect "; use $0", "v"(i32 %v200)
+  call void asm sideeffect "; use $0", "v"(i32 %v201)
+  call void asm sideeffect "; use $0", "v"(i32 %v202)
+  call void asm sideeffect "; use $0", "v"(i32 %v203)
+  call void asm sideeffect "; use $0", "v"(i32 %v204)
+  call void asm sideeffect "; use $0", "v"(i32 %v205)
+  call void asm sideeffect "; use $0", "v"(i32 %v206)
+  call void asm sideeffect "; use $0", "v"(i32 %v207)
+  call void asm sideeffect "; use $0", "v"(i32 %v208)
+  call void asm sideeffect "; use $0", "v"(i32 %v209)
+  call void asm sideeffect "; use $0", "v"(i32 %v210)
+  call void asm sideeffect "; use $0", "v"(i32 %v211)
+  call void asm sideeffect "; use $0", "v"(i32 %v212)
+  call void asm sideeffect "; use $0", "v"(i32 %v213)
+  call void asm sideeffect "; use $0", "v"(i32 %v214)
+  call void asm sideeffect "; use $0", "v"(i32 %v215)
+  call void asm sideeffect "; use $0", "v"(i32 %v216)
+  call void asm sideeffect "; use $0", "v"(i32 %v217)
+  call void asm sideeffect "; use $0", "v"(i32 %v218)
+  call void asm sideeffect "; use $0", "v"(i32 %v219)
+  call void asm sideeffect "; use $0", "v"(i32 %v220)
+  call void asm sideeffect "; use $0", "v"(i32 %v221)
+  call void asm sideeffect "; use $0", "v"(i32 %v222)
+  call void asm sideeffect "; use $0", "v"(i32 %v223)
+  call void asm sideeffect "; use $0", "v"(i32 %v224)
+  call void asm sideeffect "; use $0", "v"(i32 %v225)
+  call void asm sideeffect "; use $0", "v"(i32 %v226)
+  call void asm sideeffect "; use $0", "v"(i32 %v227)
+  call void asm sideeffect "; use $0", "v"(i32 %v228)
+  call void asm sideeffect "; use $0", "v"(i32 %v229)
+  call void asm sideeffect "; use $0", "v"(i32 %v230)
+  call void asm sideeffect "; use $0", "v"(i32 %v231)
+  call void asm sideeffect "; use $0", "v"(i32 %v232)
+  call void asm sideeffect "; use $0", "v"(i32 %v233)
+  call void asm sideeffect "; use $0", "v"(i32 %v234)
+  call void asm sideeffect "; use $0", "v"(i32 %v235)
+  call void asm sideeffect "; use $0", "v"(i32 %v236)
+  call void asm sideeffect "; use $0", "v"(i32 %v237)
+  call void asm sideeffect "; use $0", "v"(i32 %v238)
+  call void asm sideeffect "; use $0", "v"(i32 %v239)
+  call void asm sideeffect "; use $0", "v"(i32 %v240)
+  call void asm sideeffect "; use $0", "v"(i32 %v241)
+  call void asm sideeffect "; use $0", "v"(i32 %v242)
+  call void asm sideeffect "; use $0", "v"(i32 %v243)
+  call void asm sideeffect "; use $0", "v"(i32 %v244)
+  call void asm sideeffect "; use $0", "v"(i32 %v245)
+  call void asm sideeffect "; use $0", "v"(i32 %v246)
+  call void asm sideeffect "; use $0", "v"(i32 %v247)
+  call void asm sideeffect "; use $0", "v"(i32 %v248)
+  call void asm sideeffect "; use $0", "v"(i32 %v249)
+  call void asm sideeffect "; use $0", "v"(i32 %v250)
+  call void asm sideeffect "; use $0", "v"(i32 %v251)
+  call void asm sideeffect "; use $0", "v"(i32 %v252)
+  call void asm sideeffect "; use $0", "v"(i32 %v253)
+  call void asm sideeffect "; use $0", "v"(i32 %v254)
+  call void asm sideeffect "; use $0", "v"(i32 %v255)
+  ret void
+}
+
+define internal void @use512vgprs() {
+  %v0 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v1 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v2 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v3 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v4 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v5 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v6 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  %v7 = call <32 x i32> asm sideeffect "; def $0", "=v"()
+  call void @use256vgprs()
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v0)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v1)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v2)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v3)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v4)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v5)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v6)
+  call void asm sideeffect "; use $0", "v"(<32 x i32> %v7)
+  ret void
+}
+
+define void @foo() #0 {
+  ret void
+}
+
+attributes #0 = { noinline }
+
+; GCN-LABEL: {{^}}k256_w8:
+; GFX90A: NumVgprs: 32
+; GFX90A: NumAgprs: 32
+; GFX90A: TotalNumVgprs: 64
+define amdgpu_kernel void @k256_w8() #2568 {
+  call void @foo()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k256_w8_no_agprs:
+; GFX90A: NumVgprs: 64
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 64
+define amdgpu_kernel void @k256_w8_no_agprs() #2568 {
+  call void @use256vgprs()
+  ret void
+}
+
+attributes #2568 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="8" }
+
+; GCN-LABEL: {{^}}k256_w4:
+; GFX90A: NumVgprs: 64
+; GFX90A: NumAgprs: 64
+; GFX90A: TotalNumVgprs: 128
+define amdgpu_kernel void @k256_w4() #2564 {
+  call void @foo()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k256_w4_no_agprs:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 128
+define amdgpu_kernel void @k256_w4_no_agprs() #2564 {
+  call void @use256vgprs()
+  ret void
+}
+
+attributes #2564 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="4" }
+
+; GCN-LABEL: {{^}}k256_w2:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k256_w2() #2562 {
+  call void @foo()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k256_w2_no_agprs:
+; GFX90A: NumVgprs: 256
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k256_w2_no_agprs() #2562 {
+  call void @use256vgprs()
+  ret void
+}
+
+attributes #2562 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="2" }
+
+; GCN-LABEL: {{^}}k256_w1:
+; GFX90A: NumVgprs: 256
+; GFX90A: NumAgprs: 256
+; GFX90A: TotalNumVgprs: 512
+define amdgpu_kernel void @k256_w1() #2561 {
+  call void @foo()
+  call void @use512vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k256_w1_no_agprs:
+; GFX90A: NumVgprs: 256
+; GFX90A: NumAgprs: 256
+; GFX90A: TotalNumVgprs: 512
+define amdgpu_kernel void @k256_w1_no_agprs() #2561 {
+  call void @use512vgprs()
+  ret void
+}
+
+attributes #2561 = { nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-waves-per-eu"="1" }
+
+; GCN-LABEL: {{^}}k512_no_agprs:
+; GFX90A: NumVgprs: 256
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k512_no_agprs() #512 {
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k512_call:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k512_call() #512 {
+  call void @foo()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k512_virtual_agpr:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k512_virtual_agpr() #512 {
+  %a0 = call i32 asm sideeffect "; def $0", "=a"()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k512_physical_agpr:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
+define amdgpu_kernel void @k512_physical_agpr() #512 {
+  call void asm sideeffect "", "~{a8}" ()
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}f512:
+; GFX90A: NumVgprs: 12{{[0-9]}}
+; GFX90A: NumAgprs: {{[1-9]}}
+define void @f512() #512 {
+  call void @use256vgprs()
+  ret void
+}
+
+attributes #512 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
+
+; GCN-LABEL: {{^}}k1024:
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 0
+; GFX90A: TotalNumVgprs: 128
+define amdgpu_kernel void @k1024() #1024 {
+  call void @use256vgprs()
+  ret void
+}
+
+; GCN-LABEL: {{^}}k1024_call:
+; GFX90A: NumVgprs: 64
+; GFX90A: NumAgprs: 64
+; GFX90A: TotalNumVgprs: 128
+define amdgpu_kernel void @k1024_call() #1024 {
+  call void @foo()
+  call void @use256vgprs()
+  ret void
+}
+
+attributes #1024 = { nounwind "amdgpu-flat-work-group-size"="1024,1024" }


        


More information about the llvm-commits mailing list