[llvm-branch-commits] [llvm] AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 (PR #179226)

Fri Feb 20 07:59:38 PST 2026

https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179226

>From 08fd28025a4510bf489c6d5d5f8a0df3222fa99d Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 12 Feb 2026 18:02:57 +0100
Subject: [PATCH] AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3

Codegen for v_dual_dot2acc_f32_f16/bf16 for targets that only have VOP3
version of the instruction.
Since there is no VOP2 version, instroduce temporary mir DOT2ACC pseudo
that is selected when there are no src_modifiers. This DOT2ACC pseudo
has src2 tied to dst (like the VOP2 version), PostRA pseudo expansion will
restore pseudo to VOP3 version of the instruction.
CreateVOPD will recoginize such VOP3 pseudo and generate v_dual_dot2acc.
---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   3 +
 llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp       |   2 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |  16 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   8 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  35 ++-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   4 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      | 186 +++++++------
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 256 ++++++++++--------
 10 files changed, 315 insertions(+), 210 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 07fb32173c2a3..1f4f1fbc15622 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -2645,6 +2645,9 @@ def isWave32Strict : Predicate<"Subtarget->isWave32()">,
 def isWave64Strict : Predicate<"Subtarget->isWave64()">,
   AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
 
+def HasOnlyDualDot2AccF32F16 : Predicate<"Subtarget->hasVOPDInsts() && Subtarget->hasDot10Insts() && !Subtarget->hasDot5Insts()">;
+def HasOnlyDualDot2AccF32BF16 : Predicate<"Subtarget->hasVOPDInsts() && Subtarget->hasDot12Insts() && !Subtarget->hasDot13Insts()">;
+
 //===----------------------------------------------------------------------===//
 // HwModes
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 72805aa9165b6..0118c2436d7a4 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -94,14 +94,15 @@ class GCNCreateVOPD {
     for (auto CompIdx : VOPD::COMPONENTS) {
       auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum();
       bool IsVOP3 = SII->isVOP3(*MI[CompIdx]);
+      bool IsVOP3Dot = IsVOP3 && SII->isDOT(*MI[CompIdx]);
       for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; ++CompSrcIdx) {
         if (AMDGPU::hasNamedOperand(VOPDOpc, Mods[CompIdx][CompSrcIdx])) {
           const MachineOperand *Mod =
               SII->getNamedOperand(*MI[CompIdx], SrcMods[CompSrcIdx]);
           VOPDInst.addImm(Mod ? Mod->getImm() : 0);
         }
-        auto MCOprIdx =
-            InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx, IsVOP3);
+        auto MCOprIdx = InstInfo[CompIdx].getIndexOfSrcInMCOperands(
+            CompSrcIdx, IsVOP3, IsVOP3Dot);
         VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx));
       }
       if (MI[CompIdx]->getOpcode() == AMDGPU::V_CNDMASK_B32_e32 && CI.IsVOPD3)
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 663f53889ac74..4300d5a3a8dd2 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -44,7 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
+  if (!IsVOPD3 && (TII.isVOP3WithoutVOPD(MIX) || TII.isVOP3WithoutVOPD(MIY)))
     return false;
   if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index b051f790118ef..9dabafe1a0a12 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2069,6 +2069,16 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   const AMDGPU::LaneMaskConstants &LMC = AMDGPU::LaneMaskConstants::get(ST);
   switch (MI.getOpcode()) {
   default: return TargetInstrInfo::expandPostRAPseudo(MI);
+  case AMDGPU::V_DOT2ACC_F32_F16_PSEUDO:
+    MI.setDesc(get(AMDGPU::V_DOT2_F32_F16));
+    MI.untieRegOperand(6);
+    break;
+
+  case AMDGPU::V_DOT2ACC_F32_BF16_PSEUDO:
+    MI.setDesc(get(AMDGPU::V_DOT2_F32_BF16));
+    MI.untieRegOperand(6);
+    break;
+
   case AMDGPU::S_MOV_B64_term:
     // This is only a terminator to get the correct spill code placement during
     // register allocation.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c945533f0f2ab..5db057de4b298 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -566,6 +566,22 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool isVOP3(uint32_t Opcode) const { return isVOP3(get(Opcode)); }
 
+  static bool isVOP3WithoutVOPD(const MachineInstr &MI) {
+    if (MI.getOpcode() == AMDGPU::V_DOT2_F32_F16 ||
+        MI.getOpcode() == AMDGPU::V_DOT2_F32_BF16) {
+      // VOPD if no src_mods, no clamp, no inline const and src2 same as dst.
+      return MI.getOperand(1).getImm() != SISrcMods::OP_SEL_1 ||
+             !MI.getOperand(2).isReg() ||
+             MI.getOperand(3).getImm() != SISrcMods::OP_SEL_1 ||
+             !MI.getOperand(4).isReg() ||
+             MI.getOperand(5).getImm() != SISrcMods::OP_SEL_1 ||
+             !MI.getOperand(6).isReg() ||
+             MI.getOperand(0).getReg() != MI.getOperand(6).getReg() ||
+             MI.getOperand(7).getImm() != 0;
+    }
+    return isVOP3(MI.getDesc());
+  }
+
   static bool isSDWA(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::SDWA;
   }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index fa24383c90fa6..5bc7cd3a55ada 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -843,12 +843,18 @@ class ComponentLayout {
   unsigned getIndexOfDstInMCOperands() const { return MC_DST_IDX[Kind]; }
 
   // Return the index of the specified src operand in MCInst operands.
-  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx, bool VOPD3) const {
+  unsigned getIndexOfSrcInMCOperands(unsigned CompSrcIdx, bool VOPD3,
+                                     bool VOP3Dot = false) const {
     assert(CompSrcIdx < Component::MAX_SRC_NUM);
 
     if (Kind == SINGLE && CompSrcIdx == 2 && BitOp3Idx != -1)
       return BitOp3Idx;
 
+    if (VOP3Dot) {
+      return SINGLE_MC_SRC_IDX[3][CompSrcIdx] + getPrevCompSrcNum() +
+             (Kind != SINGLE ? 1 : 0);
+    }
+
     if (VOPD3) {
       return SINGLE_MC_SRC_IDX[VOPD3ModsNum][CompSrcIdx] + getPrevCompSrcNum() +
              getPrevCompVOPD3ModsNum() + (Kind != SINGLE ? 1 : 0);
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 89e59c1734104..c93e01ad4f693 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,11 +87,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 }
 
 multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
-                                SDPatternOperator node = null_frag> {
+                                SDPatternOperator node = null_frag,
+                                bits<6> VOPDOp, string VOPDName> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           getVOP3PModPat<P, node,
                                          1 /*HasExplicitClamp*/, 1/*IsDOT*/,
-                                         VOP3PModsDOT, VOP3PModsF32>.ret>;
+                                         VOP3PModsDOT, VOP3PModsF32>.ret>,
+             VOPD_Component<VOPDOp, VOPDName>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
     def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -616,7 +618,7 @@ defm V_DOT2_F32_F16 :
   VOP3PInstDotWithDual<"v_dot2_f32_f16",
                        VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
                                      /*HasDPP*/ 1>,
-                       AMDGPUfdot2>;
+                       AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
 
 let OtherPredicates = [HasDot7Insts] in {
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
@@ -641,12 +643,37 @@ let SubtargetPredicate = HasDot12Insts  in {
 
 defm V_DOT2_F32_BF16 :
   VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
-                       int_amdgcn_fdot2_f32_bf16>;
+                       int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
 
 } // End SubtargetPredicate = HasDot12Insts
 
 } // End let IsDOT = 1
 
+let IsDOT = 1, Constraints = "$vdst = $src2" in {
+let OtherPredicates = [HasOnlyDualDot2AccF32F16] in
+def V_DOT2ACC_F32_F16_PSEUDO
+  : VOP3P_Pseudo<"", VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR>>;
+
+let OtherPredicates = [HasOnlyDualDot2AccF32BF16] in
+def V_DOT2ACC_F32_BF16_PSEUDO :
+  VOP3P_Pseudo<"", VOP3P_Profile<VOP_F32_V2BF16_V2BF16_F32, VOP3_REGULAR>>;
+}
+
+class Dot2AccPseudo_Pat <SDPatternOperator node, Instruction inst, ValueType ty>
+  : GCNPat <
+    (f32 (node (ty (VOP3PNoModsDOT ty:$src0)), (ty (VOP3PNoModsDOT ty:$src1)),
+               (f32 (VOP3PNoModsF32 f32:$src2)), (i1 DSTCLAMP.NONE))),
+    (f32 (inst (i32 SRCMODS.OP_SEL_1), $src0, (i32 SRCMODS.OP_SEL_1), $src1,
+               (i32 SRCMODS.OP_SEL_1), $src2))
+>;
+
+let SubtargetPredicate = HasOnlyDualDot2AccF32F16 in
+def : Dot2AccPseudo_Pat<AMDGPUfdot2, V_DOT2ACC_F32_F16_PSEUDO, v2f16>;
+
+let SubtargetPredicate = HasOnlyDualDot2AccF32BF16 in
+def : Dot2AccPseudo_Pat<int_amdgcn_fdot2_f32_bf16, V_DOT2ACC_F32_BF16_PSEUDO,
+                        v2bf16>;
+
 multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
   let IsDOT = 1 in
   defm NAME : VOP3PInst<OpName, VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED_NO_OPSEL>,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 48ef0f92e9f2d..da5a46588f0f2 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
-// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
-defvar VOPDX_Max_Index = 12;
+// First 13 insts from VOPDY are also VOPDX.
+defvar VOPDX_Max_Index = 13;
 defvar VOPD3X_Max_Index = 36;
 
 class VOPD_Component<bits<6> OpIn, string vOPDName> {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index a16971843c247..44de014be30f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -13,7 +13,8 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_mov_b32_e32 v0, v2
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   ret float %r
 }
@@ -39,13 +40,15 @@ define float @v_fdot2_f32_bf16_neg_a_lo(<2 x bfloat> %a, <2 x bfloat> %b, float
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_lo:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_lo:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v0
 ; GFX12:    v_bfi_b32 v0, 0xffff, v3, v0
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %a_lo = extractelement <2 x bfloat> %a, i32 0
   %neg.a_lo = fneg bfloat %a_lo
   %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -66,14 +69,16 @@ define float @v_fdot2_f32_bf16_neg_a_hi(<2 x bfloat> %a, <2 x bfloat> %b, float
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_hi:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_hi:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX12:    v_perm_b32 v0, v3, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %a_hi = extractelement <2 x bfloat> %a, i32 1
   %neg.a_hi = fneg bfloat %a_hi
   %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -102,13 +107,15 @@ define float @v_fdot2_f32_bf16_neg_b_lo(<2 x bfloat> %a, <2 x bfloat> %b, float
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_lo:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_lo:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v1
 ; GFX12:    v_bfi_b32 v1, 0xffff, v3, v1
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %b_lo = extractelement <2 x bfloat> %b, i32 0
   %neg.b_lo = fneg bfloat %b_lo
   %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -129,14 +136,16 @@ define float @v_fdot2_f32_bf16_neg_b_hi(<2 x bfloat> %a, <2 x bfloat> %b, float
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_hi:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_hi:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX12:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %b_hi = extractelement <2 x bfloat> %b, i32 1
   %neg.b_hi = fneg bfloat %b_hi
   %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -185,12 +194,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_a(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   ret float %r
@@ -207,12 +218,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_a(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   ret float %r
@@ -229,12 +242,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_b(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   ret float %r
@@ -251,12 +266,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_b(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   ret float %r
@@ -270,7 +287,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a(<2 x bfloat> %b, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x3f003f00, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x3f003f00, v0, v1
+; GFX11PLUS:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 0.5, bfloat 0.5>, <2 x bfloat> %b, float %c, i1 false)
   ret float %ret
 }
@@ -283,7 +301,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v0, 0x40004000, v1
+; GFX11PLUS:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -297,7 +316,10 @@ define float @v_fdot2_f32_bf16_inline_literal_c(<2 x bfloat> %a, <2 x bfloat> %b
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
+; GFX11PLUS:    s_mov_b32 s0, 2.0
+; GFX11PLUS:    v_mov_b32_e32 v2, s0
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX11PLUS:    v_mov_b32_e32 v0, v2
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
   ret float %ret
 }
@@ -483,9 +505,8 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -502,8 +523,8 @@ define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.a = fneg <2 x bfloat> %a
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -524,17 +545,16 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v0
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v0, 0xffff, v6, v0
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x bfloat> %a, i32 0
   %neg.a_lo = fneg bfloat %a_lo
   %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -558,18 +578,17 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v0, v6, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x bfloat> %a, i32 1
   %neg.a_hi = fneg bfloat %a_hi
   %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -589,8 +608,8 @@ define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.b = fneg <2 x bfloat> %b
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -611,17 +630,16 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v1
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v1, 0xffff, v6, v1
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x bfloat> %b, i32 0
   %neg.b_lo = fneg bfloat %b_lo
   %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -645,18 +663,17 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x bfloat> %b, i32 1
   %neg.b_hi = fneg bfloat %b_hi
   %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -677,8 +694,8 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -697,8 +714,8 @@ define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -718,16 +735,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -747,16 +762,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -776,16 +789,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -805,16 +816,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -831,9 +840,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_dual(<2 x bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -849,9 +858,9 @@ define float @v_fdot2_f32_bf16_inline_literal_b_dual(<2 x bfloat> %a, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -868,9 +877,10 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    s_mov_b32 s0, 2.0
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v4, v2, v3 :: v_dual_mov_b32 v5, s0
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v0, v1, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v5, v4
   %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 2e6a03fec38e9..7aefe45ff7e40 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -30,11 +30,13 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
 ;
 ; GFX1170-LABEL: v_fdot2:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   ret float %r
 }
@@ -74,13 +76,15 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_neg_a_lo:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_neg_a_lo:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v0
 ; GFX12:    v_bfi_b32 v0, 0xffff, v3, v0
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -115,14 +119,16 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_neg_a_hi:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_neg_a_hi:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX12:    v_perm_b32 v0, v3, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -165,13 +171,15 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_neg_b_lo:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_neg_b_lo:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v1
 ; GFX12:    v_bfi_b32 v1, 0xffff, v3, v1
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -206,14 +214,16 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_neg_b_hi:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_neg_b_hi:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
 ; GFX12:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -298,12 +308,14 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_opsel_lo_a:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_opsel_lo_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   ret float %r
@@ -334,12 +346,14 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_opsel_hi_a:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_opsel_hi_a:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   ret float %r
@@ -370,12 +384,14 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_opsel_lo_b:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_opsel_lo_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   ret float %r
@@ -406,12 +422,14 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX1170-LABEL: v_fdot2_opsel_hi_b:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_opsel_hi_b:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   ret float %r
@@ -439,11 +457,13 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_a:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX1170:    v_mov_b32_e32 v0, v1
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_a:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX12:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   ret float %ret
 }
@@ -470,11 +490,13 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_b:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX1170:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX1170:    v_mov_b32_e32 v0, v1
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_b:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX12:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX12:    v_mov_b32_e32 v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -504,11 +526,17 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_c:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX1170:    s_mov_b32 s0, 2.0
+; GFX1170:    v_mov_b32_e32 v2, s0
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_mov_b32_e32 v0, v2
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_c:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX12:    s_mov_b32 s0, 2.0
+; GFX12:    v_mov_b32_e32 v2, s0
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_mov_b32_e32 v0, v2
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
   ret float %ret
 }
@@ -800,15 +828,15 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d
 ;
 ; GFX1170-LABEL: v_fdot2_dual:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -843,14 +871,14 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 ; GFX1170-LABEL: v_fdot2_neg_a_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.a = fneg <2 x half> %a
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -890,17 +918,17 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ; GFX1170-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v0
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v0, 0xffff, v6, v0
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -943,18 +971,18 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ; GFX1170-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v0, v6, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -992,14 +1020,14 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 ; GFX1170-LABEL: v_fdot2_neg_b_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.b = fneg <2 x half> %b
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1039,17 +1067,17 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ; GFX1170-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v1, 0xffff, v6, v1
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -1092,18 +1120,18 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ; GFX1170-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -1144,14 +1172,14 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 ; GFX1170-LABEL: v_fdot2_neg_c_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX12-LABEL: v_fdot2_neg_c_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1190,14 +1218,14 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 ; GFX1170-LABEL: v_fdot2_abs_c_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX12-LABEL: v_fdot2_abs_c_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_add_f32_e32 v0, v0, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1236,16 +1264,16 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ; GFX1170-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1284,16 +1312,16 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ; GFX1170-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1332,16 +1360,16 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ; GFX1170-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1380,16 +1408,16 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ; GFX1170-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX1170:  ; %bb.0:
 ; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX1170:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX12:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1424,15 +1452,15 @@ define float @v_fdot2_inline_literal_a_dual(<2 x half> %b, float %c, <2 x half>
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_a_dual:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
-; GFX1170:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX1170:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170:    v_add_f32_e32 v0, v1, v4
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_a_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v0, v1
+; GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX12:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1466,15 +1494,15 @@ define float @v_fdot2_inline_literal_b_dual(<2 x half> %a, float %c, <2 x half>
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_b_dual:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
-; GFX1170:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX1170:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170:    v_add_f32_e32 v0, v1, v4
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_b_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_f16 v1, v0, 0x40004000, v1
+; GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX12:    v_add_f32_e32 v0, v1, v4
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1511,15 +1539,19 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h
 ;
 ; GFX1170-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX1170:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX1170:    v_add_f32_e32 v0, v0, v1
+; GFX1170:    s_mov_b32 s0, 2.0
+; GFX1170:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170:    v_mov_b32_e32 v5, s0
+; GFX1170:    v_dot2_f32_f16 v5, v0, v1, v5
+; GFX1170:    v_add_f32_e32 v0, v5, v4
 ;
 ; GFX12-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    s_mov_b32 s0, 2.0
+; GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX12:    v_mov_b32_e32 v5, s0
+; GFX12:    v_dot2_f32_f16 v5, v0, v1, v5
+; GFX12:    v_add_f32_e32 v0, v5, v4
   %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1