[llvm] 2f38a8f - AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 (#179226)

Wed Mar 25 03:47:13 PDT 2026

Author: Petar Avramovic
Date: 2026-03-25T11:47:07+01:00
New Revision: 2f38a8fc57b7b64ea26ff667524d4850f8853cae

URL: https://github.com/llvm/llvm-project/commit/2f38a8fc57b7b64ea26ff667524d4850f8853cae
DIFF: https://github.com/llvm/llvm-project/commit/2f38a8fc57b7b64ea26ff667524d4850f8853cae.diff

LOG: AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3 (#179226)

For V_DOT2_F32_F16 and V_DOT2_F32_BF16 add their VOPDName and mark
them with usesCustomInserter which will be used to add pre-RA register
allocation hints to preferably assign dst and src2 to the same physical
register. When the hint is satisfied, canMapVOP3PToVOPD recognises the
instruction as eligible for VOPD pairing by checking if it is VOP2 like:
dst==src2, no source modifiers, no clamp, and src1 is a register.
Mark both instructions as commutable to allow a literal in src1 to be
moved to src0, since VOPD only permits a literal in src0.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
    llvm/lib/Target/AMDGPU/VOP3PInstructions.td
    llvm/lib/Target/AMDGPU/VOPInstructions.td
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index b17cabf37d53f..4659dcd1a78cb 100644

--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -34,6 +34,36 @@ using namespace llvm;
 
 #define DEBUG_TYPE "gcn-vopd-utils"
 
+// Check if MI is a VOP3P instruction with operands that satisfy the constraints
+// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
+// are registers (src0 can be register or literal), and src2 is same as dst.
+static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
+    return false;
+  // src0 can be register or literal
+  int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+  if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+  if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (!MI.getOperand(Src1Idx).isReg())
+    return false;
+  int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
+  if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
+    return false;
+  int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+  if (!MI.getOperand(Src2Idx).isReg())
+    return false;
+  int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
+  if (MI.getOperand(ClampIdx).getImm() != 0)
+    return false;
+  int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+  return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
+}
+
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
                                    const MachineInstr &MIX,
                                    const MachineInstr &MIY, bool IsVOPD3) {
@@ -44,7 +74,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
+  if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
+                   (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
     return false;
   if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d0fceb6717c38..79b6c239a6d65 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7011,6 +7011,14 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.getOperand(0).setReg(OriginalExec);
     return BB;
   }
+  case AMDGPU::V_DOT2_F32_F16:
+  case AMDGPU::V_DOT2_F32_BF16: {
+    // Hint RA to assign dst and src2 the same physical register.
+    // For targets without VOP2, but with VOPD, variant of the instruction this
+    // is one of the conditions to attempt converting VOP3P to VOPD.
+    MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
+    return BB;
+  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 749cead8a20fb..2b617e54bdfe9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -928,6 +928,12 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
     NumVOPD3Mods = 2;
     if (IsVOP3)
       SrcOperandsNum = 3;
+  } else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
+             Opcode == AMDGPU::V_DOT2_F32_BF16) {
+    // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
+    // path in getIndexOfSrcInMCOperands to get correct src operand indexes,
+    // but generating VOPD, not VOPD3.
+    NumVOPD3Mods = SrcOperandsNum;
   } else if (isSISrcFPOperand(OpDesc,
                               getNamedOperandIdx(Opcode, OpName::src0))) {
     // All FP VOPD instructions have Neg modifiers for all operands except

diff  --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9bde8634e2ee2..54832dd76c754 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,11 +87,13 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 }
 
 multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
-                                SDPatternOperator node = null_frag> {
+                                SDPatternOperator node = null_frag,
+                                bits<6> VOPDOp, string VOPDName> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           getVOP3PModPat<P, node,
                                          1 /*HasExplicitClamp*/, 1/*IsDOT*/,
-                                         VOP3PModsDOT, VOP3PModsF32>.ret>;
+                                         VOP3PModsDOT, VOP3PModsF32>.ret>,
+             VOPD_Component<VOPDOp, VOPDName>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
     def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -612,12 +614,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
   VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
 } // End OtherPredicates = [HasDot2Insts]
 
-let OtherPredicates = [HasDot10Insts] in
+let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in
 defm V_DOT2_F32_F16 :
   VOP3PInstDotWithDual<"v_dot2_f32_f16",
                        VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
                                      /*HasDPP*/ 1>,
-                       AMDGPUfdot2>;
+                       AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
 
 let OtherPredicates = [HasDot7Insts] in {
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
@@ -640,9 +642,10 @@ def DOT2_BF16_Profile
 
 let SubtargetPredicate = HasDot12Insts  in {
 
+let isCommutable = 1, usesCustomInserter = 1 in
 defm V_DOT2_F32_BF16 :
   VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
-                       int_amdgcn_fdot2_f32_bf16>;
+                       int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
 
 } // End SubtargetPredicate = HasDot12Insts
 

diff  --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 82545a472cf17..40b0476a84d25 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
-// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
-defvar VOPDX_Max_Index = 12;
+// First 13 insts from VOPDY are also VOPDX.
+defvar VOPDX_Max_Index = 13;
 defvar VOPD3X_Max_Index = 36;
 
 class VOPD_Component<bits<6> OpIn, string vOPDName> {

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index e03b57cac6ab2..7cb0a50dc8b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -271,7 +271,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -439,7 +439,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c)
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
+; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
   ret float %ret
 }
@@ -461,9 +461,8 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -473,15 +472,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
 define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.a = fneg <2 x bfloat> %a
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -502,17 +501,16 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v0
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v0, 0xffff, v6, v0
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x bfloat> %a, i32 0
   %neg.a_lo = fneg bfloat %a_lo
   %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -536,18 +534,17 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v0, v6, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x bfloat> %a, i32 1
   %neg.a_hi = fneg bfloat %a_hi
   %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -560,15 +557,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.b = fneg <2 x bfloat> %b
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -589,17 +586,16 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v1
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_bfi_b32 v1, 0xffff, v6, v1
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x bfloat> %b, i32 0
   %neg.b_lo = fneg bfloat %b_lo
   %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -623,18 +619,17 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX12:    v_dot2_f32_bf16 v5, v3, v4, v5
 ; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX12:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dot2_f32_bf16 v2, v0, v1, v2
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x bfloat> %b, i32 1
   %neg.b_hi = fneg bfloat %b_hi
   %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -647,15 +642,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -666,15 +661,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -694,16 +689,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -723,16 +716,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -752,16 +743,14 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -781,16 +770,14 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX11:  ; %bb.0:
 ; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX11:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX12-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX12:  ; %bb.0:
 ; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_bf16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX12:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
+; GFX12:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -810,9 +797,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -828,9 +814,8 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v1, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -846,9 +831,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_x(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_x:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v4, v3, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, v4, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %e, <2 x bfloat> %d, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -864,9 +848,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_y(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_y:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v1, v0, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, 0x40004000, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v1, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %b, <2 x bfloat> %a, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -882,9 +865,8 @@ define float @v_fdot2_f32_bf16_inline_literal_b_xy(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_xy:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v2
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, 0x40004000, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -902,8 +884,8 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_c_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, 2.0
-; GFX11PLUS:    v_dot2_f32_bf16 v1, v2, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v4, v2, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v4
   %r0 = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -913,9 +895,9 @@ define float @v_fdot2_f32_bf16_inline_literal_c_dual(<2 x bfloat> %a, <2 x bfloa
 define float @v_fdot2_f32_bf16_clamp_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GCN-LABEL: v_fdot2_f32_bf16_clamp_dual:
 ; GCN:  ; %bb.0:
-; GCN:    v_dot2_f32_bf16 v0, v0, v1, v2 clamp
-; GCN:    v_dot2_f32_bf16 v1, v3, v4, v5 clamp
-; GCN:    v_add_f32_e32 v0, v0, v1
+; GCN:    v_dot2_f32_bf16 v2, v0, v1, v2 clamp
+; GCN:    v_dot2_f32_bf16 v5, v3, v4, v5 clamp
+; GCN:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 true)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 true)
   %r = fadd float %r0, %r1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 319d5b3e22760..73410d8f32ff9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck %s --check-prefixes=GCN,GFX950
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX11
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1170 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck %s --check-prefixes=GCN,GFX11PLUS,GFX1170-GFX12
 
 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
 
@@ -28,13 +28,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   ret float %r
 }
@@ -71,16 +67,10 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_lo:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_a_lo:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b32_e32 v3, 0x8000, v0
-; GFX12:    v_bfi_b32 v0, 0xffff, v3, v0
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_lo:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -112,17 +102,10 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_hi:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_a_hi:
-; GFX12:  ; %bb.0:
-; GFX12:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX12:    v_perm_b32 v0, v3, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_hi:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v0.h, 0x8000, v0.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -162,16 +145,10 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_lo:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_b_lo:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b32_e32 v3, 0x8000, v1
-; GFX12:    v_bfi_b32 v1, 0xffff, v3, v1
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_lo:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v1.l, 0x8000, v1.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -203,17 +180,10 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_hi:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_neg_b_hi:
-; GFX12:  ; %bb.0:
-; GFX12:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX12:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX12:    v_perm_b32 v1, v3, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_hi:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_xor_b16 v1.h, 0x8000, v1.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -261,15 +231,10 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v0.l, v0.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   ret float %r
@@ -297,15 +262,10 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v0.h, v0.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   ret float %r
@@ -333,15 +293,10 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_lo_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v1.l, v1.h
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   ret float %r
@@ -369,15 +324,10 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170-GFX12-LABEL: v_fdot2_opsel_hi_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_mov_b16_e32 v1.h, v1.l
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   ret float %r
@@ -403,13 +353,9 @@ define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v1, 0x40004000, v0
 ; GFX11:    v_mov_b32_e32 v0, v1
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_a:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_a:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   ret float %ret
 }
@@ -434,13 +380,9 @@ define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
 ; GFX11:    v_dot2acc_f32_f16 v1, 0x40004000, v0
 ; GFX11:    v_mov_b32_e32 v0, v1
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_b:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_b:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, 0x40004000, v0, v1
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -468,13 +410,9 @@ define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
 ; GFX11:    v_dot2acc_f32_f16 v2, v0, v1
 ; GFX11:    v_mov_b32_e32 v0, v2
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_c:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
-;
-; GFX12-LABEL: v_fdot2_inline_literal_c:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
   ret float %ret
 }
@@ -538,20 +476,10 @@ define float @v_fdot2_opsel_lo_a_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX10:  ; %bb.0:
 ; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0] clamp
 ;
-; GFX11-LABEL: v_fdot2_opsel_lo_a_clamp:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX1170-LABEL: v_fdot2_opsel_lo_a_clamp:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_a_clamp:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_a_clamp:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.l, v0.h
+; GFX11PLUS:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 true)
   ret float %r
@@ -572,20 +500,10 @@ define float @v_fdot2_opsel_hi_a_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX10:  ; %bb.0:
 ; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1] clamp
 ;
-; GFX11-LABEL: v_fdot2_opsel_hi_a_clamp:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX1170-LABEL: v_fdot2_opsel_hi_a_clamp:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_a_clamp:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_a_clamp:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.h, v0.l
+; GFX11PLUS:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 true)
   ret float %r
@@ -606,20 +524,10 @@ define float @v_fdot2_opsel_lo_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX10:  ; %bb.0:
 ; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0] clamp
 ;
-; GFX11-LABEL: v_fdot2_opsel_lo_b_clamp:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX1170-LABEL: v_fdot2_opsel_lo_b_clamp:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_b_clamp:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_b_clamp:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.l, v1.h
+; GFX11PLUS:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 true)
   ret float %r
@@ -640,20 +548,10 @@ define float @v_fdot2_opsel_hi_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
 ; GFX10:  ; %bb.0:
 ; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1] clamp
 ;
-; GFX11-LABEL: v_fdot2_opsel_hi_b_clamp:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX1170-LABEL: v_fdot2_opsel_hi_b_clamp:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_b_clamp:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_b_clamp:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.h, v1.l
+; GFX11PLUS:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 true)
   ret float %r
@@ -696,7 +594,7 @@ define float @v_fdot2_inline_literal_b_clamp(<2 x half> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_clamp:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_f16 v0, v0, 0x40004000, v1 clamp
+; GFX11PLUS:    v_dot2_f32_f16 v0, 0x40004000, v0, v1 clamp
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 true)
   ret float %ret
 }
@@ -712,9 +610,9 @@ define float @v_fdot2_inline_literal_c_clamp(<2 x half> %a, <2 x half> %b) {
 define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_dual:
 ; GFX950:  ; %bb.0:
@@ -728,22 +626,10 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -753,39 +639,33 @@ define float @v_fdot2_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d
 define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_a_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_a_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.a = fneg <2 x half> %a
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -796,9 +676,9 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX950:  ; %bb.0:
@@ -811,30 +691,15 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_lo_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_lo_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b32_e32 v6, 0x8000, v0
-; GFX12:    v_bfi_b32 v0, 0xffff, v6, v0
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_a_lo_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_lo = extractelement <2 x half> %a, i32 0
   %neg.a_lo = fneg half %a_lo
   %neg_lo.a = insertelement <2 x half> %a, half %neg.a_lo, i32 0
@@ -847,9 +712,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX950:  ; %bb.0:
@@ -863,31 +728,15 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_a_hi_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_a_hi_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX12:    v_perm_b32 v0, v6, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_a_hi_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v0.h, 0x8000, v0.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %a_hi = extractelement <2 x half> %a, i32 1
   %neg.a_hi = fneg half %a_hi
   %neg_hi.a = insertelement <2 x half> %a, half %neg.a_hi, i32 1
@@ -900,39 +749,33 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_b_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_b_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.b = fneg <2 x half> %b
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -943,9 +786,9 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX950:  ; %bb.0:
@@ -958,30 +801,15 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_lo_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_lo_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_xor_b32_e32 v6, 0x8000, v1
-; GFX12:    v_bfi_b32 v1, 0xffff, v6, v1
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_b_lo_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_lo = extractelement <2 x half> %b, i32 0
   %neg.b_lo = fneg half %b_lo
   %neg_lo.b = insertelement <2 x half> %b, half %neg.b_lo, i32 0
@@ -994,9 +822,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX950:  ; %bb.0:
@@ -1010,31 +838,15 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 ;
 ; GFX10-LABEL: v_fdot2_neg_b_hi_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_b_hi_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX12:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX12:    v_perm_b32 v1, v6, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_neg_b_hi_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_xor_b16 v1.h, 0x8000, v1.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %b_hi = extractelement <2 x half> %b, i32 1
   %neg.b_hi = fneg half %b_hi
   %neg_hi.b = insertelement <2 x half> %b, half %neg.b_hi, i32 1
@@ -1047,39 +859,33 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
 define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_neg_c_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_neg_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_neg_c_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_neg_c_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_neg_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_neg_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_neg_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1090,39 +896,33 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_abs_c_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_abs_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX950:    v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v0, v5
+; GFX950:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX10-LABEL: v_fdot2_abs_c_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX11-LABEL: v_fdot2_abs_c_dual:
 ; GFX11:  ; %bb.0:
-; GFX11:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX11:    v_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v0, v5
-;
-; GFX1170-LABEL: v_fdot2_abs_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_abs_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11:    v_add_f32_e32 v0, v2, v5
+;
+; GFX1170-GFX12-LABEL: v_fdot2_abs_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX1170-GFX12:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX1170-GFX12:    v_add_f32_e32 v0, v2, v5
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1133,9 +933,9 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
 define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX950:  ; %bb.0:
@@ -1147,29 +947,15 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_lo_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[1,0,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.l, v0.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.l, v0.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_a_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.l, v0.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1180,9 +966,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX950:  ; %bb.0:
@@ -1194,29 +980,15 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_hi_a_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[0,1,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v0.h, v0.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v0.h, v0.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_a_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v0, v0, v0, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_a_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v0.h, v0.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %a, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %shuf, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1227,9 +999,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX950:  ; %bb.0:
@@ -1241,29 +1013,15 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_lo_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel:[0,1,0]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.l, v1.h
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.l, v1.h
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_lo_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x7060302
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_lo_b_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.l, v1.h
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1274,9 +1032,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX950:  ; %bb.0:
@@ -1288,29 +1046,15 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 ;
 ; GFX10-LABEL: v_fdot2_opsel_hi_b_dual:
 ; GFX10:  ; %bb.0:
-; GFX10:    v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
+; GFX10:    v_dot2_f32_f16 v2, v0, v1, v2 op_sel_hi:[1,0,1]
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
-; GFX10:    v_add_f32_e32 v0, v0, v5
-;
-; GFX11-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX11:  ; %bb.0:
-; GFX11:    v_mov_b16_e32 v1.h, v1.l
-; GFX11:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
-; GFX11:    v_add_f32_e32 v0, v2, v5
+; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX1170-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_mov_b16_e32 v1.h, v1.l
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_opsel_hi_b_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_perm_b32 v1, v1, v1, 0x5040100
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_opsel_hi_b_dual:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_mov_b16_e32 v1.h, v1.l
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v5, v3, v4 :: v_dual_dot2acc_f32_f16 v2, v0, v1
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %shuf = shufflevector <2 x half> %b, <2 x half> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
@@ -1324,9 +1068,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
 define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_x:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, v4, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_x:
 ; GFX950:  ; %bb.0:
@@ -1340,22 +1084,10 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, v3, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_x:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_x:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_x:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, v3, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1365,9 +1097,9 @@ define float @v_fdot2_inline_literal_a_x(<2 x half> %a, <2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_y:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906:    v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, v1, v2
+; GFX906:    v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_y:
 ; GFX950:  ; %bb.0:
@@ -1381,22 +1113,10 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_y:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_y:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_y:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v0, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1406,9 +1126,9 @@ define float @v_fdot2_inline_literal_a_y(<2 x half> %a, <2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_a_xy:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, 2.0, v1, v2 op_sel_hi:[0,1,1]
-; GFX906:    v_dot2_f32_f16 v1, 2.0, v4, v5 op_sel_hi:[0,1,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, 2.0, v1, v2 op_sel_hi:[0,1,1]
+; GFX906:    v_dot2_f32_f16 v5, 2.0, v4, v5 op_sel_hi:[0,1,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_a_xy:
 ; GFX950:  ; %bb.0:
@@ -1422,22 +1142,10 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v4
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX1170:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_a_xy:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, 0x40004000, v1, v2
-; GFX12:    v_dot2_f32_f16 v1, 0x40004000, v4, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_a_xy:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v4
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1447,9 +1155,9 @@ define float @v_fdot2_inline_literal_a_xy(<2 x half> %a, <2 x half> %b, float %c
 define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_x:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v4, v3, v5
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_x:
 ; GFX950:  ; %bb.0:
@@ -1463,22 +1171,10 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, v4, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_x:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_x:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX1170:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_x:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX12:    v_dot2_f32_f16 v1, v4, v3, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_x:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, v4, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %e, <2 x half> %d, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1488,9 +1184,9 @@ define float @v_fdot2_inline_literal_b_x(<2 x half> %a, <2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_y:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX906:    v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v1, v0, v2
+; GFX906:    v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_y:
 ; GFX950:  ; %bb.0:
@@ -1504,22 +1200,10 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c,
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_y:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_y:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_y:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v1, v0, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_y:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, v1, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %b, <2 x half> %a, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, half 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1529,9 +1213,9 @@ define float @v_fdot2_inline_literal_b_y(<2 x half> %a, <2 x half> %b, float %c,
 define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GFX906-LABEL: v_fdot2_inline_literal_b_xy:
 ; GFX906:  ; %bb.0:
-; GFX906:    v_dot2_f32_f16 v0, v0, 2.0, v2 op_sel_hi:[1,0,1]
-; GFX906:    v_dot2_f32_f16 v1, v3, 2.0, v5 op_sel_hi:[1,0,1]
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v2, v0, 2.0, v2 op_sel_hi:[1,0,1]
+; GFX906:    v_dot2_f32_f16 v5, v3, 2.0, v5 op_sel_hi:[1,0,1]
+; GFX906:    v_add_f32_e32 v0, v2, v5
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_b_xy:
 ; GFX950:  ; %bb.0:
@@ -1545,22 +1229,10 @@ define float @v_fdot2_inline_literal_b_xy(<2 x half> %a, <2 x half> %b, float %c
 ; GFX10:    v_dot2c_f32_f16 v5, 0x40004000, v3
 ; GFX10:    v_add_f32_e32 v0, v2, v5
 ;
-; GFX11-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX11:  ; %bb.0:
-; GFX11:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
-; GFX11:    v_add_f32_e32 v0, v2, v5
-;
-; GFX1170-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX1170:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_b_xy:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, 0x40004000, v2
-; GFX12:    v_dot2_f32_f16 v1, v3, 0x40004000, v5
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX11PLUS-LABEL: v_fdot2_inline_literal_b_xy:
+; GFX11PLUS:  ; %bb.0:
+; GFX11PLUS:    v_dual_dot2acc_f32_f16 v2, 0x40004000, v0 :: v_dual_dot2acc_f32_f16 v5, 0x40004000, v3
+; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> <half 2.0, half 2.0>, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1571,8 +1243,8 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h
 ; GFX906-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX906:  ; %bb.0:
 ; GFX906:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX906:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX906:    v_add_f32_e32 v0, v0, v1
+; GFX906:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX906:    v_add_f32_e32 v0, v0, v4
 ;
 ; GFX950-LABEL: v_fdot2_inline_literal_c_dual:
 ; GFX950:  ; %bb.0:
@@ -1594,17 +1266,11 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h
 ; GFX11:    v_dot2acc_f32_f16 v5, v0, v1
 ; GFX11:    v_add_f32_e32 v0, v5, v4
 ;
-; GFX1170-LABEL: v_fdot2_inline_literal_c_dual:
-; GFX1170:  ; %bb.0:
-; GFX1170:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX1170:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX1170:    v_add_f32_e32 v0, v0, v1
-;
-; GFX12-LABEL: v_fdot2_inline_literal_c_dual:
-; GFX12:  ; %bb.0:
-; GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
-; GFX12:    v_dot2_f32_f16 v1, v2, v3, v4
-; GFX12:    v_add_f32_e32 v0, v0, v1
+; GFX1170-GFX12-LABEL: v_fdot2_inline_literal_c_dual:
+; GFX1170-GFX12:  ; %bb.0:
+; GFX1170-GFX12:    v_dot2_f32_f16 v0, v0, v1, 2.0
+; GFX1170-GFX12:    v_dot2_f32_f16 v4, v2, v3, v4
+; GFX1170-GFX12:    v_add_f32_e32 v0, v0, v4
   %r0 = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 2.0, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -1614,9 +1280,9 @@ define float @v_fdot2_inline_literal_c_dual(<2 x half> %a, <2 x half> %b, <2 x h
 define float @v_fdot2_clamp_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
 ; GCN-LABEL: v_fdot2_clamp_dual:
 ; GCN:  ; %bb.0:
-; GCN:    v_dot2_f32_f16 v0, v0, v1, v2 clamp
-; GCN:    v_dot2_f32_f16 v1, v3, v4, v5 clamp
-; GCN:    v_add_f32_e32 v0, v0, v1
+; GCN:    v_dot2_f32_f16 v2, v0, v1, v2 clamp
+; GCN:    v_dot2_f32_f16 v5, v3, v4, v5 clamp
+; GCN:    v_add_f32_e32 v0, v2, v5
   %r0 = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
   %r1 = call float @llvm.amdgcn.fdot2(<2 x half> %d, <2 x half> %e, float %f, i1 true)
   %r = fadd float %r0, %r1