[llvm] Revert "AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3" (PR #190159)

Thu Apr 2 05:20:00 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Petar Avramovic (petar-avramovic)

<details>
<summary>Changes</summary>

This reverts commit 47f6a19181b426baa03182ab6a7a41e16b35301d.
Breaks MIOpen, don't have propper fix yet.

---

Patch is 75.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190159.diff


7 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp (+1-32) 
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (-8) 
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (-6) 
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5-8) 
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll (+69-55) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll (+528-246) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 4659dcd1a78cb..b17cabf37d53f 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -34,36 +34,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "gcn-vopd-utils"
 
-// Check if MI is a VOP3P instruction with operands that satisfy the constraints
-// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
-// are registers (src0 can be register or literal), and src2 is same as dst.
-static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
-  unsigned Opc = MI.getOpcode();
-  if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
-    return false;
-  // src0 can be register or literal
-  int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
-  if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
-    return false;
-  int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
-  if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
-    return false;
-  int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
-  if (!MI.getOperand(Src1Idx).isReg())
-    return false;
-  int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
-  if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
-    return false;
-  int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
-  if (!MI.getOperand(Src2Idx).isReg())
-    return false;
-  int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
-  if (MI.getOperand(ClampIdx).getImm() != 0)
-    return false;
-  int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
-  return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
-}
-
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
                                    const MachineInstr &MIX,
                                    const MachineInstr &MIY, bool IsVOPD3) {
@@ -74,8 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
-                   (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
+  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
     return false;
   if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0bc509c4a6b29..2e631d2f4a55e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7012,14 +7012,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.getOperand(0).setReg(OriginalExec);
     return BB;
   }
-  case AMDGPU::V_DOT2_F32_F16:
-  case AMDGPU::V_DOT2_F32_BF16: {
-    // Hint RA to assign dst and src2 the same physical register.
-    // For targets without VOP2, but with VOPD, variant of the instruction this
-    // is one of the conditions to attempt converting VOP3P to VOPD.
-    MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
-    return BB;
-  }
   default:
     if (TII->isImage(MI) || TII->isMUBUF(MI)) {
       if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2b617e54bdfe9..749cead8a20fb 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -928,12 +928,6 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
     NumVOPD3Mods = 2;
     if (IsVOP3)
       SrcOperandsNum = 3;
-  } else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
-             Opcode == AMDGPU::V_DOT2_F32_BF16) {
-    // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
-    // path in getIndexOfSrcInMCOperands to get correct src operand indexes,
-    // but generating VOPD, not VOPD3.
-    NumVOPD3Mods = SrcOperandsNum;
   } else if (isSISrcFPOperand(OpDesc,
                               getNamedOperandIdx(Opcode, OpName::src0))) {
     // All FP VOPD instructions have Neg modifiers for all operands except
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 554273675077b..d8665739e1501 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,13 +87,11 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
 }
 
 multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
-                                SDPatternOperator node = null_frag,
-                                bits<6> VOPDOp, string VOPDName> {
+                                SDPatternOperator node = null_frag> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           getVOP3PModPat<P, node,
                                          1 /*HasExplicitClamp*/, 1/*IsDOT*/,
-                                         VOP3PModsDOT, VOP3PModsF32>.ret>,
-             VOPD_Component<VOPDOp, VOPDName>;
+                                         VOP3PModsDOT, VOP3PModsF32>.ret>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
     def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -614,12 +612,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
   VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
 } // End OtherPredicates = [HasDot2Insts]
 
-let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in
+let OtherPredicates = [HasDot10Insts] in
 defm V_DOT2_F32_F16 :
   VOP3PInstDotWithDual<"v_dot2_f32_f16",
                        VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
                                      /*HasDPP*/ 1>,
-                       AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
+                       AMDGPUfdot2>;
 
 let OtherPredicates = [HasDot7Insts] in {
 defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
@@ -642,10 +640,9 @@ def DOT2_BF16_Profile
 
 let SubtargetPredicate = HasDot12Insts  in {
 
-let isCommutable = 1, usesCustomInserter = 1 in
 defm V_DOT2_F32_BF16 :
   VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
-                       int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
+                       int_amdgcn_fdot2_f32_bf16>;
 
 } // End SubtargetPredicate = HasDot12Insts
 
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 40b0476a84d25..82545a472cf17 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
   string OpName = opName;
 }
 
-// First 13 insts from VOPDY are also VOPDX.
-defvar VOPDX_Max_Index = 13;
+// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
+defvar VOPDX_Max_Index = 12;
 defvar VOPD3X_Max_Index = 36;
 
 class VOPD_Component<bits<6> OpIn, string vOPDName> {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 9e5a8c672deb3..1dff54ac35427 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
   ret float %ret
 }
@@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c)
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
   %ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
   ret float %ret
 }
@@ -395,8 +395,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -406,15 +407,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
 define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v2, v5
+; GFX950:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %neg.a = fneg <2 x bfloat> %a
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -435,8 +436,9 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %a_lo = extractelement <2 x bfloat> %a, i32 0
   %neg.a_lo = fneg bfloat %a_lo
   %neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -460,8 +462,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %a_hi = extractelement <2 x bfloat> %a, i32 1
   %neg.a_hi = fneg bfloat %a_hi
   %neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -474,15 +477,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v2, v5
+; GFX950:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %neg.b = fneg <2 x bfloat> %b
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -503,8 +506,9 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %b_lo = extractelement <2 x bfloat> %b, i32 0
   %neg.b_lo = fneg bfloat %b_lo
   %neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -528,8 +532,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %b_hi = extractelement <2 x bfloat> %b, i32 1
   %neg.b_hi = fneg bfloat %b_hi
   %neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -542,15 +547,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
 define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v2, v5
+; GFX950:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %neg.c = fneg float %c
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -561,15 +566,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
 define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
 ; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX950:  ; %bb.0:
-; GFX950:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
 ; GFX950:    v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950:    v_add_f32_e32 v0, v2, v5
+; GFX950:    v_add_f32_e32 v0, v0, v5
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS:    v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %abs.c = call float @llvm.fabs.f32(float %c)
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -589,8 +594,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v0.l, v0.h
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -610,8 +616,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v0.h, v0.l
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -631,8 +638,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v1.l, v1.h
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -652,8 +660,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
 ; GFX11PLUS:  ; %bb.0:
 ; GFX11PLUS:    v_mov_b16_e32 v1.h, v1.l
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -673,8 +682,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS:    v_dot2_f32_bf16 v1, 0x40004000, v4, v5
+; GFX11PLUS:    v_add_f32_e32 v0, v0, v1
   %r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
   %r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
   %r = fadd float %r0, %r1
@@ -690,8 +700,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat>
 ;
 ; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
 ; GFX11PLUS:  ; %bb.0:
-; GFX11PLUS:    v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
-; GFX11PLUS:    v_add_f32_e32 v0, v2, v5
+; GFX11PLUS:    v_dot2_f32_bf16 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/190159