[llvm] Revert "AMDGPU: Codegen for v_dual_dot2acc_f32_f16/bf16 from VOP3" (PR #190159)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 05:20:00 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
This reverts commit 47f6a19181b426baa03182ab6a7a41e16b35301d.
Breaks MIOpen, don't have propper fix yet.
---
Patch is 75.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/190159.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp (+1-32)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (-8)
- (modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (-6)
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+5-8)
- (modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll (+69-55)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll (+528-246)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 4659dcd1a78cb..b17cabf37d53f 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -34,36 +34,6 @@ using namespace llvm;
#define DEBUG_TYPE "gcn-vopd-utils"
-// Check if MI is a VOP3P instruction with operands that satisfy the constraints
-// for mapping it to a VOP2/VOPD opcode: no modifiers, no clamp, src1 and src2
-// are registers (src0 can be register or literal), and src2 is same as dst.
-static bool canMapVOP3PToVOPD(const MachineInstr &MI) {
- unsigned Opc = MI.getOpcode();
- if (Opc != AMDGPU::V_DOT2_F32_F16 && Opc != AMDGPU::V_DOT2_F32_BF16)
- return false;
- // src0 can be register or literal
- int16_t Src0ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
- if (MI.getOperand(Src0ModsIdx).getImm() != SISrcMods::OP_SEL_1)
- return false;
- int16_t Src1ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
- if (MI.getOperand(Src1ModsIdx).getImm() != SISrcMods::OP_SEL_1)
- return false;
- int16_t Src1Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
- if (!MI.getOperand(Src1Idx).isReg())
- return false;
- int16_t Src2ModsIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers);
- if (MI.getOperand(Src2ModsIdx).getImm() != SISrcMods::OP_SEL_1)
- return false;
- int16_t Src2Idx = getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- if (!MI.getOperand(Src2Idx).isReg())
- return false;
- int16_t ClampIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::clamp);
- if (MI.getOperand(ClampIdx).getImm() != 0)
- return false;
- int16_t VdstIdx = getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
- return MI.getOperand(VdstIdx).getReg() == MI.getOperand(Src2Idx).getReg();
-}
-
bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
const MachineInstr &MIX,
const MachineInstr &MIY, bool IsVOPD3) {
@@ -74,8 +44,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
if (IsVOPD3 && !ST.hasVOPD3())
return false;
- if (!IsVOPD3 && ((TII.isVOP3(MIX) && !canMapVOP3PToVOPD(MIX)) ||
- (TII.isVOP3(MIY) && !canMapVOP3PToVOPD(MIY))))
+ if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
return false;
if (TII.isDPP(MIX) || TII.isDPP(MIY))
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0bc509c4a6b29..2e631d2f4a55e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7012,14 +7012,6 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOperand(0).setReg(OriginalExec);
return BB;
}
- case AMDGPU::V_DOT2_F32_F16:
- case AMDGPU::V_DOT2_F32_BF16: {
- // Hint RA to assign dst and src2 the same physical register.
- // For targets without VOP2, but with VOPD, variant of the instruction this
- // is one of the conditions to attempt converting VOP3P to VOPD.
- MRI.setSimpleHint(MI.getOperand(0).getReg(), MI.getOperand(6).getReg());
- return BB;
- }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2b617e54bdfe9..749cead8a20fb 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -928,12 +928,6 @@ ComponentProps::ComponentProps(const MCInstrDesc &OpDesc, bool VOP3Layout) {
NumVOPD3Mods = 2;
if (IsVOP3)
SrcOperandsNum = 3;
- } else if (Opcode == AMDGPU::V_DOT2_F32_F16 ||
- Opcode == AMDGPU::V_DOT2_F32_BF16) {
- // VOP3P opcodes that have VOPD but don't have VOP2 version. Using VOPD3
- // path in getIndexOfSrcInMCOperands to get correct src operand indexes,
- // but generating VOPD, not VOPD3.
- NumVOPD3Mods = SrcOperandsNum;
} else if (isSISrcFPOperand(OpDesc,
getNamedOperandIdx(Opcode, OpName::src0))) {
// All FP VOPD instructions have Neg modifiers for all operands except
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 554273675077b..d8665739e1501 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -87,13 +87,11 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
}
multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
- SDPatternOperator node = null_frag,
- bits<6> VOPDOp, string VOPDName> {
+ SDPatternOperator node = null_frag> {
def NAME : VOP3P_Pseudo<OpName, P,
getVOP3PModPat<P, node,
1 /*HasExplicitClamp*/, 1/*IsDOT*/,
- VOP3PModsDOT, VOP3PModsF32>.ret>,
- VOPD_Component<VOPDOp, VOPDName>;
+ VOP3PModsDOT, VOP3PModsF32>.ret>;
let SubtargetPredicate = isGFX11Plus in {
if P.HasExtVOP3DPP then
def _dpp : VOP3_DPP_Pseudo<OpName, P> {
@@ -614,12 +612,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
} // End OtherPredicates = [HasDot2Insts]
-let OtherPredicates = [HasDot10Insts], isCommutable = 1, usesCustomInserter = 1 in
+let OtherPredicates = [HasDot10Insts] in
defm V_DOT2_F32_F16 :
VOP3PInstDotWithDual<"v_dot2_f32_f16",
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
/*HasDPP*/ 1>,
- AMDGPUfdot2, 0xC, "v_dot2acc_f32_f16">;
+ AMDGPUfdot2>;
let OtherPredicates = [HasDot7Insts] in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
@@ -642,10 +640,9 @@ def DOT2_BF16_Profile
let SubtargetPredicate = HasDot12Insts in {
-let isCommutable = 1, usesCustomInserter = 1 in
defm V_DOT2_F32_BF16 :
VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
- int_amdgcn_fdot2_f32_bf16, 0xD, "v_dot2acc_f32_bf16">;
+ int_amdgcn_fdot2_f32_bf16>;
} // End SubtargetPredicate = HasDot12Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 40b0476a84d25..82545a472cf17 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -34,8 +34,8 @@ class VOP <string opName> {
string OpName = opName;
}
-// First 13 insts from VOPDY are also VOPDX.
-defvar VOPDX_Max_Index = 13;
+// First 13 insts from VOPDY are also VOPDX. DOT2ACC_F32_BF16 is omitted
+defvar VOPDX_Max_Index = 12;
defvar VOPD3X_Max_Index = 36;
class VOPD_Component<bits<6> OpIn, string vOPDName> {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 9e5a8c672deb3..1dff54ac35427 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -225,7 +225,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b(<2 x bfloat> %a, float %c) {
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 false)
ret float %ret
}
@@ -373,7 +373,7 @@ define float @v_fdot2_f32_bf16_inline_literal_b_clamp(<2 x bfloat> %a, float %c)
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_b_clamp:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, 0x40004000, v0, v1 clamp
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, 0x40004000, v1 clamp
%ret = tail call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> <bfloat 2.0, bfloat 2.0>, float %c, i1 true)
ret float %ret
}
@@ -395,8 +395,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, v3, v4
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@@ -406,15 +407,15 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX950: ; %bb.0:
-; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.a = fneg <2 x bfloat> %a
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -435,8 +436,9 @@ define float @v_fdot2_f32_bf16_neg_a_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_lo_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v0.l, 0x8000, v0.l
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%a_lo = extractelement <2 x bfloat> %a, i32 0
%neg.a_lo = fneg bfloat %a_lo
%neg_lo.a = insertelement <2 x bfloat> %a, bfloat %neg.a_lo, i32 0
@@ -460,8 +462,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_hi_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v0.h, 0x8000, v0.h
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%a_hi = extractelement <2 x bfloat> %a, i32 1
%neg.a_hi = fneg bfloat %a_hi
%neg_hi.a = insertelement <2 x bfloat> %a, bfloat %neg.a_hi, i32 1
@@ -474,15 +477,15 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX950: ; %bb.0:
-; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.b = fneg <2 x bfloat> %b
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -503,8 +506,9 @@ define float @v_fdot2_f32_bf16_neg_b_lo_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_lo_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v1.l, 0x8000, v1.l
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%b_lo = extractelement <2 x bfloat> %b, i32 0
%neg.b_lo = fneg bfloat %b_lo
%neg_lo.b = insertelement <2 x bfloat> %b, bfloat %neg.b_lo, i32 0
@@ -528,8 +532,9 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_hi_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_xor_b16 v1.h, 0x8000, v1.h
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%b_hi = extractelement <2 x bfloat> %b, i32 1
%neg.b_hi = fneg bfloat %b_hi
%neg_hi.b = insertelement <2 x bfloat> %b, bfloat %neg.b_hi, i32 1
@@ -542,15 +547,15 @@ define float @v_fdot2_f32_bf16_neg_b_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_c_dual:
; GFX950: ; %bb.0:
-; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_lo:[0,0,1]
-; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.c = fneg float %c
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -561,15 +566,15 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_abs_c_dual:
; GFX950: ; %bb.0:
-; GFX950: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v2, v0, v1, v2 neg_hi:[0,0,1]
-; GFX11PLUS: v_dot2_f32_bf16 v5, v3, v4, v5
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%abs.c = call float @llvm.fabs.f32(float %c)
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -589,8 +594,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_a_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v0.l, v0.h
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -610,8 +616,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_a_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_a_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v0.h, v0.l
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %a, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %shuf, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -631,8 +638,9 @@ define float @v_fdot2_f32_bf16_opsel_lo_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_lo_b_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v1.l, v1.h
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 1, i32 1>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -652,8 +660,9 @@ define float @v_fdot2_f32_bf16_opsel_hi_b_dual(<2 x bfloat> %a, <2 x bfloat> %b,
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_opsel_hi_b_dual:
; GFX11PLUS: ; %bb.0:
; GFX11PLUS: v_mov_b16_e32 v1.h, v1.l
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v5, v3, v4 :: v_dual_dot2acc_f32_bf16 v2, v0, v1
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%shuf = shufflevector <2 x bfloat> %b, <2 x bfloat> poison, <2 x i32> <i32 0, i32 0>
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %shuf, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %d, <2 x bfloat> %e, float %f, i1 false)
@@ -673,8 +682,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_y(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_y:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, v0, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v1, 0x40004000, v4, v5
+; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%r0 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 false)
%r1 = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> <bfloat 2.0, bfloat 2.0>, <2 x bfloat> %e, float %f, i1 false)
%r = fadd float %r0, %r1
@@ -690,8 +700,9 @@ define float @v_fdot2_f32_bf16_inline_literal_a_xy(<2 x bfloat> %a, <2 x bfloat>
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_inline_literal_a_xy:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dual_dot2acc_f32_bf16 v2, 0x40004000, v1 :: v_dual_dot2acc_f32_bf16 v5, 0x40004000, v4
-; GFX11PLUS: v_add_f32_e32 v0, v2, v5
+; GFX11PLUS: v_dot2_f32_bf16 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/190159
More information about the llvm-commits
mailing list