[llvm-branch-commits] [llvm] AMDGPU: Fix src2_modifiers for v_dot2_f32_f16/bf16 (PR #179224)
Petar Avramovic via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Mar 19 04:59:36 PDT 2026
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179224
>From 0525c50558827eaccaeaa2dbeeae39f6672f8dab Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 19 Mar 2026 12:47:21 +0100
Subject: [PATCH] AMDGPU: Fix src2_modifiers for v_dot2_f32_f16/bf16
---
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 4 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 9 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 13 ++
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 2 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 28 ++-
llvm/lib/Target/AMDGPU/VOPInstructions.td | 6 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll | 163 ++++++++++++------
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 15 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 35 ++--
11 files changed, 192 insertions(+), 86 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 84c0348c1d6114..de8722841d3fe2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,6 +51,10 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;
+def gi_vop3pmodsf32 :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsF32">,
+ GIComplexPatternEquiv<VOP3PModsF32>;
+
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 749450aaf03446..613dcfeb646a2d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3691,6 +3691,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ SelectVOP3Mods(In, Src, SrcMods);
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ Mods |= cast<ConstantSDNode>(SrcMods)->getZExtValue();
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index ffeb6dfdb3f903..8b12d1d2a800f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -233,6 +233,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index eb0b05a45d47de..80b30b98ab5906 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5269,6 +5269,19 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
return selectVOP3PRetHelper(Root, true);
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+ Mods |= SISrcMods::OP_SEL_1;
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
MachineOperand &Root) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index cc121632e101d8..2c9ecc207d8bd1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -200,6 +200,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PModsF32(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d9b40beaf73182..229cac30d41650 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1706,6 +1706,8 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">;
+
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def WMMAModsF32NegAbs : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 992c375069e775..9bde8634e2ee2a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -86,6 +86,21 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
} // end SubtargetPredicate = isGFX11Plus
}
+multiclass VOP3PInstDotWithDual<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def NAME : VOP3P_Pseudo<OpName, P,
+ getVOP3PModPat<P, node,
+ 1 /*HasExplicitClamp*/, 1/*IsDOT*/,
+ VOP3PModsDOT, VOP3PModsF32>.ret>;
+ let SubtargetPredicate = isGFX11Plus in {
+ if P.HasExtVOP3DPP then
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
+ let VOP3P = 1;
+ let PseudoInstr = OpName #"_dpp";
+ }
+ } // end SubtargetPredicate = isGFX11Plus
+}
+
// Non-packed instructions that use the VOP3P encoding.
// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed.
multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
@@ -598,9 +613,11 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
} // End OtherPredicates = [HasDot2Insts]
let OtherPredicates = [HasDot10Insts] in
-defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
- VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
- AMDGPUfdot2, 1/*ExplicitClamp*/>;
+defm V_DOT2_F32_F16 :
+ VOP3PInstDotWithDual<"v_dot2_f32_f16",
+ VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR,
+ /*HasDPP*/ 1>,
+ AMDGPUfdot2>;
let OtherPredicates = [HasDot7Insts] in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
@@ -623,8 +640,9 @@ def DOT2_BF16_Profile
let SubtargetPredicate = HasDot12Insts in {
-defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
- int_amdgcn_fdot2_f32_bf16, 1>;
+defm V_DOT2_F32_BF16 :
+ VOP3PInstDotWithDual<"v_dot2_f32_bf16", DOT2_BF16_Profile,
+ int_amdgcn_fdot2_f32_bf16>;
} // End SubtargetPredicate = HasDot12Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 9d56aa4ad5cb09..82545a472cf17b 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1380,10 +1380,12 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp,
bit IsDOT = 0,
- ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT, VOP3PMods)> {
+ ComplexPattern SrcPat = !if(IsDOT, VOP3PModsDOT,
+ VOP3PMods),
+ ComplexPattern Src2Pat = SrcPat> {
dag src0_dag = (P.Src0VT (SrcPat P.Src0VT:$src0, i32:$src0_modifiers));
dag src1_dag = (P.Src1VT (SrcPat P.Src1VT:$src1, i32:$src1_modifiers));
- dag src2_dag = (P.Src2VT (SrcPat P.Src2VT:$src2, i32:$src2_modifiers));
+ dag src2_dag = (P.Src2VT (Src2Pat P.Src2VT:$src2, i32:$src2_modifiers));
dag clamp_dag = (i1 timm:$clamp);
list<dag> ret3 = [(set P.DstVT:$vdst,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index cd8ce7a408370c..0d93cfe52af54a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -1,41 +1,51 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefix=GFX906 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX10PLUS %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_wait" --filter-out "s_nop" --filter-out "s_delay_alu" --filter-out "s_setpc_b64"
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GCN,GFX906 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GCN,GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GCN,GFX11 %s
define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+;
+; GFX10-LABEL: v_fdot2:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 false)
ret float %r
}
define float @v_fdot2_clamp(<2 x half> %a, <2 x half> %b, float %c) {
-; GFX906-LABEL: v_fdot2_clamp:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
-; GFX906-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10PLUS-LABEL: v_fdot2_clamp:
-; GFX10PLUS: ; %bb.0:
-; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 clamp
-; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_fdot2_clamp:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 true)
ret float %r
}
define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_a:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+;
+; GFX10-LABEL: v_fdot2_neg_a:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_a:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
%neg.a = fneg <2 x half> %a
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
ret float %r
@@ -43,10 +53,20 @@ define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_b:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+;
+; GFX10-LABEL: v_fdot2_neg_b:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_b:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
%neg.b = fneg <2 x half> %b
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
ret float %r
@@ -54,10 +74,20 @@ define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_a_neg_b:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v1, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0]
+;
+; GFX10-LABEL: v_fdot2_neg_a_neg_b:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX10: v_dot2c_f32_f16 v0, v1, v1
+;
+; GFX11-LABEL: v_fdot2_neg_a_neg_b:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b32_e32 v0, v2
+; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11: v_dot2acc_f32_f16 v0, v1, v1
%neg.a = fneg <2 x half> %b
%neg.b = fneg <2 x half> %b
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %neg.b, float %c, i1 false)
@@ -66,11 +96,20 @@ define float @v_fdot2_neg_a_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+;
+; GFX10-LABEL: v_fdot2_neg_c:
+; GFX10: ; %bb.0:
+; GFX10: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_neg_c:
+; GFX11: ; %bb.0:
+; GFX11: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
ret float %r
@@ -78,30 +117,56 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_inline_literal_a(<2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_inline_literal_a:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, 2.0, v0, v1 op_sel_hi:[0,1,1]
+;
+; GFX10-LABEL: v_fdot2_inline_literal_a:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_mov_b32_e32 v0, v1
+;
+; GFX11-LABEL: v_fdot2_inline_literal_a:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0
+; GFX11: v_mov_b32_e32 v0, v1
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> <half 2.0, half 2.0>, <2 x half> %b, float %c, i1 false)
ret float %ret
}
define float @v_fdot2_inline_literal_b(<2 x half> %a, float %c) {
; GFX906-LABEL: v_fdot2_inline_literal_b:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, 2.0, v1 op_sel_hi:[1,0,1]
+;
+; GFX10-LABEL: v_fdot2_inline_literal_b:
+; GFX10: ; %bb.0:
+; GFX10: v_dot2c_f32_f16 v1, 0x40004000, v0
+; GFX10: v_mov_b32_e32 v0, v1
+;
+; GFX11-LABEL: v_fdot2_inline_literal_b:
+; GFX11: ; %bb.0:
+; GFX11: v_dot2acc_f32_f16 v1, 0x40004000, v0
+; GFX11: v_mov_b32_e32 v0, v1
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> <half 2.0, half 2.0>, float %c, i1 false)
ret float %ret
}
define float @v_fdot2_inline_literal_c(<2 x half> %a, <2 x half> %b) {
; GFX906-LABEL: v_fdot2_inline_literal_c:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, 1.0
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; GFX906: ; %bb.0:
+; GFX906: v_dot2_f32_f16 v0, v0, v1, 1.0
+;
+; GFX10-LABEL: v_fdot2_inline_literal_c:
+; GFX10: ; %bb.0:
+; GFX10: v_mov_b32_e32 v2, 1.0
+; GFX10: v_dot2c_f32_f16 v2, v0, v1
+; GFX10: v_mov_b32_e32 v0, v2
+;
+; GFX11-LABEL: v_fdot2_inline_literal_c:
+; GFX11: ; %bb.0:
+; GFX11: v_mov_b32_e32 v2, 1.0
+; GFX11: v_dot2acc_f32_f16 v2, v0, v1
+; GFX11: v_mov_b32_e32 v0, v2
%ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
ret float %ret
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index ce5de941172108..6cfa02501adc58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -165,7 +165,7 @@ define float @v_fdot2_f32_bf16_neg_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c)
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 false)
ret float %r
@@ -180,8 +180,7 @@ define float @v_fdot2_f32_bf16_abs_c(<2 x bfloat> %a, <2 x bfloat> %b, float %c)
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
%abs.c = call float @llvm.fabs.f32(float %c)
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 false)
ret float %r
@@ -344,7 +343,7 @@ define float @v_fdot2_f32_bf16_neg_b_clamp(<2 x bfloat> %a, <2 x bfloat> %b, flo
define float @v_fdot2_f32_bf16_neg_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
; GCN-LABEL: v_fdot2_f32_bf16_neg_c_clamp:
; GCN: ; %bb.0:
-; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] clamp
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %neg.c, i1 true)
ret float %r
@@ -353,8 +352,7 @@ define float @v_fdot2_f32_bf16_neg_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, flo
define float @v_fdot2_f32_bf16_abs_c_clamp(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
; GCN-LABEL: v_fdot2_f32_bf16_abs_c_clamp:
; GCN: ; %bb.0:
-; GCN: v_and_b32_e32 v2, 0x7fffffff, v2
-; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 clamp
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1] clamp
%abs.c = call float @llvm.fabs.f32(float %c)
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %abs.c, i1 true)
ret float %r
@@ -682,7 +680,7 @@ define float @v_fdot2_f32_bf16_neg_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_c_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%neg.c = fneg float %c
@@ -702,8 +700,7 @@ define float @v_fdot2_f32_bf16_abs_c_dual(<2 x bfloat> %a, <2 x bfloat> %b, floa
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_abs_c_dual:
; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2
+; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX11PLUS: v_dot2_f32_bf16 v1, v3, v4, v5
; GFX11PLUS: v_add_f32_e32 v0, v0, v1
%abs.c = call float @llvm.fabs.f32(float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 3312f29470066c..c0f1240e4ef058 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -290,7 +290,7 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
; GFX906: ; %bb.0:
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
;
; GFX950-LABEL: v_fdot2_neg_c:
; GFX950: ; %bb.0:
@@ -312,11 +312,11 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX1170-LABEL: v_fdot2_neg_c:
; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
;
; GFX12-LABEL: v_fdot2_neg_c:
; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
ret float %r
@@ -325,8 +325,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_abs_c:
; GFX906: ; %bb.0:
-; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
;
; GFX950-LABEL: v_fdot2_abs_c:
; GFX950: ; %bb.0:
@@ -348,13 +347,11 @@ define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX1170-LABEL: v_fdot2_abs_c:
; GFX1170: ; %bb.0:
-; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
;
; GFX12-LABEL: v_fdot2_abs_c:
; GFX12: ; %bb.0:
-; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
%abs.c = call float @llvm.fabs.f32(float %c)
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
ret float %r
@@ -637,7 +634,7 @@ define float @v_fdot2_neg_b_clamp(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_neg_c_clamp(<2 x half> %a, <2 x half> %b, float %c) {
; GCN-LABEL: v_fdot2_neg_c_clamp:
; GCN: ; %bb.0:
-; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1] clamp
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] clamp
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 true)
ret float %r
@@ -646,8 +643,7 @@ define float @v_fdot2_neg_c_clamp(<2 x half> %a, <2 x half> %b, float %c) {
define float @v_fdot2_abs_c_clamp(<2 x half> %a, <2 x half> %b, float %c) {
; GCN-LABEL: v_fdot2_abs_c_clamp:
; GCN: ; %bb.0:
-; GCN: v_and_b32_e32 v2, 0x7fffffff, v2
-; GCN: v_dot2_f32_f16 v0, v0, v1, v2 clamp
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1] clamp
%abs.c = call float @llvm.fabs.f32(float %c)
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 true)
ret float %r
@@ -1191,7 +1187,7 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
; GFX906-LABEL: v_fdot2_neg_c_dual:
; GFX906: ; %bb.0:
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
; GFX906: v_add_f32_e32 v0, v0, v1
;
@@ -1217,13 +1213,13 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX1170-LABEL: v_fdot2_neg_c_dual:
; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5
; GFX1170: v_add_f32_e32 v0, v0, v1
;
; GFX12-LABEL: v_fdot2_neg_c_dual:
; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
; GFX12: v_add_f32_e32 v0, v0, v1
%neg.c = fneg float %c
@@ -1236,8 +1232,7 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x half> %d, <2 x half> %e, float %f) {
; GFX906-LABEL: v_fdot2_abs_c_dual:
; GFX906: ; %bb.0:
-; GFX906: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX906: v_dot2_f32_f16 v1, v3, v4, v5
; GFX906: v_add_f32_e32 v0, v0, v1
;
@@ -1263,15 +1258,13 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX1170-LABEL: v_fdot2_abs_c_dual:
; GFX1170: ; %bb.0:
-; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5
; GFX1170: v_add_f32_e32 v0, v0, v1
;
; GFX12-LABEL: v_fdot2_abs_c_dual:
; GFX12: ; %bb.0:
-; GFX12: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX12: v_dot2_f32_f16 v1, v3, v4, v5
; GFX12: v_add_f32_e32 v0, v0, v1
%abs.c = call float @llvm.fabs.f32(float %c)
More information about the llvm-branch-commits
mailing list