[llvm-branch-commits] [llvm] AMDGPU: Improve codegen for VOP2 v_dot2c_f32_f16/bf16 (PR #179225)
Petar Avramovic via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Feb 20 07:59:36 PST 2026
https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/179225
>From 4d02482bdf260b028a45a8c9f56659404adb12c4 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Thu, 12 Feb 2026 17:56:30 +0100
Subject: [PATCH] AMDGPU: Improve codegen for VOP2 v_dot2c_f32_f16/bf16
Select VOP2 version when there are no src_modifers, otherwise VOP3.
---
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 8 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 22 ++
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 2 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 48 +++-
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 5 +
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
llvm/lib/Target/AMDGPU/VOP2Instructions.td | 26 +-
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 34 +--
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 226 +++++-------------
9 files changed, 157 insertions(+), 216 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 8aba9752e3185..cd75545f76f72 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,10 +51,18 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;
+def gi_vop3pnomodsdot :
+ GIComplexOperandMatcher<s32, "selectVOP3PNoModsDOT">,
+ GIComplexPatternEquiv<VOP3PNoModsDOT>;
+
def gi_vop3pmodsf32 :
GIComplexOperandMatcher<s32, "selectVOP3PModsF32">,
GIComplexPatternEquiv<VOP3PModsF32>;
+def gi_vop3pnomodsf32 :
+ GIComplexOperandMatcher<s32, "selectVOP3PNoModsF32">,
+ GIComplexPatternEquiv<VOP3PNoModsF32>;
+
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b31134882cffe..c8d7212ac5b56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3658,6 +3658,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const {
+ SDValue SrcTmp, SrcModsTmp;
+ SelectVOP3PMods(In, SrcTmp, SrcModsTmp, true);
+ if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
+ Src = SrcTmp;
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = SISrcMods::OP_SEL_1;
@@ -3670,6 +3681,17 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsF32(SDValue In, SDValue &Src,
return true;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const {
+ SDValue SrcTmp, SrcModsTmp;
+ SelectVOP3PModsF32(In, SrcTmp, SrcModsTmp);
+ if (cast<ConstantSDNode>(SrcModsTmp)->getZExtValue() == SISrcMods::OP_SEL_1) {
+ Src = SrcTmp;
+ return true;
+ }
+
+ return false;
+}
+
bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 43550c7ab53f8..5c13072005a3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -233,7 +233,9 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods,
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PNoModsDOT(SDValue In, SDValue &Src) const;
bool SelectVOP3PModsF32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3PNoModsF32(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d1d43841bca39..ac5ce562e4723 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4519,6 +4519,17 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
return std::pair(Src, Mods);
}
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsF32Impl(Register Src) const {
+ unsigned Mods = SISrcMods::OP_SEL_1;
+ if (Subtarget->isGFX11Plus()) {
+ unsigned ModsImpl;
+ std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Src);
+ Mods |= ModsImpl;
+ }
+ return std::pair(Src, Mods);
+}
+
Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
bool ForceVGPR) const {
@@ -5225,22 +5236,43 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
return selectVOP3PRetHelper(Root, true);
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PNoModsDOT(MachineOperand &Root) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true /*IsDOT*/);
+ if (Mods != SISrcMods::OP_SEL_1)
+ return {};
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PModsF32(MachineOperand &Root) const {
- Register Src = Root.getReg();
- unsigned Mods = SISrcMods::OP_SEL_1;
- if (Subtarget->isGFX11Plus()) {
- unsigned ModsImpl;
- std::tie(Src, ModsImpl) = selectVOP3ModsImpl(Root.getReg());
- Mods |= ModsImpl;
- }
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ Register Reg;
+ unsigned Mods;
+ std::tie(Reg, Mods) = selectVOP3PModsF32Impl(Root.getReg());
+ Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PNoModsF32(MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3PModsF32Impl(Root.getReg());
+ if (Mods != SISrcMods::OP_SEL_1)
+ return {};
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }}};
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
MachineOperand &Root) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index a67c5314eb6a5..6c71715975a11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -162,6 +162,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool IsCanonicalizing = true,
bool AllowAbs = true,
bool OpSel = false) const;
+ std::pair<Register, unsigned> selectVOP3PModsF32Impl(Register Src) const;
Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods,
MachineOperand Root, MachineInstr *InsertPt,
@@ -200,7 +201,11 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVOP3PModsDOT(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3PNoModsDOT(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectVOP3PModsF32(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3PNoModsF32(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 8198da4cc7f92..3b0c43a9fb758 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1701,7 +1701,9 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
+def VOP3PNoModsDOT : ComplexPattern<untyped, 1, "SelectVOP3PNoModsDOT">;
def VOP3PModsF32 : ComplexPattern<untyped, 2, "SelectVOP3PModsF32">;
+def VOP3PNoModsF32 : ComplexPattern<untyped, 1, "SelectVOP3PNoModsF32">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 2ccf39224a278..9c7bf355a1ec6 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1300,19 +1300,21 @@ let Constraints = "$vdst = $src2",
defm V_DOT2C_F32_BF16 : VOP2Inst_VOPD<"v_dot2c_f32_bf16", VOP_DOT_ACC_F32_V2BF16, 0xd, "v_dot2acc_f32_bf16">;
}
+class Dot2F32NoModsPat <SDPatternOperator node, Instruction inst, ValueType ty>
+ : GCNPat <
+ (f32 (node (ty (VOP3PNoModsDOT ty:$src0)), (ty (VOP3PNoModsDOT ty:$src1)),
+ (f32 (VOP3PNoModsF32 f32:$src2)), (i1 DSTCLAMP.NONE))),
+ (f32 (inst $src0, $src1, $src2))
+>;
+
let AddedComplexity = 30 in {
- def : GCNPat<
- (f32 (AMDGPUfdot2 v2f16:$src0, v2f16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
- (f32 (V_DOT2C_F32_F16_e32 $src0, $src1, $src2))
- > {
- let SubtargetPredicate = HasDot5Insts;
- }
- def : GCNPat<
- (f32 (int_amdgcn_fdot2_f32_bf16 v2bf16:$src0, v2bf16:$src1, f32:$src2, (i1 DSTCLAMP.NONE))),
- (f32 (V_DOT2C_F32_BF16_e32 $src0, $src1, $src2))
- > {
- let SubtargetPredicate = HasDot13Insts;
- }
+ let SubtargetPredicate = HasDot5Insts in
+ def : Dot2F32NoModsPat<AMDGPUfdot2, V_DOT2C_F32_F16_e32, v2f16>;
+
+ let SubtargetPredicate = HasDot13Insts in
+ def : Dot2F32NoModsPat<int_amdgcn_fdot2_f32_bf16, V_DOT2C_F32_BF16_e32,
+ v2bf16>;
+
def : GCNPat<
(i32 (int_amdgcn_sdot4 i32:$src0, i32:$src1, i32:$src2, (i1 DSTCLAMP.NONE))),
(i32 (V_DOT4C_I32_I8_e32 $src0, $src1, $src2))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 600000144887d..a16971843c247 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -19,15 +19,9 @@ define float @v_fdot2_f32_bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
}
define float @v_fdot2_f32_bf16_neg_a(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
-; GFX950-LABEL: v_fdot2_f32_bf16_neg_a:
-; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
-; GFX950: v_mov_b32_e32 v0, v2
-;
-; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a:
-; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-LABEL: v_fdot2_f32_bf16_neg_a:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
%neg.a = fneg <2 x bfloat> %a
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %neg.a, <2 x bfloat> %b, float %c, i1 false)
ret float %r
@@ -88,15 +82,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi(<2 x bfloat> %a, <2 x bfloat> %b, float
}
define float @v_fdot2_f32_bf16_neg_b(<2 x bfloat> %a, <2 x bfloat> %b, float %c) {
-; GFX950-LABEL: v_fdot2_f32_bf16_neg_b:
-; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
-; GFX950: v_mov_b32_e32 v0, v2
-;
-; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b:
-; GFX11PLUS: ; %bb.0:
-; GFX11PLUS: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-LABEL: v_fdot2_f32_bf16_neg_b:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
%neg.b = fneg <2 x bfloat> %b
%r = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %neg.b, float %c, i1 false)
ret float %r
@@ -507,10 +495,9 @@ define float @v_fdot2_f32_bf16_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c,
define float @v_fdot2_f32_bf16_neg_a_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_a_dual:
; GFX11PLUS: ; %bb.0:
@@ -595,10 +582,9 @@ define float @v_fdot2_f32_bf16_neg_a_hi_dual(<2 x bfloat> %a, <2 x bfloat> %b, f
define float @v_fdot2_f32_bf16_neg_b_dual(<2 x bfloat> %a, <2 x bfloat> %b, float %c, <2 x bfloat> %d, <2 x bfloat> %e, float %f) {
; GFX950-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX950: v_dot2c_f32_bf16_e32 v2, v0, v1
+; GFX950: v_dot2_f32_bf16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX950: v_dot2c_f32_bf16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX11PLUS-LABEL: v_fdot2_f32_bf16_neg_b_dual:
; GFX11PLUS: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 237f3cfe3031d..2e6a03fec38e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -40,35 +40,9 @@ define float @v_fdot2(<2 x half> %a, <2 x half> %b, float %c) {
}
define float @v_fdot2_neg_a(<2 x half> %a, <2 x half> %b, float %c) {
-; GFX906-LABEL: v_fdot2_neg_a:
-; GFX906: ; %bb.0:
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-;
-; GFX950-LABEL: v_fdot2_neg_a:
-; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
-; GFX950: v_mov_b32_e32 v0, v2
-;
-; GFX10-LABEL: v_fdot2_neg_a:
-; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
-;
-; GFX11-LABEL: v_fdot2_neg_a:
-; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_mov_b32_e32 v0, v2
-;
-; GFX1170-LABEL: v_fdot2_neg_a:
-; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
-;
-; GFX12-LABEL: v_fdot2_neg_a:
-; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GCN-LABEL: v_fdot2_neg_a:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
%neg.a = fneg <2 x half> %a
%r = call float @llvm.amdgcn.fdot2(<2 x half> %neg.a, <2 x half> %b, float %c, i1 false)
ret float %r
@@ -89,10 +63,7 @@ define float @v_fdot2_neg_a_lo(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_neg_a_lo:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v3, 0x8000, v0
-; GFX10: v_bfi_b32 v0, 0xffff, v3, v0
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
;
; GFX11-LABEL: v_fdot2_neg_a_lo:
; GFX11: ; %bb.0:
@@ -133,11 +104,7 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_neg_a_hi:
; GFX10: ; %bb.0:
-; GFX10: v_mov_b32_e32 v3, 0x8000
-; GFX10: v_xor_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10: v_perm_b32 v0, v3, v0, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
;
; GFX11-LABEL: v_fdot2_neg_a_hi:
; GFX11: ; %bb.0:
@@ -164,35 +131,9 @@ define float @v_fdot2_neg_a_hi(<2 x half> %a, <2 x half> %b, float %c) {
}
define float @v_fdot2_neg_b(<2 x half> %a, <2 x half> %b, float %c) {
-; GFX906-LABEL: v_fdot2_neg_b:
-; GFX906: ; %bb.0:
-; GFX906: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-;
-; GFX950-LABEL: v_fdot2_neg_b:
-; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
-; GFX950: v_mov_b32_e32 v0, v2
-;
-; GFX10-LABEL: v_fdot2_neg_b:
-; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
-;
-; GFX11-LABEL: v_fdot2_neg_b:
-; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_mov_b32_e32 v0, v2
-;
-; GFX1170-LABEL: v_fdot2_neg_b:
-; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
-;
-; GFX12-LABEL: v_fdot2_neg_b:
-; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GCN-LABEL: v_fdot2_neg_b:
+; GCN: ; %bb.0:
+; GCN: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
%neg.b = fneg <2 x half> %b
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %neg.b, float %c, i1 false)
ret float %r
@@ -213,10 +154,7 @@ define float @v_fdot2_neg_b_lo(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_neg_b_lo:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v3, 0x8000, v1
-; GFX10: v_bfi_b32 v1, 0xffff, v3, v1
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
;
; GFX11-LABEL: v_fdot2_neg_b_lo:
; GFX11: ; %bb.0:
@@ -257,11 +195,7 @@ define float @v_fdot2_neg_b_hi(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_neg_b_hi:
; GFX10: ; %bb.0:
-; GFX10: v_mov_b32_e32 v3, 0x8000
-; GFX10: v_xor_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10: v_perm_b32 v1, v3, v1, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
;
; GFX11-LABEL: v_fdot2_neg_b_hi:
; GFX11: ; %bb.0:
@@ -305,19 +239,9 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX10: v_dot2c_f32_f16 v2, v0, v1
; GFX10: v_mov_b32_e32 v0, v2
;
-; GFX11-LABEL: v_fdot2_neg_c:
-; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_mov_b32_e32 v0, v2
-;
-; GFX1170-LABEL: v_fdot2_neg_c:
-; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
-;
-; GFX12-LABEL: v_fdot2_neg_c:
-; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
+; GFX11PLUS-LABEL: v_fdot2_neg_c:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
ret float %r
@@ -341,20 +265,9 @@ define float @v_fdot2_abs_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX10: v_dot2c_f32_f16 v2, v0, v1
; GFX10: v_mov_b32_e32 v0, v2
;
-; GFX11-LABEL: v_fdot2_abs_c:
-; GFX11: ; %bb.0:
-; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_mov_b32_e32 v0, v2
-;
-; GFX1170-LABEL: v_fdot2_abs_c:
-; GFX1170: ; %bb.0:
-; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2
-;
-; GFX12-LABEL: v_fdot2_abs_c:
-; GFX12: ; %bb.0:
-; GFX12: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
+; GFX11PLUS-LABEL: v_fdot2_abs_c:
+; GFX11PLUS: ; %bb.0:
+; GFX11PLUS: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
%abs.c = call float @llvm.fabs.f32(float %c)
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %abs.c, i1 false)
ret float %r
@@ -374,9 +287,7 @@ define float @v_fdot2_opsel_lo_a(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_opsel_lo_a:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v0, v0, v0, 0x7060302
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
;
; GFX11-LABEL: v_fdot2_opsel_lo_a:
; GFX11: ; %bb.0:
@@ -412,9 +323,7 @@ define float @v_fdot2_opsel_hi_a(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_opsel_hi_a:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v0, v0, v0, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
;
; GFX11-LABEL: v_fdot2_opsel_hi_a:
; GFX11: ; %bb.0:
@@ -450,9 +359,7 @@ define float @v_fdot2_opsel_lo_b(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_opsel_lo_b:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v1, v1, v1, 0x7060302
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
;
; GFX11-LABEL: v_fdot2_opsel_lo_b:
; GFX11: ; %bb.0:
@@ -488,9 +395,7 @@ define float @v_fdot2_opsel_hi_b(<2 x half> %a, <2 x half> %b, float %c) {
;
; GFX10-LABEL: v_fdot2_opsel_hi_b:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v1, v1, v1, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_mov_b32_e32 v0, v2
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
;
; GFX11-LABEL: v_fdot2_opsel_hi_b:
; GFX11: ; %bb.0:
@@ -919,24 +824,21 @@ define float @v_fdot2_neg_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX950-LABEL: v_fdot2_neg_a_dual:
; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v0, 0x80008000, v0
-; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX10-LABEL: v_fdot2_neg_a_dual:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_a_dual:
; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v0, 0x80008000, v0
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0]
; GFX11: v_dot2acc_f32_f16 v5, v3, v4
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_add_f32_e32 v0, v2, v5
+; GFX11: v_add_f32_e32 v0, v0, v5
;
; GFX1170-LABEL: v_fdot2_neg_a_dual:
; GFX1170: ; %bb.0:
@@ -974,11 +876,9 @@ define float @v_fdot2_neg_a_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
;
; GFX10-LABEL: v_fdot2_neg_a_lo_dual:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v6, 0x8000, v0
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[1,0,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_bfi_b32 v0, 0xffff, v6, v0
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_a_lo_dual:
; GFX11: ; %bb.0:
@@ -1029,12 +929,9 @@ define float @v_fdot2_neg_a_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
;
; GFX10-LABEL: v_fdot2_neg_a_hi_dual:
; GFX10: ; %bb.0:
-; GFX10: v_mov_b32_e32 v6, 0x8000
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[1,0,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_xor_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10: v_perm_b32 v0, v6, v0, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_a_hi_dual:
; GFX11: ; %bb.0:
@@ -1076,24 +973,21 @@ define float @v_fdot2_neg_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX950-LABEL: v_fdot2_neg_b_dual:
; GFX950: ; %bb.0:
-; GFX950: v_xor_b32_e32 v1, 0x80008000, v1
-; GFX950: v_dot2c_f32_f16_e32 v2, v0, v1
+; GFX950: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX950: v_dot2c_f32_f16_e32 v5, v3, v4
-; GFX950: v_add_f32_e32 v0, v2, v5
+; GFX950: v_add_f32_e32 v0, v0, v5
;
; GFX10-LABEL: v_fdot2_neg_b_dual:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_b_dual:
; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v1, 0x80008000, v1
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0]
; GFX11: v_dot2acc_f32_f16 v5, v3, v4
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_add_f32_e32 v0, v2, v5
+; GFX11: v_add_f32_e32 v0, v0, v5
;
; GFX1170-LABEL: v_fdot2_neg_b_dual:
; GFX1170: ; %bb.0:
@@ -1131,11 +1025,9 @@ define float @v_fdot2_neg_b_lo_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
;
; GFX10-LABEL: v_fdot2_neg_b_lo_dual:
; GFX10: ; %bb.0:
-; GFX10: v_xor_b32_e32 v6, 0x8000, v1
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,1,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_bfi_b32 v1, 0xffff, v6, v1
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_b_lo_dual:
; GFX11: ; %bb.0:
@@ -1186,12 +1078,9 @@ define float @v_fdot2_neg_b_hi_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x
;
; GFX10-LABEL: v_fdot2_neg_b_hi_dual:
; GFX10: ; %bb.0:
-; GFX10: v_mov_b32_e32 v6, 0x8000
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,1,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_xor_b32_sdwa v6, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10: v_perm_b32 v1, v6, v1, 0x5040100
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_neg_b_hi_dual:
; GFX11: ; %bb.0:
@@ -1248,14 +1137,13 @@ define float @v_fdot2_neg_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX11-LABEL: v_fdot2_neg_c_dual:
; GFX11: ; %bb.0:
-; GFX11: v_xor_b32_e32 v2, 0x80000000, v2
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX11: v_dot2acc_f32_f16 v5, v3, v4
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_add_f32_e32 v0, v2, v5
+; GFX11: v_add_f32_e32 v0, v0, v5
;
; GFX1170-LABEL: v_fdot2_neg_c_dual:
; GFX1170: ; %bb.0:
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1]
; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5
; GFX1170: v_add_f32_e32 v0, v0, v1
;
@@ -1295,15 +1183,13 @@ define float @v_fdot2_abs_c_dual(<2 x half> %a, <2 x half> %b, float %c, <2 x ha
;
; GFX11-LABEL: v_fdot2_abs_c_dual:
; GFX11: ; %bb.0:
-; GFX11: v_and_b32_e32 v2, 0x7fffffff, v2
+; GFX11: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX11: v_dot2acc_f32_f16 v5, v3, v4
-; GFX11: v_dot2acc_f32_f16 v2, v0, v1
-; GFX11: v_add_f32_e32 v0, v2, v5
+; GFX11: v_add_f32_e32 v0, v0, v5
;
; GFX1170-LABEL: v_fdot2_abs_c_dual:
; GFX1170: ; %bb.0:
-; GFX1170: v_and_b32_e32 v2, 0x7fffffff, v2
-; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX1170: v_dot2_f32_f16 v0, v0, v1, v2 neg_hi:[0,0,1]
; GFX1170: v_dot2_f32_f16 v1, v3, v4, v5
; GFX1170: v_add_f32_e32 v0, v0, v1
;
@@ -1336,10 +1222,9 @@ define float @v_fdot2_opsel_lo_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
;
; GFX10-LABEL: v_fdot2_opsel_lo_a_dual:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v0, v0, v0, 0x7060302
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[1,0,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_opsel_lo_a_dual:
; GFX11: ; %bb.0:
@@ -1385,10 +1270,9 @@ define float @v_fdot2_opsel_hi_a_dual(<2 x half> %a, <2 x half> %b, float %c, <2
;
; GFX10-LABEL: v_fdot2_opsel_hi_a_dual:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v0, v0, v0, 0x5040100
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[0,1,1]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_opsel_hi_a_dual:
; GFX11: ; %bb.0:
@@ -1434,10 +1318,9 @@ define float @v_fdot2_opsel_lo_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
;
; GFX10-LABEL: v_fdot2_opsel_lo_b_dual:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v1, v1, v1, 0x7060302
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel:[0,1,0]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_opsel_lo_b_dual:
; GFX11: ; %bb.0:
@@ -1483,10 +1366,9 @@ define float @v_fdot2_opsel_hi_b_dual(<2 x half> %a, <2 x half> %b, float %c, <2
;
; GFX10-LABEL: v_fdot2_opsel_hi_b_dual:
; GFX10: ; %bb.0:
-; GFX10: v_perm_b32 v1, v1, v1, 0x5040100
+; GFX10: v_dot2_f32_f16 v0, v0, v1, v2 op_sel_hi:[1,0,1]
; GFX10: v_dot2c_f32_f16 v5, v3, v4
-; GFX10: v_dot2c_f32_f16 v2, v0, v1
-; GFX10: v_add_f32_e32 v0, v2, v5
+; GFX10: v_add_f32_e32 v0, v0, v5
;
; GFX11-LABEL: v_fdot2_opsel_hi_b_dual:
; GFX11: ; %bb.0:
More information about the llvm-branch-commits
mailing list