[llvm-branch-commits] [llvm] [AMDGPU] si-peephole-sdwa: Handle V_PACK_B32_F16_e64 (WIP) (PR #176383)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jan 19 10:04:33 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Frederik Harwath (frederik-h)
<details>
<summary>Changes</summary>
Change si-peephole-sdwa to eliminate V_PACK_B32_F16_e64 instructions
by changing the second operand to write to the upper word of the
destination directly.
---
Patch is 254.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/176383.diff
36 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+46-12)
- (modified) llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (+35)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll (+48-64)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll (+24-32)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll (+40-40)
- (modified) llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll (+5-6)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+18-20)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+28-43)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+19-31)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.f16.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/fmaximum3.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/fminimum3.ll (+3-4)
- (modified) llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll (+126-140)
- (modified) llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll (+29-34)
- (modified) llvm/test/CodeGen/AMDGPU/fpow.ll (+92-104)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll (+7-12)
- (modified) llvm/test/CodeGen/AMDGPU/fract-match.ll (+4-5)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll (+29-42)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+68-60)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp10.ll (+87-82)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+21-21)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+13-22)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+138-98)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+138-98)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log2.ll (+68-127)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll (+29-42)
- (modified) llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll (+74-118)
- (modified) llvm/test/CodeGen/AMDGPU/repeated-divisor.ll (+16-19)
- (modified) llvm/test/CodeGen/AMDGPU/roundeven.ll (+42-44)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-commute.ll (+2-3)
- (modified) llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll (+25-30)
- (modified) llvm/test/CodeGen/AMDGPU/v_pack.ll (+18-20)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0788cbb18269b..5b2e41ef5f7e8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -655,6 +655,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
break;
case ISD::EXTRACT_SUBVECTOR:
case ISD::CONCAT_VECTORS:
+ case ISD::FSIN:
+ case ISD::FCOS:
setOperationAction(Op, VT, Custom);
break;
default:
@@ -9876,6 +9878,35 @@ SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
DAG.getValueType(SmallVT));
}
+/// Helper function for LowerINTRINSIC_WO_CHAIN. Replace a \p Op of
+/// scalar type with a new node \p NewISD node with one argument which
+/// is the operand at index \p OperandIndex of Op. Scalarizes for
+/// vector types.
+///
+// FIXME The manual scalarization seems to be necessary because the
+// Expand fallback is not supported for ISD::INTRINSIC_WO_CHAIN and
+// hence the lowering function should not fail for v2f16; see comment
+// in SelectionDAGLegalize::ExpandNode.
+static SDValue BuildScalarizedUnaryOp(SDValue Op, unsigned NewISD,
+ unsigned OperandIndex,
+ SelectionDAG &DAG) {
+ EVT VT = Op.getValueType();
+ SDLoc DL(Op);
+ SDValue Operand = Op.getOperand(OperandIndex);
+ if (!VT.isVector())
+ return DAG.getNode(NewISD, DL, VT, Operand);
+
+ EVT ScalarVT = VT.getScalarType();
+ unsigned NElts = VT.getVectorNumElements();
+ SmallVector<SDValue, 8> Args;
+
+ DAG.ExtractVectorElements(Operand, Args, 0, NElts);
+ for (unsigned I = 0; I < NElts; ++I)
+ Args[I] = DAG.getNode(NewISD, DL, ScalarVT, Args[I]);
+
+ return DAG.getBuildVector(VT, DL, Args);
+}
+
SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
@@ -10098,10 +10129,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
case Intrinsic::amdgcn_sin:
- return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
+ return BuildScalarizedUnaryOp(Op, AMDGPUISD::SIN_HW, 1, DAG);
case Intrinsic::amdgcn_cos:
- return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
+ return BuildScalarizedUnaryOp(Op, AMDGPUISD::COS_HW, 1, DAG);
case Intrinsic::amdgcn_mul_u24:
return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
@@ -10117,7 +10148,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitRemovedIntrinsicError(DAG, DL, VT);
}
case Intrinsic::amdgcn_fract:
- return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+ return BuildScalarizedUnaryOp(Op, AMDGPUISD::FRACT, 1, DAG);
case Intrinsic::amdgcn_class:
return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
@@ -12965,6 +12996,9 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
}
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+ unsigned OpC = Op.getOpcode();
+ assert((OpC == ISD::FCOS || OpC == ISD::FSIN) && "Wrong trig opcode");
+
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue Arg = Op.getOperand(0);
@@ -12978,19 +13012,19 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->hasTrigReducedRange()) {
SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
- TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
+ SDValue FractId =
+ DAG.getTargetConstant(Intrinsic::amdgcn_fract, DL, MVT::i32);
+ TrigVal =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, FractId, MulVal, Flags);
} else {
TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
}
- switch (Op.getOpcode()) {
- case ISD::FCOS:
- return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
- case ISD::FSIN:
- return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
- default:
- llvm_unreachable("Wrong trig opcode");
- }
+ Intrinsic::AMDGCNIntrinsics Intrinsic =
+ OpC == ISD::FSIN ? Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
+ SDValue TrigId = DAG.getTargetConstant(Intrinsic, DL, MVT::i32);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(Op), VT, TrigId, TrigVal,
+ Flags);
}
SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index acc4b3f0a68b4..232d975c3fc4e 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -455,6 +455,23 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// writing WORD_1. Modifiers don't matter because all the bits that
// would be impacted are being overwritten by the dst.
// Any other case will not work.
+ //
+ // FIXME Is this really true for f16 operands? That is, this
+ // change introduced by the v_pack_b32_f16 conversion looks wrong:
+ //@@ -2394,17 +2394,17 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half>
+ //%a) {
+ // ; GFX9-LABEL: v_neg_rsq_v2f16:
+ // ; GFX9: ; %bb.0:
+ // ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ // -; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD
+ // dst_unused:UNUSED_PAD src0_sel:WORD_1
+ // -; GFX9-NEXT: v_rsq_f16_e32 v0, v0
+ // -; GFX9-NEXT: v_pack_b32_f16 v0, -v0, -v1
+ // +; GFX9-NEXT: v_rsq_f16_e32 v1, v0
+ // +; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1
+ // dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+ // +; GFX9-NEXT: v_mov_b32_e32 v0, v1
+ // ; GFX9-NEXT: s_setpc_b64 s[30:31]
SdwaSel DstSel = static_cast<SdwaSel>(
TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
@@ -961,7 +978,25 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
+ }
+ case AMDGPU::V_PACK_B32_F16_e64: {
+ MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+
+ bool InvalidOp = false;
+ for (auto *Op : {Dst, Src1, Src2})
+ if (!Op || !Op->isReg() || Op->getReg().isPhysical())
+ InvalidOp = true;
+
+ if (InvalidOp)
+ break;
+
+ if (isSameReg(*Src1, *Src2))
+ break;
+ // FIXME Figure out necessary restrictions on Src1 and Src2
+ return std::make_unique<SDWADstPreserveOperand>(Dst, Src1, Src2, WORD_1);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
index d046b854fb0d8..9b4b14e6ca105 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-mul.ll
@@ -543,14 +543,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-LABEL: test_v4f16_sub_mul:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -563,27 +561,23 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX9-DENORM-LABEL: test_v4f16_sub_mul:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -596,14 +590,12 @@ define <4 x half> @test_v4f16_sub_mul(<4 x half> %x, <4 x half> %y, <4 x half> %
; GFX10-DENORM-LABEL: test_v4f16_sub_mul:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul:
@@ -642,14 +634,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-LABEL: test_v4f16_sub_mul_rhs:
; GFX9: ; %bb.0: ; %.entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX9-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX9-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -662,27 +652,23 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX9-DENORM-LABEL: test_v4f16_sub_mul_rhs:
; GFX9-DENORM: ; %bb.0: ; %.entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_mul_rhs:
; GFX10: ; %bb.0: ; %.entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX10-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX10-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
@@ -695,14 +681,12 @@ define <4 x half> @test_v4f16_sub_mul_rhs(<4 x half> %x, <4 x half> %y, <4 x hal
; GFX10-DENORM-LABEL: test_v4f16_sub_mul_rhs:
; GFX10-DENORM: ; %bb.0: ; %.entry
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2
-; GFX10-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v2, v4, v0
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_sub_f16_e32 v3, v5, v1
-; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2
+; GFX10-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v0, v4, v2
+; GFX10-DENORM-NEXT: v_sub_f16_e32 v1, v5, v3
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v0, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX10-DENORM-NEXT: v_sub_f16_sdwa v1, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-CONTRACT-LABEL: test_v4f16_sub_mul_rhs:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
index c0a828ecacbae..6143e91f037df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-sub-neg-mul.ll
@@ -219,14 +219,12 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-CONTRACT-LABEL: test_v4f16_sub_ext_neg_mul:
@@ -239,27 +237,23 @@ define <4 x half> @test_v4f16_sub_ext_neg_mul(<4 x half> %x, <4 x half> %y, <4 x
; GFX9-DENORM-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX9-DENORM: ; %bb.0: ; %entry
; GFX9-DENORM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_pk_mul_f16 v1, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v2, v0, v4
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_sub_f16_e32 v3, v1, v5
-; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-DENORM-NEXT: v_pack_b32_f16 v1, v3, v1
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v2, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT: v_pk_mul_f16 v3, v1, v3 neg_lo:[0,1] neg_hi:[0,1]
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v0, v2, v4
+; GFX9-DENORM-NEXT: v_sub_f16_e32 v1, v3, v5
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v0, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-DENORM-NEXT: v_sub_f16_sdwa v1, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1
; GFX9-DENORM-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_v4f16_sub_ext_neg_mul:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_mul_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1]
-; GFX10-NEXT: v_pk_mul_f16 v1, v1, v3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/176383
More information about the llvm-branch-commits
mailing list