[llvm-branch-commits] [llvm] [AMDGPU] Create V_FMAAK_F16/V_FMAMK_F16 in true16 with imm folding (PR #173317)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 13 13:01:30 PST 2026
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/173317
>From 47a890e3881617c703e25c1587439d510b062c4f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 22 Dec 2025 14:31:48 -0800
Subject: [PATCH] [AMDGPU] Create V_FMAAK_F16/V_FMAMK_F16 in true16 with imm
folding
This does not cover real true16 with tests, the next patch will.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 39 +++++++++++++-------
llvm/test/CodeGen/AMDGPU/fma.f16.ll | 23 ++++++------
llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 8 ++--
3 files changed, 40 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index bd6c58d0f8945..f49a5d32c6604 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3776,13 +3776,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
- // takes VGPR_32_Lo128 operands, so the rewrite would also require
- // restricting their register classes. For now just bail out.
- if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
- NewOpc == AMDGPU::V_FMAMK_F16_fake16)
- return false;
-
const std::optional<int64_t> SubRegImm = extractSubregFromImm(
Imm, RegSrc == Src1 ? Src0->getSubReg() : Src1->getSubReg());
@@ -3807,6 +3800,18 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
removeModOperands(UseMI);
UseMI.setDesc(get(NewOpc));
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAMK_F16_fake16) {
+ auto Tmp = MRI->createVirtualRegister(
+ NewOpc == AMDGPU::V_FMAMK_F16_t16 ? &AMDGPU::VGPR_16_Lo128RegClass
+ : &AMDGPU::VGPR_32_Lo128RegClass);
+ BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+ UseMI.getDebugLoc(), get(AMDGPU::COPY),
+ UseMI.getOperand(0).getReg())
+ .addReg(Tmp, RegState::Kill);
+ UseMI.getOperand(0).setReg(Tmp);
+ }
+
bool DeleteDef = MRI->use_nodbg_empty(Reg);
if (DeleteDef)
DefMI.eraseFromParent();
@@ -3854,13 +3859,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
- // takes VGPR_32_Lo128 operands, so the rewrite would also require
- // restricting their register classes. For now just bail out.
- if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
- NewOpc == AMDGPU::V_FMAAK_F16_fake16)
- return false;
-
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -3880,6 +3878,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// These come before src2.
removeModOperands(UseMI);
UseMI.setDesc(get(NewOpc));
+
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
+ NewOpc == AMDGPU::V_FMAAK_F16_fake16) {
+ auto Tmp = MRI->createVirtualRegister(
+ NewOpc == AMDGPU::V_FMAAK_F16_t16 ? &AMDGPU::VGPR_16_Lo128RegClass
+ : &AMDGPU::VGPR_32_Lo128RegClass);
+ BuildMI(*UseMI.getParent(), std::next(UseMI.getIterator()),
+ UseMI.getDebugLoc(), get(AMDGPU::COPY),
+ UseMI.getOperand(0).getReg())
+ .addReg(Tmp, RegState::Kill);
+ UseMI.getOperand(0).setReg(Tmp);
+ }
+
// It might happen that UseMI was commuted
// and we now have SGPR as SRC1. If so 2 inlined
// constant and SGPR are illegal.
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index bbd493f668847..20db029aad27f 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -428,12 +428,11 @@ define i32 @test_D139469_f16(half %arg) {
; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16:
; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb
; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1
-; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e
+; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0
+; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v1, v0
; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
@@ -491,12 +490,12 @@ define i32 @test_D139469_f16(half %arg) {
; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e
-; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0
-; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v2, v1
-; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x291e
+; GFX12-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, 0x291e, v0
+; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_sa_sdst(0)
+; GFX12-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT: v_min_num_f16_e32 v0, v1, v0
; GFX12-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0
; GFX12-SDAG-FAKE16-NEXT: s_wait_alu depctr_va_vcc(0)
; GFX12-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index efb55db486489..da0524c2bd93b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -509,8 +509,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1
-; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
+; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-DENORM-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %b,
@@ -741,8 +741,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0
; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1
-; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v1
+; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-DENORM-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a,
More information about the llvm-branch-commits
mailing list