[llvm] AMDGPU][True16][CodeGen] fold clamp update for true16 (PR #128919)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 8 18:40:12 PST 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/128919
>From d17eb59740bb42a38f0bf538658e9c25f574f803 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 26 Feb 2025 12:31:48 -0500
Subject: [PATCH 1/2] true16 for fold clamp
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 ++++-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 29 +++++---------------
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 33 +++++++++++++++++++++++
3 files changed, 45 insertions(+), 23 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/true16-fold.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..afc1ce1db4e75 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1820,6 +1820,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+ MachineInstr *OrigDef = Def;
+ // Look through COPY. COPY only observed with True16.
+ if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
+ Def = MRI->getVRegDef(Def->getOperand(1).getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1837,7 +1841,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
- Register DefReg = Def->getOperand(0).getReg();
+ Register DefReg = OrigDef->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 9949b823dfec1..51965acec9573 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
-; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-TRUE16: ; %bb.0:
-; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-FAKE16: ; %bb.0:
-; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GISEL-GFX1100: ; %bb.0:
-; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1546,10 +1532,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
new file mode 100644
index 0000000000000..b5c49fdfa5edf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr="+wavefrontsize32",+real-true16 -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: fold_16bit_madmix_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_madmix_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sreg_32 = IMPLICIT_DEF
+ %13:vgpr_32 = COPY %12:sreg_32
+ %11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_16 = COPY %11:vgpr_32
+ %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %14:vgpr_16
+ S_ENDPGM 0, implicit $vgpr0
+...
>From 0789fcf6150c5af82f6c1b0a0616f1be9220f65e Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Sat, 8 Mar 2025 19:40:57 -0500
Subject: [PATCH 2/2] address comment
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 12 +--
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 90 ++++++++++++++++++++++-
2 files changed, 96 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index afc1ce1db4e75..db9305cdea5a9 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1819,11 +1819,13 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
return false;
- MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
- MachineInstr *OrigDef = Def;
// Look through COPY. COPY only observed with True16.
- if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
- Def = MRI->getVRegDef(Def->getOperand(1).getReg());
+ MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, ClampSrc->getReg());
+ MachineInstr *Def = nullptr;
+ if (DefSrc && DefSrc->isReg() && !DefSrc->isImm())
+ Def = MRI->getVRegDef(DefSrc->getReg());
+ else
+ Def = MRI->getVRegDef(ClampSrc->getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1841,7 +1843,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
- Register DefReg = OrigDef->getOperand(0).getReg();
+ Register DefReg = Def->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index b5c49fdfa5edf..ea8615082ff5d 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -18,7 +18,7 @@ body: |
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%10:vgpr_32 = COPY $vgpr2
%9:vgpr_32 = COPY $vgpr1
@@ -31,3 +31,91 @@ body: |
$vgpr0 = COPY %14:vgpr_16
S_ENDPGM 0, implicit $vgpr0
...
+
+---
+name: fold_16bit_subreg_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_subreg_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY3]].lo16, 0, [[COPY3]].lo16, -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sreg_32 = IMPLICIT_DEF
+ %13:vgpr_32 = COPY %12:sreg_32
+ %15:vgpr_16 = COPY %13.lo16:vgpr_32
+ %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %14:vgpr_16
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_phyreg_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_phyreg_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: $vgpr10_lo16 = COPY [[DEF]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_16 = COPY $vgpr10_lo16
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY3]], 0, [[COPY3]], -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sreg_32 = IMPLICIT_DEF
+ $vgpr10_lo16 = COPY %12:sreg_32
+ %15:vgpr_16 = COPY $vgpr10_lo16
+ %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %14:vgpr_16
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_undef_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_undef_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY3]].lo16, 0, [[COPY3]].lo16, -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sreg_32 = IMPLICIT_DEF
+ %13:vgpr_32 = COPY %12:sreg_32
+ %15:vgpr_16 = COPY %13.lo16:vgpr_32
+ %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %14:vgpr_16
+ S_ENDPGM 0, implicit $vgpr0
+...
More information about the llvm-commits
mailing list