[llvm] [AMDGPU][True16][CodeGen] fold clamp update for true16 (PR #128919)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 24 07:45:11 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/128919
>From 56a2c731f64e7945c29f8f4b3e08955aed7c6a09 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Wed, 26 Feb 2025 12:31:48 -0500
Subject: [PATCH 1/3] true16 for fold clamp
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 ++++-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 29 +++++---------------
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 33 +++++++++++++++++++++++
3 files changed, 45 insertions(+), 23 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/true16-fold.mir
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cc15dd7cb495c..d970e4d0580f3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1820,6 +1820,10 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
return false;
MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+ MachineInstr *OrigDef = Def;
+ // Look through COPY. COPY only observed with True16.
+ if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
+ Def = MRI->getVRegDef(Def->getOperand(1).getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1837,7 +1841,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
- Register DefReg = Def->getOperand(0).getReg();
+ Register DefReg = OrigDef->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 15cb404a3840a..beac41e42e0c6 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -269,19 +269,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
}
define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %src1, float %src2) #0 {
-; SDAG-GFX1100-TRUE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-TRUE16: ; %bb.0:
-; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v0.l, v0.l, v0.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; SDAG-GFX1100-FAKE16-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; SDAG-GFX1100-FAKE16: ; %bb.0:
-; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GFX900: ; %bb.0:
@@ -312,12 +304,6 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
-; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
-; GISEL-GFX1100: ; %bb.0:
-; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp
-; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
-;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1524,10 +1510,9 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-TRUE16-NEXT: v_max_f16_e64 v3.l, v3.l, v3.l clamp
-; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v3, v3, v4, v5 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixhi_f16 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX1100-TRUE16-NEXT: v_mov_b32_e32 v0, v3
; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
new file mode 100644
index 0000000000000..b5c49fdfa5edf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -0,0 +1,33 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr="+wavefrontsize32",+real-true16 -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: fold_16bit_madmix_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0.entry:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_madmix_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %10:vgpr_32 = COPY $vgpr2
+ %9:vgpr_32 = COPY $vgpr1
+ %8:vgpr_32 = COPY $vgpr0
+ %12:sreg_32 = IMPLICIT_DEF
+ %13:vgpr_32 = COPY %12:sreg_32
+ %11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %15:vgpr_16 = COPY %11:vgpr_32
+ %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %14:vgpr_16
+ S_ENDPGM 0, implicit $vgpr0
+...
>From e772194ce5bea98648323a50995a51bab4a59235 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Sat, 8 Mar 2025 19:40:57 -0500
Subject: [PATCH 2/3] address comment
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 8 +-
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 147 ++++++++++++++++++++--
2 files changed, 137 insertions(+), 18 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index d970e4d0580f3..c9a4e44c195ce 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1819,11 +1819,9 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
if (!ClampSrc || !MRI->hasOneNonDBGUser(ClampSrc->getReg()))
return false;
- MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
- MachineInstr *OrigDef = Def;
// Look through COPY. COPY only observed with True16.
- if (Def->isCopy() && Def->getOperand(1).getReg().isVirtual())
- Def = MRI->getVRegDef(Def->getOperand(1).getReg());
+ MachineOperand *DefSrc = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
+ MachineInstr *Def = MRI->getVRegDef(DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
@@ -1841,7 +1839,7 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
// Clamp is applied after omod, so it is OK if omod is set.
DefClamp->setImm(1);
- Register DefReg = OrigDef->getOperand(0).getReg();
+ Register DefReg = Def->getOperand(0).getReg();
Register MIDstReg = MI.getOperand(0).getReg();
if (TRI->isSGPRReg(*MRI, DefReg)) {
// Pseudo scalar instructions have a SGPR for dst and clamp is a v_max*
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index b5c49fdfa5edf..2f024cdaa2517 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -1,12 +1,12 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr="+wavefrontsize32",+real-true16 -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass si-fold-operands -mattr=+real-true16 -o - %s | FileCheck %s
---
name: fold_16bit_madmix_clamp
tracksRegLiveness: true
registers:
body: |
- bb.0.entry:
+ bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-LABEL: name: fold_16bit_madmix_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
@@ -16,18 +16,139 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
- ; CHECK-NEXT: $vgpr0 = COPY [[COPY4]]
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
- %10:vgpr_32 = COPY $vgpr2
- %9:vgpr_32 = COPY $vgpr1
- %8:vgpr_32 = COPY $vgpr0
- %12:sreg_32 = IMPLICIT_DEF
- %13:vgpr_32 = COPY %12:sreg_32
- %11:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %8:vgpr_32, 8, %9:vgpr_32, 0, %10:vgpr_32, 0, %13:vgpr_32, 0, 0, implicit $mode, implicit $exec
- %15:vgpr_16 = COPY %11:vgpr_32
- %14:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %15:vgpr_16, 0, %15:vgpr_16, -1, 0, 0, implicit $mode, implicit $exec
- $vgpr0 = COPY %14:vgpr_16
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:vgpr_32 = COPY %3
+ %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_16 = COPY %5
+ %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %7
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_subreg_folded_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_madmix_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:vgpr_32 = COPY %3
+ %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %5.lo16, 0, %5.lo16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %6
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_subreg_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_subreg_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:vgpr_32 = COPY %3
+ %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
+ %6:vgpr_16 = COPY %5.lo16
+ %7:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %6, 0, %6, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %7
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_phyreg_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_phyreg_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]]
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:sreg_32 = IMPLICIT_DEF
+ %4:vgpr_32 = COPY %3
+ %5:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, %2, 8, %1, 0, %0, 0, %4, 0, 0, implicit $mode, implicit $exec
+ $vgpr10_lo16 = COPY %5
+ %6:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %6
+ S_ENDPGM 0, implicit $vgpr0
+...
+
+---
+name: fold_16bit_undef_clamp
+tracksRegLiveness: true
+registers:
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-LABEL: name: fold_16bit_undef_clamp
+ ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[DEF]], 0, [[DEF]], -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
+ %0:vgpr_32 = COPY $vgpr2
+ %1:vgpr_32 = COPY $vgpr1
+ %2:vgpr_32 = COPY $vgpr0
+ %3:vgpr_16 = IMPLICIT_DEF
+ %4:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, %3, 0, %3, -1, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = COPY %4
S_ENDPGM 0, implicit $vgpr0
...
>From 3dad0c3f3dadaef4962f6782f627a2016baba0cb Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 13 Mar 2025 17:22:43 -0400
Subject: [PATCH 3/3] address comment
---
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 5 +++--
llvm/test/CodeGen/AMDGPU/true16-fold.mir | 21 +++++++++------------
2 files changed, 12 insertions(+), 14 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index c9a4e44c195ce..160577b3a3ecf 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1820,8 +1820,9 @@ bool SIFoldOperandsImpl::tryFoldClamp(MachineInstr &MI) {
return false;
// Look through COPY. COPY only observed with True16.
- MachineOperand *DefSrc = TRI->lookThruCopyLike(ClampSrc->getReg(), MRI);
- MachineInstr *Def = MRI->getVRegDef(DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg());
+ MachineOperand *DefSrc = lookUpCopyChain(*TII, *MRI, ClampSrc->getReg());
+ MachineInstr *Def = MRI->getVRegDef(
+ DefSrc && DefSrc->isReg() ? DefSrc->getReg() : ClampSrc->getReg());
// The type of clamp must be compatible.
if (TII->getClampMask(*Def) != TII->getClampMask(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/true16-fold.mir b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
index 2f024cdaa2517..02acdcbaf88b1 100644
--- a/llvm/test/CodeGen/AMDGPU/true16-fold.mir
+++ b/llvm/test/CodeGen/AMDGPU/true16-fold.mir
@@ -16,10 +16,9 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
- ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
@@ -34,13 +33,13 @@ body: |
...
---
-name: fold_16bit_subreg_folded_clamp
+name: fold_16bit_subreg_1_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: fold_16bit_madmix_clamp
+ ; CHECK-LABEL: name: fold_16bit_subreg_1_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
@@ -49,8 +48,7 @@ body: |
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_16 = COPY [[V_FMA_MIXLO_F16_]]
- ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY4]], 0, [[COPY4]], -1, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[V_FMA_MIXLO_F16_]].lo16, 0, [[V_FMA_MIXLO_F16_]].lo16, -1, 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
@@ -65,13 +63,13 @@ body: |
...
---
-name: fold_16bit_subreg_clamp
+name: fold_16bit_subreg_2_clamp
tracksRegLiveness: true
registers:
body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
- ; CHECK-LABEL: name: fold_16bit_subreg_clamp
+ ; CHECK-LABEL: name: fold_16bit_subreg_2_clamp
; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
@@ -110,10 +108,9 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
- ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 0, [[COPY3]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[V_FMA_MIXLO_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMA_MIXLO_F16 8, [[COPY2]], 8, [[COPY1]], 0, [[COPY]], 1, [[COPY3]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: $vgpr10_lo16 = COPY [[V_FMA_MIXLO_F16_]]
- ; CHECK-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, $vgpr10_lo16, 0, $vgpr10_lo16, -1, 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_MAX_F16_t16_e64_]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[V_FMA_MIXLO_F16_]]
; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr0
%0:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
More information about the llvm-commits
mailing list