[llvm] [AMDGPU][GISel] Fold 'min(min(x,y),z)' and 'max(max(x,y),z)' into min3 and max3 (PR #124263)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 24 04:48:28 PST 2025
https://github.com/Ruhung created https://github.com/llvm/llvm-project/pull/124263
Draft PR.
Fold 'min(min(x,y),z)' and 'max(max(x,y),z)' into min3 and max3
Fixes: #123079
>From a308dac4370db8cb9a8b69eedfc8ebca5190d146 Mon Sep 17 00:00:00 2001
From: Ruhung <jhlee at pllab.cs.nthu.edu.tw>
Date: Fri, 24 Jan 2025 11:17:37 +0800
Subject: [PATCH] [AMDGPU][GISel] Fold 'min(min(x,y),z)' and 'max(max(x,y),z)'
into min3 and max3.
---
llvm/lib/Target/AMDGPU/AMDGPUCombine.td | 23 ++++--
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 7 ++
.../Target/AMDGPU/AMDGPURegBankCombiner.cpp | 70 +++++++++++++++++++
llvm/lib/Target/AMDGPU/SIInstructions.td | 36 ++++++++++
llvm/test/CodeGen/AMDGPU/ctlz.ll | 6 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 6 +-
6 files changed, 133 insertions(+), 15 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index da47aaf8a3b5c9..477ecbfe97b4a2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -71,6 +71,16 @@ def int_minmax_to_med3 : GICombineRule<
[{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
(apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+def minmax3_matchdata : GIDefMatchData<"VOP3MatchInfo">;
+
+def minmax_to_minmax3
+ : GICombineRule<
+ (defs root:$min_or_max, minmax3_matchdata:$matchinfo),
+ (match(wip_match_opcode G_SMAX, G_SMIN, G_UMAX, G_UMIN, G_FMAXNUM,
+ G_FMINNUM, G_FMAXNUM_IEEE, G_FMINNUM_IEEE):$min_or_max,
+ [{ return matchMinMaxToMinMax3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ applyVOP3(*${min_or_max}, ${matchinfo}); }])>;
+
def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
(match (wip_match_opcode G_FMAXNUM,
@@ -175,10 +185,9 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
let CombineAllMethodName = "tryCombineAllImpl";
}
-def AMDGPURegBankCombiner : GICombiner<
- "AMDGPURegBankCombinerImpl",
- [unmerge_merge, unmerge_cst, unmerge_undef,
- zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
- fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
- redundant_and]> {
-}
+def AMDGPURegBankCombiner
+ : GICombiner<"AMDGPURegBankCombinerImpl",
+ [unmerge_merge, unmerge_cst, unmerge_undef, zext_trunc_fold,
+ int_minmax_to_med3, ptr_add_immed_chain, fp_minmax_to_clamp,
+ fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+ minmax_to_minmax3, redundant_and]> {}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 1b909568fc555c..0038d9942a3201 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -244,6 +244,13 @@ def : GINodeEquiv<G_AMDGPU_CVT_PK_I16_I32, AMDGPUpk_i16_i32_impl>;
def : GINodeEquiv<G_AMDGPU_SMED3, AMDGPUsmed3>;
def : GINodeEquiv<G_AMDGPU_UMED3, AMDGPUumed3>;
def : GINodeEquiv<G_AMDGPU_FMED3, AMDGPUfmed3_impl>;
+def : GINodeEquiv<G_AMDGPU_SMAX3, AMDGPUsmax3>;
+def : GINodeEquiv<G_AMDGPU_UMAX3, AMDGPUumax3>;
+def : GINodeEquiv<G_AMDGPU_FMAX3, AMDGPUfmax3>;
+def : GINodeEquiv<G_AMDGPU_SMIN3, AMDGPUsmin3>;
+def : GINodeEquiv<G_AMDGPU_UMIN3, AMDGPUumin3>;
+def : GINodeEquiv<G_AMDGPU_FMIN3, AMDGPUfmin3>;
+
def : GINodeEquiv<G_AMDGPU_CLAMP, AMDGPUclamp>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 98c48f4fe3705b..5af2663397fb75 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -74,17 +74,27 @@ class AMDGPURegBankCombinerImpl : public Combiner {
Register Val0, Val1, Val2;
};
+ struct VOP3MatchInfo {
+ unsigned Opc;
+ Register Val0, Val1, Val2;
+ };
+
MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
+ unsigned getMinMax3(unsigned Opc) const;
template <class m_Cst, typename CstTy>
bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
Register &Val, CstTy &K0, CstTy &K1) const;
+ bool matchVOP3(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned Opc,
+ Register &A, Register &B, Register &C) const;
bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
+ bool matchMinMaxToMinMax3(MachineInstr &MI, VOP3MatchInfo &MatchInfo) const;
bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
+ void applyVOP3(MachineInstr &MI, VOP3MatchInfo &MatchInfo) const;
void applyClamp(MachineInstr &MI, Register &Reg) const;
private:
@@ -165,6 +175,27 @@ AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
}
}
+unsigned AMDGPURegBankCombinerImpl::getMinMax3(unsigned Opc) const {
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unsupported opcode");
+ case AMDGPU::G_SMAX:
+ return AMDGPU::G_AMDGPU_SMAX3;
+ case AMDGPU::G_SMIN:
+ return AMDGPU::G_AMDGPU_SMIN3;
+ case AMDGPU::G_UMAX:
+ return AMDGPU::G_AMDGPU_UMAX3;
+ case AMDGPU::G_UMIN:
+ return AMDGPU::G_AMDGPU_UMIN3;
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMAXNUM_IEEE:
+ return AMDGPU::G_AMDGPU_FMAX3;
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ return AMDGPU::G_AMDGPU_FMIN3;
+ }
+}
+
template <class m_Cst, typename CstTy>
bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
MachineRegisterInfo &MRI,
@@ -187,6 +218,36 @@ bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
m_Cst(K0))));
}
+bool AMDGPURegBankCombinerImpl::matchVOP3(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ unsigned Opc, Register &A,
+ Register &B, Register &C) const {
+ return mi_match(
+ MI, MRI,
+ m_any_of(m_BinOp(Opc, m_OneNonDBGUse(m_BinOp(Opc, m_Reg(A), m_Reg(B))),
+ m_Reg(C)),
+ m_BinOp(Opc, m_Reg(A),
+ m_OneNonDBGUse(m_BinOp(Opc, m_Reg(B), m_Reg(C))))));
+}
+
+bool AMDGPURegBankCombinerImpl::matchMinMaxToMinMax3(
+ MachineInstr &MI, VOP3MatchInfo &MatchInfo) const {
+ Register Dst = MI.getOperand(0).getReg();
+ if (!isVgprRegBank(Dst))
+ return false;
+
+ LLT Ty = MRI.getType(Dst);
+ if ((Ty != LLT::scalar(16) || !STI.hasMin3Max3_16()) && Ty != LLT::scalar(32))
+ return false;
+
+ unsigned Opc = MI.getOpcode();
+ Register R0, R1, R2;
+ if (!matchVOP3(MI, MRI, Opc, R0, R1, R2))
+ return false;
+ MatchInfo = {getMinMax3(Opc), R0, R1, R2};
+ return true;
+}
+
bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
Register Dst = MI.getOperand(0).getReg();
@@ -362,6 +423,15 @@ void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}
+void AMDGPURegBankCombinerImpl::applyVOP3(MachineInstr &MI,
+ VOP3MatchInfo &MatchInfo) const {
+ B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
+ {getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
+ getAsVgpr(MatchInfo.Val2)},
+ MI.getFlags());
+ MI.eraseFromParent();
+}
+
SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4325ab448e5815..b5b4b8cf91db8f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3955,6 +3955,42 @@ def G_AMDGPU_FMED3 : AMDGPUGenericInstruction {
let hasSideEffects = 0;
}
+def G_AMDGPU_SMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMIN3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_SMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_UMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMAX3 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2);
+ let hasSideEffects = 0;
+}
+
def G_AMDGPU_CLAMP : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src);
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 3019d4d298eb45..1b71bfd0c3e297 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -861,9 +861,8 @@ define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspa
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v0, v0, 32 clamp
-; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v1, v0
+; GFX10-GISEL-NEXT: v_min3_u32 v0, v1, v0, 64
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
@@ -989,8 +988,7 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1
; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v2
; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
-; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v2, v1
-; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1
+; GFX10-GISEL-NEXT: v_min3_u32 v1, v2, v1, 64
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index f0c278a67c8bcc..3ebaf864683da6 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -749,9 +749,8 @@ define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspa
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v0, v0
; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v1, v1, 32 clamp
-; GFX10-GISEL-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX10-GISEL-NEXT: v_min3_u32 v0, v0, v1, 64
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-GISEL-NEXT: v_min_u32_e32 v0, 64, v0
; GFX10-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -859,8 +858,7 @@ define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr a
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v2, v2
; GFX10-GISEL-NEXT: v_ffbl_b32_e32 v1, v1
; GFX10-GISEL-NEXT: v_add_nc_u32_e64 v2, v2, 32 clamp
-; GFX10-GISEL-NEXT: v_min_u32_e32 v1, v1, v2
-; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 64, v1
+; GFX10-GISEL-NEXT: v_min3_u32 v1, v1, v2, 64
; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-GISEL-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list