[llvm] [AMDGPU] Try to reuse in v_cndmask register with constant from compare. (PR #131146)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 13 07:06:47 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Daniil Fukalov (dfukalov)
<details>
<summary>Changes</summary>
For some targets, the optimization `X == Const ? X : Y` → `X == Const ? Const : Y` can cause extra register usage for the constant in `v_cndmask`. This patch detects such cases and reuses the register from the compare instruction that already holds the constant, instead of materializing it again.
For SWDEV-506659.
---
Full diff: https://github.com/llvm/llvm-project/pull/131146.diff
2 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+69-4)
- (added) llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll (+171)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 91df516b80857..f2857cd381c7e 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1411,15 +1411,80 @@ bool SIFoldOperandsImpl::tryFoldCndMask(MachineInstr &MI) const {
Opc != AMDGPU::V_CNDMASK_B64_PSEUDO)
return false;
+ // Try to find optimized Y == Const ? Const : Z. If Const can't be directly
+ // encoded in the cndmask, try to reuse a register already holding the Const
+ // value from the comparison instruction.
+ auto tryFoldCndMaskCmp =
+ [&](MachineOperand *SrcOp, std::optional<int64_t> SrcImm,
+ unsigned CmpOpcodes[4], AMDGPU::OpName CmpValName) -> bool {
+ // We'll try to process only register operands with known values.
+ if (!SrcImm || !SrcOp->isReg())
+ return false;
+
+ // Find the predicate of the cndmask instruction.
+ MachineOperand *PredOp = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+ if (!PredOp || !PredOp->isReg())
+ return false;
+
+ MachineInstr *PredI = MRI->getVRegDef(PredOp->getReg());
+ if (!PredI || !PredI->isCompare())
+ return false;
+
+ unsigned CmpOpc = PredI->getOpcode();
+
+ if (CmpOpc != CmpOpcodes[0] && CmpOpc != CmpOpcodes[1] &&
+ CmpOpc != CmpOpcodes[2] && CmpOpc != CmpOpcodes[3])
+ return false;
+
+ // Check if the immediate value of the source operand matches the immediate
+ // value of either the first or second operand of the comparison
+ // instruction.
+ MachineOperand *SubstOp = nullptr;
+ std::optional<int64_t> CmpValImm = getImmOrMaterializedImm(
+ *TII->getNamedOperand(*PredI, AMDGPU::OpName::src0));
+ if (CmpValImm && *CmpValImm == *SrcImm) {
+ SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src1);
+ } else {
+ CmpValImm = getImmOrMaterializedImm(
+ *TII->getNamedOperand(*PredI, AMDGPU::OpName::src1));
+ if (CmpValImm && *CmpValImm == *SrcImm) {
+ SubstOp = TII->getNamedOperand(*PredI, AMDGPU::OpName::src0);
+ } else {
+ return false;
+ }
+ }
+
+ if (!SubstOp || !SubstOp->isReg())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folded " << MI << " into ");
+ SrcOp->setReg(SubstOp->getReg());
+ LLVM_DEBUG(dbgs() << MI);
+ return true;
+ };
+
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (!Src1->isIdenticalTo(*Src0)) {
- std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
- if (!Src1Imm)
- return false;
+ // Try to fold with not-equal comparisons
+ unsigned NECmpOpcodes[4] = {
+ AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_LG_F32_e64,
+ AMDGPU::V_CMP_NE_I32_e64, AMDGPU::V_CMP_NE_U32_e64};
std::optional<int64_t> Src0Imm = getImmOrMaterializedImm(*Src0);
- if (!Src0Imm || *Src0Imm != *Src1Imm)
+ if (tryFoldCndMaskCmp(Src0, Src0Imm, NECmpOpcodes, AMDGPU::OpName::src1))
+ return true;
+
+ // Try to fold with equal comparisons
+ unsigned EQCmpOpcodes[4] = {
+ AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64,
+ AMDGPU::V_CMP_EQ_I32_e64, AMDGPU::V_CMP_EQ_U32_e64};
+
+ std::optional<int64_t> Src1Imm = getImmOrMaterializedImm(*Src1);
+ if (tryFoldCndMaskCmp(Src1, Src1Imm, EQCmpOpcodes, AMDGPU::OpName::src0))
+ return true;
+
+ if (!Src0Imm || !Src1Imm || *Src0Imm != *Src1Imm)
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
new file mode 100644
index 0000000000000..0b7106448bb7c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-select.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
+
+define float @f32_oeq_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_oeq_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_one_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define float @f32_one_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 %arg, 424242
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 424242, %arg
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 %arg, 424242
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
+
+define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 424242, %arg
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/131146
More information about the llvm-commits
mailing list