[llvm] [AMDGPU] Remove redundant s_cmp_* after add X, 1 (PR #176962)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jan 20 08:36:45 PST 2026
https://github.com/LU-JOHN created https://github.com/llvm/llvm-project/pull/176962
Convert:
```
s_add_u32 X, Y, 1
s_cmp_lg_i32 X, 0
```
to:
```
s_add_u32 X, Y, 1
<invert scc uses>
```
Also delete with s_cmp_eq_i32 X, 0, but inverting scc uses is not necessary.
>From 2473f1ad584250b11af61d6a56be755c7d30e214 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Tue, 20 Jan 2026 10:15:10 -0600
Subject: [PATCH] Remove redundant s_cmp_* after add X, 1
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 54 ++++++++++++++++---
.../AMDGPU/GlobalISel/insertelement.ll | 49 +++++++++--------
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 28 ++++++++--
3 files changed, 94 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6392022368785..503ef92afccc7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10963,6 +10963,34 @@ static bool foldableSelect(const MachineInstr &Def) {
return true;
}
+static bool setsSCCifResultIsZero(const MachineInstr &Def, bool &NeedInversion,
+ unsigned &NewDefOpc) {
+ // S_ADD_U32 X, 1 sets SCC on carryout which can only happen if result==0.
+ // S_ADD_I32 X, 1 can be converted to S_ADD_U32 X, 1 if SCC is dead.
+ if (Def.getOpcode() != AMDGPU::S_ADD_I32 &&
+ Def.getOpcode() != AMDGPU::S_ADD_U32)
+ return false;
+ MachineOperand AddSrc1 = Def.getOperand(1);
+ MachineOperand AddSrc2 = Def.getOperand(2);
+ int64_t addend;
+ if (!(AddSrc1.isImm() && AddSrc1.getImm() == 1) &&
+ !(AddSrc2.isImm() && AddSrc2.getImm() == 1) &&
+ !(getFoldableImm(&AddSrc1, addend) && addend == 1) &&
+ !(getFoldableImm(&AddSrc2, addend) && addend == 1))
+ return false;
+
+ if (Def.getOpcode() == AMDGPU::S_ADD_I32) {
+ const MachineOperand *SccDef =
+ Def.findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
+ if (!SccDef->isDead())
+ return false;
+ NewDefOpc = AMDGPU::S_ADD_U32;
+ }
+
+ NeedInversion = !NeedInversion;
+ return true;
+}
+
bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Register SrcReg2, int64_t CmpMask,
int64_t CmpValue,
@@ -10984,20 +11012,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// For S_OP that set SCC = DST!=0, do the transformation
//
- // s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
-
+ // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+ //
+ // For (S_OP ...) that set SCC = DST==0, invert NeedInversion and
+ // do the transformation:
+ //
+ // s_cmp_[lg|eq]_* (S_OP ...), 0 => (S_OP ...)
+ //
// If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
// for S_CSELECT* already has the same value that will be calculated by
// s_cmp_lg_*
//
- // s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
- // imm), 0)
- if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
+ // s_cmp_[lg|eq]_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT*
+ // (non-zero imm), 0)
+
+ unsigned NewDefOpc = Def->getOpcode();
+ if (!setsSCCifResultIsNonZero(*Def) &&
+ !setsSCCifResultIsZero(*Def, NeedInversion, NewDefOpc) &&
+ !foldableSelect(*Def))
return false;
if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
return false;
+ if (NewDefOpc != Def->getOpcode())
+ Def->setDesc(get(NewDefOpc));
+
// If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
// s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
// 64-bit foldableSelect then delete s_or_b32 in the sequence:
@@ -11020,7 +11060,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
if (Select && foldableSelect(*Select))
- optimizeSCC(Select, Def, false);
+ optimizeSCC(Select, Def, /*NeedInversion=*/false);
}
}
}
@@ -11101,7 +11141,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- if (!optimizeSCC(Def, &CmpInstr, false))
+ if (!optimizeSCC(Def, &CmpInstr, /*NeedInversion=*/false))
return false;
if (!MRI->use_nodbg_empty(DefReg)) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 533b25ef1a0c0..6738c5e224267 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -2024,31 +2024,30 @@ entry:
define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_s_add_1(<8 x float> inreg %vec, float inreg %val, i32 inreg %idx) {
; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_s_add_1:
; GPRIDX: ; %bb.0: ; %entry
-; GPRIDX-NEXT: s_add_i32 s11, s11, 1
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 0
-; GPRIDX-NEXT: s_cselect_b32 s0, s10, s2
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 1
-; GPRIDX-NEXT: s_cselect_b32 s1, s10, s3
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 2
-; GPRIDX-NEXT: s_cselect_b32 s2, s10, s4
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 3
-; GPRIDX-NEXT: s_cselect_b32 s3, s10, s5
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 4
-; GPRIDX-NEXT: s_cselect_b32 s4, s10, s6
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 5
-; GPRIDX-NEXT: s_cselect_b32 s5, s10, s7
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 6
-; GPRIDX-NEXT: s_cselect_b32 s6, s10, s8
-; GPRIDX-NEXT: s_cmp_eq_u32 s11, 7
-; GPRIDX-NEXT: s_cselect_b32 s7, s10, s9
-; GPRIDX-NEXT: v_mov_b32_e32 v0, s0
-; GPRIDX-NEXT: v_mov_b32_e32 v1, s1
-; GPRIDX-NEXT: v_mov_b32_e32 v2, s2
-; GPRIDX-NEXT: v_mov_b32_e32 v3, s3
-; GPRIDX-NEXT: v_mov_b32_e32 v4, s4
-; GPRIDX-NEXT: v_mov_b32_e32 v5, s5
-; GPRIDX-NEXT: v_mov_b32_e32 v6, s6
-; GPRIDX-NEXT: v_mov_b32_e32 v7, s7
+; GPRIDX-NEXT: s_add_u32 s0, s11, 1
+; GPRIDX-NEXT: s_cselect_b32 s1, s10, s2
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 1
+; GPRIDX-NEXT: s_cselect_b32 s2, s10, s3
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 2
+; GPRIDX-NEXT: s_cselect_b32 s3, s10, s4
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 3
+; GPRIDX-NEXT: s_cselect_b32 s4, s10, s5
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 4
+; GPRIDX-NEXT: s_cselect_b32 s5, s10, s6
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 5
+; GPRIDX-NEXT: s_cselect_b32 s6, s10, s7
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 6
+; GPRIDX-NEXT: s_cselect_b32 s7, s10, s8
+; GPRIDX-NEXT: s_cmp_eq_u32 s0, 7
+; GPRIDX-NEXT: s_cselect_b32 s0, s10, s9
+; GPRIDX-NEXT: v_mov_b32_e32 v0, s1
+; GPRIDX-NEXT: v_mov_b32_e32 v1, s2
+; GPRIDX-NEXT: v_mov_b32_e32 v2, s3
+; GPRIDX-NEXT: v_mov_b32_e32 v3, s4
+; GPRIDX-NEXT: v_mov_b32_e32 v4, s5
+; GPRIDX-NEXT: v_mov_b32_e32 v5, s6
+; GPRIDX-NEXT: v_mov_b32_e32 v6, s7
+; GPRIDX-NEXT: v_mov_b32_e32 v7, s0
; GPRIDX-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: dyn_insertelement_v8f32_s_s_s_add_1:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 6f4212b13433b..d66b9029dbf99 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -7,6 +7,24 @@ declare i64 @llvm.ctpop.i64(i64)
declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
+define amdgpu_ps i32 @add32(i32 inreg %val0) {
+; CHECK-LABEL: add32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_u32 s0, s0, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s0
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_cselect_b64 s[0:1], 0, -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = add i32 %val0, 1
+ call void asm "; use $0", "s"(i32 %result)
+ %cmp = icmp ne i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32:
; CHECK: ; %bb.0:
@@ -691,14 +709,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1 at rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB40_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB41_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB40_3
-; CHECK-NEXT: .LBB40_2: ; %if
+; CHECK-NEXT: s_branch .LBB41_3
+; CHECK-NEXT: .LBB41_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB40_3
-; CHECK-NEXT: .LBB40_3:
+; CHECK-NEXT: s_branch .LBB41_3
+; CHECK-NEXT: .LBB41_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
More information about the llvm-commits
mailing list