[llvm] [AMDGPU] Invert scc uses to delete s_cmp_eq* (PR #167382)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 19 07:40:55 PST 2025
https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/167382
>From fa8aded09f316b69dfdf29791309015b01bb32dc Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Fri, 7 Nov 2025 13:50:47 -0600
Subject: [PATCH 01/10] Invert uses to delete s_cmp_eq*
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 72 ++++++++++++++++---
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 ++
.../branch-folding-implicit-def-subreg.ll | 9 ++-
llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++---
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 33 +++++++++
.../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +--
6 files changed, 114 insertions(+), 33 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7cb7f47ddb220..8fc551e7b9650 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10737,12 +10737,64 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
return false;
}
+bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
+ MachineBasicBlock *MBB = SCCDef->getParent();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ SmallVector<MachineInstr *, 2> InvertInstr;
+ bool SCCIsDead = false;
+
+ // Scan instructions for SCC uses that need to be inverted until SCC is dead.
+ for (MachineInstr &MI :
+ make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
+ if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, TRI, false) != -1) {
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
+ MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+ InvertInstr.push_back(&MI);
+ else
+ return false;
+ }
+ if (MI.modifiesRegister(AMDGPU::SCC, TRI)) {
+ SCCIsDead = true;
+ break;
+ }
+ }
+
+ const MachineRegisterInfo &MRI =
+ SCCDef->getParent()->getParent()->getRegInfo();
+ // If SCC is still live, verify that it is not live past the end of this
+ // block.
+ if (!SCCIsDead && MRI.tracksLiveness())
+ SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) ==
+ MachineBasicBlock::LQR_Dead;
+
+ // Invert uses
+ if (SCCIsDead) {
+ for (auto &MI : InvertInstr) {
+ if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI->getOpcode() == AMDGPU::S_CSELECT_B64)
+ swapOperands(*MI);
+ else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+ MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
+ ? AMDGPU::S_CBRANCH_SCC1
+ : AMDGPU::S_CBRANCH_SCC0));
+ else
+ llvm_unreachable("SCC used but no inversion handling");
+ }
+ return true;
+ }
+ return false;
+}
+
// SCC is already valid after SCCValid.
// SCCRedefine will redefine SCC to the same value already available after
// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
// update kill/dead flags if necessary.
-static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
- const SIRegisterInfo &RI) {
+bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ const SIRegisterInfo &RI,
+ bool NeedInversion) const {
MachineInstr *KillsSCC = nullptr;
if (SCCValid->getParent() != SCCRedefine->getParent())
return false;
@@ -10753,6 +10805,8 @@ static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
if (MI.killsRegister(AMDGPU::SCC, &RI))
KillsSCC = &MI;
}
+ if (NeedInversion && !invertSCCUse(SCCRedefine))
+ return false;
if (MachineOperand *SccDef =
SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
SccDef->setIsDead(false);
@@ -10786,7 +10840,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
return false;
const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
- this]() -> bool {
+ this](bool NeedInversion) -> bool {
if (CmpValue != 0)
return false;
@@ -10807,7 +10861,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
return false;
- if (!optimizeSCC(Def, &CmpInstr, RI))
+ if (!optimizeSCC(Def, &CmpInstr, RI, NeedInversion))
return false;
// If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
@@ -10832,7 +10886,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
if (Select && foldableSelect(*Select))
- optimizeSCC(Select, Def, RI);
+ optimizeSCC(Select, Def, RI, false);
}
}
}
@@ -10913,7 +10967,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- if (!optimizeSCC(Def, &CmpInstr, RI))
+ if (!optimizeSCC(Def, &CmpInstr, RI, false))
return false;
if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10944,7 +10998,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_EQ_I32:
case AMDGPU::S_CMPK_EQ_U32:
case AMDGPU::S_CMPK_EQ_I32:
- return optimizeCmpAnd(1, 32, true, false);
+ return optimizeCmpAnd(1, 32, true, false) || optimizeCmpSelect(true);
case AMDGPU::S_CMP_GE_U32:
case AMDGPU::S_CMPK_GE_U32:
return optimizeCmpAnd(1, 32, false, false);
@@ -10957,7 +11011,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMP_LG_I32:
case AMDGPU::S_CMPK_LG_U32:
case AMDGPU::S_CMPK_LG_I32:
- return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(false);
case AMDGPU::S_CMP_GT_U32:
case AMDGPU::S_CMPK_GT_U32:
return optimizeCmpAnd(0, 32, false, false);
@@ -10965,7 +11019,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
case AMDGPU::S_CMPK_GT_I32:
return optimizeCmpAnd(0, 32, false, true);
case AMDGPU::S_CMP_LG_U64:
- return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
+ return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(false);
}
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index c66985a19685b..95f367ea0c154 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -125,6 +125,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
unsigned SubIdx, const TargetRegisterClass *SubRC) const;
private:
+ bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
+ const SIRegisterInfo &RI, bool NeedInversion) const;
+
+ bool invertSCCUse(MachineInstr *SCCDef) const;
+
void swapOperands(MachineInstr &Inst) const;
std::pair<bool, MachineBasicBlock *>
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 026b8ba2759f0..274a88c930130 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -701,11 +701,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
- ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
- ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc
+ ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc
+ ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.53:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 72c2003058a01..1233c1fe12f72 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -879,8 +879,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: s_lshl_b32 s6, s4, 7
; SI-NEXT: s_or_b32 s6, s5, s6
-; SI-NEXT: s_cmp_eq_u32 s6, 0
-; SI-NEXT: s_cselect_b32 s4, s4, s5
+; SI-NEXT: s_cselect_b32 s4, s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
@@ -893,8 +892,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s2, 7
; VI-NEXT: s_or_b32 s4, s3, s4
-; VI-NEXT: s_cmp_eq_u32 s4, 0
-; VI-NEXT: s_cselect_b32 s2, s2, s3
+; VI-NEXT: s_cselect_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -908,8 +906,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s4, s2, 7
; GFX9-NEXT: s_or_b32 s4, s3, s4
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s2, s2, s3
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -935,8 +932,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl_b32 s4, s2, 7
; GFX10-NEXT: s_or_b32 s4, s3, s4
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s2, s2, s3
+; GFX10-NEXT: s_cselect_b32 s2, s3, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -946,11 +942,9 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s2, 7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s4, s3, s4
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s2, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_cselect_b32 s2, s3, s2
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index b5228e3054f0a..6fb1d49e53c45 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; Test deletion of redundant s_cmp* sX, 0 instructions.
declare i32 @llvm.ctpop.i32(i32)
declare i64 @llvm.ctpop.i64(i64)
@@ -20,6 +21,38 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
ret i32 %zext
}
+; s_lshl_b32 sets SCC if result is non-zero.
+; Deletion of equal to zero comparison will require inversion of use.
+define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: shl32_eq:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, s1
+; CHECK-NEXT: s_cselect_b64 s[0:1], 0, -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i32 %val0, %val1
+ %cmp = icmp eq i32 %result, 0
+ %zext = zext i1 %cmp to i32
+ ret i32 %zext
+}
+
+; 64-bit selection will generate two 32-bit selects. Inversion of multiple
+; uses is required.
+define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl32_eq_multi_use:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b32 s2, 0, s2
+; CHECK-NEXT: s_cselect_b32 s0, 0, s1
+; CHECK-NEXT: s_mov_b32 s1, s2
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i32 %val0, 1
+ %cmp = icmp eq i32 %result, 0
+ %val64 = select i1 %cmp, i64 %val1, i64 0
+ ret i64 %val64
+}
+
define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
; CHECK-LABEL: shl64:
; CHECK: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 4445383bd0ace..4ef2ac1f59e07 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -180,8 +180,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -190,8 +189,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0
-; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
+; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -208,9 +206,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0
-; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
>From 7cac738128d2a48cfe4513a64b0274df838e9578 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Tue, 11 Nov 2025 09:17:37 -0600
Subject: [PATCH 02/10] Streamline code
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 34 +++++++++++++-------------
1 file changed, 17 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 8fc551e7b9650..0540461f33859 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10746,7 +10746,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
// Scan instructions for SCC uses that need to be inverted until SCC is dead.
for (MachineInstr &MI :
make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
- if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, TRI, false) != -1) {
+ if (MI.readsRegister(AMDGPU::SCC, TRI)) {
if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
@@ -10755,7 +10755,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
else
return false;
}
- if (MI.modifiesRegister(AMDGPU::SCC, TRI)) {
+ if (MI.definesRegister(AMDGPU::SCC, TRI)) {
SCCIsDead = true;
break;
}
@@ -10769,23 +10769,23 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) ==
MachineBasicBlock::LQR_Dead;
+ if (!SCCIsDead)
+ return false;
+
// Invert uses
- if (SCCIsDead) {
- for (auto &MI : InvertInstr) {
- if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- MI->getOpcode() == AMDGPU::S_CSELECT_B64)
- swapOperands(*MI);
- else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
- MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1)
- MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
- ? AMDGPU::S_CBRANCH_SCC1
- : AMDGPU::S_CBRANCH_SCC0));
- else
- llvm_unreachable("SCC used but no inversion handling");
- }
- return true;
+ for (MachineInstr *MI : InvertInstr) {
+ if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+ MI->getOpcode() == AMDGPU::S_CSELECT_B64)
+ swapOperands(*MI);
+ else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+ MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
+ ? AMDGPU::S_CBRANCH_SCC1
+ : AMDGPU::S_CBRANCH_SCC0));
+ else
+ llvm_unreachable("SCC used but no inversion handling");
}
- return false;
+ return true;
}
// SCC is already valid after SCCValid.
>From 781e6114b448170acc69c554df5f55eac66d0119 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 13 Nov 2025 15:33:40 -0600
Subject: [PATCH 03/10] Remove unnecessary parm, add comment, address feedback
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 32 ++++++++++++++------------
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +-
2 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0540461f33859..9a5fb178d5ae2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10737,16 +10737,18 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
return false;
}
+// Invert all uses of SCC following SCCDef because SCCDef may be deleted and
+// (incoming SCC) = !(SCC defined by SCCDef).
+// Return true if all uses can be re-written, false otherwise.
bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
MachineBasicBlock *MBB = SCCDef->getParent();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- SmallVector<MachineInstr *, 2> InvertInstr;
+ SmallVector<MachineInstr *> InvertInstr;
bool SCCIsDead = false;
// Scan instructions for SCC uses that need to be inverted until SCC is dead.
for (MachineInstr &MI :
make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) {
- if (MI.readsRegister(AMDGPU::SCC, TRI)) {
+ if (MI.readsRegister(AMDGPU::SCC, &RI)) {
if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
MI.getOpcode() == AMDGPU::S_CSELECT_B64 ||
MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
@@ -10755,18 +10757,18 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
else
return false;
}
- if (MI.definesRegister(AMDGPU::SCC, TRI)) {
+ if (MI.definesRegister(AMDGPU::SCC, &RI) ||
+ MI.killsRegister(AMDGPU::SCC, &RI)) {
SCCIsDead = true;
break;
}
}
- const MachineRegisterInfo &MRI =
- SCCDef->getParent()->getParent()->getRegInfo();
+ const MachineRegisterInfo &MRI = SCCDef->getMF()->getRegInfo();
// If SCC is still live, verify that it is not live past the end of this
// block.
if (!SCCIsDead && MRI.tracksLiveness())
- SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) ==
+ SCCIsDead = MBB->computeRegisterLiveness(&RI, AMDGPU::SCC, MBB->end(), 0) ==
MachineBasicBlock::LQR_Dead;
if (!SCCIsDead)
@@ -10775,15 +10777,16 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
// Invert uses
for (MachineInstr *MI : InvertInstr) {
if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 ||
- MI->getOpcode() == AMDGPU::S_CSELECT_B64)
+ MI->getOpcode() == AMDGPU::S_CSELECT_B64) {
swapOperands(*MI);
- else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
- MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1)
+ } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ||
+ MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) {
MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0
? AMDGPU::S_CBRANCH_SCC1
: AMDGPU::S_CBRANCH_SCC0));
- else
+ } else {
llvm_unreachable("SCC used but no inversion handling");
+ }
}
return true;
}
@@ -10793,7 +10796,6 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
// SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and
// update kill/dead flags if necessary.
bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
- const SIRegisterInfo &RI,
bool NeedInversion) const {
MachineInstr *KillsSCC = nullptr;
if (SCCValid->getParent() != SCCRedefine->getParent())
@@ -10861,7 +10863,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def))
return false;
- if (!optimizeSCC(Def, &CmpInstr, RI, NeedInversion))
+ if (!optimizeSCC(Def, &CmpInstr, NeedInversion))
return false;
// If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
@@ -10886,7 +10888,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
if (Select && foldableSelect(*Select))
- optimizeSCC(Select, Def, RI, false);
+ optimizeSCC(Select, Def, false);
}
}
}
@@ -10967,7 +10969,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
return false;
- if (!optimizeSCC(Def, &CmpInstr, RI, false))
+ if (!optimizeSCC(Def, &CmpInstr, false))
return false;
if (!MRI->use_nodbg_empty(DefReg)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 95f367ea0c154..3fffe85eb55d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -126,7 +126,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
- const SIRegisterInfo &RI, bool NeedInversion) const;
+ bool NeedInversion) const;
bool invertSCCUse(MachineInstr *SCCDef) const;
>From 139b622009b99df08d640c067fdb48ca7bf593f8 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 13 Nov 2025 15:52:42 -0600
Subject: [PATCH 04/10] Use new isLiveOut method
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/include/llvm/CodeGen/MachineBasicBlock.h | 5 +++++
llvm/lib/CodeGen/MachineBasicBlock.cpp | 7 +++++++
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +------
llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 5 ++---
4 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index fcf7bab09fcff..8f76d2ad5ef53 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -514,6 +514,11 @@ class MachineBasicBlock
LLVM_ABI bool isLiveIn(MCRegister Reg,
LaneBitmask LaneMask = LaneBitmask::getAll()) const;
+ /// Return true if the specified register is live out (i.e. in the live in set
+ /// of a successor)
+ LLVM_ABI bool isLiveOut(MCRegister Reg,
+ LaneBitmask LaneMask = LaneBitmask::getAll()) const;
+
// Iteration support for live in sets. These sets are kept in sorted
// order by their register number.
using livein_iterator = LiveInVector::const_iterator;
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index ba0b025167307..35a9da95b3b9f 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -639,6 +639,13 @@ bool MachineBasicBlock::isLiveIn(MCRegister Reg, LaneBitmask LaneMask) const {
return I != livein_end() && (I->LaneMask & LaneMask).any();
}
+bool MachineBasicBlock::isLiveOut(MCRegister Reg, LaneBitmask LaneMask) const {
+ for (MachineBasicBlock *S : successors())
+ if (S->isLiveIn(Reg, LaneMask))
+ return true;
+ return false;
+}
+
void MachineBasicBlock::sortUniqueLiveIns() {
llvm::sort(LiveIns,
[](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9a5fb178d5ae2..99a07876a0189 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10764,14 +10764,9 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
}
}
- const MachineRegisterInfo &MRI = SCCDef->getMF()->getRegInfo();
// If SCC is still live, verify that it is not live past the end of this
// block.
- if (!SCCIsDead && MRI.tracksLiveness())
- SCCIsDead = MBB->computeRegisterLiveness(&RI, AMDGPU::SCC, MBB->end(), 0) ==
- MachineBasicBlock::LQR_Dead;
-
- if (!SCCIsDead)
+ if (!SCCIsDead && MBB->isLiveOut(AMDGPU::SCC))
return false;
// Invert uses
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fa452f3717f0e..442bf4f7e6d64 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -1476,9 +1476,8 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_EQ_U32 [[S_AND_B32_]], 0, implicit-def $scc
- ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
>From 3bf05ff7c467d79c7f4f990a115543d1d5b39ac8 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 13 Nov 2025 16:16:16 -0600
Subject: [PATCH 05/10] Update test check
Signed-off-by: John Lu <John.Lu at amd.com>
---
.../test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 274a88c930130..c6cc3922d3952 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -702,8 +702,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc
- ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
- ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
+ ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.53:
>From 65ccbf46710f1aae58236b3b285003c809bcd129 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Sun, 16 Nov 2025 01:29:17 -0600
Subject: [PATCH 06/10] Don't rely on isLiveOut. Use defines and kill flags
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/include/llvm/CodeGen/MachineBasicBlock.h | 5 --
llvm/lib/CodeGen/MachineBasicBlock.cpp | 7 ---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
.../branch-folding-implicit-def-subreg.ll | 5 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++++--
llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 5 +-
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 57 ++++++++++++-------
.../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +++-
8 files changed, 64 insertions(+), 45 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 8f76d2ad5ef53..fcf7bab09fcff 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -514,11 +514,6 @@ class MachineBasicBlock
LLVM_ABI bool isLiveIn(MCRegister Reg,
LaneBitmask LaneMask = LaneBitmask::getAll()) const;
- /// Return true if the specified register is live out (i.e. in the live in set
- /// of a successor)
- LLVM_ABI bool isLiveOut(MCRegister Reg,
- LaneBitmask LaneMask = LaneBitmask::getAll()) const;
-
// Iteration support for live in sets. These sets are kept in sorted
// order by their register number.
using livein_iterator = LiveInVector::const_iterator;
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 35a9da95b3b9f..ba0b025167307 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -639,13 +639,6 @@ bool MachineBasicBlock::isLiveIn(MCRegister Reg, LaneBitmask LaneMask) const {
return I != livein_end() && (I->LaneMask & LaneMask).any();
}
-bool MachineBasicBlock::isLiveOut(MCRegister Reg, LaneBitmask LaneMask) const {
- for (MachineBasicBlock *S : successors())
- if (S->isLiveIn(Reg, LaneMask))
- return true;
- return false;
-}
-
void MachineBasicBlock::sortUniqueLiveIns() {
llvm::sort(LiveIns,
[](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 99a07876a0189..61ffa484db93d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10766,7 +10766,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
// If SCC is still live, verify that it is not live past the end of this
// block.
- if (!SCCIsDead && MBB->isLiveOut(AMDGPU::SCC))
+ if (!SCCIsDead)
return false;
// Invert uses
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index c6cc3922d3952..026b8ba2759f0 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -701,10 +701,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc
+ ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
+ ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc
+ ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.53:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 1233c1fe12f72..72c2003058a01 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -879,7 +879,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: s_lshl_b32 s6, s4, 7
; SI-NEXT: s_or_b32 s6, s5, s6
-; SI-NEXT: s_cselect_b32 s4, s5, s4
+; SI-NEXT: s_cmp_eq_u32 s6, 0
+; SI-NEXT: s_cselect_b32 s4, s4, s5
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
@@ -892,7 +893,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s2, 7
; VI-NEXT: s_or_b32 s4, s3, s4
-; VI-NEXT: s_cselect_b32 s2, s3, s2
+; VI-NEXT: s_cmp_eq_u32 s4, 0
+; VI-NEXT: s_cselect_b32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -906,7 +908,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s4, s2, 7
; GFX9-NEXT: s_or_b32 s4, s3, s4
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_cmp_eq_u32 s4, 0
+; GFX9-NEXT: s_cselect_b32 s2, s2, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -932,7 +935,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl_b32 s4, s2, 7
; GFX10-NEXT: s_or_b32 s4, s3, s4
-; GFX10-NEXT: s_cselect_b32 s2, s3, s2
+; GFX10-NEXT: s_cmp_eq_u32 s4, 0
+; GFX10-NEXT: s_cselect_b32 s2, s2, s3
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -942,9 +946,11 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s2, 7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s4, s3, s4
-; GFX11-NEXT: s_cselect_b32 s2, s3, s2
+; GFX11-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-NEXT: s_cselect_b32 s2, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index 442bf4f7e6d64..fa452f3717f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -1476,8 +1476,9 @@ body: |
; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
- ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def dead $scc
+ ; GCN-NEXT: S_CMP_EQ_U32 [[S_AND_B32_]], 0, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
; GCN-NEXT: S_BRANCH %bb.1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 6fb1d49e53c45..b01447abe3e9b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -23,34 +23,53 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; s_lshl_b32 sets SCC if result is non-zero.
; Deletion of equal to zero comparison will require inversion of use.
+; FIXME: Can't invert because kill flag not set on last use.
define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32_eq:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_lshl_b32 s0, s0, s1
-; CHECK-NEXT: s_cselect_b64 s[0:1], 0, -1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_lshl_b32 s0, s0, 1
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_cselect_b32 s0, s1, 0
; CHECK-NEXT: ; return to shader part epilog
- %result = shl i32 %val0, %val1
+ %result = shl i32 %val0, 1
%cmp = icmp eq i32 %result, 0
- %zext = zext i1 %cmp to i32
- ret i32 %zext
+ %select = select i1 %cmp, i32 %val1, i32 0
+ ret i32 %select
+}
+
+; s_lshl_b32 sets SCC if result is non-zero.
+; Deletion of equal to zero comparison will require inversion of use.
+define amdgpu_ps i32 @shl32_eq_with_scc_clobber(i32 inreg %val0, i32 inreg %val1) {
+; CHECK-LABEL: shl32_eq_with_scc_clobber:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_lshl_b32 s0, s0, 1
+; CHECK-NEXT: s_cselect_b32 s0, 0, s1
+; CHECK-NEXT: s_xor_b32 s0, s0, s1
+; CHECK-NEXT: ; return to shader part epilog
+ %result = shl i32 %val0, 1
+ %cmp = icmp eq i32 %result, 0
+ %select = select i1 %cmp, i32 %val1, i32 0
+ %xor = xor i32 %select, %val1
+ ret i32 %xor
}
; 64-bit selection will generate two 32-bit selects. Inversion of multiple
; uses is required.
-define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) {
-; CHECK-LABEL: shl32_eq_multi_use:
+define amdgpu_ps i64 @shl32_eq_multi_use_with_scc_clobber(i32 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl32_eq_multi_use_with_scc_clobber:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s3, s2
+; CHECK-NEXT: s_mov_b32 s2, s1
; CHECK-NEXT: s_lshl_b32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b32 s2, 0, s2
-; CHECK-NEXT: s_cselect_b32 s0, 0, s1
-; CHECK-NEXT: s_mov_b32 s1, s2
+; CHECK-NEXT: s_cselect_b32 s1, 0, s3
+; CHECK-NEXT: s_cselect_b32 s0, 0, s2
+; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; CHECK-NEXT: ; return to shader part epilog
%result = shl i32 %val0, 1
%cmp = icmp eq i32 %result, 0
- %val64 = select i1 %cmp, i64 %val1, i64 0
- ret i64 %val64
+ %select = select i1 %cmp, i64 %val1, i64 0
+ %xor = xor i64 %select, %val1
+ ret i64 %xor
}
define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
@@ -693,14 +712,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1 at rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB38_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB41_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB38_3
-; CHECK-NEXT: .LBB38_2: ; %if
+; CHECK-NEXT: s_branch .LBB41_3
+; CHECK-NEXT: .LBB41_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB38_3
-; CHECK-NEXT: .LBB38_3:
+; CHECK-NEXT: s_branch .LBB41_3
+; CHECK-NEXT: .LBB41_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 4ef2ac1f59e07..4445383bd0ace 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -180,7 +180,8 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1
+; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0
+; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -189,7 +190,8 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1
+; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0
+; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -206,7 +208,9 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1
+; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
+; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0
+; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
>From 35c3de8d0ffaafb220559f5edd82fe89e5a89bc0 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 17 Nov 2025 11:37:57 -0600
Subject: [PATCH 07/10] SCC is dead on return
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++++++------------
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 3 +--
.../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +++-------
4 files changed, 11 insertions(+), 22 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 61ffa484db93d..ace5150d8437f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10758,7 +10758,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
return false;
}
if (MI.definesRegister(AMDGPU::SCC, &RI) ||
- MI.killsRegister(AMDGPU::SCC, &RI)) {
+ MI.killsRegister(AMDGPU::SCC, &RI) || MI.isReturn()) {
SCCIsDead = true;
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index 72c2003058a01..1233c1fe12f72 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -879,8 +879,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-NEXT: s_lshl_b32 s6, s4, 7
; SI-NEXT: s_or_b32 s6, s5, s6
-; SI-NEXT: s_cmp_eq_u32 s6, 0
-; SI-NEXT: s_cselect_b32 s4, s4, s5
+; SI-NEXT: s_cselect_b32 s4, s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
@@ -893,8 +892,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s4, s2, 7
; VI-NEXT: s_or_b32 s4, s3, s4
-; VI-NEXT: s_cmp_eq_u32 s4, 0
-; VI-NEXT: s_cselect_b32 s2, s2, s3
+; VI-NEXT: s_cselect_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -908,8 +906,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s4, s2, 7
; GFX9-NEXT: s_or_b32 s4, s3, s4
-; GFX9-NEXT: s_cmp_eq_u32 s4, 0
-; GFX9-NEXT: s_cselect_b32 s2, s2, s3
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -935,8 +932,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_lshl_b32 s4, s2, 7
; GFX10-NEXT: s_or_b32 s4, s3, s4
-; GFX10-NEXT: s_cmp_eq_u32 s4, 0
-; GFX10-NEXT: s_cselect_b32 s2, s2, s3
+; GFX10-NEXT: s_cselect_b32 s2, s3, s2
; GFX10-NEXT: v_mov_b32_e32 v1, s2
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
; GFX10-NEXT: s_endpgm
@@ -946,11 +942,9 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s4, s2, 7
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s4, s3, s4
-; GFX11-NEXT: s_cmp_eq_u32 s4, 0
-; GFX11-NEXT: s_cselect_b32 s2, s2, s3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_cselect_b32 s2, s3, s2
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index b01447abe3e9b..14404b5adfd6c 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -28,8 +28,7 @@ define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32_eq:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_lshl_b32 s0, s0, 1
-; CHECK-NEXT: s_cmp_eq_u32 s0, 0
-; CHECK-NEXT: s_cselect_b32 s0, s1, 0
+; CHECK-NEXT: s_cselect_b32 s0, 0, s1
; CHECK-NEXT: ; return to shader part epilog
%result = shl i32 %val0, 1
%cmp = icmp eq i32 %result, 0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 4445383bd0ace..4ef2ac1f59e07 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -180,8 +180,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13
; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1
; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -190,8 +189,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13
; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0
-; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
+; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1
; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31]
;
@@ -208,9 +206,7 @@ define i1 @workgroup_zero() {
; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0
-; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0
+; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1
; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe
; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31]
>From adc2b32653c53a5d03e113173dc6312018b9c6e6 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 17 Nov 2025 12:07:33 -0600
Subject: [PATCH 08/10] Add scc killed during instruction insertion for
s_cbranch_scc?
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++++++++
llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +-
.../AMDGPU/branch-folding-implicit-def-subreg.ll | 5 ++---
3 files changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e37d739fc25df..b213f9c91d507 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6487,6 +6487,17 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
MI.getOperand(0).setReg(OriginalExec);
return BB;
}
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1: {
+ MachineBasicBlock *TBB = nullptr;
+ MachineBasicBlock *FBB = nullptr;
+ SmallVector<MachineOperand, 1> Cond;
+ TII->analyzeBranch(*BB, TBB, FBB, Cond);
+ if (TBB && !TBB->isLiveIn(AMDGPU::SCC) && FBB &&
+ !FBB->isLiveIn(AMDGPU::SCC))
+ MI.addRegisterKilled(AMDGPU::SCC, TRI);
+ }
+ return BB;
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1931e0be15152..694f39d7b0881 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1548,7 +1548,7 @@ defm S_BRANCH : SOPP_With_Relaxation<
[(br bb:$simm16)]>;
}
-let Uses = [SCC] in {
+let usesCustomInserter = 1, Uses = [SCC] in {
defm S_CBRANCH_SCC0 : SOPP_With_Relaxation<
"s_cbranch_scc0" , (ins SOPPBrTarget:$simm16),
"$simm16"
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 026b8ba2759f0..c6cc3922d3952 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -701,11 +701,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000)
; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
; GFX90A-NEXT: {{ $}}
- ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
- ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
+ ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc
; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
- ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc
+ ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.53:
; GFX90A-NEXT: successors: %bb.61(0x80000000)
>From d01205f661549d259b71c6d650169be7b6253fb6 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Mon, 17 Nov 2025 13:20:48 -0600
Subject: [PATCH 09/10] Inversion test does not require scc clobber
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 42 ++++++++---------------------
1 file changed, 11 insertions(+), 31 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index 14404b5adfd6c..6f4212b13433b 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -23,7 +23,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
; s_lshl_b32 sets SCC if result is non-zero.
; Deletion of equal to zero comparison will require inversion of use.
-; FIXME: Can't invert because kill flag not set on last use.
define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) {
; CHECK-LABEL: shl32_eq:
; CHECK: ; %bb.0:
@@ -36,39 +35,20 @@ define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) {
ret i32 %select
}
-; s_lshl_b32 sets SCC if result is non-zero.
-; Deletion of equal to zero comparison will require inversion of use.
-define amdgpu_ps i32 @shl32_eq_with_scc_clobber(i32 inreg %val0, i32 inreg %val1) {
-; CHECK-LABEL: shl32_eq_with_scc_clobber:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_lshl_b32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b32 s0, 0, s1
-; CHECK-NEXT: s_xor_b32 s0, s0, s1
-; CHECK-NEXT: ; return to shader part epilog
- %result = shl i32 %val0, 1
- %cmp = icmp eq i32 %result, 0
- %select = select i1 %cmp, i32 %val1, i32 0
- %xor = xor i32 %select, %val1
- ret i32 %xor
-}
-
; 64-bit selection will generate two 32-bit selects. Inversion of multiple
; uses is required.
-define amdgpu_ps i64 @shl32_eq_multi_use_with_scc_clobber(i32 inreg %val0, i64 inreg %val1) {
-; CHECK-LABEL: shl32_eq_multi_use_with_scc_clobber:
+define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) {
+; CHECK-LABEL: shl32_eq_multi_use:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_mov_b32 s3, s2
-; CHECK-NEXT: s_mov_b32 s2, s1
; CHECK-NEXT: s_lshl_b32 s0, s0, 1
-; CHECK-NEXT: s_cselect_b32 s1, 0, s3
-; CHECK-NEXT: s_cselect_b32 s0, 0, s2
-; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
+; CHECK-NEXT: s_cselect_b32 s2, 0, s2
+; CHECK-NEXT: s_cselect_b32 s0, 0, s1
+; CHECK-NEXT: s_mov_b32 s1, s2
; CHECK-NEXT: ; return to shader part epilog
%result = shl i32 %val0, 1
%cmp = icmp eq i32 %result, 0
%select = select i1 %cmp, i64 %val1, i64 0
- %xor = xor i64 %select, %val1
- ret i64 %xor
+ ret i64 %select
}
define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
@@ -711,14 +691,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1 at rel32@lo+4
; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1 at rel32@hi+12
; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0
-; CHECK-NEXT: s_cbranch_scc0 .LBB41_2
+; CHECK-NEXT: s_cbranch_scc0 .LBB40_2
; CHECK-NEXT: ; %bb.1: ; %endif
; CHECK-NEXT: s_mov_b32 s0, 1
-; CHECK-NEXT: s_branch .LBB41_3
-; CHECK-NEXT: .LBB41_2: ; %if
+; CHECK-NEXT: s_branch .LBB40_3
+; CHECK-NEXT: .LBB40_2: ; %if
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_branch .LBB41_3
-; CHECK-NEXT: .LBB41_3:
+; CHECK-NEXT: s_branch .LBB40_3
+; CHECK-NEXT: .LBB40_3:
%cmp = icmp ne ptr addrspace(4) @1, null
br i1 %cmp, label %endif, label %if
>From 729b232f6b6273c7734bc014b561a619b5aaead6 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 19 Nov 2025 09:40:39 -0600
Subject: [PATCH 10/10] Update comment. Cleaner return check
Signed-off-by: John Lu <John.Lu at amd.com>
---
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++++---
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b213f9c91d507..865eb61e481f5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6496,8 +6496,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
if (TBB && !TBB->isLiveIn(AMDGPU::SCC) && FBB &&
!FBB->isLiveIn(AMDGPU::SCC))
MI.addRegisterKilled(AMDGPU::SCC, TRI);
- }
return BB;
+ }
default:
if (TII->isImage(MI) || TII->isMUBUF(MI)) {
if (!MI.mayStore())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ace5150d8437f..c2786b49c36bd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10758,14 +10758,15 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const {
return false;
}
if (MI.definesRegister(AMDGPU::SCC, &RI) ||
- MI.killsRegister(AMDGPU::SCC, &RI) || MI.isReturn()) {
+ MI.killsRegister(AMDGPU::SCC, &RI)) {
SCCIsDead = true;
break;
}
}
+ if (MBB->succ_empty())
+ SCCIsDead = true;
- // If SCC is still live, verify that it is not live past the end of this
- // block.
+ // SCC may have more uses. Can't invert all of them.
if (!SCCIsDead)
return false;
More information about the llvm-commits
mailing list