[llvm] [AMDGPU] Use s_cmovk_i32 instead of s_cselect_b32 when applicable (PR #135232)
Ryan Buchner via llvm-commits
llvm-commits at lists.llvm.org
Thu May 1 12:05:25 PDT 2025
https://github.com/bababuck updated https://github.com/llvm/llvm-project/pull/135232
>From 2b6f45bba810ac210d4722d4cecdbbec6eef5586 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 30 Apr 2025 13:08:39 -0700
Subject: [PATCH 1/3] [AMDGPU] Add new MIR test that will demonstate the
changes need to address #129984.
---
llvm/test/CodeGen/AMDGPU/shrink-select.mir | 75 ++++++++++++++++++++++
1 file changed, 75 insertions(+)
create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-select.mir
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-select.mir b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
new file mode 100644
index 0000000000000..8e8931c9fa58d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
@@ -0,0 +1,75 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -start-before=si-shrink-instructions -stop-before=si-post-ra-bundler -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name: shrink-select-hint
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: shrink-select-hint
+ ; GCN: renamable $sgpr0 = S_MOV_B32 0
+ ; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 0
+ ; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 0
+ ; GCN-NEXT: renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc
+ ; GCN-NEXT: renamable $sgpr1 = S_CSELECT_B32 killed renamable $sgpr2, 31744, implicit killed $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr1, implicit killed renamable $sgpr0
+ %0:sgpr_32 = S_MOV_B32 0
+ %1:sgpr_32 = S_MOV_B32 0
+ %2:sgpr_32 = S_MOV_B32 0
+ %3:sgpr_32 = S_ADD_U32 killed %0, %1, implicit-def $scc
+ %4:sgpr_32 = S_CSELECT_B32 %2, 31744, implicit $scc
+ S_ENDPGM 0, implicit %4, implicit %3
+...
+---
+name: shrink-select-b32
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: shrink-select-b32
+ ; GCN: renamable $sgpr0 = S_CSELECT_B32 undef renamable $sgpr0, 31744, implicit undef $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0
+ %0:sgpr_32 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %1:sgpr_32 = S_CSELECT_B32 %0, 31744, implicit $scc
+ S_ENDPGM 0, implicit %1
+...
+---
+name: shrink-select-b64
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: shrink-select-b64
+ ; GCN: renamable $sgpr0_sgpr1 = S_CSELECT_B64 undef renamable $sgpr0_sgpr1, 31744, implicit undef $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
+ %0:sgpr_64 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %1:sgpr_64 = S_CSELECT_B64 %0, 31744, implicit $scc
+ S_ENDPGM 0, implicit %1
+...
+---
+name: shrink-select-non-kimm
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: shrink-select-non-kimm
+ ; GCN: renamable $sgpr0 = S_CSELECT_B32 undef renamable $sgpr0, 16, implicit undef $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0
+ %0:sgpr_32 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %1:sgpr_32 = S_CSELECT_B32 %0, 16, implicit $scc
+ S_ENDPGM 0, implicit %1
+...
+---
+name: shrink-select-reg
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GCN-LABEL: name: shrink-select-reg
+ ; GCN: renamable $sgpr0 = S_CSELECT_B32 undef renamable $sgpr0, undef renamable $sgpr0, implicit undef $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0
+ %0:sgpr_32 = IMPLICIT_DEF
+ %1:sgpr_32 = IMPLICIT_DEF
+ $scc = IMPLICIT_DEF
+ %2:sgpr_32 = S_CSELECT_B32 %0, %1, implicit $scc
+ S_ENDPGM 0, implicit %2
+...
>From 65e130bc96f7765f15d0426a14bd58509e964849 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 9 Apr 2025 14:23:32 -0700
Subject: [PATCH 2/3] [AMDGPU] Use s_cmovk_i32 instead of s_cselect_b32 when
applicable
Partially addresses #129984.
---
.../Target/AMDGPU/SIShrinkInstructions.cpp | 35 +++++++++++++++++++
.../AMDGPU/32-bit-local-address-space.ll | 4 +--
...amdgpu-codegenprepare-fold-binop-select.ll | 2 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 14 ++++----
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 16 ++++-----
llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll | 16 ++++-----
llvm/test/CodeGen/AMDGPU/shrink-select.mir | 2 +-
7 files changed, 62 insertions(+), 27 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..b123e2e0f182c 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -883,6 +883,41 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
}
}
+ // Try to use S_CMOVK_I32 in place of S_CSELECT_B32
+ if (MI.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+ // Must be exactly one Immediate
+ if (!(Src0->isReg() ^ Src1->isReg()))
+ continue;
+
+ bool Swapped = false;
+ // Don't actually swap the MachineOperands yet
+ // Could do it now, but don't want to since will modify generated
+ // program even in cases where we don't insert a S_CMOVK_I32
+ if (!Src0->isReg() && Src1->isReg()) {
+ Swapped = true;
+ std::swap(Src0, Src1);
+ }
+
+ if (!Src1->isImm() || !isKImmOperand(*Src1))
+ continue;
+
+ if (Src0->getReg() != Dest->getReg())
+ continue;
+
+ // Actually swap the operands in the MachineInst now that we know we
+ // are going through with the shrink
+ if (Swapped) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ continue;
+ }
+
+ MI.setDesc(TII->get(AMDGPU::S_CMOVK_I32));
+ MI.removeOperand(1);
+ }
+
// Try to use S_ADDK_I32 and S_MULK_I32.
if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
MI.getOpcode() == AMDGPU::S_MUL_I32) {
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 2c2855c860ebb..27ab9b88e23bd 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -58,10 +58,10 @@ entry:
; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
; GFX7 v_cmp_ne_u32
-; GFX7: s_cselect_b32
+; GFX7: s_cmovk_i32
; GFX8: s_cmp_lg_u32
; GFX8-NOT: v_cmp_ne_u32
-; GFX8: s_cselect_b32
+; GFX8: s_cmovk_i32
define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
%cmp = icmp ne ptr addrspace(3) %lds, null
%x = select i1 %cmp, i32 123, i32 456
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index e71bf15384727..2d03aeb2753ac 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -399,7 +399,7 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
-; GCN-NEXT: s_cselect_b32 s0, s0, 0x83
+; GCN-NEXT: s_cmovk_i32 s0, 0x83
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index b4b9c2d3e0135..7dce469b254c6 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1786,10 +1786,10 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; SI-NEXT: s_lshr_b32 s4, s4, 2
; SI-NEXT: s_add_i32 s4, s4, s6
; SI-NEXT: s_cmp_lt_i32 s5, 31
-; SI-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; SI-NEXT: s_cmovk_i32 s4, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_movk_i32 s2, 0x7e00
-; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; SI-NEXT: s_cmovk_i32 s2, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s5, 0x40f
; SI-NEXT: s_cselect_b32 s2, s2, s4
; SI-NEXT: s_lshr_b32 s3, s3, 16
@@ -1844,10 +1844,10 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; VI-NEXT: s_lshr_b32 s1, s1, 2
; VI-NEXT: s_add_i32 s1, s1, s3
; VI-NEXT: s_cmp_lt_i32 s2, 31
-; VI-NEXT: s_cselect_b32 s1, s1, 0x7c00
+; VI-NEXT: s_cmovk_i32 s1, 0x7c00
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_movk_i32 s0, 0x7e00
-; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00
+; VI-NEXT: s_cmovk_i32 s0, 0x7c00
; VI-NEXT: s_cmpk_eq_i32 s2, 0x40f
; VI-NEXT: s_cselect_b32 s0, s0, s1
; VI-NEXT: s_movk_i32 s1, 0x7fff
@@ -1896,10 +1896,10 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX9-NEXT: s_lshr_b32 s4, s4, 2
; GFX9-NEXT: s_add_i32 s4, s4, s5
; GFX9-NEXT: s_cmp_lt_i32 s3, 31
-; GFX9-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; GFX9-NEXT: s_cmovk_i32 s4, 0x7c00
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_movk_i32 s2, 0x7e00
-; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX9-NEXT: s_cmovk_i32 s2, 0x7c00
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x40f
; GFX9-NEXT: s_cselect_b32 s2, s2, s4
; GFX9-NEXT: s_movk_i32 s3, 0x7fff
@@ -1958,7 +1958,7 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: s_add_i32 s5, s5, s6
; GFX11-NEXT: s_cmp_lt_i32 s2, 31
; GFX11-NEXT: s_movk_i32 s6, 0x7e00
-; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX11-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 41cbbe57d7a36..534790ae43312 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -132,7 +132,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_or_b32 s8, s8, s9
; SI-NEXT: s_add_i32 s6, s6, s8
; SI-NEXT: s_cmp_lt_i32 s0, 31
-; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00
+; SI-NEXT: s_cmovk_i32 s6, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s1, 0
; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f
@@ -188,10 +188,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
; VI-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s8
; VI-SAFE-SDAG-NEXT: s_cmp_lt_i32 s6, 31
-; VI-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmovk_i32 s5, 0x7c00
; VI-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; VI-SAFE-SDAG-NEXT: s_movk_i32 s4, 0x7e00
-; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, 0x7c00
+; VI-SAFE-SDAG-NEXT: s_cmovk_i32 s4, 0x7c00
; VI-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s6, 0x40f
; VI-SAFE-SDAG-NEXT: s_cselect_b32 s4, s4, s5
; VI-SAFE-SDAG-NEXT: s_lshr_b32 s5, s7, 16
@@ -240,7 +240,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; VI-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
@@ -312,7 +312,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
@@ -365,7 +365,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX10-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: s_add_i32 s5, s5, s6
; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
@@ -501,7 +501,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
+; GFX11-SAFE-GISEL-NEXT: s_cmovk_i32 s2, 0x7c00
; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
index 0c8dbe865a872..f75982caab7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll
@@ -1371,7 +1371,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
; GFX678-NEXT: s_movk_i32 s34, 0xa5
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX678-NEXT: s_cmovk_i32 s34, 0xa50
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1380,7 +1380,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_movk_i32 s34, 0xa5
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX9-NEXT: s_cmovk_i32 s34, 0xa50
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1389,7 +1389,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_movk_i32 s34, 0xa5
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa50
+; GFX10-NEXT: s_cmovk_i32 s34, 0xa50
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1398,7 +1398,7 @@ define amdgpu_gfx void @s_set_rounding_select_2_1(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_movk_i32 s0, 0xa5
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa50
+; GFX11-NEXT: s_cmovk_i32 s0, 0xa50
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
@@ -1413,7 +1413,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX678-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX678-NEXT: s_cmp_eq_u32 s4, 0
; GFX678-NEXT: s_movk_i32 s34, 0xa50
-; GFX678-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX678-NEXT: s_cmovk_i32 s34, 0xa5
; GFX678-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX678-NEXT: s_setpc_b64 s[30:31]
;
@@ -1422,7 +1422,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_cmp_eq_u32 s4, 0
; GFX9-NEXT: s_movk_i32 s34, 0xa50
-; GFX9-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX9-NEXT: s_cmovk_i32 s34, 0xa5
; GFX9-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@@ -1431,7 +1431,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_cmp_eq_u32 s4, 0
; GFX10-NEXT: s_movk_i32 s34, 0xa50
-; GFX10-NEXT: s_cselect_b32 s34, s34, 0xa5
+; GFX10-NEXT: s_cmovk_i32 s34, 0xa5
; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s34
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1440,7 +1440,7 @@ define amdgpu_gfx void @s_set_rounding_select_1_2(i32 inreg %cond) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_movk_i32 s0, 0xa50
-; GFX11-NEXT: s_cselect_b32 s0, s0, 0xa5
+; GFX11-NEXT: s_cmovk_i32 s0, 0xa5
; GFX11-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 0, 4), s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-select.mir b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
index 8e8931c9fa58d..332c7ddb76c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
@@ -26,7 +26,7 @@ tracksRegLiveness: true
body: |
bb.0:
; GCN-LABEL: name: shrink-select-b32
- ; GCN: renamable $sgpr0 = S_CSELECT_B32 undef renamable $sgpr0, 31744, implicit undef $scc
+ ; GCN: renamable $sgpr0 = S_CMOVK_I32 31744, implicit undef $scc
; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0
%0:sgpr_32 = IMPLICIT_DEF
$scc = IMPLICIT_DEF
>From e9c507dd2e87a1cc2957b93c8b9318b08b43e067 Mon Sep 17 00:00:00 2001
From: bababuck <buchner.ryan at gmail.com>
Date: Wed, 9 Apr 2025 14:23:32 -0700
Subject: [PATCH 3/3] [AMDGPU] Hint that s_cselect_b32 source-0 and destination
ideally are assigned the same physical register.
Addresses #129984.
---
llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp | 9 +++++++++
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 4 ++--
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 12 ++++++------
llvm/test/CodeGen/AMDGPU/shrink-select.mir | 4 ++--
4 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index b123e2e0f182c..482faca21fb7d 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -904,6 +904,15 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
if (!Src1->isImm() || !isKImmOperand(*Src1))
continue;
+ // Hint that the source and destination register should be allocated
+ // as the same register so that we can shrink to S_CMOVK_I32 on the
+ // post-allocation SIShrinkInstructions pass.
+ if (Dest->getReg().isVirtual()) {
+ MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
+ MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
+ continue;
+ }
+
if (Src0->getReg() != Dest->getReg())
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 7dce469b254c6..489ff4a3af416 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1960,9 +1960,9 @@ define amdgpu_kernel void @s_copysign_out_f16_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-NEXT: s_movk_i32 s6, 0x7e00
; GFX11-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX11-NEXT: s_cmp_lg_u32 s3, 0
-; GFX11-NEXT: s_cselect_b32 s3, s6, 0x7c00
+; GFX11-NEXT: s_cmovk_i32 s6, 0x7c00
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-NEXT: s_cselect_b32 s2, s3, s5
+; GFX11-NEXT: s_cselect_b32 s2, s6, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 534790ae43312..d1f1b95f30395 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -134,9 +134,9 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; SI-NEXT: s_cmp_lt_i32 s0, 31
; SI-NEXT: s_cmovk_i32 s6, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s1, 0
-; SI-NEXT: s_cselect_b32 s1, s2, 0x7c00
+; SI-NEXT: s_cmovk_i32 s2, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s0, 0x40f
-; SI-NEXT: s_cselect_b32 s0, s1, s6
+; SI-NEXT: s_cselect_b32 s0, s2, s6
; SI-NEXT: s_lshr_b32 s1, s7, 16
; SI-NEXT: s_and_b32 s1, s1, 0x8000
; SI-NEXT: s_or_b32 s6, s1, s0
@@ -314,9 +314,9 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX10-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
; GFX10-SAFE-SDAG-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmovk_i32 s6, 0x7c00
; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s6, s5
; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
@@ -446,9 +446,9 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
; GFX11-SAFE-SDAG-NEXT: s_movk_i32 s6, 0x7e00
; GFX11-SAFE-SDAG-NEXT: s_cmovk_i32 s5, 0x7c00
; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, s6, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmovk_i32 s6, 0x7c00
; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s4, s5
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s6, s5
; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s3, 0x8000
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-select.mir b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
index 332c7ddb76c5e..34ee54133ea01 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-select.mir
@@ -11,8 +11,8 @@ body: |
; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 0
; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 0
; GCN-NEXT: renamable $sgpr0 = S_ADD_U32 killed renamable $sgpr0, killed renamable $sgpr1, implicit-def $scc
- ; GCN-NEXT: renamable $sgpr1 = S_CSELECT_B32 killed renamable $sgpr2, 31744, implicit killed $scc
- ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr1, implicit killed renamable $sgpr0
+ ; GCN-NEXT: renamable $sgpr2 = S_CMOVK_I32 31744, implicit killed $scc
+ ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2, implicit killed renamable $sgpr0
%0:sgpr_32 = S_MOV_B32 0
%1:sgpr_32 = S_MOV_B32 0
%2:sgpr_32 = S_MOV_B32 0
More information about the llvm-commits
mailing list