[llvm] [AMDGPU] Try to reuse in v_cndmask register with constant from compare. (PR #131146)
Daniil Fukalov via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 17 14:21:06 PDT 2025
================
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
+
+define float @f32_oeq_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_oeq_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_oeq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_oeq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp oeq float 0x3FCF5C2900000000, %arg
+ %select = select i1 %fcmp, float 0x3FCF5C2900000000, float %arg1
+ ret float %select
+}
+
+define float @f32_one_v_i(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define float @f32_one_i_v(float %arg, float %arg1) {
+; GFX9-LABEL: f32_one_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x3e7ae148
+; GFX9-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: f32_one_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x3e7ae148, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x3e7ae148, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %fcmp = fcmp one float %arg, 0x3FCF5C2900000000
+ %select = select i1 %fcmp, float %arg1, float 0x3FCF5C2900000000
+ ret float %select
+}
+
+define i32 @i32_eq_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 %arg, 424242
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_eq_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_eq_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_eq_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp eq i32 424242, %arg
+ %select = select i1 %icmp, i32 424242, i32 %arg1
+ ret i32 %select
+}
+
+define i32 @i32_ne_v_i(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_v_i:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_v_i:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 %arg, 424242
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
+
+define i32 @i32_ne_i_v(i32 %arg, i32 %arg1) {
+; GFX9-LABEL: i32_ne_i_v:
+; GFX9: ; %bb.0: ; %bb
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s4, 0x67932
+; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
+; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: i32_ne_i_v:
+; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x67932, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x67932, v1, vcc_lo
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+bb:
+ %icmp = icmp ne i32 424242, %arg
+ %select = select i1 %icmp, i32 %arg1, i32 424242
+ ret i32 %select
+}
----------------
dfukalov wrote:
> Test half, i16, and bfloat cases. Plus 64-bit
I added half and i16 types. For bloat case we'll get no advantage here, since the imm is stored in two registers in any case: one (shifted left) for compare and second (original) for cndmask, like:
```
v_lshlrev_b32_e32 v2, 16, v0
s_mov_b32 s4, 0x42420000
v_cmp_eq_f32_e32 vcc, s4, v2
v_cndmask_b32_e32 v0, v1, v0, vcc
```
For the 64-bit types this folding doesn't work yet, since they are lowered into different pattern cmp/cndmask with pairs of registers (with REG_SEQUENCEs). I would implement it incrementally in additional patch.
https://github.com/llvm/llvm-project/pull/131146
More information about the llvm-commits
mailing list