[llvm] [AMDGPU] Use `S_BFE_U64` for uniform i1 ext (PR #69703)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 20 03:06:10 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Pierre van Houtryve (Pierre-vh)
<details>
<summary>Changes</summary>
Solves #<!-- -->59869
---
Full diff: https://github.com/llvm/llvm-project/pull/69703.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+26-9)
- (modified) llvm/test/CodeGen/AMDGPU/saddo.ll (+33-27)
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+23-20)
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+23-20)
- (modified) llvm/test/CodeGen/AMDGPU/zero_extend.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c1808c..700b10fdd7a3d5d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2278,17 +2278,34 @@ def : GCNPat <
(REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
>;
-class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
- (i64 (ext i1:$src)),
- (REG_SEQUENCE VReg_64,
- (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
- /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
- sub0, (S_MOV_B32 (i32 0)), sub1)
->;
+multiclass ZExt_i64_i1_Pat <SDNode ext> {
+ def: GCNPat <
+ (i64 (ext i1:$src)),
+ (REG_SEQUENCE VReg_64,
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+ sub0, (S_MOV_B32 (i32 0)), sub1)
+ >;
+
+ let WaveSizePredicate = isWave32 in
+ def : GCNPat <
+ (i64 (UniformUnaryFrag<ext> SReg_1:$src)),
+ (S_BFE_U64 (REG_SEQUENCE SReg_64,
+ SReg_32:$src, sub0,
+ (i32 (IMPLICIT_DEF)), sub1),
+ (i32 0x10000))
+ >;
+
+ let WaveSizePredicate = isWave64 in
+ def : GCNPat <
+ (i64 (UniformUnaryFrag<ext> SReg_1:$src)),
+ (S_BFE_U64 SReg_64:$src, (i32 0x10000))
+ >;
+}
-def : ZExt_i64_i1_Pat<zext>;
-def : ZExt_i64_i1_Pat<anyext>;
+defm : ZExt_i64_i1_Pat<zext>;
+defm : ZExt_i64_i1_Pat<anyext>;
// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
// REG_SEQUENCE patterns don't support instructions with multiple outputs.
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index cb3166d7a20d309..4a5914c82a2529e 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -29,10 +29,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT: v_mov_b32_e32 v1, s11
-; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0
-; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT: s_bfe_u64 s[4:5], s[4:5], 0x10000
+; SI-NEXT: s_add_u32 s4, s10, s4
+; SI-NEXT: s_addc_u32 s5, s11, s5
+; SI-NEXT: v_mov_b32_e32 v0, s4
+; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -45,15 +46,16 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-NEXT: s_add_u32 s2, s6, s0
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: s_addc_u32 s3, s7, s1
-; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000
+; VI-NEXT: s_add_u32 s0, s2, s0
+; VI-NEXT: s_addc_u32 s1, s3, s1
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -67,13 +69,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX9-NEXT: s_add_u32 s0, s6, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_addc_u32 s1, s7, s3
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX9-NEXT: s_add_u32 s0, s0, s2
+; GFX9-NEXT: s_addc_u32 s1, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
@@ -87,11 +90,13 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX10-NEXT: s_add_u32 s0, s6, s2
; GFX10-NEXT: s_addc_u32 s1, s7, s3
; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
-; GFX10-NEXT: s_xor_b32 s2, s2, s3
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b32 s2, s2, s6
+; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000
+; GFX10-NEXT: s_add_u32 s0, s0, s2
+; GFX10-NEXT: s_addc_u32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
@@ -100,18 +105,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s2, s6, s0
; GFX11-NEXT: s_addc_u32 s3, s7, s1
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: s_xor_b32 s0, s0, s6
+; GFX11-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_u32 s0, s2, s0
+; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 4363db2351e7a66..c81e13bbc027396 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -7,21 +7,22 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-LABEL: s_uaddo_i64_zext:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_add_u32 s0, s6, s0
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: s_addc_u32 s1, s7, s1
+; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000
+; SI-NEXT: s_add_u32 s6, s0, s6
+; SI-NEXT: s_addc_u32 s7, s1, s7
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_add_u32 s4, s6, s8
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: s_addc_u32 s5, s7, s9
-; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -30,17 +31,18 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: s_add_u32 s0, s6, s0
-; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: s_addc_u32 s1, s7, s1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[1:2]
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
+; VI-NEXT: s_add_u32 s0, s0, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -52,13 +54,14 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_add_u32 s0, s6, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_addc_u32 s1, s7, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
+; GFX9-NEXT: s_add_u32 s0, s0, s2
+; GFX9-NEXT: s_addc_u32 s1, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 37b5be3b672f2fa..802fff942efe1cf 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -8,21 +8,22 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-LABEL: s_usubo_i64_zext:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
-; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_sub_u32 s0, s6, s0
+; SI-NEXT: v_mov_b32_e32 v0, s6
+; SI-NEXT: v_mov_b32_e32 v1, s7
+; SI-NEXT: s_subb_u32 s1, s7, s1
+; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000
+; SI-NEXT: s_add_u32 s6, s0, s6
+; SI-NEXT: s_addc_u32 s7, s1, s7
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
-; SI-NEXT: s_sub_u32 s4, s6, s8
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
-; SI-NEXT: s_subb_u32 s5, s7, s9
-; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_mov_b32_e32 v1, s5
-; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
-; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -31,17 +32,18 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: s_sub_u32 s0, s6, s0
-; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: s_subb_u32 s1, s7, s1
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[1:2]
; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
+; VI-NEXT: s_add_u32 s0, s0, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -53,13 +55,14 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_sub_u32 s0, s6, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_subb_u32 s1, s7, s3
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000
+; GFX9-NEXT: s_add_u32 s0, s0, s2
+; GFX9-NEXT: s_addc_u32 s1, s1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 1f532f2706de765..584cb811ed66073 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -38,7 +38,7 @@ define amdgpu_kernel void @s_arg_zext_i1_to_i64(ptr addrspace(1) %out, i1 zeroex
; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
; GCN-DAG: s_cmp_eq_u32
-; GCN: v_cndmask_b32
+; GCN: s_bfe_u64
define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
%cmp = icmp eq i32 %a, %b
%ext = zext i1 %cmp to i64
``````````
</details>
https://github.com/llvm/llvm-project/pull/69703
More information about the llvm-commits
mailing list