[llvm] [AMDGPU][NFC] Enable gfx942 for more tests (PR #154363)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 19 08:32:26 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Janek van Oirschot (JanekvO)
<details>
<summary>Changes</summary>
Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#<!-- -->154115)
Expecting to see more tests affected in aforementioned PR after rebase on top of this PR
---
Patch is 2.25 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154363.diff
37 Files Affected:
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+3519)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+1211)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+2519)
- (modified) llvm/test/CodeGen/AMDGPU/bypass-div.ll (+1270)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll (+907)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-select.ll (+148-72)
- (modified) llvm/test/CodeGen/AMDGPU/div_i128.ll (+4154)
- (modified) llvm/test/CodeGen/AMDGPU/div_v2i128.ll (+3548)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+1034)
- (modified) llvm/test/CodeGen/AMDGPU/fcanonicalize.ll (+605)
- (modified) llvm/test/CodeGen/AMDGPU/fceil64.ll (+88)
- (modified) llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll (+48-23)
- (modified) llvm/test/CodeGen/AMDGPU/fptoi.i128.ll (+2236)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+1675)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+1213)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+1213)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+1984)
- (modified) llvm/test/CodeGen/AMDGPU/imm.ll (+761)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (+1089)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll (+200)
- (modified) llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll (+44-21)
- (modified) llvm/test/CodeGen/AMDGPU/lround.ll (+448)
- (modified) llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll (+33)
- (modified) llvm/test/CodeGen/AMDGPU/rem_i128.ll (+2686-2)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+1931)
- (modified) llvm/test/CodeGen/AMDGPU/shift-i128.ll (+550)
- (modified) llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll (+38)
- (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+559)
- (modified) llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll (+155)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll ()
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+2087)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+1523)
- (modified) llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll (+222)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+1471)
- (modified) llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll (+134-1)
- (modified) llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll (+47)
- (modified) llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll (+53)
``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index c7385e4324e2c..c6ad5c93fb7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2,6 +2,7 @@
; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; CHECK-LABEL: @udiv_i32(
@@ -98,6 +99,37 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s4, 0, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT: s_mul_i32 s5, s4, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_add_i32 s5, s4, 1
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s4
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -191,6 +223,35 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s4, 0, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT: s_mul_i32 s4, s4, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s4
+; GFX942-NEXT: s_sub_i32 s4, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s4, s2
+; GFX942-NEXT: s_sub_i32 s4, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s4, s2
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -312,6 +373,42 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_abs_i32 s4, s3
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT: s_sub_i32 s5, 0, s4
+; GFX942-NEXT: s_xor_b32 s3, s2, s3
+; GFX942-NEXT: s_abs_i32 s2, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_ashr_i32 s3, s3, 31
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s6, v0
+; GFX942-NEXT: s_mul_i32 s5, s5, s6
+; GFX942-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT: s_add_i32 s6, s6, s5
+; GFX942-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT: s_mul_i32 s6, s5, s4
+; GFX942-NEXT: s_sub_i32 s2, s2, s6
+; GFX942-NEXT: s_add_i32 s7, s5, 1
+; GFX942-NEXT: s_sub_i32 s6, s2, s4
+; GFX942-NEXT: s_cmp_ge_u32 s2, s4
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_cselect_b32 s2, s6, s2
+; GFX942-NEXT: s_add_i32 s6, s5, 1
+; GFX942-NEXT: s_cmp_ge_u32 s2, s4
+; GFX942-NEXT: s_cselect_b32 s2, s6, s5
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s3
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -423,6 +520,40 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_abs_i32 s3, s3
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s5, 0, s3
+; GFX942-NEXT: s_ashr_i32 s4, s2, 31
+; GFX942-NEXT: s_abs_i32 s2, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s6, v0
+; GFX942-NEXT: s_mul_i32 s5, s5, s6
+; GFX942-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT: s_add_i32 s6, s6, s5
+; GFX942-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT: s_mul_i32 s5, s5, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s5
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s4
+; GFX942-NEXT: s_sub_i32 s2, s2, s4
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -492,6 +623,29 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_short v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_lshr_b32 s1, s0, 16
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT: v_trunc_f32_e32 v2, v2
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT: v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_short v3, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -567,6 +721,31 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_lshr_b32 s3, s2, 16
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_and_b32 s0, s2, 0xffff
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT: v_trunc_f32_e32 v2, v2
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT: v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_short v3, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -648,6 +827,31 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_ashr_i32 s3, s2, 16
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT: s_sext_i32_i16 s2, s2
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s4, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s4, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -735,6 +939,33 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_ashr_i32 s4, s6, 16
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT: s_sext_i32_i16 s2, s6
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s4
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s5, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s5, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s4
+; GFX942-NEXT: v_sub_u32_e32 v0, s6, v0
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -798,6 +1029,25 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT: v_trunc_f32_e32 v1, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT: v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -869,6 +1119,28 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT: s_lshr_b32 s3, s2, 8
+; GFX942-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT: v_trunc_f32_e32 v1, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT: v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -950,6 +1222,31 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bfe_i32 s3, s2, 0x80008
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT: s_sext_i32_i8 s2, s2
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s4, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s4, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -1039,6 +1336,34 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bfe_i32 s2, s6, 0x80008
+; GFX942-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT: s_sext_i32_i8 s3, s6
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s3
+; GFX942-NEXT: s_xor_b32 s2, s3, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v1
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_lshr_b32 s4, s6, 8
+; GFX942-NEXT: s_or_b32 s5, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v1, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v1|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s5, 0
+; GFX942-NEXT: v_add_u32_e32 v1, s2, v3
+; GFX942-NEXT: v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT: v_sub_u32_e32 v1, s6, v1
+; GFX942-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -1367,6 +1692,99 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_v4i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT: s_sub_i32 s2, 0, s12
+; GFX942-NEXT: v_cvt_f32_u32_e32 v3, s14
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT: v_readfirstlane_b32 s3, v0
+; GFX942-NEXT: s_mul_i32 s2, s2, s3
+; GFX942-NEXT: s_mul_hi_u32 s2, s3, s2
+; GFX942-NEXT: s_add_i32 s3, s3, s2
+; GFX942-NEXT: s_mul_hi_u32 s2, s8, s3
+; GFX942-NEXT: s_mul_i32 s3, s2, s12
+; GFX942-NEXT: s_sub_i32 s3, s8, s3
+; GFX942-NEXT: s_add_i32 s5, s2, 1
+; GFX942-NEXT: s_sub_i32 s6, s3, s12
+; GFX942-NEXT: s_cmp_ge_u32 s3, s12
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_cselect_b32 s3, s6, s3
+; GFX942-NEXT: s_add_i32 s5, s2, 1
+; GFX942-NEXT: s_cmp_ge_u32 s3, s12
+; GFX942-NEXT: v_readfirstlane_b32 s4, v1
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_sub_i32 s3, 0, s13
+; GFX942-NEXT: s_mul_i32 s3, s3, s4
+; GFX942-NEXT: s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT: s_add_i32 s4, s4, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v3
+; GFX942-NEXT: s_mul_hi_u32 s3, s9, s4
+; GFX942-NEXT: s_mul_i32 s4, s3, s13
+; GFX942-NEXT: s_sub_i32 s4, s9, s4
+; GFX942-NEXT: s_add_i32 s5, s3, 1
+; GFX942-NEXT: s_sub_i32 s6, s4, s13
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: s_cmp_ge_u32 s4, s13
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_cselect_b32 s3, s5, s3
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_add_i32 s5, s3, 1
+; GFX942-NEXT: s_cmp_ge_u32 s4, s13
+; GFX942-NEXT: s_cselect_b32 s3, s5, s3
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX942-NEXT: s_sub_i32 s4, 0, s14
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_mul_hi_u32 s4, s10, s5
+; GFX942-NEXT: s_mul_i32 s5, s4, s14
+; GFX942-NEXT: s_sub_i32 s5, s10, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_sub_i32 s7, s5, s14
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: s_cmp_ge_u32 s5, s14
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_cmp_ge_u32 s5, s14
+; GFX942-NEXT: s_cselect_b...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154363
More information about the llvm-commits
mailing list