[llvm] [AMDGPU] Fix passing CodeGen/AMDGPU/frem.ll on gfx1150. (PR #67425)

via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 26 06:07:06 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

<details>
<summary>Changes</summary>

We would currently crash on it trying to use t16 instructions instead of fake16 ones.

---

Patch is 37.58 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67425.diff


2 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+5-5) 
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+577) 


``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 30e3179f8eb7d83..27d62188cee88c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5045,11 +5045,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
   case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
   case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
-  case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_t16_e64;
-  case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_t16_e64;
-  case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_t16_e64;
-  case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_t16_e64;
-  case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_t16_e64;
+  case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
+  case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
+  case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
+  case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+  case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
   case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
   case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
   case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d258e060ba3e113..7d4393b653a756c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5,6 +5,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
@@ -179,6 +180,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -325,6 +352,29 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -471,6 +521,29 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -679,6 +752,44 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -817,6 +928,29 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -955,6 +1089,29 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1176,6 +1333,41 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v12, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v12, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1150-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1370,6 +1562,37 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1564,6 +1787,37 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1832,6 +2086,47 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v5, v4, v3
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX1150-NEXT:    global_store_b32 v0, v1...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/67425


More information about the llvm-commits mailing list