[llvm] [AMDGPU] Fix passing CodeGen/AMDGPU/frem.ll on gfx1150. (PR #67425)

Ivan Kosarev via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 26 06:06:26 PDT 2023


https://github.com/kosarev created https://github.com/llvm/llvm-project/pull/67425

We would currently crash on it trying to use t16 instructions instead of fake16 ones.

>From 437ef120a8e23a7c5f423396cda6f55382f96c9a Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev at amd.com>
Date: Tue, 26 Sep 2023 13:39:13 +0100
Subject: [PATCH] [AMDGPU] Fix passing CodeGen/AMDGPU/frem.ll on gfx1150.

We would currently crash on it trying to use t16 instructions instead of
fake16 ones.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp |  10 +-
 llvm/test/CodeGen/AMDGPU/frem.ll       | 577 +++++++++++++++++++++++++
 2 files changed, 582 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 30e3179f8eb7d83..27d62188cee88c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5045,11 +5045,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
   case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
   case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
-  case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_t16_e64;
-  case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_t16_e64;
-  case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_t16_e64;
-  case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_t16_e64;
-  case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_t16_e64;
+  case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
+  case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
+  case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
+  case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+  case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
   case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
   case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
   case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d258e060ba3e113..7d4393b653a756c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5,6 +5,7 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
@@ -179,6 +180,32 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -325,6 +352,29 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -471,6 +521,29 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -679,6 +752,44 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -817,6 +928,29 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -955,6 +1089,29 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1176,6 +1333,41 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v12, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v12, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1150-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1150-NEXT:    v_mul_f64 v[10:11], v[8:9], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1370,6 +1562,37 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: fast_frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1564,6 +1787,37 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: unsafe_frem_f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v10, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1832,6 +2086,47 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
+; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v3, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v5, v4, v3
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -2253,6 +2548,71 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v4f16:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_fmac_f16_e32 v7, v6, v5
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_fma_f16 v0, v5, v2, v0
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v2, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v6, v5, v2
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX1150-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_div_fixup_f16 v2, v2, v3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v2, v2
+; GFX1150-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v2, v3
+; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v6
+; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
@@ -2557,6 +2917,66 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
+; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v8, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
+; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fma_f32 v1, v5, v3, v1
+; GFX1150-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
+; GFX1150-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v7, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v8, -v5, v7, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v8, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
+; GFX1150-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v0, v3, v2
+; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
@@ -3053,6 +3473,110 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v4f32:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v8, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b128 v[0:3], v8, s[6:7]
+; GFX1150-NEXT:    global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
+; GFX1150-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX1150-NEXT:    v_mul_f32_e32 v12, v9, v11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v13, -v10, v12, v9
+; GFX1150-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
+; GFX1150-NEXT:    v_trunc_f32_e32 v9, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
+; GFX1150-NEXT:    v_fma_f32 v3, v9, v7, v3
+; GFX1150-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
+; GFX1150-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX1150-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX1150-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
+; GFX1150-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1150-NEXT:    v_fma_f32 v2, v7, v6, v2
+; GFX1150-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
+; GFX1150-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v9, v7
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v9
+; GFX1150-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v11, -v7, v10, v6
+; GFX1150-NEXT:    v_fmac_f32_e32 v10, v11, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
+; GFX1150-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1150-NEXT:    v_fma_f32 v1, v6, v5, v1
+; GFX1150-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
+; GFX1150-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1150-NEXT:    s_denorm_mode 15
+; GFX1150-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v7, v9, v7
+; GFX1150-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v10, -v6, v9, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v9, v10, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX1150-NEXT:    s_denorm_mode 12
+; GFX1150-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
+; GFX1150-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1150-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX1150-NEXT:    global_store_b128 v8, v[0:3], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
@@ -3384,6 +3908,59 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1150-LABEL: frem_v2f64:
+; GFX1150:       ; %bb.0:
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1150-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT:    v_mov_b32_e32 v16, 0
+; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-NEXT:    s_clause 0x1
+; GFX1150-NEXT:    global_load_b128 v[0:3], v16, s[6:7]
+; GFX1150-NEXT:    global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1150-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1150-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
+; GFX1150-NEXT:    v_mul_f64 v[14:15], v[12:13], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX1150-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
+; GFX1150-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; GFX1150-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
+; GFX1150-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1150-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f64 v[12:13], v[10:11], v[8:9]
+; GFX1150-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
+; GFX1150-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
+; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; GFX1150-NEXT:    global_store_b128 v16, v[0:3], s[4:5]
+; GFX1150-NEXT:    s_nop 0
+; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1150-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16



More information about the llvm-commits mailing list