[llvm] e48731b - [AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling in movetoVALU process (#141152)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 10 12:36:48 PDT 2025
Author: Brox Chen
Date: 2025-06-10T15:36:44-04:00
New Revision: e48731bc03419f133a85b50571a368a889c6dab2
URL: https://github.com/llvm/llvm-project/commit/e48731bc03419f133a85b50571a368a889c6dab2
DIFF: https://github.com/llvm/llvm-project/commit/e48731bc03419f133a85b50571a368a889c6dab2.diff
LOG: [AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling in movetoVALU process (#141152)
Add op_sel for v_s_xxx_f16 when move them to VALU
update a few related codegen test for gfx12 in true16 mode
Added:
llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
Modified:
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/test/CodeGen/AMDGPU/frem.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 805f8e9acdca7..2ebf8b99e9d7b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7734,6 +7734,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
+ case AMDGPU::V_S_EXP_F16_e64:
+ case AMDGPU::V_S_LOG_F16_e64:
+ case AMDGPU::V_S_RCP_F16_e64:
+ case AMDGPU::V_S_RSQ_F16_e64:
+ case AMDGPU::V_S_SQRT_F16_e64: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
+ ? &AMDGPU::VGPR_16RegClass
+ : &AMDGPU::VGPR_32RegClass);
+ auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(2))
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ if (ST.useRealTrue16Insts())
+ NewInstr.addImm(0); // opsel0
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ legalizeOperandsVALUt16(*NewInstr, MRI);
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7a1351174733b..d3432daedadf8 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -8,6 +8,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -331,6 +333,82 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -537,6 +615,48 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: fast_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: fast_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -743,6 +863,48 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -985,6 +1147,42 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v5, v4
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v5
+; GFX1200-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v5, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1142,6 +1340,27 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1299,6 +1518,27 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1551,6 +1791,39 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v12, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1772,6 +2045,35 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v10, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1993,6 +2295,35 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v10, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -2514,6 +2845,131 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v2f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1200-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v2f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1200-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3398,6 +3854,227 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v4f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v4f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1200-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
@@ -3758,6 +4435,65 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2
; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v4, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
+; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7
+; GFX1200-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v5, -v6, v8, v5
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v8
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT: v_fma_f32 v1, v5, v3, v1
+; GFX1200-NEXT: v_div_scale_f32 v5, null, v2, v2, v0
+; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v6, v5
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6
+; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v0
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v0, v3, v2
+; GFX1200-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
@@ -4354,6 +5090,111 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v4f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v8, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX1200-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
+; GFX1200-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v11, v10
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v12, -v10, v11, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v11
+; GFX1200-NEXT: v_mul_f32_e32 v12, v9, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v13, -v10, v12, v9
+; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v9, -v10, v12, v9
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v9, v9, v11, v12
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v9, v9, v7, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v9, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
+; GFX1200-NEXT: v_fma_f32 v3, v9, v7, v3
+; GFX1200-NEXT: v_div_scale_f32 v9, null, v6, v6, v2
+; GFX1200-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v10, v9
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v11, -v9, v10, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v10
+; GFX1200-NEXT: v_mul_f32_e32 v11, v7, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v12, -v9, v11, v7
+; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v7, -v9, v11, v7
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v7, v7, v10, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v7, v7, v6, v2
+; GFX1200-NEXT: v_trunc_f32_e32 v7, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1200-NEXT: v_fma_f32 v2, v7, v6, v2
+; GFX1200-NEXT: v_div_scale_f32 v7, null, v5, v5, v1
+; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v9, v7
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v10, -v7, v9, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v9
+; GFX1200-NEXT: v_mul_f32_e32 v10, v6, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v11, -v7, v10, v6
+; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v6, -v7, v10, v6
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v5, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v6, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1200-NEXT: v_fma_f32 v1, v6, v5, v1
+; GFX1200-NEXT: v_div_scale_f32 v6, null, v4, v4, v0
+; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v9, -v6, v7, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v9, v7
+; GFX1200-NEXT: v_mul_f32_e32 v9, v5, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v10, -v6, v9, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v5, -v6, v9, v5
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v4, v0
+; GFX1200-NEXT: v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1200-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
@@ -4734,6 +5575,58 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v16, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b128 v[0:3], v16, s[2:3]
+; GFX1200-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
+; GFX1200-NEXT: v_mul_f64_e32 v[14:15], v[12:13], v[10:11]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX1200-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
+; GFX1200-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[10:11], v[8:9]
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
+; GFX1200-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index c6ea12dd61651..a2be749bb3c20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -1,17 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
declare half @llvm.amdgcn.rcp.f16(half %a)
-; GCN-LABEL: {{^}}rcp_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
-; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
-; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @rcp_f16(
+; GCN-LABEL: rcp_f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_rcp_f16_e32 v0, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX11-TRUE16-LABEL: rcp_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: rcp_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: rcp_f16:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_rcp_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: rcp_f16:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_rcp_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -20,3 +105,5 @@ entry:
store half %r.val, ptr addrspace(1) %r
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 0924e9a5c2314..bf371478cc1dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -1,17 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
declare half @llvm.amdgcn.rsq.f16(half %a)
-; GCN-LABEL: {{^}}rsq_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
-; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
-; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
define amdgpu_kernel void @rsq_f16(
+; GCN-LABEL: rsq_f16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s10, s6
+; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s8, s2
+; GCN-NEXT: s_mov_b32 s9, s3
+; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_rsq_f16_e32 v0, v0
+; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT: s_endpgm
+;
+; GFX11-TRUE16-LABEL: rsq_f16:
+; GFX11-TRUE16: ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT: s_endpgm
+;
+; GFX11-FAKE16-LABEL: rsq_f16:
+; GFX11-FAKE16: ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: rsq_f16:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_rsq_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: rsq_f16:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_rsq_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -20,3 +105,5 @@ entry:
store half %r.val, ptr addrspace(1) %r
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 2996a4e22a3ef..8604feb3b492f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
declare half @llvm.sqrt.f16(half %a)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -81,6 +83,42 @@ define amdgpu_kernel void @sqrt_f16(
; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: sqrt_f16:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_f16:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -189,6 +227,50 @@ define amdgpu_kernel void @sqrt_v2f16(
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-NEXT: s_endpgm
+;
+; GFX12-TRUE16-LABEL: sqrt_v2f16:
+; GFX12-TRUE16: ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT: s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_v2f16:
+; GFX12-FAKE16: ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
entry:
@@ -197,5 +279,3 @@ entry:
store <2 x half> %r.val, ptr addrspace(1) %r
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
new file mode 100644
index 0000000000000..a6819359561b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: exp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.exp2.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: log_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.log.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rcp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rcp.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rsq_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rsq.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: sqrt_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.sqrt.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare half @llvm.amdgcn.log.f16(half)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare half @llvm.amdgcn.rsq.f16(half)
+declare half @llvm.amdgcn.sqrt.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
new file mode 100644
index 0000000000000..b1b5b6b62296d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: exp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.exp2.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: log_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.log.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rcp_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rcp.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: rsq_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.rsq.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
+ ; CHECK-LABEL: name: sqrt_f16
+ ; CHECK: bb.0 (%ir-block.0):
+ ; CHECK-NEXT: liveins: $sgpr4_sgpr5
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+ ; CHECK-NEXT: [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: S_ENDPGM 0
+ %val = load volatile half, ptr addrspace(1) %ptr
+ %res = call half @llvm.amdgcn.sqrt.f16(half %val)
+ store half %res, ptr addrspace(1) %ptr
+ ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare half @llvm.amdgcn.log.f16(half)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare half @llvm.amdgcn.rsq.f16(half)
+declare half @llvm.amdgcn.sqrt.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 9407c8a9ee436..56848eabf8607 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
@@ -21,27 +21,6 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
ret void
}
-define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
- ; CHECK-LABEL: name: exp_f16
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %val = load volatile half, ptr addrspace(1) %ptr
- %res = call half @llvm.amdgcn.exp2.f16(half %val)
- store half %res, ptr addrspace(1) %ptr
- ret void
-}
-
define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: log_f32
; CHECK: bb.0 (%ir-block.0):
@@ -62,27 +41,6 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
ret void
}
-define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
- ; CHECK-LABEL: name: log_f16
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %val = load volatile half, ptr addrspace(1) %ptr
- %res = call half @llvm.amdgcn.log.f16(half %val)
- store half %res, ptr addrspace(1) %ptr
- ret void
-}
-
define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: rcp_f32
; CHECK: bb.0 (%ir-block.0):
@@ -103,27 +61,6 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
ret void
}
-define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
- ; CHECK-LABEL: name: rcp_f16
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %val = load volatile half, ptr addrspace(1) %ptr
- %res = call half @llvm.amdgcn.rcp.f16(half %val)
- store half %res, ptr addrspace(1) %ptr
- ret void
-}
-
define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: rsq_f32
; CHECK: bb.0 (%ir-block.0):
@@ -144,27 +81,6 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
ret void
}
-define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
- ; CHECK-LABEL: name: rsq_f16
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %val = load volatile half, ptr addrspace(1) %ptr
- %res = call half @llvm.amdgcn.rsq.f16(half %val)
- store half %res, ptr addrspace(1) %ptr
- ret void
-}
-
define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
; CHECK-LABEL: name: sqrt_f32
; CHECK: bb.0 (%ir-block.0):
@@ -185,34 +101,8 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
ret void
}
-define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
- ; CHECK-LABEL: name: sqrt_f16
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $sgpr4_sgpr5
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
- ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: S_ENDPGM 0
- %val = load volatile half, ptr addrspace(1) %ptr
- %res = call half @llvm.amdgcn.sqrt.f16(half %val)
- store half %res, ptr addrspace(1) %ptr
- ret void
-}
-
declare float @llvm.amdgcn.exp2.f32(float)
-declare half @llvm.amdgcn.exp2.f16(half)
declare float @llvm.amdgcn.log.f32(float)
-declare half @llvm.amdgcn.log.f16(half)
declare float @llvm.amdgcn.rcp.f32(float)
-declare half @llvm.amdgcn.rcp.f16(half)
declare float @llvm.amdgcn.rsq.f32(float)
-declare half @llvm.amdgcn.rsq.f16(half)
declare float @llvm.amdgcn.sqrt.f32(float)
-declare half @llvm.amdgcn.sqrt.f16(half)
More information about the llvm-commits
mailing list