[llvm] Main true16 gfx12 patch 1 (PR #141152)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Thu May 22 15:23:47 PDT 2025
https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/141152
None
>From 66b2a63eccf61be612b033d3ea6e3b893da44da6 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 6 May 2025 14:30:12 -0400
Subject: [PATCH 1/4] check for vgpr16 putting into vgpr32 case in v2s lowering
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 +-
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 51 ++
llvm/test/CodeGen/AMDGPU/frem.ll | 751 ++++++++++++------
3 files changed, 596 insertions(+), 259 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 85276bd24bcf4..ba832e52892d3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7221,24 +7221,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// legalize operand between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
MachineRegisterInfo &MRI) const {
- unsigned Opcode = Inst.getOpcode();
- if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+ if (!ST.useRealTrue16Insts())
return;
- for (MachineOperand &Op : Inst.explicit_operands()) {
+ unsigned Opcode = MI.getOpcode();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // legalize operands and check for size mismatch
+ for (MachineOperand &Op : MI.explicit_operands()) {
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
- if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+ if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *RC = RI.getRegClass(RCID);
- if (RI.getRegSizeInBits(*RC) == 16) {
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
+ if (32 == RI.getRegSizeInBits(*RC) &&
+ 16 == RI.getRegSizeInBits(*ExpectedRC)) {
Op.setSubReg(AMDGPU::lo16);
+ } else if (16 == RI.getRegSizeInBits(*RC) &&
+ 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register NewDstReg =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(Op.getReg())
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ Op.setReg(NewDstReg);
}
}
}
@@ -7783,8 +7803,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.addReg(Undef)
.addImm(AMDGPU::hi16);
Inst.eraseFromParent();
-
MRI.replaceRegWith(DstReg, NewDstReg);
+ // legalize useMI with mismatched size
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ unsigned UseMIOpcode = UseMI.getOpcode();
+ if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+ (16 ==
+ RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+ I->setSubReg(AMDGPU::lo16);
+ }
+ }
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index f9db082a2e912..9b6a2f3a1aa1e 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -57,6 +57,57 @@ body: |
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
+---
+name: salu16_usedby_salu32
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu16_usedby_salu32
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[DEF]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_TRUNC_F16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[REG_SEQUENCE]], [[DEF]], implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_TRUNC_F16 %1:sreg_32, implicit $mode
+ %3:sreg_32 = S_XOR_B32 %2:sreg_32, %1:sreg_32, implicit-def $scc
+...
+
+---
+name: salu32_usedby_salu16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: salu32_usedby_salu16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[DEF]], [[DEF]], implicit $exec
+ ; GCN-NEXT: [[V_TRUNC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_TRUNC_F16_t16_e64 0, [[V_XOR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = COPY %0:vgpr_32
+ %2:sreg_32 = S_XOR_B32 %1:sreg_32, %1:sreg_32, implicit-def $scc
+ %3:sreg_32 = S_TRUNC_F16 %2:sreg_32, implicit $mode
+...
+
+---
+name: S_FMAC_F16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sgpr_lo16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF2]], %subreg.hi16
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[DEF]], %subreg.lo16, [[DEF3]], %subreg.hi16
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE1]].lo16, 0, [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:sgpr_lo16 = COPY %0:vgpr_16
+ %2:sreg_32 = COPY %0:vgpr_16
+ %3:sreg_32 = COPY %1:sgpr_lo16
+ %4:sreg_32 = S_FMAC_F16 %3:sreg_32, %3:sreg_32, %2:sreg_32, implicit $mode
+...
+
---
name: vgpr16_to_spgr32
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
>From ee27d3056ba71cd543aa8d4a4533c2a5caa856a6 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Mon, 12 May 2025 14:22:58 -0400
Subject: [PATCH 2/4] address comment
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ba832e52892d3..d81fcef59e530 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7239,15 +7239,16 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
- if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
- unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ if (Op.isReg() && Op.getReg().isVirtual()) {
const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
- if (32 == RI.getRegSizeInBits(*RC) &&
- 16 == RI.getRegSizeInBits(*ExpectedRC)) {
+ if (!RI.isVGPRClass(RC))
+ continue;
+ unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
+ unsigned expectedSize = RI.getRegSizeInBits(*RI.getRegClass(RCID));
+ unsigned currSize = RI.getRegSizeInBits(*RC);
+ if (expectedSize == 16 && currSize == 32) {
Op.setSubReg(AMDGPU::lo16);
- } else if (16 == RI.getRegSizeInBits(*RC) &&
- 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+ } else if (expectedSize == 32 && currSize == 16) {
const DebugLoc &DL = MI.getDebugLoc();
Register NewDstReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
>From 77ce1b4f258f7906b39b08e49f55ea3cafa98dbf Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 22 May 2025 11:59:22 -0400
Subject: [PATCH 3/4] address comment
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d81fcef59e530..0ecd3a8034adc 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7240,15 +7240,14 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
if (!OpIdx)
continue;
if (Op.isReg() && Op.getReg().isVirtual()) {
- const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
- if (!RI.isVGPRClass(RC))
+ const TargetRegisterClass *DefRC = MRI.getRegClass(Op.getReg());
+ if (!RI.isVGPRClass(DefRC))
continue;
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- unsigned expectedSize = RI.getRegSizeInBits(*RI.getRegClass(RCID));
- unsigned currSize = RI.getRegSizeInBits(*RC);
- if (expectedSize == 16 && currSize == 32) {
+ const TargetRegisterClass *UseRC = RI.getRegClass(RCID);
+ if (RI.getMatchingSuperRegClass(DefRC, UseRC, AMDGPU::lo16)) {
Op.setSubReg(AMDGPU::lo16);
- } else if (expectedSize == 32 && currSize == 16) {
+ } else if (RI.getMatchingSuperRegClass(UseRC, DefRC, AMDGPU::lo16)) {
const DebugLoc &DL = MI.getDebugLoc();
Register NewDstReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
>From 392b57fe2865d9b6fd5cb97028e0c53059e4a5f9 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Thu, 22 May 2025 18:22:52 -0400
Subject: [PATCH 4/4] v_s_xx_f16 support in moveToVALU
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 23 +
llvm/test/CodeGen/AMDGPU/frem.ll | 897 +++++++++++++++++++++++++
2 files changed, 920 insertions(+)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0ecd3a8034adc..a8a95f3e898bb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7725,6 +7725,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
}
+ case AMDGPU::V_S_EXP_F16_e64:
+ case AMDGPU::V_S_LOG_F16_e64:
+ case AMDGPU::V_S_RCP_F16_e64:
+ case AMDGPU::V_S_RSQ_F16_e64:
+ case AMDGPU::V_S_SQRT_F16_e64: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
+ ? &AMDGPU::VGPR_16RegClass
+ : &AMDGPU::VGPR_32RegClass);
+ auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(1))
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ if (ST.useRealTrue16Insts())
+ NewInstr.addImm(0); // opsel0
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ legalizeOperandsVALUt16(*NewInstr, MRI);
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
+ }
}
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7a1351174733b..8613691c09517 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -8,6 +8,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -331,6 +333,82 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -537,6 +615,50 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: fast_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: fast_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -743,6 +865,50 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT: v_rcp_f16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT: v_rcp_f16_e32 v3, 0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -985,6 +1151,42 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v5, v4
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v6, -v4, v5, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v5, v6, v5
+; GFX1200-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v5, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1142,6 +1344,27 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1299,6 +1522,27 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-NEXT: v_fmac_f32_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
%r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1551,6 +1795,39 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v12, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v12, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v12, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v12, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT: v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1200-NEXT: v_mul_f64_e32 v[10:11], v[8:9], v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1200-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v12, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1772,6 +2049,35 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v10, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1993,6 +2299,35 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
; GFX1150-NEXT: global_store_b64 v10, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v10, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT: v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT: global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%r0 = load double, ptr addrspace(1) %in1, align 8
%r1 = load double, ptr addrspace(1) %in2, align 8
@@ -2514,6 +2849,131 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v2f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1200-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v2f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1200-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3398,6 +3858,227 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX1150-FAKE16-NEXT: s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v4f16:
+; GFX1200-TRUE16: ; %bb.0:
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1200-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT: s_clause 0x1
+; GFX1200-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1200-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1200-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1200-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1200-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1200-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1200-TRUE16-NEXT: s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v4f16:
+; GFX1200-FAKE16: ; %bb.0:
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1200-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT: s_clause 0x1
+; GFX1200-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1200-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1200-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1200-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1200-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1200-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
@@ -3758,6 +4439,65 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fmac_f32_e32 v0, v3, v2
; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v4, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
+; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v8, -v6, v7, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v7
+; GFX1200-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v9, -v6, v8, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v8, v9, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v5, -v6, v8, v5
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v8
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT: v_fma_f32 v1, v5, v3, v1
+; GFX1200-NEXT: v_div_scale_f32 v5, null, v2, v2, v0
+; GFX1200-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v6, v5
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v7, -v5, v6, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v6, v7, v6
+; GFX1200-NEXT: v_mul_f32_e32 v7, v3, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v8, -v5, v7, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v8, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v3, -v5, v7, v3
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v3, v3, v6, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v2, v0
+; GFX1200-NEXT: v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT: v_fmac_f32_e32 v0, v3, v2
+; GFX1200-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
@@ -4354,6 +5094,111 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fmac_f32_e32 v0, v5, v4
; GFX1150-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v4f32:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v8, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b128 v[0:3], v8, s[2:3]
+; GFX1200-NEXT: global_load_b128 v[4:7], v8, s[4:5] offset:64
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
+; GFX1200-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v11, v10
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v12, -v10, v11, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v11
+; GFX1200-NEXT: v_mul_f32_e32 v12, v9, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v13, -v10, v12, v9
+; GFX1200-NEXT: v_fmac_f32_e32 v12, v13, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v9, -v10, v12, v9
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: v_div_fmas_f32 v9, v9, v11, v12
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v9, v9, v7, v3
+; GFX1200-NEXT: v_trunc_f32_e32 v9, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v9, 0x80000000, v9
+; GFX1200-NEXT: v_fma_f32 v3, v9, v7, v3
+; GFX1200-NEXT: v_div_scale_f32 v9, null, v6, v6, v2
+; GFX1200-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v10, v9
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v11, -v9, v10, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v10
+; GFX1200-NEXT: v_mul_f32_e32 v11, v7, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v12, -v9, v11, v7
+; GFX1200-NEXT: v_fmac_f32_e32 v11, v12, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v7, -v9, v11, v7
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v7, v7, v10, v11
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v7, v7, v6, v2
+; GFX1200-NEXT: v_trunc_f32_e32 v7, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1200-NEXT: v_fma_f32 v2, v7, v6, v2
+; GFX1200-NEXT: v_div_scale_f32 v7, null, v5, v5, v1
+; GFX1200-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v9, v7
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v10, -v7, v9, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v9
+; GFX1200-NEXT: v_mul_f32_e32 v10, v6, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v11, -v7, v10, v6
+; GFX1200-NEXT: v_fmac_f32_e32 v10, v11, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v6, -v7, v10, v6
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v6, v6, v9, v10
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v5, v1
+; GFX1200-NEXT: v_trunc_f32_e32 v6, v6
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1200-NEXT: v_fma_f32 v1, v6, v5, v1
+; GFX1200-NEXT: v_div_scale_f32 v6, null, v4, v4, v0
+; GFX1200-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT: s_denorm_mode 15
+; GFX1200-NEXT: v_fma_f32 v9, -v6, v7, 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fmac_f32_e32 v7, v9, v7
+; GFX1200-NEXT: v_mul_f32_e32 v9, v5, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v10, -v6, v9, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v9, v10, v7
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f32 v5, -v6, v9, v5
+; GFX1200-NEXT: s_denorm_mode 12
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: v_div_fmas_f32 v5, v5, v7, v9
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v4, v0
+; GFX1200-NEXT: v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1200-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
@@ -4734,6 +5579,58 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
; GFX1150-NEXT: global_store_b128 v16, v[0:3], s[0:1]
; GFX1150-NEXT: s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64:
+; GFX1200: ; %bb.0:
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT: v_mov_b32_e32 v16, 0
+; GFX1200-NEXT: s_wait_kmcnt 0x0
+; GFX1200-NEXT: s_clause 0x1
+; GFX1200-NEXT: global_load_b128 v[0:3], v16, s[2:3]
+; GFX1200-NEXT: global_load_b128 v[4:7], v16, s[4:5] offset:64
+; GFX1200-NEXT: s_wait_loadcnt 0x0
+; GFX1200-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT: v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT: v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
+; GFX1200-NEXT: v_mul_f64_e32 v[14:15], v[12:13], v[10:11]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX1200-NEXT: v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT: v_trunc_f64_e32 v[8:9], v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT: v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
+; GFX1200-NEXT: v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT: v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_mul_f64_e32 v[12:13], v[10:11], v[8:9]
+; GFX1200-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX1200-NEXT: s_wait_alu 0xfffd
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
+; GFX1200-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT: v_trunc_f64_e32 v[6:7], v[6:7]
+; GFX1200-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX1200-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
More information about the llvm-commits
mailing list