[llvm] check for vgpr16 putting into vgpr32 case in v2s lowering (PR #138734)
Brox Chen via llvm-commits
llvm-commits at lists.llvm.org
Tue May 6 14:02:08 PDT 2025
https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/138734
>From 1135e8b6831ab1248b79f1cecc6e1385a56e483e Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 6 May 2025 14:30:12 -0400
Subject: [PATCH] check for vgpr16 putting into vgpr32 case in v2s lowering
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 53 +-
.../AMDGPU/fix-sgpr-copies-f16-true16.mir | 17 +
llvm/test/CodeGen/AMDGPU/frem.ll | 751 ++++++++++++------
3 files changed, 562 insertions(+), 259 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6d54860df221..0cabf09ec7f21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7235,24 +7235,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
return DeferredList.contains(MI);
}
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// legalize operand between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
MachineRegisterInfo &MRI) const {
- unsigned Opcode = Inst.getOpcode();
- if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+ if (!ST.useRealTrue16Insts())
return;
- for (MachineOperand &Op : Inst.explicit_operands()) {
+ unsigned Opcode = MI.getOpcode();
+ MachineBasicBlock *MBB = MI.getParent();
+
+ // legalize operands and check for size mismatch
+ for (MachineOperand &Op : MI.explicit_operands()) {
unsigned OpIdx = Op.getOperandNo();
if (!OpIdx)
continue;
- if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+ if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
- const TargetRegisterClass *RC = RI.getRegClass(RCID);
- if (RI.getRegSizeInBits(*RC) == 16) {
+ const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+ const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
+ if (32 == RI.getRegSizeInBits(*RC) &&
+ 16 == RI.getRegSizeInBits(*ExpectedRC)) {
Op.setSubReg(AMDGPU::lo16);
+ } else if (16 == RI.getRegSizeInBits(*RC) &&
+ 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register NewDstReg =
+ MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+ BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+ .addReg(Op.getReg())
+ .addImm(AMDGPU::lo16)
+ .addReg(Undef)
+ .addImm(AMDGPU::hi16);
+ Op.setReg(NewDstReg);
}
}
}
@@ -7793,8 +7813,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
.add(Inst.getOperand(1))
.add(MachineOperand::CreateImm(AMDGPU::lo16));
Inst.eraseFromParent();
-
MRI.replaceRegWith(DstReg, NewDstReg);
+ // legalize useMI with mismatched size
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ MachineInstr &UseMI = *I->getParent();
+ unsigned UseMIOpcode = UseMI.getOpcode();
+ if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+ (16 ==
+ RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+ I->setSubReg(AMDGPU::lo16);
+ }
+ }
addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
return;
}
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 6e24d9afa2bbc..93686dc44c5db 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -54,6 +54,23 @@ body: |
%4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
...
+---
+name: S_FMAC_F16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: S_FMAC_F16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+ ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+ ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[SUBREG_TO_REG]].lo16, 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_16 = IMPLICIT_DEF
+ %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+ %2:sreg_32 = COPY %1:vgpr_16
+ %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
+ %4:sreg_32 = S_FMAC_F16 killed %2:sreg_32, %2:sreg_32, %3:sreg_32, implicit $mode
+...
+
---
name: vgpr16_to_spgr32
body: |
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #1 {
%gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
%r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v0, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
; GFX11-FAKE16-NEXT: s_endpgm
;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150: ; %bb.0:
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT: v_mov_b32_e32 v4, 0
-; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT: s_clause 0x1
-; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT: s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16: ; %bb.0:
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT: s_clause 0x1
+; GFX1150-TRUE16-NEXT: global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT: global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT: v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT: v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT: v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT: global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT: s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16: ; %bb.0:
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT: s_clause 0x1
+; GFX1150-FAKE16-NEXT: global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT: global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT: s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT: v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %in2) #0 {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
%r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
More information about the llvm-commits
mailing list