[llvm] check for vgpr16 putting into vgpr32 case in v2s lowering (PR #138734)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Tue May 6 14:02:08 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/138734

>From 1135e8b6831ab1248b79f1cecc6e1385a56e483e Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 6 May 2025 14:30:12 -0400
Subject: [PATCH] check for vgpr16 putting into vgpr32 case in v2s lowering

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  53 +-
 .../AMDGPU/fix-sgpr-copies-f16-true16.mir     |  17 +
 llvm/test/CodeGen/AMDGPU/frem.ll              | 751 ++++++++++++------
 3 files changed, 562 insertions(+), 259 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6d54860df221..0cabf09ec7f21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7235,24 +7235,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
   return DeferredList.contains(MI);
 }
 
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// legalize operand between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
                                           MachineRegisterInfo &MRI) const {
-  unsigned Opcode = Inst.getOpcode();
-  if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+  if (!ST.useRealTrue16Insts())
     return;
 
-  for (MachineOperand &Op : Inst.explicit_operands()) {
+  unsigned Opcode = MI.getOpcode();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // legalize operands and check for size mismatch
+  for (MachineOperand &Op : MI.explicit_operands()) {
     unsigned OpIdx = Op.getOperandNo();
     if (!OpIdx)
       continue;
-    if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+    if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
       unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
-      const TargetRegisterClass *RC = RI.getRegClass(RCID);
-      if (RI.getRegSizeInBits(*RC) == 16) {
+      const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+      const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
+      if (32 == RI.getRegSizeInBits(*RC) &&
+          16 == RI.getRegSizeInBits(*ExpectedRC)) {
         Op.setSubReg(AMDGPU::lo16);
+      } else if (16 == RI.getRegSizeInBits(*RC) &&
+                 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+        const DebugLoc &DL = MI.getDebugLoc();
+        Register NewDstReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+        BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+        BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+            .addReg(Op.getReg())
+            .addImm(AMDGPU::lo16)
+            .addReg(Undef)
+            .addImm(AMDGPU::hi16);
+        Op.setReg(NewDstReg);
       }
     }
   }
@@ -7793,8 +7813,19 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
             .add(Inst.getOperand(1))
             .add(MachineOperand::CreateImm(AMDGPU::lo16));
         Inst.eraseFromParent();
-
         MRI.replaceRegWith(DstReg, NewDstReg);
+        // legalize useMI with mismatched size
+        for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+                                               E = MRI.use_end();
+             I != E; ++I) {
+          MachineInstr &UseMI = *I->getParent();
+          unsigned UseMIOpcode = UseMI.getOpcode();
+          if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+              (16 ==
+               RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+            I->setSubReg(AMDGPU::lo16);
+          }
+        }
         addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
         return;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 6e24d9afa2bbc..93686dc44c5db 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -54,6 +54,23 @@ body:             |
     %4:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %3:sreg_32, 0, 0, 0, implicit $mode, implicit $exec
 ...
 
+---
+name:            S_FMAC_F16
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: S_FMAC_F16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
+    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+    ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_FMAC_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[SUBREG_TO_REG]].lo16, 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+    %0:vgpr_16 = IMPLICIT_DEF
+    %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+    %2:sreg_32 = COPY %1:vgpr_16
+    %3:sreg_32 = S_OR_B32 %2:sreg_32, %2:sreg_32, implicit-def $scc
+    %4:sreg_32 = S_FMAC_F16 killed %2:sreg_32, %2:sreg_32, %3:sreg_32, implicit $mode
+...
+
 ---
 name:            vgpr16_to_spgr32
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT:    v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT:    v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16



More information about the llvm-commits mailing list