[llvm] check for vgpr16 putting into vgpr32 case in v2s lowering (PR #138734)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Tue May 6 11:36:05 PDT 2025


https://github.com/broxigarchen created https://github.com/llvm/llvm-project/pull/138734

None

>From 80d793e63ebcd7b6f91946f48efee714f8360e03 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 6 May 2025 14:30:12 -0400
Subject: [PATCH] check for vgpr16 putting into vgpr32 case in v2s lowering

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  69 +-
 .../AMDGPU/fix-sgpr-copies-f16-true16.mir     |  15 +-
 llvm/test/CodeGen/AMDGPU/frem.ll              | 751 ++++++++++++------
 3 files changed, 564 insertions(+), 271 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e6d54860df221..ac9995d7e6c0a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7235,24 +7235,44 @@ bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
   return DeferredList.contains(MI);
 }
 
-// 16bit SALU use sgpr32. If a 16bit SALU get lowered to VALU in true16 mode,
-// sgpr32 is replaced to vgpr32 which is illegal in t16 inst. Need to add
-// subreg access properly. This can be removed after we have sgpr16 in place
-void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &Inst,
+// legalize operand between 16bit and 32bit registers in v2s copy
+// lowering (change spgr to vgpr).
+// This is mainly caused by 16bit SALU and 16bit VALU using reg with different
+// size. Need to legalize the size of the operands during the vgpr lowering
+// chain. This can be removed after we have sgpr16 in place
+void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
                                           MachineRegisterInfo &MRI) const {
-  unsigned Opcode = Inst.getOpcode();
-  if (!AMDGPU::isTrue16Inst(Opcode) || !ST.useRealTrue16Insts())
+  if (!ST.useRealTrue16Insts())
     return;
 
-  for (MachineOperand &Op : Inst.explicit_operands()) {
+  unsigned Opcode = MI.getOpcode();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  // legalize operands and check for size mismatch
+  for (MachineOperand &Op : MI.explicit_operands()) {
     unsigned OpIdx = Op.getOperandNo();
     if (!OpIdx)
       continue;
-    if (Op.isReg() && RI.isVGPR(MRI, Op.getReg())) {
+    if (Op.isReg() && Op.getReg().isVirtual() && RI.isVGPR(MRI, Op.getReg())) {
       unsigned RCID = get(Opcode).operands()[OpIdx].RegClass;
-      const TargetRegisterClass *RC = RI.getRegClass(RCID);
-      if (RI.getRegSizeInBits(*RC) == 16) {
+      const TargetRegisterClass *ExpectedRC = RI.getRegClass(RCID);
+      const TargetRegisterClass *RC = MRI.getRegClass(Op.getReg());
+      if (32 == RI.getRegSizeInBits(*RC) &&
+          16 == RI.getRegSizeInBits(*ExpectedRC)) {
         Op.setSubReg(AMDGPU::lo16);
+      } else if (16 == RI.getRegSizeInBits(*RC) &&
+                 32 == RI.getRegSizeInBits(*ExpectedRC)) {
+        const DebugLoc &DL = MI.getDebugLoc();
+        Register NewDstReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
+        BuildMI(*MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
+        BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDstReg)
+            .addReg(Op.getReg())
+            .addImm(AMDGPU::lo16)
+            .addReg(Undef)
+            .addImm(AMDGPU::hi16);
+        Op.setReg(NewDstReg);
       }
     }
   }
@@ -7777,8 +7797,8 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       return;
     }
 
-    // If this is a v2s copy src from vgpr16 to sgpr32,
-    // replace vgpr copy to subreg_to_reg
+    // If this is a copy src from 16bit reg to 32bit reg,
+    // replace copy to a reg_sequence
     // This can be remove after we have sgpr16 in place
     if (ST.useRealTrue16Insts() && Inst.isCopy() &&
         Inst.getOperand(1).getReg().isVirtual() &&
@@ -7787,14 +7807,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
       if (16 == RI.getRegSizeInBits(*SrcRegRC) &&
           32 == RI.getRegSizeInBits(*NewDstRC)) {
         Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+        Register Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_16RegClass);
         BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
-                get(TargetOpcode::SUBREG_TO_REG), NewDstReg)
-            .add(MachineOperand::CreateImm(0))
-            .add(Inst.getOperand(1))
-            .add(MachineOperand::CreateImm(AMDGPU::lo16));
+                get(AMDGPU::IMPLICIT_DEF), Undef);
+        BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
+                get(AMDGPU::REG_SEQUENCE), NewDstReg)
+            .addReg(Inst.getOperand(1).getReg())
+            .addImm(AMDGPU::lo16)
+            .addReg(Undef)
+            .addImm(AMDGPU::hi16);
         Inst.eraseFromParent();
-
         MRI.replaceRegWith(DstReg, NewDstReg);
+        // legalize useMI with mismatched size
+        for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+                                               E = MRI.use_end();
+             I != E; ++I) {
+          MachineInstr &UseMI = *I->getParent();
+          unsigned UseMIOpcode = UseMI.getOpcode();
+          if (AMDGPU::isTrue16Inst(UseMIOpcode) &&
+              (16 ==
+               RI.getRegSizeInBits(*getOpRegClass(UseMI, I.getOperandNo())))) {
+            I->setSubReg(AMDGPU::lo16);
+          }
+        }
         addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
         return;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 6e24d9afa2bbc..f9db082a2e912 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -9,8 +9,9 @@ body:             |
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[SUBREG_TO_REG]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF2]], %subreg.hi16
+    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, [[DEF1]], 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:sreg_32 = IMPLICIT_DEF
@@ -28,8 +29,9 @@ body:             |
     ; GCN-LABEL: name: cvt_hi_f32_f16
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[SUBREG_TO_REG]]
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]]
     ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY]].hi16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
@@ -44,8 +46,9 @@ body:             |
     ; GCN-LABEL: name: s_or_b32
     ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:vgpr_32 = SUBREG_TO_REG 0, [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16
-    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[SUBREG_TO_REG]], [[SUBREG_TO_REG]], implicit $exec
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_U16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+    ; GCN-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[REG_SEQUENCE]], [[REG_SEQUENCE]], implicit $exec
     ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_1:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[V_OR_B32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
     %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 125d009429cbf..7a1351174733b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -6,7 +6,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX1150 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
@@ -255,42 +256,81 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT:    v_mul_f32_e32 v3, v3, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v3, v5, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
-; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -456,26 +496,47 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: fast_frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: fast_frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: fast_frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -641,26 +702,47 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: unsafe_frem_f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_rcp_f16_e32 v3, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f16_e32 v3, v1, v3
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1150-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -2308,68 +2390,130 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_v2f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX1150-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v4, v6, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v3, v4, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v1
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v4, v6, v5
-; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v4, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v2f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v5, v3.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v5
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v5, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1150-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v2f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v4, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v3, v4, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v5
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v4, v6, v5
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v4, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1150-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3034,115 +3178,226 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
 ;
-; GFX1150-LABEL: frem_v4f16:
-; GFX1150:       ; %bb.0:
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    s_clause 0x1
-; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
-; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
-; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v8, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v8, v8
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v9, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
-; GFX1150-NEXT:    v_add_f32_e32 v6, v8, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v5, v6, v7
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v6, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
-; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
-; GFX1150-NEXT:    v_fma_f16 v0, v6, v2, v0
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v8, v7
-; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
-; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_add_f32_e32 v5, v7, v5
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    v_fmac_f16_e32 v2, v5, v6
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_fmac_f32_e32 v5, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
-; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
-; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
-; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
-; GFX1150-NEXT:    s_endpgm
+; GFX1150-TRUE16-LABEL: frem_v4f16:
+; GFX1150-TRUE16:       ; %bb.0:
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX1150-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-TRUE16-NEXT:    s_clause 0x1
+; GFX1150-TRUE16-NEXT:    global_load_b64 v[1:2], v5, s[2:3]
+; GFX1150-TRUE16-NEXT:    global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.h
+; GFX1150-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v3.h
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v3.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v7
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v10, v7
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v7, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v4.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v3
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v3, v6, v3
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX1150-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v4.l
+; GFX1150-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1150-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1150-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1150-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1150-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1150-TRUE16-NEXT:    global_store_b64 v5, v[1:2], s[0:1]
+; GFX1150-TRUE16-NEXT:    s_endpgm
+;
+; GFX1150-FAKE16-LABEL: frem_v4f16:
+; GFX1150-FAKE16:       ; %bb.0:
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1150-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1150-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1150-FAKE16-NEXT:    s_clause 0x1
+; GFX1150-FAKE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX1150-FAKE16-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX1150-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v5, v6, v7
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-FAKE16-NEXT:    v_fma_f16 v0, v6, v2, v0
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v5, v7, v5
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v2, v5, v6
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX1150-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-FAKE16-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v5, v3
+; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1150-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1150-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16



More information about the llvm-commits mailing list