[llvm] 2e58f60 - [AMDGPU][True16] t16 pseudo for mubuffer d16 load/store (#178822)

via llvm-commits llvm-commits at lists.llvm.org
Wed Feb 4 07:54:16 PST 2026


Author: Brox Chen
Date: 2026-02-04T10:54:11-05:00
New Revision: 2e58f6024a9257f5e506a033f48c9239212dc21d

URL: https://github.com/llvm/llvm-project/commit/2e58f6024a9257f5e506a033f48c9239212dc21d
DIFF: https://github.com/llvm/llvm-project/commit/2e58f6024a9257f5e506a033f48c9239212dc21d.diff

LOG: [AMDGPU][True16] t16 pseudo for mubuffer d16 load/store (#178822)

create t16 pseudos for mubuffer d16 load/store with vgpr16 in vdst/vdata
and use these t16 pseudo for isel pattern. Lower them back to d16
machine inst in mc level.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/BUFInstructions.td
    llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
    llvm/test/CodeGen/AMDGPU/fadd.f16.ll
    llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
    llvm/test/CodeGen/AMDGPU/fmax3.ll
    llvm/test/CodeGen/AMDGPU/fmin3.ll
    llvm/test/CodeGen/AMDGPU/fmul.f16.ll
    llvm/test/CodeGen/AMDGPU/fpext.f16.ll
    llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
    llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.ll
    llvm/test/CodeGen/AMDGPU/fsub.f16.ll
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/imm16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
    llvm/test/CodeGen/AMDGPU/select.f16.ll
    llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
    llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
    llvm/test/CodeGen/AMDGPU/v_madak_f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index ca8099736c187..48904582adba6 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -393,7 +393,7 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let sccb_value  = 0;
 }
 
-class getBUFVDataRegisterOperand<int Size, bit isTFE> {
+class getBUFVDataRegisterOperand<int Size, bit isTFE, bit isTrue16 = false> {
   defvar tfeVDataOp =
     !cond(!eq(Size, 16)  : AVLdSt_64,
           !eq(Size, 32)  : AVLdSt_64,
@@ -402,7 +402,7 @@ class getBUFVDataRegisterOperand<int Size, bit isTFE> {
           !eq(Size, 128) : AVLdSt_160);
 
   defvar VDataOp =
-    !cond(!eq(Size, 16)   : AVLdSt_32,
+    !cond(!eq(Size, 16)   : !if(isTrue16, VGPROp_16, AVLdSt_32),
           !eq(Size, 32)   : AVLdSt_32,
           !eq(Size, 64)   : AVLdSt_64,
           !eq(Size, 96)   : AVLdSt_96,
@@ -417,10 +417,10 @@ class getBUFVDataRegisterOperandForOp<RegisterOperand Op, bit isTFE> {
 }
 
 class getMUBUFInsDA<list<RegisterOperand> vdataList,
-                    list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset> {
+                    list<RegisterClassLike> vaddrList, bit isTFE, bit hasRestrictedSOffset, bit isTrue16 = false> {
   RegisterOperand vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
   RegisterClassLike vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
-  RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE>.ret;
+  RegisterOperand vdata_op = getBUFVDataRegisterOperand<!cast<SIRegisterClassLike>(vdataClass.RegClass).Size, isTFE, isTrue16>.ret;
 
   dag SOffset = !if(hasRestrictedSOffset, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
   dag NonVaddrInputs = !con((ins SReg_128_XNULL:$srsrc), SOffset, (ins Offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
@@ -448,13 +448,13 @@ class getMUBUFElements<ValueType vt> {
     );
 }
 
-class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset> {
+class getMUBUFIns<int addrKind, list<RegisterOperand> vdataList, bit isTFE, bit hasRestrictedSOffset, bit isTrue16 = false> {
   dag ret =
-    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
-    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasRestrictedSOffset, isTrue16>.ret,
+    !if(!eq(addrKind, BUFAddrKind.OffEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset, isTrue16>.ret,
+    !if(!eq(addrKind, BUFAddrKind.IdxEn),  getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasRestrictedSOffset, isTrue16>.ret,
+    !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset, isTrue16>.ret,
+    !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64_AlignTarget], isTFE, hasRestrictedSOffset, isTrue16>.ret,
     (ins))))));
 }
 
@@ -566,6 +566,33 @@ multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt,
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType load_vt,
+                                     bit TiedDest, bit isLds, bit isTFE, bit hasRestrictedSOffset> {
+  def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+  def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+  def _OFFEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+  def _IDXEN  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+  def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+    True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+    def _OFFEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+    def _IDXEN_exact  : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+    def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds, 0, isTFE, hasRestrictedSOffset, [], VGPROp_16>,
+      True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+  }
+}
+
 multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
                               bit TiedDest = 0, bit isLds = 0> {
   defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
@@ -577,6 +604,23 @@ multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
   }
 }
 
+multiclass MUBUF_Pseudo_Loads_t16<string opName, ValueType load_vt = i32,
+                                  bit TiedDest = 0, bit isLds = 0, string hiOpName = NAME#"_HI"> {
+  let True16Predicate = NotUseRealTrue16Insts in {
+    defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
+    defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>;
+  }
+  let True16Predicate = UseRealTrue16Insts in {
+    defvar NAME16 = opName#"_t16";
+    defm _t16 : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName, NAME, i16, 0, isLds, 0, 0>;
+    defm _t16_VBUFFER : MUBUF_Pseudo_Loads_Helper_t16<NAME16, hiOpName#"_VBUFFER", NAME#"_VBUFFER", i16, 0, isLds, 0, 1>;
+  }
+  if !not(isLds) then {
+    defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>;
+    defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>;
+  }
+}
+
 multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32, Predicate LDSPred = TruePredicate> {
   defm NAME : MUBUF_Pseudo_Loads<opName, load_vt>;
 
@@ -595,10 +639,11 @@ class MUBUF_Store_Pseudo <string opName,
                           ValueType store_vt,
                           bit isTFE = 0,
                           bit hasRestrictedSOffset = 0,
-                          list<dag> pattern=[]>
+                          list<dag> pattern=[],
+                          bit isTrue16 = false>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKind, [getVregSrcForVT<store_vt>.ret], isTFE, hasRestrictedSOffset>.ret,
+                 getMUBUFIns<addrKind, [getVregSrcForVT<store_vt, isTrue16, 0>.ret], isTFE, hasRestrictedSOffset, isTrue16>.ret,
                  getMUBUFAsmOps<addrKind, 0, 0, isTFE>.ret,
                  pattern>,
     MUBUF_SetupAddr<addrKind> {
@@ -650,6 +695,33 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt,
   }
 }
 
+multiclass MUBUF_Pseudo_Stores_Helper_t16<string opName, string Hi16Name, string Lo16Name, ValueType store_vt,
+                                          bit isTFE, bit hasRestrictedSOffset> {
+  def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    MUBUFAddr64Table<0, NAME>, True16D16Table<Hi16Name#"_OFFSET", Lo16Name#"_OFFSET">;
+
+  def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    MUBUFAddr64Table<1, NAME>, True16D16Table<Hi16Name#"_ADDR64", Lo16Name#"_ADDR64">;
+
+  def _OFFEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_OFFEN", Lo16Name#"_OFFEN">;
+  def _IDXEN  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_IDXEN", Lo16Name#"_IDXEN">;
+  def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+    True16D16Table<Hi16Name#"_BOTHEN", Lo16Name#"_BOTHEN">;
+
+  let DisableWQM = 1 in {
+    def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_OFFSET_exact", Lo16Name#"_OFFSET_exact">;
+    def _OFFEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_OFFEN_exact", Lo16Name#"_OFFEN_exact">;
+    def _IDXEN_exact  : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_IDXEN_exact", Lo16Name#"_IDXEN_exact">;
+    def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt, isTFE, hasRestrictedSOffset, [], 1/*isTrue16*/>,
+      True16D16Table<Hi16Name#"_BOTHEN_exact", Lo16Name#"_BOTHEN_exact">;
+  }
+}
+
 multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
   defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
   defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
@@ -658,6 +730,22 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
   defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
 }
 
+multiclass MUBUF_Pseudo_Stores_t16<string opName, ValueType store_vt = i32> {
+  defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
+  defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
+
+  let True16Predicate = NotUseRealTrue16Insts in {
+    defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
+
+    defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>;
+  }
+  let True16Predicate = UseRealTrue16Insts, SubtargetPredicate = HasD16LoadStore in {
+    defvar NAME16 = opName#"_t16";
+    defm _t16 : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI", NAME, i16, 0, 0>;
+    defm _t16_VBUFFER : MUBUF_Pseudo_Stores_Helper_t16<NAME16, NAME#"_D16_HI_VBUFFER", NAME#"_VBUFFER", i16, 0, 1>;
+  }
+}
+
 class MUBUF_Pseudo_Store_Lds<string opName>
   : MUBUF_Pseudo<opName,
                  (outs),
@@ -871,10 +959,16 @@ let TiedSourceNotRead = 1 in {
   >;
 } // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1.
 
+
+let TiedSourceNotRead = 1, SubtargetPredicate = HasD16LoadStore, OtherPredicates = [HasFormattedMUBUFInsts] in
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
+  "buffer_load_format_d16_hi_x", i32
+>;
+
 let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in {
 let TiedSourceNotRead = 1 in {
-  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
-    "buffer_load_format_d16_x", f16
+  defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads_t16 <
+    "buffer_load_format_d16_x", f16, 0, 0, "BUFFER_LOAD_FORMAT_D16_HI_X"
   >;
   defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
     "buffer_load_format_d16_xy", v2f16
@@ -930,9 +1024,6 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_aext_8_globa
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, atomic_load_zext_8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_aext_16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, atomic_load_zext_16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, zextloadi8_global>;
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
@@ -959,12 +1050,23 @@ foreach vt = VReg_128.RegTypes in {
 defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
 }
 
-defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
+let SubtargetPredicate = HasD16LoadStore in {
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
+  "buffer_store_byte_d16_hi", i32
+>;
+
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
+  "buffer_store_short_d16_hi", i32
+>;
+}
+
+defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores_t16 <
   "buffer_store_byte", i32
 >;
-defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
+defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores_t16 <
   "buffer_store_short", i32
 >;
+
 defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
   "buffer_store_dword", i32
 >;
@@ -1121,43 +1223,30 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
 let SubtargetPredicate = HasD16LoadStore in {
 let TiedSourceNotRead = 1 in {
 
-defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_ubyte_d16", i32, 1
->;
-
 defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_ubyte_d16_hi", i32, 1
 >;
 
-defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_sbyte_d16", i32, 1
->;
-
 defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_sbyte_d16_hi", i32, 1
 >;
 
-defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
-  "buffer_load_short_d16", i32, 1
->;
-
 defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
   "buffer_load_short_d16_hi", i32, 1
 >;
 
-let OtherPredicates = [HasFormattedMUBUFInsts] in
-defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
-  "buffer_load_format_d16_hi_x", i32
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_ubyte_d16", i32, 1
 >;
-} // End TiedSourceNotRead
 
-defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_byte_d16_hi", i32
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_sbyte_d16", i32, 1
 >;
 
-defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
-  "buffer_store_short_d16_hi", i32
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads_t16 <
+  "buffer_load_short_d16", i32, 1
 >;
+} // End TiedSourceNotRead
 
 let OtherPredicates = [HasFormattedMUBUFInsts] in
 defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
@@ -1166,6 +1255,18 @@ defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
 
 } // End HasD16LoadStore
 
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i16, atomic_load_nonext_16_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_aext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE_D16_t16", i16, atomic_load_zext_8_global>;
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SHORT_D16_t16", i16, atomic_load_nonext_16_global>;
+}
+
 let SubtargetPredicate = isNotGFX940Plus in
 def BUFFER_WBINVL1 : MUBUF_Invalidate <
   "buffer_wbinvl1", int_amdgcn_buffer_wbinvl1
@@ -1367,8 +1468,14 @@ let OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts] in {
 } // End OtherPredicates = [HasUnpackedD16VMem, HasFormattedMUBUFInsts].
 
 let OtherPredicates = [HasPackedD16VMem, HasFormattedMUBUFInsts] in {
+let True16Predicate = NotUseRealTrue16Insts in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
+}
+let True16Predicate = UseRealTrue16Insts in {
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_t16">;
+}
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
@@ -1934,15 +2041,26 @@ multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
 }
 
 let OtherPredicates = [Has16BitInsts] in {
-
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>;
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>;
-
 defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SHORT_D16_t16", i16, load_global>;
+}
+
 
 } // End OtherPredicates = [Has16BitInsts]
 
@@ -1982,6 +2100,19 @@ multiclass MUBUFScratchLoadPat_D16_Common <string Instr,
   >;
 }
 
+multiclass MUBUFScratchLoadPat_D16_Common_t16 <string Instr, ValueType vt, PatFrag ld_frag> {
+  def : GCNPat <
+    (vt (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset))),
+    (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset)
+  >;
+
+  def : GCNPat <
+    (vt (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
+    (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset)
+  >;
+}
+
+
 multiclass MUBUFScratchLoadPat_D16 <string Instr,
                                 ValueType vt, PatFrag ld_frag> {
   let SubtargetPredicate = HasUnrestrictedSOffset in {
@@ -1990,17 +2121,35 @@ multiclass MUBUFScratchLoadPat_D16 <string Instr,
   defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>;
 }
 
+multiclass MUBUFScratchLoadPat_D16_t16 <string Instr,
+                                ValueType vt, PatFrag ld_frag> {
+  let SubtargetPredicate = HasUnrestrictedSOffset in {
+    defm : MUBUFScratchLoadPat_D16_Common_t16<Instr, vt, ld_frag>;
+  }
+  defm : MUBUFScratchLoadPat_D16_Common_t16<Instr # "_VBUFFER", vt, ld_frag>;
+}
+
 let OtherPredicates = [NotHasFlatScratchEnabled] in {
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
-defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SBYTE_D16_t16", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_UBYTE_D16_t16", i16, zextloadi8_private>;
+defm : MUBUFScratchLoadPat_D16_t16 <"BUFFER_LOAD_SHORT_D16_t16", i16, load_private>;
+}
 
 foreach vt = Reg32Types.types in {
 defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>;
@@ -2066,8 +2215,15 @@ multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt,
   defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>;
 }
 
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>;
 defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_global>;
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT_t16", i16, store_global>;
+}
 
 multiclass MUBUFScratchStorePat_Common <string Instr,
                                  ValueType vt, PatFrag st,
@@ -2097,8 +2253,16 @@ multiclass MUBUFScratchStorePat <string Instr,
 let OtherPredicates = [NotHasFlatScratchEnabled] in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>;
 defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>;
+
+let True16Predicate = NotUseRealTrue16Insts in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>;
 defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_t16", i16, truncstorei8_private, VGPR_16>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_t16", i16, store_private, VGPR_16>;
+}
 
 foreach vt = Reg32Types.types in {
 defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>;

diff  --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 02d7b50e23b5d..f8b35e54e3bc0 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -71,21 +71,20 @@ define amdgpu_kernel void @br_cc_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[8:11], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.h, v1.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v2.l, v2.h
+; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB0_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ; GFX11-TRUE16-NEXT:  .LBB0_2: ; %two
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -180,22 +179,24 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3800
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, 0.5, v1.l
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %one
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x3800
-; GFX11-TRUE16-NEXT:  .LBB1_2: ; %two
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
+; GFX11-TRUE16-NEXT:  .LBB1_2: ; %two
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-FAKE16-LABEL: br_cc_f16_imm_a:
 ; GFX11-FAKE16:       ; %bb.0: ; %entry
@@ -283,18 +284,20 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3800
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, 0.5, v1.l
-; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB2_2
-; GFX11-TRUE16-NEXT:  ; %bb.1: ; %two
-; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0x3800
-; GFX11-TRUE16-NEXT:  .LBB2_2: ; %one
+; GFX11-TRUE16-NEXT:    s_cbranch_vccnz .LBB2_2
+; GFX11-TRUE16-NEXT:  ; %bb.1: ; %one
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+; GFX11-TRUE16-NEXT:  .LBB2_2: ; %two
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s7
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index d3881660bb846..293e24f2d1b9d 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5799,22 +5799,39 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
 ; SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; SDAG-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_getpc_b64 s[2:3]
-; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
-; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s32, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
-; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-TRUE16-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX11-TRUE16-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-TRUE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_u8 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
+; GFX11-TRUE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-TRUE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-FAKE16-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX11-FAKE16-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-FAKE16-NEXT:    s_mov_b32 s32, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
+; GFX11-FAKE16-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-FAKE16-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; HSA-LABEL: test_call_external_void_func_struct_i8_i32:
 ; HSA:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index edae2c393e5f0..a36be284a4bc3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -216,19 +216,34 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: extract_vector_elt_v3f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: extract_vector_elt_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 offset:2
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: extract_vector_elt_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-FAKE16-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -269,20 +284,35 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %ou
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x34
-; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s4, s6, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_lshl_b32 s4, s6, 4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_lshl_b32 s4, s6, 4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
   store half %p0, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 0b7533e2ecced..e8c96787db66c 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -72,11 +72,11 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s10
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s11
-; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
+; GFX11-SDAG-NEXT:    buffer_load_d16_b16 v0, off, s[0:3], 0 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-SDAG-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
@@ -92,11 +92,11 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-GISEL-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-GISEL-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-NEXT:    s_endpgm
 ;
@@ -232,7 +232,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s6
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT:    buffer_load_d16_b16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
@@ -246,7 +246,7 @@ define amdgpu_kernel void @fadd_f16_imm_a(
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
@@ -364,7 +364,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s3
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, s6
 ; GFX11-SDAG-NEXT:    s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT:    buffer_load_d16_b16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-SDAG-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
@@ -378,7 +378,7 @@ define amdgpu_kernel void @fadd_f16_imm_b(
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX11-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, 2.0, v0.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 5d3f69c84b902..0a6619810ad0e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -71,13 +71,13 @@ define amdgpu_kernel void @fcmp_f16_lt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -121,13 +121,13 @@ define amdgpu_kernel void @fcmp_f16_lt(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -233,13 +233,13 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s2, |v0.l|, |v1.l|
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e64 s2, |v0.l|, |v0.h|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s2
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
@@ -285,9 +285,9 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
@@ -405,13 +405,13 @@ define amdgpu_kernel void @fcmp_f16_eq(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -455,13 +455,13 @@ define amdgpu_kernel void @fcmp_f16_eq(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -567,13 +567,13 @@ define amdgpu_kernel void @fcmp_f16_le(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -617,13 +617,13 @@ define amdgpu_kernel void @fcmp_f16_le(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_le_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -729,13 +729,13 @@ define amdgpu_kernel void @fcmp_f16_gt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -779,13 +779,13 @@ define amdgpu_kernel void @fcmp_f16_gt(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -891,13 +891,13 @@ define amdgpu_kernel void @fcmp_f16_lg(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -941,13 +941,13 @@ define amdgpu_kernel void @fcmp_f16_lg(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1053,13 +1053,13 @@ define amdgpu_kernel void @fcmp_f16_ge(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1103,13 +1103,13 @@ define amdgpu_kernel void @fcmp_f16_ge(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ge_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1215,13 +1215,13 @@ define amdgpu_kernel void @fcmp_f16_o(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1265,13 +1265,13 @@ define amdgpu_kernel void @fcmp_f16_o(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1377,13 +1377,13 @@ define amdgpu_kernel void @fcmp_f16_u(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1427,13 +1427,13 @@ define amdgpu_kernel void @fcmp_f16_u(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_u_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1539,13 +1539,13 @@ define amdgpu_kernel void @fcmp_f16_nge(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1589,13 +1589,13 @@ define amdgpu_kernel void @fcmp_f16_nge(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_nge_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1701,13 +1701,13 @@ define amdgpu_kernel void @fcmp_f16_nlg(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1751,13 +1751,13 @@ define amdgpu_kernel void @fcmp_f16_nlg(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_nlg_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -1863,13 +1863,13 @@ define amdgpu_kernel void @fcmp_f16_ngt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -1913,13 +1913,13 @@ define amdgpu_kernel void @fcmp_f16_ngt(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_ngt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -2025,13 +2025,13 @@ define amdgpu_kernel void @fcmp_f16_nle(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -2075,13 +2075,13 @@ define amdgpu_kernel void @fcmp_f16_nle(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_nle_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -2187,13 +2187,13 @@ define amdgpu_kernel void @fcmp_f16_neq(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -2237,13 +2237,13 @@ define amdgpu_kernel void @fcmp_f16_neq(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_neq_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
@@ -2349,13 +2349,13 @@ define amdgpu_kernel void @fcmp_f16_nlt(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -2399,13 +2399,13 @@ define amdgpu_kernel void @fcmp_f16_nlt(
 ; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc_lo
 ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 94f7eee4a6efb..4368f2a5de3b1 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -510,15 +510,15 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -570,15 +570,15 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -631,15 +631,15 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
-; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
@@ -803,15 +803,15 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_max3_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -863,15 +863,15 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    v_max3_num_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -924,15 +924,15 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
-; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_max3_num_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 6be2eb93ee25c..142bdd42b2c00 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -510,15 +510,15 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -570,15 +570,15 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -631,15 +631,15 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[4:7], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
-; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v0.h, v1.l
 ; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
@@ -803,15 +803,15 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v0.l, v1.l, v2.l
+; GFX11-TRUE16-NEXT:    v_min3_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -863,15 +863,15 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v1, off, s[12:15], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX12-TRUE16-NEXT:    v_min3_num_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
@@ -924,15 +924,15 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s17, s13
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s20, s14
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s21, s15
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[4:7], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-TRUE16-NEXT:    buffer_load_u16 v0, off, s[20:23], null scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], null scope:SCOPE_SYS
 ; GFX1250-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s0, s8
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s1, s9
-; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v0.l, v1.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_min3_num_f16 v0.l, v1.l, v0.l, v0.h
 ; GFX1250-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 082006898b436..3bf4f5aafbe12 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -69,13 +69,13 @@ define amdgpu_kernel void @fmul_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -163,7 +163,7 @@ define amdgpu_kernel void @fmul_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4200, v0.l
@@ -246,7 +246,7 @@ define amdgpu_kernel void @fmul_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 4.0, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 94c2d3364a769..fc7d87e92e1b3 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -136,7 +136,7 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -465,7 +465,7 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
@@ -547,7 +547,7 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
@@ -629,7 +629,7 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
@@ -722,16 +722,16 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, -v0.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v2, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -825,14 +825,14 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, -v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, -v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -925,16 +925,16 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, |v0.l|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v2, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1028,14 +1028,14 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, |v0.l|, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0.l|
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, |v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1128,16 +1128,16 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x8000, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, -|v0.l|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v2, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -1232,14 +1232,14 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v1.l, -|v0.l|, v0.l
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0.l|
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, -|v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index ac269ee0d5abe..db483562c46ab 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -54,7 +54,7 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
@@ -137,7 +137,7 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -229,7 +229,7 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index af1ab37e48474..5b231eabad706 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -54,7 +54,7 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
@@ -137,7 +137,7 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -230,7 +230,7 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 77faf363ca412..9dd8d8f230217 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -249,7 +249,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
@@ -512,7 +512,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
 ; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
@@ -1170,7 +1170,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_endpgm
@@ -1417,7 +1417,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
@@ -4264,7 +4264,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
@@ -4534,7 +4534,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm
@@ -4804,7 +4804,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
 ; GFX1250-GISEL-TRUE16-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 0a1e3bb0979ac..0a2c59668714d 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -490,61 +490,173 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-SDAG-NEXT:    s_endpgm
 ;
-; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
-; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
-; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
-; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
-; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
-; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
-; GFX11-GISEL-NEXT:    s_max_i32 s6, s6, 0
-; GFX11-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
-; GFX11-GISEL-NEXT:    s_min_i32 s6, s6, 13
-; GFX11-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
-; GFX11-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s2, s7
-; GFX11-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
-; GFX11-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
-; GFX11-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b32 s6, s9, s6
-; GFX11-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
-; GFX11-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s2, 7
-; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
-; GFX11-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
-; GFX11-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX11-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
-; GFX11-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b32 s6, s7, s6
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX11-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_add_i32 s2, s2, s6
-; GFX11-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
-; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
-; GFX11-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
-; GFX11-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
-; GFX11-GISEL-NEXT:    s_endpgm
+; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
+; GFX11-SAFE-GISEL:       ; %bb.0:
+; GFX11-SAFE-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SAFE-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-SAFE-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-SAFE-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-SAFE-GISEL-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-SAFE-GISEL-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-SAFE-GISEL-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-SAFE-GISEL-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-SAFE-GISEL-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-SAFE-GISEL-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-SAFE-GISEL-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-SAFE-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-SAFE-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-SAFE-GISEL-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-SAFE-GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX11-SAFE-GISEL-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-GISEL-NEXT:    s_endpgm
+;
+; GFX11-UNSAFE-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-TRUE16:       ; %bb.0:
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-UNSAFE-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
+; GFX11-UNSAFE-GISEL-FAKE16:       ; %bb.0:
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_max_i32 s6, s6, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshl_b32 s7, s4, 12
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_min_i32 s6, s6, 13
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshl_b32 s5, s5, 9
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshr_b32 s9, s8, s6
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshl_b32 s6, s9, s6
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s5, s5, 0x7c00
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, s8
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s6, s9, s6
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_lt_i32 s4, 1
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_and_b32 s6, s2, 7
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s6, 5
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s6, s7, s6
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_add_i32 s2, s2, s6
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmp_gt_i32 s4, 30
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 0x7c00, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s4, 0x40f
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-GISEL-FAKE16-NEXT:    s_endpgm
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, ptr addrspace(1) %out

diff  --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index b8b339964cf0e..5c4bb6f91a50d 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -69,13 +69,13 @@ define amdgpu_kernel void @fsub_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -163,7 +163,7 @@ define amdgpu_kernel void @fsub_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    v_sub_f16_e32 v0.l, 1.0, v0.l
@@ -246,7 +246,7 @@ define amdgpu_kernel void @fsub_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, -2.0, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 7d2a48779bef6..3928a44595761 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -2291,13 +2291,13 @@ define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x1
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v1, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v0, off, s32
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2402,21 +2402,21 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v1, off, s32 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v2, off, s32 offset:8 glc dlc
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v1, off, s32 offset:8 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-TRUE16-NEXT:    scratch_load_b32 v3, off, s32 offset:12 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v2, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v1, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    ds_store_b32 v0, v0
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2652,11 +2652,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_clause 0x5
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-TRUE16-NEXT:    scratch_load_u8 v36, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_u8 v34, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v32, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v34, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v35, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v32, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_b16 v33, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_b16 v33, off, s32 offset:20
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
@@ -2668,6 +2668,8 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v16, 1, v34
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
@@ -2676,21 +2678,17 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v36
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b16 v32, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v33, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b16 v33, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4134,21 +4132,21 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX11-TRUE16-NEXT:    s_clause 0x10
 ; GFX11-TRUE16-NEXT:    scratch_load_b32 v31, off, s32
 ; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v32, off, s32 offset:64
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v33, off, s32 offset:60
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v34, off, s32 offset:56
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v35, off, s32 offset:52
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v36, off, s32 offset:48
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v37, off, s32 offset:44
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v38, off, s32 offset:40
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v39, off, s32 offset:36
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v48, off, s32 offset:32
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v49, off, s32 offset:28
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v50, off, s32 offset:24
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v51, off, s32 offset:20
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v52, off, s32 offset:16
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v53, off, s32 offset:12
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v54, off, s32 offset:8
-; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v55, off, s32 offset:4
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v32, off, s32 offset:60
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v33, off, s32 offset:56
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v33, off, s32 offset:52
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v34, off, s32 offset:48
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v34, off, s32 offset:44
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v35, off, s32 offset:40
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v35, off, s32 offset:36
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v36, off, s32 offset:32
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v36, off, s32 offset:28
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v37, off, s32 offset:24
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v37, off, s32 offset:20
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v38, off, s32 offset:16
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v38, off, s32 offset:12
+; GFX11-TRUE16-NEXT:    scratch_load_d16_u8 v39, off, s32 offset:8
+; GFX11-TRUE16-NEXT:    scratch_load_d16_hi_u8 v39, off, s32 offset:4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(16)
@@ -4168,53 +4166,45 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(15)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
-; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(13)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v32, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(11)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v33, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v34, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v35, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(5)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v36, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v37, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v38, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b8 v39, off, s[0:3], 0 dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 4add34dd7d956..a7665ee31e608 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -84,14 +84,23 @@ define i8 @i8_func_void() #0 {
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: i8_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i8_func_void:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_u8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i8_func_void:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) poison
   ret i8 %val
 }
@@ -150,14 +159,23 @@ define i16 @i16_func_void() #0 {
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: i16_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: i16_func_void:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: i16_func_void:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) poison
   ret i16 %val
 }
@@ -1164,16 +1182,27 @@ define {i8, i32} @struct_i8_i32_func_void() #0 {
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: struct_i8_i32_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: struct_i8_i32_func_void:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_u8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: struct_i8_i32_func_void:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load { i8, i32 }, ptr addrspace(1) poison
   ret { i8, i32 } %val
 }
@@ -1193,19 +1222,33 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: void_func_sret_struct_i8_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_load_u8 v1, off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_load_b32 v2, off, s[0:3], 0 glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b8 v0, v1, off
-; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: void_func_sret_struct_i8_i32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_u8 v1, off, s[0:3], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v2, off, s[0:3], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    scratch_store_b8 v0, v1, off
+; GFX11-TRUE16-NEXT:    scratch_store_b32 v0, v2, off offset:4
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: void_func_sret_struct_i8_i32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_load_u8 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v2, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    scratch_store_b8 v0, v1, off
+; GFX11-FAKE16-NEXT:    scratch_store_b32 v0, v2, off offset:4
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile i8, ptr addrspace(1) poison
   %val1 = load volatile i32, ptr addrspace(1) poison
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
@@ -2268,14 +2311,23 @@ define bfloat @bf16_func_void() #0 {
 ; GFX789-NEXT:    s_waitcnt vmcnt(0)
 ; GFX789-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: bf16_func_void:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: bf16_func_void:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: bf16_func_void:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %val = load bfloat, ptr addrspace(1) poison
   ret bfloat %val
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index ff40b1db5fe2f..0b2e5d540d463 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -19,16 +19,27 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0xfd,0xbb]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_neg_0.0_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_neg_0.0_i16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x8000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_neg_0.0_i16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc ; encoding: [0x00,0x20,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0 ; encoding: [0x00,0x00,0x7c,0xbc]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_neg_0.0_i16:
 ; VI:       ; %bb.0:
@@ -66,15 +77,25 @@ define amdgpu_kernel void @store_inline_imm_0.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_0.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_0.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0 ; encoding: [0x80,0x38,0x00,0x7e]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0 ; encoding: [0x80,0x02,0x00,0x7e]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_0.0_f16:
 ; VI:       ; %bb.0:
@@ -110,15 +131,25 @@ define amdgpu_kernel void @store_imm_neg_0.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_imm_neg_0.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_imm_neg_0.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x8000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_imm_neg_0.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffff8000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x80,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_imm_neg_0.0_f16:
 ; VI:       ; %bb.0:
@@ -154,15 +185,25 @@ define amdgpu_kernel void @store_inline_imm_0.5_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_0.5_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_0.5_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3800 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x38,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_0.5_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x38,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_0.5_f16:
 ; VI:       ; %bb.0:
@@ -198,15 +239,25 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_m_0.5_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_0.5_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xb800 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xb8,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_0.5_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffffb800 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xb8,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_0.5_f16:
 ; VI:       ; %bb.0:
@@ -242,15 +293,25 @@ define amdgpu_kernel void @store_inline_imm_1.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_1.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_1.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3c00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x3c,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_1.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x3c,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_1.0_f16:
 ; VI:       ; %bb.0:
@@ -286,15 +347,25 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_m_1.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_1.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xbc00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xbc,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_1.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffffbc00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xbc,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_1.0_f16:
 ; VI:       ; %bb.0:
@@ -330,15 +401,25 @@ define amdgpu_kernel void @store_inline_imm_2.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_2.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_2.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x4000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x40,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_2.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x4000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x40,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_2.0_f16:
 ; VI:       ; %bb.0:
@@ -374,15 +455,25 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_m_2.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_2.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xc000 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xc0,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_2.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffffc000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc0,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_2.0_f16:
 ; VI:       ; %bb.0:
@@ -418,15 +509,25 @@ define amdgpu_kernel void @store_inline_imm_4.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_4.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_4.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x4400 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x44,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_4.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x4400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x44,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_4.0_f16:
 ; VI:       ; %bb.0:
@@ -462,15 +563,25 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_m_4.0_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_4.0_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xc400 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0xc4,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_4.0_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffffc400 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0xc4,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_4.0_f16:
 ; VI:       ; %bb.0:
@@ -506,15 +617,25 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_inv_2pi_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_inv_2pi_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3118 ; encoding: [0xff,0x38,0x00,0x7e,0x18,0x31,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_inv_2pi_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0x31,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_inv_2pi_f16:
 ; VI:       ; %bb.0:
@@ -550,15 +671,25 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(ptr addrspace(1) %out)
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_inline_imm_m_inv_2pi_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_inline_imm_m_inv_2pi_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0xb118 ; encoding: [0xff,0x38,0x00,0x7e,0x18,0xb1,0xff,0xff]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_inline_imm_m_inv_2pi_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0xffffb118 ; encoding: [0xff,0x02,0x00,0x7e,0x18,0xb1,0xff,0xff]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_inline_imm_m_inv_2pi_f16:
 ; VI:       ; %bb.0:
@@ -594,15 +725,25 @@ define amdgpu_kernel void @store_literal_imm_f16(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x80]
 ; GFX10-NEXT:    s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
 ;
-; GFX11-LABEL: store_literal_imm_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00]
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
-; GFX11-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
-; GFX11-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+; GFX11-TRUE16-LABEL: store_literal_imm_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x6c00 ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x6c,0x00,0x00]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-TRUE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
+;
+; GFX11-FAKE16-LABEL: store_literal_imm_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0 ; encoding: [0x02,0x00,0x04,0xf4,0x00,0x00,0x00,0xf8]
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x6c00 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x6c,0x00,0x00]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 ; encoding: [0xff,0x00,0x83,0xbe,0x00,0x60,0x01,0x31]
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1 ; encoding: [0xc1,0x00,0x82,0xbe]
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 ; encoding: [0x00,0x00,0x64,0xe0,0x00,0x00,0x00,0x80]
+; GFX11-FAKE16-NEXT:    s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
 ;
 ; VI-LABEL: store_literal_imm_f16:
 ; VI:       ; %bb.0:
@@ -1251,7 +1392,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(ptr addrspace(1) %out,
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x00,0x02,0x80]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 0.5, v0.l ; encoding: [0xf0,0x00,0x00,0x64]
@@ -1349,7 +1490,7 @@ define amdgpu_kernel void @commute_add_literal_f16(ptr addrspace(1) %out, ptr ad
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2 ; encoding: [0x02,0x00,0x88,0xbe]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3 ; encoding: [0x03,0x00,0x89,0xbe]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0 ; encoding: [0x00,0x00,0x84,0xbe]
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x48,0xe0,0x00,0x00,0x02,0x80]
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0 ; encoding: [0x00,0x00,0x80,0xe0,0x00,0x00,0x02,0x80]
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1 ; encoding: [0x01,0x00,0x85,0xbe]
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 0x6400, v0.l ; encoding: [0xff,0x00,0x00,0x64,0x00,0x64,0x00,0x00]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index 9983c09538201..38f7ea12251ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -37,7 +37,7 @@ define amdgpu_kernel void @rcp_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
@@ -73,7 +73,7 @@ define amdgpu_kernel void @rcp_f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 9f26745d83a64..efb6b20d6e730 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -37,7 +37,7 @@ define amdgpu_kernel void @rsq_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
@@ -73,7 +73,7 @@ define amdgpu_kernel void @rsq_f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 117a169069782..4769f0f62fdf7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -57,7 +57,7 @@ define amdgpu_kernel void @ceil_f16(
 ; GFX11-NEXT:    s_mov_b32 s8, s2
 ; GFX11-NEXT:    s_mov_b32 s9, s3
 ; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 868e69dfbd352..f32d8af2ce11a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -57,7 +57,7 @@ define amdgpu_kernel void @floor_f16(
 ; GFX11-NEXT:    s_mov_b32 s8, s2
 ; GFX11-NEXT:    s_mov_b32 s9, s3
 ; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-NEXT:    s_mov_b32 s5, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index a32a456c102dd..146956c5908ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -177,17 +177,18 @@ define amdgpu_kernel void @fmuladd_f16(
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s16, s4
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s12, s6
-; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s7, s11
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v2, off, s[12:15], 0
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-FLUSH-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
@@ -242,14 +243,14 @@ define amdgpu_kernel void @fmuladd_f16(
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s17, s5
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s20, s6
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v2, off, s[20:23], 0
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[20:23], 0
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v1.l
-; GFX11-DENORM-TRUE16-NEXT:    buffer_store_b16 v2, off, s[8:11], 0
+; GFX11-DENORM-TRUE16-NEXT:    v_fmac_f16_e32 v1.l, v0.l, v0.h
+; GFX11-DENORM-TRUE16-NEXT:    buffer_store_b16 v1, off, s[8:11], 0
 ; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DENORM-FAKE16-LABEL: fmuladd_f16:
@@ -432,14 +433,14 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4200, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-FLUSH-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -483,13 +484,13 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-DENORM-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v1.l
+; GFX11-DENORM-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
 ; GFX11-DENORM-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
 ;
@@ -667,14 +668,14 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-FLUSH-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-FLUSH-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-FLUSH-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-FLUSH-TRUE16-NEXT:    v_mul_f16_e32 v0.l, 0x4200, v0.l
 ; GFX11-FLUSH-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
+; GFX11-FLUSH-TRUE16-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-FLUSH-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-FLUSH-TRUE16-NEXT:    s_endpgm
 ;
@@ -718,13 +719,13 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DENORM-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-DENORM-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-DENORM-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-DENORM-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-DENORM-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v1.l
+; GFX11-DENORM-TRUE16-NEXT:    v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h
 ; GFX11-DENORM-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-DENORM-TRUE16-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 69f17ed072425..83f3a7571f9c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -128,14 +128,14 @@ define amdgpu_kernel void @maxnum_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
@@ -267,7 +267,7 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
@@ -393,7 +393,7 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 0e91d905d5585..56554ed17f177 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -128,14 +128,14 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
-; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v1.l, v1.l
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.h, v0.h, v0.h
 ; GFX11-TRUE16-NEXT:    v_min_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
@@ -302,7 +302,7 @@ define amdgpu_kernel void @minnum_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
@@ -426,7 +426,7 @@ define amdgpu_kernel void @minnum_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 57ce028b1fc4a..80a7d0127fec5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -60,7 +60,7 @@ define amdgpu_kernel void @rint_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l
@@ -96,7 +96,7 @@ define amdgpu_kernel void @rint_f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_rndne_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index bbbf4b66cf55b..0965dd126a8e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -18,7 +18,7 @@ define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_sqrt_bf16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 6fbfcf3451430..961a881cae20e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -59,7 +59,7 @@ define amdgpu_kernel void @sqrt_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
@@ -95,7 +95,7 @@ define amdgpu_kernel void @sqrt_f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 047c170ab527f..e4d82271884c1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -59,7 +59,7 @@ define amdgpu_kernel void @trunc_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
@@ -95,7 +95,7 @@ define amdgpu_kernel void @trunc_f16(
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], null
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 195d222408139..36f073054031a 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -99,22 +99,20 @@ define amdgpu_kernel void @select_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s12
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s13
-; GFX11-TRUE16-NEXT:    s_mov_b32 s24, s14
-; GFX11-TRUE16-NEXT:    s_mov_b32 s25, s15
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v3, off, s[24:27], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_mov_b32 s24, s14
+; GFX11-TRUE16-NEXT:    s_mov_b32 s25, s15
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[24:27], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[0:3], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v1, off, s[0:3], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s9
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v3.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.h, v1.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -244,27 +242,25 @@ define amdgpu_kernel void @select_f16_imm_a(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -385,27 +381,25 @@ define amdgpu_kernel void @select_f16_imm_b(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s18, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s19, s11
-; GFX11-TRUE16-NEXT:    s_mov_b32 s22, s10
-; GFX11-TRUE16-NEXT:    s_mov_b32 s23, s11
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
-; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s7
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s11
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, v1.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -534,20 +528,20 @@ define amdgpu_kernel void @select_f16_imm_c(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_nlt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3800, v1.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -676,20 +670,20 @@ define amdgpu_kernel void @select_f16_imm_d(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s16, s4
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s5
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s6
-; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s7
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, s11
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[12:15], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[4:7], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
-; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cmp_lt_f16_e32 vcc_lo, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x3800, v1.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;

diff  --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 792bd03ad15aa..3b61a5fa45604 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 8fe5cbeaab34d..901361ff48cee 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[8:11], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l

diff  --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index b675e0ffe9eed..de44175f7c5bd 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -70,12 +70,12 @@ define amdgpu_kernel void @madak_f16(
 ; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s2
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s3
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[12:15], 0
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s0
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s1
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v0.h
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 0x4900, v0.l
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
@@ -216,22 +216,22 @@ define amdgpu_kernel void @madak_f16_use_2(
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s17, s13
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s20, s14
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s21, s15
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[16:19], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v0, off, s[16:19], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[20:23], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_hi_b16 v0, off, s[20:23], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT:    buffer_load_u16 v2, off, s[0:3], 0 glc dlc
+; GFX11-TRUE16-NEXT:    buffer_load_d16_b16 v1, off, s[0:3], 0 glc dlc
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s8
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s9
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s0, s10
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s1, s11
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v0.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.h, v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_add_f16_e32 v1.l, 0x4900, v0.h
+; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.h, 0x4900, v0.h
 ; GFX11-TRUE16-NEXT:    v_add_f16_e32 v0.l, 0x4900, v0.l
-; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    buffer_store_d16_hi_b16 v0, off, s[4:7], 0
 ; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;


        


More information about the llvm-commits mailing list