[llvm] turn on true16 on gfx12 (PR #143652)

Brox Chen via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 11 09:46:54 PDT 2025


https://github.com/broxigarchen updated https://github.com/llvm/llvm-project/pull/143652

>From e9ef035ba2e3b58b72bfe6604b621fb562cf4e31 Mon Sep 17 00:00:00 2001
From: guochen2 <guochen2 at amd.com>
Date: Tue, 10 Jun 2025 23:48:32 -0400
Subject: [PATCH] turn on true16 on gfx12

---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |    3 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll |   43 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  | 1446 +++++++++++------
 4 files changed, 1001 insertions(+), 493 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index a9b39eebbcdcf..18590c5a5d4c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1879,7 +1879,8 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureMaxHardClauseLength32,
    Feature1_5xVGPRs,
    FeatureMemoryAtomicFAddF32DenormalSupport,
-   FeatureBVHDualAndBVH8Insts
+   FeatureBVHDualAndBVH8Insts,
+   FeatureRealTrue16Insts
    ]>;
 
 def FeatureISAVersion12_Generic: FeatureSet<
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index 7f1c01a7a0008..2fefd92bd8446 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
 ; GFX12-LABEL: test_cvt_f32_bf8_byte0:
@@ -42,11 +43,19 @@ define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
 }
 
 define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cvt_pk_bf8_f32 v2.l, v0, v1
+; GFX12-TRUE16-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-FAKE16-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false)
@@ -55,13 +64,21 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr
 }
 
 define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
-; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-TRUE16-NEXT:    v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1]
+; GFX12-TRUE16-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-FAKE16-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-FAKE16-NEXT:    s_endpgm
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
   %tmp1 = bitcast i32 %tmp0 to float
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 16d32b73b9b0d..b600f0773e915 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
 declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 67c2ee6403558..d2f465f12004b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
 ; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
 
 define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
 ; GFX6-LABEL: constant_load_i1:
@@ -136,15 +137,25 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v2i1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v2i1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v2i1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   store <2 x i1> %load, ptr addrspace(1) %out
   ret void
@@ -206,15 +217,25 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v3i1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v3i1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v3i1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   store <3 x i1> %load, ptr addrspace(1) %out
   ret void
@@ -277,15 +298,25 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v4i1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v4i1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v4i1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   store <4 x i1> %load, ptr addrspace(1) %out
   ret void
@@ -348,15 +379,25 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v8i1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v8i1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b8 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v8i1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b8 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   store <8 x i1> %load, ptr addrspace(1) %out
   ret void
@@ -419,15 +460,25 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_load_v16i1:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v1, v0, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_load_v16i1:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_load_v16i1:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   store <16 x i1> %load, ptr addrspace(1) %out
   ret void
@@ -858,19 +909,33 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v2i1_to_v2i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v2, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v2, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v2i1_to_v2i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v2, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX12-TRUE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v2i1_to_v2i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v2, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
+; GFX12-FAKE16-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, ptr addrspace(1) %out
@@ -1012,21 +1077,37 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v3, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v3, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX12-NEXT:    v_bfe_u32 v1, v1, 1, 1
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v3i1_to_v3i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v1, v3, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v1, v1, 1, 1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v3i1_to_v3i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v3, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v1, v1, 1, 1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT:    global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, ptr addrspace(1) %out
@@ -1177,21 +1258,37 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v4i1_to_v4i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v1, v4, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v3, 0xffff, v1
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX12-NEXT:    v_bfe_u32 v2, v1, 2, 1
-; GFX12-NEXT:    v_bfe_u32 v1, v1, 1, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 3, v3
-; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v4i1_to_v4i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v1, v4, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v1, 2, 1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v1, v1, 1, 1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 3, v3
+; GFX12-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v4i1_to_v4i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v1, v4, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v1, 2, 1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v1, v1, 1, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 3, v3
+; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1377,32 +1474,59 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T8.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10001
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
-; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10005
-; GFX12-NEXT:    s_and_b32 s6, s2, 1
-; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10002
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x10004
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 7, v0
-; GFX12-NEXT:    v_bfe_u32 v2, v0, 6, 1
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6
-; GFX12-NEXT:    v_mov_b32_e32 v6, s7
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v8i1_to_v8i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v8, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s5, s2, 0x10005
+; GFX12-TRUE16-NEXT:    s_and_b32 s6, s2, 1
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s7, s2, 0x10002
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s2, s2, 0x10004
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 7, v0
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 6, 1
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, s7
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v8i1_to_v8i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v8, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v5, s4 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s5, s2, 0x10005
+; GFX12-FAKE16-NEXT:    s_and_b32 s6, s2, 1
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s7, s2, 0x10002
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s2, s2, 0x10004
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 7, v0
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v0, 6, 1
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s3
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v4, s6
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, s7
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1682,46 +1806,88 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ; EG-NEXT:     LSHR * T14.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v16, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-NEXT:    s_and_b32 s6, 0xffff, s2
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10003
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10001
-; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10007
-; GFX12-NEXT:    s_bfe_u32 s7, s2, 0x10009
-; GFX12-NEXT:    s_bfe_u32 s8, s2, 0x1000d
-; GFX12-NEXT:    s_and_b32 s9, s2, 1
-; GFX12-NEXT:    s_bfe_u32 s10, s2, 0x1000a
-; GFX12-NEXT:    s_bfe_u32 s2, s2, 0x1000c
-; GFX12-NEXT:    s_bfe_u32 s11, s6, 0x10005
-; GFX12-NEXT:    s_bfe_u32 s12, s6, 0x1000b
-; GFX12-NEXT:    s_lshr_b32 s13, s6, 15
-; GFX12-NEXT:    s_bfe_u32 s14, s6, 0x10002
-; GFX12-NEXT:    s_bfe_u32 s15, s6, 0x10006
-; GFX12-NEXT:    s_bfe_u32 s16, s6, 0x10004
-; GFX12-NEXT:    s_bfe_u32 s17, s6, 0x10008
-; GFX12-NEXT:    s_bfe_u32 s6, s6, 0x1000e
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
-; GFX12-NEXT:    v_mov_b32_e32 v1, s8
-; GFX12-NEXT:    v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v4, s17
-; GFX12-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v11, s5
-; GFX12-NEXT:    v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s16
-; GFX12-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v10, s15
-; GFX12-NEXT:    v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s4
-; GFX12-NEXT:    v_mov_b32_e32 v14, s14
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v16i1_to_v16i32:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v16, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-TRUE16-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s6, s2, 0x10007
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s7, s2, 0x10009
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s8, s2, 0x1000d
+; GFX12-TRUE16-NEXT:    s_and_b32 s9, s2, 1
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s10, s2, 0x1000a
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s2, s2, 0x1000c
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s11, s3, 0x10005
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s12, s3, 0x1000b
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s13, s3, 15
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s14, s3, 0x10002
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s15, s3, 0x10006
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s16, s3, 0x10004
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s17, s3, 0x10008
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s3, 0x1000e
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, s8
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v15, s4 :: v_dual_mov_b32 v2, s3
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v4, s17
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v11, s6
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s16
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v10, s15
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s5
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v14, s14
+; GFX12-TRUE16-NEXT:    s_clause 0x3
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v16i1_to_v16i32:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v16, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-FAKE16-NEXT:    s_and_b32 s6, 0xffff, s2
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10003
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x10001
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s5, s2, 0x10007
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s7, s2, 0x10009
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s8, s2, 0x1000d
+; GFX12-FAKE16-NEXT:    s_and_b32 s9, s2, 1
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s10, s2, 0x1000a
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s2, s2, 0x1000c
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s11, s6, 0x10005
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s12, s6, 0x1000b
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s13, s6, 15
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s14, s6, 0x10002
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s15, s6, 0x10006
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s16, s6, 0x10004
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s17, s6, 0x10008
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s6, s6, 0x1000e
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, s8
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v15, s3 :: v_dual_mov_b32 v2, s6
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, s13 :: v_dual_mov_b32 v4, s17
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, s10 :: v_dual_mov_b32 v11, s5
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v7, s12 :: v_dual_mov_b32 v8, s16
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v10, s15
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v12, s9 :: v_dual_mov_b32 v13, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v14, s14
+; GFX12-FAKE16-NEXT:    s_clause 0x3
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, ptr addrspace(1) %out
@@ -4492,22 +4658,39 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v2i1_to_v2i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v2i1_to_v2i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v2i1_to_v2i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4576,22 +4759,39 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T1.W, T1.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v2i1_to_v2i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v4, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT:    v_bfe_i32 v2, v1, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v2i1_to_v2i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v4, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v2, v1, 0, 1
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-TRUE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v2i1_to_v2i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v4, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 1, v0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v2, v1, 0, 1
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-FAKE16-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <2 x i1>, ptr addrspace(4) %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4674,27 +4874,49 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v5, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v5, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX12-NEXT:    v_bfe_u32 v2, v0, 1, 1
-; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
-; GFX12-NEXT:    v_mov_b32_e32 v3, v5
-; GFX12-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v2, 0xffff, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b64 v5, v[4:5], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v3i1_to_v3i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v5, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v0, 1, 1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    global_store_b64 v5, v[4:5], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v3i1_to_v3i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v5, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v0, 1, 1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 2, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v3, v5
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, v5 :: v_dual_and_b32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_store_b64 v5, v[4:5], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4783,28 +5005,51 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T1.W, T1.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v6, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v6, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_bfe_i32 v4, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v2, v2, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v3i1_to_v3i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v6, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v4, v1, 0, 1
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v3i1_to_v3i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v6, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 2, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v4, v1, 0, 1
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v2, v2, 0, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_store_b64 v6, v[4:5], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <3 x i1>, ptr addrspace(4) %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4894,33 +5139,61 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10002
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 3, v0
-; GFX12-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10001
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT:    s_and_b32 s2, s2, 1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v4i1_to_v4i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x10002
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 3, v0
+; GFX12-TRUE16-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x10001
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-TRUE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-TRUE16-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v4i1_to_v4i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10002
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 3, v0
+; GFX12-FAKE16-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10001
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX12-FAKE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX12-FAKE16-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5018,32 +5291,57 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T2.W, T2.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v8, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v8, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 3, v0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
-; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 1, v0
-; GFX12-NEXT:    v_bfe_i32 v0, v0, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_i32 v6, v1, 0, 1
-; GFX12-NEXT:    v_bfe_i32 v4, v2, 0, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_bfe_i32 v2, v3, 0, 1
-; GFX12-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v4i1_to_v4i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v8, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s3, 2
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s4, s3, 3
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s6, s3, 1
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v6, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v3, s7
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s6
+; GFX12-TRUE16-NEXT:    s_clause 0x1
+; GFX12-TRUE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v4i1_to_v4i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v8, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v8, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 3, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 2, v0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 1, v0
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v6, v1, 0, 1
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v4, v2, 0, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v2, v3, 0, 1
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <4 x i1>, ptr addrspace(4) %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5179,35 +5477,65 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; EG-NEXT:     LSHR * T12.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v12, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v12
-; GFX12-NEXT:    v_mov_b32_e32 v5, v1
-; GFX12-NEXT:    v_mov_b32_e32 v7, v1
-; GFX12-NEXT:    v_bfe_u32 v6, v12, 5, 1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_lshrrev_b32_e32 v2, 7, v0
-; GFX12-NEXT:    v_bfe_u32 v0, v0, 6, 1
-; GFX12-NEXT:    v_bfe_u32 v4, v12, 4, 1
-; GFX12-NEXT:    v_mov_b32_e32 v9, v1
-; GFX12-NEXT:    v_mov_b32_e32 v11, v1
-; GFX12-NEXT:    v_bfe_u32 v10, v12, 3, 1
-; GFX12-NEXT:    v_bfe_u32 v8, v12, 2, 1
-; GFX12-NEXT:    v_mov_b32_e32 v13, v1
-; GFX12-NEXT:    v_mov_b32_e32 v15, v1
-; GFX12-NEXT:    v_bfe_u32 v14, v12, 1, 1
-; GFX12-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v1, v[12:15], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v8i1_to_v8i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v12, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v12
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v7, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v12, 5, 1
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 7, v0
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v0, v0, 6, 1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v12, 4, 1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v10, v12, 3, 1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v12, 2, 1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v13, v1
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v15, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v14, v12, 1, 1
+; GFX12-TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX12-TRUE16-NEXT:    s_clause 0x3
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:32
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v8i1_to_v8i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v12, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v12
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v7, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v12, 5, 1
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 7, v0
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v0, v0, 6, 1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v4, v12, 4, 1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v10, v12, 3, 1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v12, 2, 1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v13, v1
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v15, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v14, v12, 1, 1
+; GFX12-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX12-FAKE16-NEXT:    s_clause 0x3
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:32
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5372,47 +5700,86 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; EG-NEXT:     MOV * T8.W, T8.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v16, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u8 v0, v16, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v9, s3
-; GFX12-NEXT:    s_lshr_b32 s2, s3, 6
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 7
-; GFX12-NEXT:    s_lshr_b32 s6, s3, 4
-; GFX12-NEXT:    s_lshr_b32 s8, s3, 5
-; GFX12-NEXT:    s_lshr_b32 s10, s3, 2
-; GFX12-NEXT:    s_lshr_b32 s12, s3, 3
-; GFX12-NEXT:    s_lshr_b32 s14, s3, 1
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    v_bfe_i32 v12, v9, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
-; GFX12-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
-; GFX12-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
-; GFX12-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
-; GFX12-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
-; GFX12-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
-; GFX12-NEXT:    s_clause 0x3
-; GFX12-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v8i1_to_v8i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_u8 v0, v16, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s3, 6
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s4, s3, 7
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s6, s3, 4
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s8, s3, 5
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s10, s3, 2
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s12, s3, 3
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s14, s3, 1
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v6, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, s7
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s11
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v3, s15
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s14
+; GFX12-TRUE16-NEXT:    s_clause 0x3
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:48
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:32
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v8i1_to_v8i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v16, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u8 v0, v16, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s3, 6
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s4, s3, 7
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s6, s3, 4
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s8, s3, 5
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s10, s3, 2
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s3, 3
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s14, s3, 1
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v12, v9, 0, 1
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GFX12-FAKE16-NEXT:    s_clause 0x3
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <8 x i1>, ptr addrspace(4) %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5646,61 +6013,117 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     LSHR * T22.X, PV.W, literal.x,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v1, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX12-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0
-; GFX12-NEXT:    v_mov_b32_e32 v11, v1
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000a
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT:    v_bfe_u32 v2, v4, 11, 1
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x1000d
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x1000c
-; GFX12-NEXT:    v_mov_b32_e32 v5, v1
-; GFX12-NEXT:    v_bfe_u32 v6, v4, 5, 1
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10007
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10006
-; GFX12-NEXT:    v_mov_b32_e32 v9, v1
-; GFX12-NEXT:    s_bfe_u32 s6, s2, 0x10002
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, s3
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10004
-; GFX12-NEXT:    s_bfe_u32 s3, s2, 0x10009
-; GFX12-NEXT:    s_bfe_u32 s5, s2, 0x10001
-; GFX12-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v0, s4
-; GFX12-NEXT:    v_mov_b32_e32 v2, v6
-; GFX12-NEXT:    s_bfe_u32 s4, s2, 0x10003
-; GFX12-NEXT:    s_and_b32 s2, s2, 1
-; GFX12-NEXT:    v_bfe_u32 v8, v4, 14, 1
-; GFX12-NEXT:    v_bfe_u32 v4, v4, 8, 1
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT:    v_mov_b32_e32 v0, s6
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_mov_b32_e32 v2, s4
-; GFX12-NEXT:    v_mov_b32_e32 v6, s3
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT:    v_mov_b32_e32 v0, s2
-; GFX12-NEXT:    v_mov_b32_e32 v2, s5
-; GFX12-NEXT:    s_clause 0x2
-; GFX12-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_zextload_v16i1_to_v16i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v1, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x1000a
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v2, v4, 11, 1
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x1000d
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x1000c
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v6, v4, 5, 1
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x10007
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x10006
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s6, s2, 0x10002
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x10004
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-TRUE16-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX12-TRUE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v8, v4, 14, 1
+; GFX12-TRUE16-NEXT:    v_bfe_u32 v4, v4, 8, 1
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-TRUE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v6, s3
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-TRUE16-NEXT:    s_clause 0x2
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX12-TRUE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_zextload_v16i1_to_v16i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v1, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v4, 0xffff, v0
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v11, v1
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x1000a
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v2, v4, 11, 1
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x1000d
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x1000c
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v6, v4, 5, 1
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10007
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x10006
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v9, v1
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s6, s2, 0x10002
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s3
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x10004
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s3, s2, 0x10009
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s5, s2, 0x10001
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, v6
+; GFX12-FAKE16-NEXT:    s_bfe_u32 s4, s2, 0x10003
+; GFX12-FAKE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v8, v4, 14, 1
+; GFX12-FAKE16-NEXT:    v_bfe_u32 v4, v4, 8, 1
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s6
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s4
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v6, s3
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v2, s5
+; GFX12-FAKE16-NEXT:    s_clause 0x2
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX12-FAKE16-NEXT:    global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out
@@ -5993,76 +6416,143 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; EG-NEXT:     MOV * T14.W, T14.Z,
 ; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
 ;
-; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX12-NEXT:    v_mov_b32_e32 v32, 0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_load_u16 v0, v32, s[2:3]
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_readfirstlane_b32 s3, v0
-; GFX12-NEXT:    s_lshr_b32 s4, s3, 15
-; GFX12-NEXT:    s_lshr_b32 s2, s3, 14
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT:    s_lshr_b32 s6, s3, 12
-; GFX12-NEXT:    s_lshr_b32 s8, s3, 13
-; GFX12-NEXT:    s_lshr_b32 s10, s3, 10
-; GFX12-NEXT:    s_lshr_b32 s12, s3, 11
-; GFX12-NEXT:    s_lshr_b32 s14, s3, 8
-; GFX12-NEXT:    s_lshr_b32 s16, s3, 9
-; GFX12-NEXT:    s_lshr_b32 s18, s3, 6
-; GFX12-NEXT:    s_lshr_b32 s20, s3, 7
-; GFX12-NEXT:    s_lshr_b32 s22, s3, 4
-; GFX12-NEXT:    s_lshr_b32 s24, s3, 5
-; GFX12-NEXT:    s_lshr_b32 s26, s3, 2
-; GFX12-NEXT:    s_lshr_b32 s28, s3, 3
-; GFX12-NEXT:    s_lshr_b32 s30, s3, 1
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
-; GFX12-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4
-; GFX12-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6
-; GFX12-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v6, s8
-; GFX12-NEXT:    v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v8, s10
-; GFX12-NEXT:    v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v10, s12
-; GFX12-NEXT:    v_mov_b32_e32 v15, s17
-; GFX12-NEXT:    v_bfe_i32 v28, v28, 0, 1
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v17, s19
-; GFX12-NEXT:    v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v19, s21
-; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v21, s23
-; GFX12-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v23, s25
-; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
-; GFX12-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v25, s27
-; GFX12-NEXT:    v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v27, s29
-; GFX12-NEXT:    v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v31, s31
-; GFX12-NEXT:    v_mov_b32_e32 v26, s28
-; GFX12-NEXT:    v_mov_b32_e32 v30, s30
-; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
-; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:64
-; GFX12-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:48
-; GFX12-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:16
-; GFX12-NEXT:    global_store_b128 v32, v[28:31], s[0:1]
-; GFX12-NEXT:    s_endpgm
+; GFX12-TRUE16-LABEL: constant_sextload_v16i1_to_v16i64:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v32, 0
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    global_load_d16_b16 v0, v32, s[2:3]
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s2, s3, 14
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s4, s3, 15
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s6, s3, 12
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s8, s3, 13
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s10, s3, 10
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s12, s3, 11
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s16, s3, 9
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s18, s3, 6
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s20, s3, 7
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s22, s3, 4
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s24, s3, 5
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s26, s3, 2
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s28, s3, 3
+; GFX12-TRUE16-NEXT:    s_lshr_b32 s30, s3, 1
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v7, s5
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v6, s4
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v9, s7
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s9
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s11
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v17, s15
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX12-TRUE16-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v16, s14 :: v_dual_mov_b32 v19, s17
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v18, s16 :: v_dual_mov_b32 v21, s19
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v23, s21
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v22, s20 :: v_dual_mov_b32 v25, s23
+; GFX12-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v24, s22 :: v_dual_mov_b32 v27, s25
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v26, s24 :: v_dual_mov_b32 v29, s27
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v28, s26 :: v_dual_mov_b32 v31, s29
+; GFX12-TRUE16-NEXT:    v_dual_mov_b32 v30, s28 :: v_dual_mov_b32 v3, s31
+; GFX12-TRUE16-NEXT:    v_mov_b32_e32 v2, s30
+; GFX12-TRUE16-NEXT:    s_clause 0x7
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:112
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:96
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:80
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:48
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:32
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[28:31], s[0:1] offset:16
+; GFX12-TRUE16-NEXT:    global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: constant_sextload_v16i1_to_v16i64:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v32, 0
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    global_load_u16 v0, v32, s[2:3]
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s4, s3, 15
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s2, s3, 14
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v28, s3 :: v_dual_mov_b32 v3, s5
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s6, s3, 12
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s8, s3, 13
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s10, s3, 10
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s12, s3, 11
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s14, s3, 8
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s16, s3, 9
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s18, s3, 6
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s20, s3, 7
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s22, s3, 4
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s24, s3, 5
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s26, s3, 2
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s28, s3, 3
+; GFX12-FAKE16-NEXT:    s_lshr_b32 s30, s3, 1
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX12-FAKE16-NEXT:    s_wait_alu 0xfffe
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s4
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v4, s6
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v9, s11 :: v_dual_mov_b32 v6, s8
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v8, s10
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v10, s12
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v15, s17
+; GFX12-FAKE16-NEXT:    v_bfe_i32 v28, v28, 0, 1
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v17, s19
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v19, s21
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v21, s23
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v23, s25
+; GFX12-FAKE16-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v25, s27
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v27, s29
+; GFX12-FAKE16-NEXT:    v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v31, s31
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v26, s28
+; GFX12-FAKE16-NEXT:    v_mov_b32_e32 v30, s30
+; GFX12-FAKE16-NEXT:    s_clause 0x1
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[4:7], s[0:1] offset:96
+; GFX12-FAKE16-NEXT:    v_ashrrev_i32_e32 v29, 31, v28
+; GFX12-FAKE16-NEXT:    s_clause 0x5
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[12:15], s[0:1] offset:64
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[16:19], s[0:1] offset:48
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[24:27], s[0:1] offset:16
+; GFX12-FAKE16-NEXT:    global_store_b128 v32, v[28:31], s[0:1]
+; GFX12-FAKE16-NEXT:    s_endpgm
   %load = load <16 x i1>, ptr addrspace(4) %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, ptr addrspace(1) %out



More information about the llvm-commits mailing list