[llvm] f68cc2a - AMDGPU: Use 128-bit DS operations by default

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 2 14:42:23 PDT 2020


Author: Matt Arsenault
Date: 2020-04-02T17:17:47-04:00
New Revision: f68cc2a7ed766965028b8b0f0d9300a0460c3cf1

URL: https://github.com/llvm/llvm-project/commit/f68cc2a7ed766965028b8b0f0d9300a0460c3cf1
DIFF: https://github.com/llvm/llvm-project/commit/f68cc2a7ed766965028b8b0f0d9300a0460c3cf1.diff

LOG: AMDGPU: Use 128-bit DS operations by default

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
    llvm/test/CodeGen/AMDGPU/concat_vectors.ll
    llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
    llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
    llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
    llvm/test/CodeGen/AMDGPU/load-local-f32.ll
    llvm/test/CodeGen/AMDGPU/load-local-f64.ll
    llvm/test/CodeGen/AMDGPU/load-local-i16.ll
    llvm/test/CodeGen/AMDGPU/load-local-i32.ll
    llvm/test/CodeGen/AMDGPU/load-local-i64.ll
    llvm/test/CodeGen/AMDGPU/load-local-i8.ll
    llvm/test/CodeGen/AMDGPU/local-64.ll
    llvm/test/CodeGen/AMDGPU/reorder-stores.ll
    llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
    llvm/test/CodeGen/AMDGPU/store-local.ll
    llvm/test/CodeGen/AMDGPU/store-v3i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 5d58b8239ca0..df4c6308fee3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -78,7 +78,7 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // unset everything else if it is disabled
 
   // Assuming ECC is enabled is the conservative default.
-  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
+  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
 
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
index 7e0607dc6ddc..2fb12235ee31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=SI %s
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=CI %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -mattr=-enable-ds128 -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=CI %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d  -mcpu=bonaire -mattr=+enable-ds128 -O0 -run-pass=legalizer -global-isel-abort=0  %s -o - | FileCheck -check-prefix=CI-DS128 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=VI %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -O0 -run-pass=legalizer  -global-isel-abort=0 %s -o - | FileCheck -check-prefix=GFX9 %s
@@ -1759,24 +1759,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; VI-LABEL: name: test_load_local_s96_align8
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, align 8, addrspace 3)
-    ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0
-    ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96)
+    ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; GFX9-LABEL: name: test_load_local_s96_align8
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, align 8, addrspace 3)
-    ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0
-    ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 8, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     %0:_(p3) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load 12, align 8, addrspace 3)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -1814,24 +1802,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; VI-LABEL: name: test_load_local_s96_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0
-    ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96)
+    ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     ; GFX9-LABEL: name: test_load_local_s96_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD]](s64), 0
-    ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](s96)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
     %0:_(p3) = COPY $vgpr0
     %1:_(s96) = G_LOAD %0 :: (load 12, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -2969,94 +2945,99 @@ body: |
     ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; VI: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
-    ; VI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]]
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C15]]
     ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]]
-    ; VI: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16)
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C15]]
+    ; VI: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C16]](s16)
     ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C15]]
     ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]]
-    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16)
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C15]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C16]](s16)
     ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32)
-    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]]
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C15]]
     ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32)
-    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]]
-    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16)
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C15]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C16]](s16)
     ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
     ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32)
-    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C15]]
     ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32)
-    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]]
-    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16)
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C15]]
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C16]](s16)
     ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
-    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
-    ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
-    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
-    ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
-    ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
-    ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
-    ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
-    ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
     ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32)
-    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]]
+    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C15]]
     ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32)
-    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]]
-    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16)
-    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]]
+    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C15]]
+    ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C16]](s16)
+    ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]]
     ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32)
-    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]]
+    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C15]]
     ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32)
-    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]]
-    ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16)
-    ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]]
+    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C15]]
+    ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C16]](s16)
+    ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]]
     ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32)
-    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]]
+    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C15]]
     ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32)
-    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]]
-    ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16)
-    ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]]
+    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C15]]
+    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C16]](s16)
+    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]]
     ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32)
-    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]]
+    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C15]]
     ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32)
-    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]]
-    ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16)
-    ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]]
-    ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
-    ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32)
+    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]]
+    ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16)
+    ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; VI: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32)
+    ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]]
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32)
+    ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]]
+    ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32)
     ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]]
-    ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
-    ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
-    ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32)
+    ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
+    ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32)
     ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32)
-    ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     ; GFX9-LABEL: name: test_load_local_s128_align16
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3)
@@ -3081,94 +3062,99 @@ body: |
     ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C15]]
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]]
-    ; GFX9: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C15]]
+    ; GFX9: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C16]](s16)
     ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C15]]
     ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]]
-    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C15]]
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C16]](s16)
     ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32)
-    ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]]
+    ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C15]]
     ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32)
-    ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]]
-    ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C15]]
+    ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C16]](s16)
     ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
     ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32)
-    ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]]
+    ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C15]]
     ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32)
-    ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]]
-    ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16)
+    ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C15]]
+    ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C16]](s16)
     ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
-    ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
-    ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
-    ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
-    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
-    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
-    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
-    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
-    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
     ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32)
-    ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]]
+    ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C15]]
     ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32)
-    ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]]
-    ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16)
-    ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]]
+    ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C15]]
+    ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C16]](s16)
+    ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]]
     ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32)
-    ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]]
+    ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C15]]
     ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32)
-    ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]]
-    ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16)
-    ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]]
+    ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C15]]
+    ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C16]](s16)
+    ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]]
     ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32)
-    ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]]
+    ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C15]]
     ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32)
-    ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]]
-    ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16)
-    ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]]
+    ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C15]]
+    ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C16]](s16)
+    ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]]
     ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32)
-    ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]]
+    ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C15]]
     ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32)
-    ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]]
-    ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16)
-    ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]]
-    ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
-    ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32)
+    ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]]
+    ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16)
+    ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]]
+    ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32)
+    ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]]
+    ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32)
+    ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]]
+    ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32)
     ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]]
-    ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
-    ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
-    ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32)
+    ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
+    ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32)
     ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32)
-    ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(p3) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3202,20 +3188,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; VI-LABEL: name: test_load_local_s128_align8
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[LOAD1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ; VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; GFX9-LABEL: name: test_load_local_s128_align8
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[LOAD1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     %0:_(p3) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3249,20 +3227,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; VI-LABEL: name: test_load_local_s128_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[LOAD1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ; VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     ; GFX9-LABEL: name: test_load_local_s128_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[LOAD1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
     %0:_(p3) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load 16, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3440,45 +3410,46 @@ body: |
     ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
     ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
     ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2, addrspace 3)
-    ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; VI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2, addrspace 3)
+    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; VI: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2, addrspace 3)
+    ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; VI: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2, addrspace 3)
+    ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; VI: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2, addrspace 3)
+    ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32)
-    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; VI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C7]]
     ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32)
-    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
-    ; VI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32)
+    ; VI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C7]]
+    ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C8]](s32)
     ; VI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32)
-    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; VI: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C7]]
     ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32)
-    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; VI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C7]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C8]](s32)
     ; VI: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
-    ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
-    ; VI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; VI: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2, addrspace 3)
-    ; VI: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; VI: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2, addrspace 3)
-    ; VI: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32)
-    ; VI: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2, addrspace 3)
-    ; VI: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2, addrspace 3)
     ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32)
-    ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; VI: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C7]]
     ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32)
-    ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
-    ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
+    ; VI: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C7]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C8]](s32)
     ; VI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
     ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32)
-    ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; VI: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C7]]
     ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32)
-    ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]]
-    ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32)
+    ; VI: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C7]]
+    ; VI: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32)
     ; VI: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
-    ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
-    ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     ; GFX9-LABEL: name: test_load_local_s128_align2
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 2, addrspace 3)
@@ -3491,45 +3462,46 @@ body: |
     ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
     ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
     ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 2, addrspace 3)
-    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX9: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C3]](s32)
+    ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2, addrspace 3)
+    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C4]](s32)
+    ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2, addrspace 3)
+    ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32)
+    ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2, addrspace 3)
+    ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
+    ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2, addrspace 3)
+    ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32)
-    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C7]]
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32)
-    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
-    ; GFX9: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C4]](s32)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C7]]
+    ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C8]](s32)
     ; GFX9: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX9: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C3]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C7]]
     ; GFX9: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32)
-    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C7]]
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C8]](s32)
     ; GFX9: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
-    ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
-    ; GFX9: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD3:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C5]](s32)
-    ; GFX9: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p3) :: (load 2, addrspace 3)
-    ; GFX9: [[PTR_ADD4:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32)
-    ; GFX9: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p3) :: (load 2, addrspace 3)
-    ; GFX9: [[PTR_ADD5:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s32)
-    ; GFX9: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD5]](p3) :: (load 2, addrspace 3)
-    ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32)
-    ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 2, addrspace 3)
     ; GFX9: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32)
-    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C3]]
+    ; GFX9: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C7]]
     ; GFX9: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32)
-    ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
-    ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C7]]
+    ; GFX9: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C8]](s32)
     ; GFX9: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
     ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32)
-    ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; GFX9: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C7]]
     ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32)
-    ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C3]]
-    ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C4]](s32)
+    ; GFX9: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY8]], [[C7]]
+    ; GFX9: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C8]](s32)
     ; GFX9: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
-    ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
-    ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(p3) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load 16, align 2, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -3954,94 +3926,99 @@ body: |
     ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; VI: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
-    ; VI: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; VI: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]]
+    ; VI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C15]]
     ; VI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]]
-    ; VI: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16)
+    ; VI: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C15]]
+    ; VI: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; VI: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C16]](s16)
     ; VI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; VI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]]
+    ; VI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C15]]
     ; VI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]]
-    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16)
+    ; VI: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C15]]
+    ; VI: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C16]](s16)
     ; VI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; VI: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32)
-    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]]
+    ; VI: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C15]]
     ; VI: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32)
-    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]]
-    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16)
+    ; VI: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C15]]
+    ; VI: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C16]](s16)
     ; VI: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
     ; VI: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32)
-    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]]
+    ; VI: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C15]]
     ; VI: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32)
-    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]]
-    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16)
-    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
-    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
-    ; VI: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
-    ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; VI: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
-    ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
-    ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
-    ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
-    ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
-    ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; VI: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C15]]
+    ; VI: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C16]](s16)
+    ; VI: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
     ; VI: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32)
-    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]]
+    ; VI: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C15]]
     ; VI: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32)
-    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]]
-    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16)
-    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]]
+    ; VI: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C15]]
+    ; VI: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C16]](s16)
+    ; VI: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]]
     ; VI: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32)
-    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]]
+    ; VI: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C15]]
     ; VI: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32)
-    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]]
-    ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16)
-    ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]]
+    ; VI: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C15]]
+    ; VI: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C16]](s16)
+    ; VI: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]]
     ; VI: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32)
-    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]]
+    ; VI: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C15]]
     ; VI: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32)
-    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]]
-    ; VI: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16)
-    ; VI: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]]
+    ; VI: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C15]]
+    ; VI: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C16]](s16)
+    ; VI: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]]
     ; VI: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32)
-    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]]
+    ; VI: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C15]]
     ; VI: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32)
-    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]]
-    ; VI: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16)
-    ; VI: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]]
-    ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
-    ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32)
+    ; VI: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]]
+    ; VI: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16)
+    ; VI: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]]
+    ; VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; VI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; VI: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32)
+    ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]]
+    ; VI: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; VI: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32)
+    ; VI: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]]
+    ; VI: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; VI: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32)
     ; VI: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]]
-    ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
-    ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
-    ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32)
+    ; VI: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
+    ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32)
     ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; VI: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32)
-    ; VI: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; VI: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     ; GFX9-LABEL: name: test_load_local_s128_align1
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3)
@@ -4066,94 +4043,99 @@ body: |
     ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C15:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
-    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C7]]
+    ; GFX9: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C15]]
     ; GFX9: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
-    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C7]]
-    ; GFX9: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C8]](s16)
+    ; GFX9: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C15]]
+    ; GFX9: [[C16:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; GFX9: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C16]](s16)
     ; GFX9: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL]]
     ; GFX9: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32)
-    ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C7]]
+    ; GFX9: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C15]]
     ; GFX9: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32)
-    ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C7]]
-    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C8]](s16)
+    ; GFX9: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C15]]
+    ; GFX9: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C16]](s16)
     ; GFX9: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; GFX9: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD4]](s32)
-    ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C7]]
+    ; GFX9: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C15]]
     ; GFX9: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD5]](s32)
-    ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C7]]
-    ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C8]](s16)
+    ; GFX9: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C15]]
+    ; GFX9: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C16]](s16)
     ; GFX9: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
     ; GFX9: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD6]](s32)
-    ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C7]]
+    ; GFX9: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C15]]
     ; GFX9: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD7]](s32)
-    ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C7]]
-    ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C8]](s16)
+    ; GFX9: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C15]]
+    ; GFX9: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C16]](s16)
     ; GFX9: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL3]]
-    ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
-    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
-    ; GFX9: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
-    ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
-    ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
-    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
-    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
-    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
-    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
-    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
     ; GFX9: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD8]](s32)
-    ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C7]]
+    ; GFX9: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C15]]
     ; GFX9: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD9]](s32)
-    ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C7]]
-    ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C8]](s16)
-    ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL6]]
+    ; GFX9: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C15]]
+    ; GFX9: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C16]](s16)
+    ; GFX9: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL4]]
     ; GFX9: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD10]](s32)
-    ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C7]]
+    ; GFX9: [[AND10:%[0-9]+]]:_(s16) = G_AND [[TRUNC10]], [[C15]]
     ; GFX9: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD11]](s32)
-    ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C7]]
-    ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C8]](s16)
-    ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL7]]
+    ; GFX9: [[AND11:%[0-9]+]]:_(s16) = G_AND [[TRUNC11]], [[C15]]
+    ; GFX9: [[SHL5:%[0-9]+]]:_(s16) = G_SHL [[AND11]], [[C16]](s16)
+    ; GFX9: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND10]], [[SHL5]]
     ; GFX9: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD12]](s32)
-    ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C7]]
+    ; GFX9: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C15]]
     ; GFX9: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD13]](s32)
-    ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C7]]
-    ; GFX9: [[SHL8:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C8]](s16)
-    ; GFX9: [[OR8:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL8]]
+    ; GFX9: [[AND13:%[0-9]+]]:_(s16) = G_AND [[TRUNC13]], [[C15]]
+    ; GFX9: [[SHL6:%[0-9]+]]:_(s16) = G_SHL [[AND13]], [[C16]](s16)
+    ; GFX9: [[OR6:%[0-9]+]]:_(s16) = G_OR [[AND12]], [[SHL6]]
     ; GFX9: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD14]](s32)
-    ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C7]]
+    ; GFX9: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C15]]
     ; GFX9: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD15]](s32)
-    ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C7]]
-    ; GFX9: [[SHL9:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C8]](s16)
-    ; GFX9: [[OR9:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL9]]
-    ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
-    ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C9]](s32)
+    ; GFX9: [[AND15:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C15]]
+    ; GFX9: [[SHL7:%[0-9]+]]:_(s16) = G_SHL [[AND15]], [[C16]](s16)
+    ; GFX9: [[OR7:%[0-9]+]]:_(s16) = G_OR [[AND14]], [[SHL7]]
+    ; GFX9: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
+    ; GFX9: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
+    ; GFX9: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C17]](s32)
+    ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL8]]
+    ; GFX9: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
+    ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
+    ; GFX9: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C17]](s32)
+    ; GFX9: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL9]]
+    ; GFX9: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
+    ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
+    ; GFX9: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C17]](s32)
     ; GFX9: [[OR10:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL10]]
-    ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR8]](s16)
-    ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
-    ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C9]](s32)
+    ; GFX9: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[OR6]](s16)
+    ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
+    ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C17]](s32)
     ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; GFX9: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR10]](s32), [[OR11]](s32)
-    ; GFX9: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[MV1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV2]](s128)
+    ; GFX9: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR8]](s32), [[OR9]](s32), [[OR10]](s32), [[OR11]](s32)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(p3) = COPY $vgpr0
     %1:_(s128) = G_LOAD %0 :: (load 16, align 1, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -5846,33 +5828,38 @@ body: |
     ; VI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; VI: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; VI: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
-    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32)
-    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32)
-    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32)
-    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32)
-    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32)
-    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32)
-    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32)
-    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32)
-    ; VI: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
     ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
     ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
+    ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; VI: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
     ; VI: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
+    ; VI: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; VI: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
     ; VI: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
+    ; VI: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; VI: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
     ; VI: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
+    ; VI: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
     ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
+    ; VI: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
     ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
+    ; VI: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; VI: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
     ; VI: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
+    ; VI: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; VI: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
     ; VI: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
+    ; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32)
+    ; VI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32)
+    ; VI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD2]](s32)
+    ; VI: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32)
+    ; VI: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LOAD4]](s32)
+    ; VI: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LOAD5]](s32)
+    ; VI: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32)
+    ; VI: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32)
     ; VI: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32)
     ; VI: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32)
     ; VI: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LOAD10]](s32)
@@ -5881,10 +5868,9 @@ body: |
     ; VI: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LOAD13]](s32)
     ; VI: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32)
     ; VI: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32)
-    ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32)
-    ; VI: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[BUILD_VECTOR1]](<8 x s32>)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<8 x s8>), [[TRUNC1]](<8 x s8>)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<16 x s8>)
+    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32)
+    ; VI: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[BUILD_VECTOR]](<16 x s32>)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[TRUNC]](<16 x s8>)
     ; GFX9-LABEL: name: test_load_local_v16s8_align16
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3)
@@ -5909,6 +5895,30 @@ body: |
     ; GFX9: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; GFX9: [[PTR_ADD6:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C6]](s32)
     ; GFX9: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
+    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
+    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C9]](s32)
+    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
+    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C10]](s32)
+    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C11]](s32)
+    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
+    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C12]](s32)
+    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
+    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C13]](s32)
+    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
+    ; GFX9: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C14]](s32)
+    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
     ; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32)
     ; GFX9: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY1]](s32), [[COPY2]](s32)
@@ -5921,25 +5931,6 @@ body: |
     ; GFX9: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LOAD6]](s32)
     ; GFX9: [[COPY8:%[0-9]+]]:_(s32) = COPY [[LOAD7]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC3:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY7]](s32), [[COPY8]](s32)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>)
-    ; GFX9: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
-    ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
-    ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD8:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32)
-    ; GFX9: [[LOAD9:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD9:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s32)
-    ; GFX9: [[LOAD10:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD9]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD10:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32)
-    ; GFX9: [[LOAD11:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s32)
-    ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32)
-    ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD13:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C5]](s32)
-    ; GFX9: [[LOAD14:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD13]](p3) :: (load 1, addrspace 3)
-    ; GFX9: [[PTR_ADD14:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
-    ; GFX9: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p3) :: (load 1, addrspace 3)
     ; GFX9: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LOAD8]](s32)
     ; GFX9: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LOAD9]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC4:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY9]](s32), [[COPY10]](s32)
@@ -5952,10 +5943,9 @@ body: |
     ; GFX9: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LOAD14]](s32)
     ; GFX9: [[COPY16:%[0-9]+]]:_(s32) = COPY [[LOAD15]](s32)
     ; GFX9: [[BUILD_VECTOR_TRUNC7:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC [[COPY15]](s32), [[COPY16]](s32)
-    ; GFX9: [[CONCAT_VECTORS1:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>)
-    ; GFX9: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[CONCAT_VECTORS1]](<8 x s16>)
-    ; GFX9: [[CONCAT_VECTORS2:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<8 x s8>), [[TRUNC1]](<8 x s8>)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS2]](<16 x s8>)
+    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR_TRUNC]](<2 x s16>), [[BUILD_VECTOR_TRUNC1]](<2 x s16>), [[BUILD_VECTOR_TRUNC2]](<2 x s16>), [[BUILD_VECTOR_TRUNC3]](<2 x s16>), [[BUILD_VECTOR_TRUNC4]](<2 x s16>), [[BUILD_VECTOR_TRUNC5]](<2 x s16>), [[BUILD_VECTOR_TRUNC6]](<2 x s16>), [[BUILD_VECTOR_TRUNC7]](<2 x s16>)
+    ; GFX9: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<16 x s16>)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[TRUNC]](<16 x s8>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<16 x s8>) = G_LOAD %0 :: (load 16, align 1, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8187,24 +8177,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     ; VI-LABEL: name: test_load_local_v3s32_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; VI: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
-    ; VI: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0
-    ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     ; GFX9-LABEL: name: test_load_local_v3s32_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s32>) = G_IMPLICIT_DEF
-    ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[DEF]], [[LOAD]](<2 x s32>), 0
-    ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s32>) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[INSERT1]](<3 x s32>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<3 x s32>) = G_LOAD %0 :: (load 12, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2 = COPY %1
@@ -8238,20 +8216,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; VI-LABEL: name: test_load_local_v4s32_align16
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; GFX9-LABEL: name: test_load_local_v4s32_align16
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 16, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8285,20 +8255,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; VI-LABEL: name: test_load_local_v4s32_align8
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; GFX9-LABEL: name: test_load_local_v4s32_align8
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 8, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 8, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -8332,20 +8294,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; VI-LABEL: name: test_load_local_v4s32_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     ; GFX9-LABEL: name: test_load_local_v4s32_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -9048,7 +9002,6 @@ body: |
     ; VI: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
     ; VI: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32)
     ; VI: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; VI: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
     ; VI: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
@@ -9074,7 +9027,8 @@ body: |
     ; VI: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
     ; VI: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32)
     ; VI: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]]
-    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
+    ; VI: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; VI: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
     ; VI: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
     ; VI: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
     ; VI: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
@@ -9098,9 +9052,8 @@ body: |
     ; VI: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
     ; VI: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32)
     ; VI: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; VI: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-LABEL: name: test_load_local_v4s32_align1
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load 1, addrspace 3)
@@ -9157,7 +9110,6 @@ body: |
     ; GFX9: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
     ; GFX9: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C5]](s32)
     ; GFX9: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX9: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; GFX9: [[PTR_ADD7:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C7]](s32)
     ; GFX9: [[LOAD8:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD7]](p3) :: (load 1, addrspace 3)
@@ -9183,7 +9135,8 @@ body: |
     ; GFX9: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
     ; GFX9: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C5]](s32)
     ; GFX9: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]]
-    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s32)
+    ; GFX9: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; GFX9: [[PTR_ADD11:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C8]](s32)
     ; GFX9: [[LOAD12:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD11]](p3) :: (load 1, addrspace 3)
     ; GFX9: [[PTR_ADD12:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32)
     ; GFX9: [[LOAD13:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD12]](p3) :: (load 1, addrspace 3)
@@ -9207,9 +9160,8 @@ body: |
     ; GFX9: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[OR10]](s16)
     ; GFX9: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C5]](s32)
     ; GFX9: [[OR11:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL11]]
-    ; GFX9: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<4 x s32>) = G_LOAD %0 :: (load 16, align 1, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -9259,31 +9211,19 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     ; VI-LABEL: name: test_load_local_v8s32_align32
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; VI: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; VI: [[LOAD3:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load 8, addrspace 3)
-    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>)
+    ; VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 16, addrspace 3)
+    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     ; GFX9-LABEL: name: test_load_local_v8s32_align32
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9: [[LOAD2:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9: [[LOAD3:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>)
+    ; GFX9: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load 16, addrspace 3)
+    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, align 32, addrspace 3)
@@ -9349,20 +9289,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ; VI-LABEL: name: test_load_local_v2s64_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     ; GFX9-LABEL: name: test_load_local_v2s64_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<2 x s64>) = G_LOAD %0 :: (load 16, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -10033,30 +9965,28 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>)
     ; VI-LABEL: name: test_load_local_v3s64_align32
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64)
-    ; VI: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; VI: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>)
+    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 16, addrspace 3)
+    ; VI: [[DEF:%[0-9]+]]:_(<3 x s64>) = G_IMPLICIT_DEF
+    ; VI: [[INSERT:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[DEF]], [[LOAD]](<2 x s64>), 0
+    ; VI: [[INSERT1:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[INSERT]], [[LOAD1]](s64), 128
+    ; VI: [[DEF1:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; VI: [[INSERT2:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF1]], [[INSERT1]](<3 x s64>), 0
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>)
     ; GFX9-LABEL: name: test_load_local_v3s64_align32
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64)
-    ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
-    ; GFX9: [[INSERT:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF]], [[BUILD_VECTOR]](<3 x s64>), 0
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](<4 x s64>)
+    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 16, addrspace 3)
+    ; GFX9: [[DEF:%[0-9]+]]:_(<3 x s64>) = G_IMPLICIT_DEF
+    ; GFX9: [[INSERT:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[DEF]], [[LOAD]](<2 x s64>), 0
+    ; GFX9: [[INSERT1:%[0-9]+]]:_(<3 x s64>) = G_INSERT [[INSERT]], [[LOAD1]](s64), 128
+    ; GFX9: [[DEF1:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF
+    ; GFX9: [[INSERT2:%[0-9]+]]:_(<4 x s64>) = G_INSERT [[DEF1]], [[INSERT1]](<3 x s64>), 0
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT2]](<4 x s64>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<3 x s64>) = G_LOAD %0 :: (load 24, align 32, addrspace 3)
     %2:_(<4 x s64>) = G_IMPLICIT_DEF
@@ -10108,32 +10038,20 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ; VI-LABEL: name: test_load_local_v4s64_align32
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; VI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; VI: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p3) :: (load 8, addrspace 3)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[LOAD3]](s64)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; VI: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load 16, addrspace 3)
+    ; VI: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     ; GFX9-LABEL: name: test_load_local_v4s64_align32
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 32, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load 16, align 32, addrspace 3)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 16, addrspace 3)
-    ; GFX9: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C2]](s32)
-    ; GFX9: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p3) :: (load 8, addrspace 3)
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[LOAD3]](s64)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ; GFX9: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load 16, addrspace 3)
+    ; GFX9: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, align 32, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1
@@ -10167,20 +10085,12 @@ body: |
     ; CI-DS128: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     ; VI-LABEL: name: test_load_local_v2p1_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1)
-    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>)
+    ; VI: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; VI: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     ; GFX9-LABEL: name: test_load_local_v2p1_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(p1) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[LOAD]](p1), [[LOAD1]](p1)
-    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x p1>)
+    ; GFX9: [[LOAD:%[0-9]+]]:_(<2 x p1>) = G_LOAD [[COPY]](p3) :: (load 16, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
     %0:_(p3) = COPY $vgpr0
     %1:_(<2 x p1>) = G_LOAD %0 :: (load 16, align 4, addrspace 3)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
@@ -12026,46 +11936,24 @@ body: |
     ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     ; VI-LABEL: name: test_extload_local_v2s96_from_24_align4
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96)
-    ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0
-    ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4, addrspace 3)
-    ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0
-    ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64
-    ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96)
-    ; VI: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96)
-    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96)
-    ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96)
+    ; VI: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12, align 4, addrspace 3)
+    ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96)
+    ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96)
+    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align4
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 4, addrspace 3)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, addrspace 3)
-    ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96)
-    ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0
-    ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4, addrspace 3)
-    ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0
-    ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96)
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96)
-    ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96)
+    ; GFX9: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12, align 4, addrspace 3)
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96)
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p3) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 4, addrspace 3)
     %2:_(s96) = G_EXTRACT %1, 0
@@ -12134,46 +12022,24 @@ body: |
     ; CI-DS128: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     ; VI-LABEL: name: test_extload_local_v2s96_from_24_align16
     ; VI: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; VI: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3)
-    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3)
+    ; VI: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; VI: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, align 8, addrspace 3)
-    ; VI: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96)
-    ; VI: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0
-    ; VI: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; VI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; VI: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; VI: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 4, addrspace 3)
-    ; VI: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; VI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4, addrspace 3)
-    ; VI: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0
-    ; VI: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64
-    ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96)
-    ; VI: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96)
-    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96)
-    ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96)
+    ; VI: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12, align 4, addrspace 3)
+    ; VI: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96)
+    ; VI: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96)
+    ; VI: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; VI: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     ; GFX9-LABEL: name: test_extload_local_v2s96_from_24_align16
     ; GFX9: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 16, addrspace 3)
-    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; GFX9: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[COPY]](p3) :: (load 12, align 16, addrspace 3)
+    ; GFX9: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
     ; GFX9: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
-    ; GFX9: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load 4, align 8, addrspace 3)
-    ; GFX9: [[DEF:%[0-9]+]]:_(s96) = G_IMPLICIT_DEF
-    ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[DEF]](s96)
-    ; GFX9: [[INSERT:%[0-9]+]]:_(s96) = G_INSERT [[COPY1]], [[LOAD]](s64), 0
-    ; GFX9: [[INSERT1:%[0-9]+]]:_(s96) = G_INSERT [[INSERT]], [[LOAD1]](s32), 64
-    ; GFX9: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; GFX9: [[PTR_ADD1:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C1]](s32)
-    ; GFX9: [[LOAD2:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD1]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: [[PTR_ADD2:%[0-9]+]]:_(p3) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32)
-    ; GFX9: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load 4, addrspace 3)
-    ; GFX9: [[INSERT2:%[0-9]+]]:_(s96) = G_INSERT [[DEF]], [[LOAD2]](s64), 0
-    ; GFX9: [[INSERT3:%[0-9]+]]:_(s96) = G_INSERT [[INSERT2]], [[LOAD3]](s32), 64
-    ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[INSERT1]](s96)
-    ; GFX9: [[COPY3:%[0-9]+]]:_(s96) = COPY [[INSERT3]](s96)
-    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY2]](s96)
-    ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY3]](s96)
+    ; GFX9: [[LOAD1:%[0-9]+]]:_(s96) = G_LOAD [[PTR_ADD]](p3) :: (load 12, align 4, addrspace 3)
+    ; GFX9: [[COPY1:%[0-9]+]]:_(s96) = COPY [[LOAD]](s96)
+    ; GFX9: [[COPY2:%[0-9]+]]:_(s96) = COPY [[LOAD1]](s96)
+    ; GFX9: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
+    ; GFX9: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
     %0:_(p3) = COPY $vgpr0
     %1:_(<2 x s96>) = G_LOAD %0 :: (load 24, align 16, addrspace 3)
     %2:_(s96) = G_EXTRACT %1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
index e50173b7caf8..4fe3008011a2 100644
--- a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -309,8 +309,8 @@ define amdgpu_kernel void @concat_vector_crash2(<8 x i8> addrspace(1)* %out, i32
 
 ; GCN-LABEL: {{^}}build_vector_splat_concat_v8i16:
 ; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI: ds_write_b64
-; VI: ds_write2_b64
+; VI: ds_write_b128
+; VI: ds_write_b128
 define amdgpu_kernel void @build_vector_splat_concat_v8i16() {
 entry:
   store <8 x i16> zeroinitializer, <8 x i16> addrspace(3)* undef, align 16

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index ef4efc6336ce..5e2fc937a86c 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt,-enable-ds128 < %s | FileCheck -check-prefixes=CI,NODS128 %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.v2 = addrspace(3) global [512 x <2 x float>] undef, align 4

diff  --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index 699f2060bf26..39afd17298ed 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -49,8 +49,8 @@ define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalia
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
-; CI-PROMOTE: ds_write2_b64
-; CI-PROMOTE: ds_read2_b64
+; CI-PROMOTE: ds_write_b128
+; CI-PROMOTE: ds_read_b128
 define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x double>], align 16, addrspace(5)
@@ -107,8 +107,8 @@ define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
-; CI-PROMOTE: ds_write2_b64
-; CI-PROMOTE: ds_read2_b64
+; CI-PROMOTE: ds_write_b128
+; CI-PROMOTE: ds_read_b128
 define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x i64>], align 16, addrspace(5)

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
index 9a23df97d9a5..c4599ee2b353 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-subvector-unused-scratch.ll
@@ -18,9 +18,9 @@ define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %
 
 ; GCN-LABEL: store_v5i32:
 ; GCN:        ds_read_b32
-; GCN:        ds_read2_b64
+; GCN:        ds_read_b128
 ; GCN:        ds_write_b32
-; GCN:        ds_write2_b64
+; GCN:        ds_write_b128
 ; GCN: ScratchSize: 0
 define amdgpu_kernel void @store_v5i32(<5 x i32> addrspace(3)* %out, <5 x i32> %a) nounwind {
   %val = load <5 x i32>, <5 x i32> addrspace(3)* %out
@@ -28,5 +28,3 @@ define amdgpu_kernel void @store_v5i32(<5 x i32> addrspace(3)* %out, <5 x i32> %
   store <5 x i32> %val.1, <5 x i32> addrspace(3)* %out, align 16
   ret void
 }
-
-

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
index c33c00073156..a0559c17a161 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
 
 ; Testing for ds_read/write_128

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
index e313b38b73d7..7495860107f5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
 
 ; Testing for ds_read_b128

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index d8d7d98e3088..357141b36a85 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
 ; Testing for ds_read/write_b128

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 10887885ef67..0063bb0b6d21 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -mattr=-flat-for-global,-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; Testing for ds_read/write_128

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
index d91c3b5be928..8a07640e4f4d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
 
 ; Testing for ds_read/write_b128

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 02477eff677b..8137ded1454d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
 ; Testing for ds_read/write_b128

diff  --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index f0dca0780aae..3e85dd5905fc 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,SICIVI,CIPLUS %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,CIPLUS %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,CIPLUS %s
 
 ; GCN-LABEL: {{^}}local_i32_load
 ; SICIVI: s_mov_b32 m0
@@ -165,7 +165,8 @@ define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) n
 ; GFX9-NOT: m0
 
 ; GCN-NOT: add
-; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
 ; GCN: s_endpgm
 define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
@@ -178,7 +179,10 @@ define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounw
 ; GFX9-NOT: m0
 
 ; GCN-NOT: add
-; GCN: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
+
+; SI: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1{{$}}
+; CIPLUS: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}}
+
 ; GCN: s_endpgm
 define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -190,8 +194,12 @@ define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %o
 ; GFX9-NOT: m0
 
 ; GCN-NOT: add
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
+; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+
+; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224{{$}}
+; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240{{$}}
+
 ; GCN: s_endpgm
 define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
@@ -204,8 +212,12 @@ define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounw
 ; GFX9-NOT: m0
 
 ; GCN-NOT: add
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
-; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
+; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; SI-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
+
+; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]$}}
+; CIPLUS-DAG: ds_write_b128 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
 ; GCN: s_endpgm
 define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16

diff  --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
index 260b32ed3406..a379a646bee9 100644
--- a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -1,12 +1,12 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn < %s | FileCheck -check-prefix=SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn < %s | FileCheck -check-prefixes=GCN,SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,VI %s
 
-; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN-LABEL: {{^}}no_reorder_v2f64_global_load_store:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
@@ -15,10 +15,14 @@ define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrs
   ret void
 }
 
-; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
+; GCN-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
 ; SI: ds_read2_b64
 ; SI: ds_write2_b64
-; SI: s_endpgm
+
+; VI: ds_read_b128
+; VI: ds_write_b128
+
+; GCN: s_endpgm
 define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
@@ -27,18 +31,18 @@ define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x dou
   ret void
 }
 
-; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
-; SI: buffer_load_dwordx4
+; GCN-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
+; GCN: buffer_load_dwordx4
 
 
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: buffer_store_dwordx4
-; SI: s_endpgm
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: buffer_store_dwordx4
+; GCN: s_endpgm
 define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
@@ -47,13 +51,13 @@ define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> ad
   ret void
 }
 
-; SI-LABEL: {{^}}no_reorder_extload_64:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI-NOT: ds_read
-; SI: ds_write_b64
-; SI: s_endpgm
+; GCN-LABEL: {{^}}no_reorder_extload_64:
+; GCN: ds_read_b64
+; GCN: ds_read_b64
+; GCN: ds_write_b64
+; GCN-NOT: ds_read
+; GCN: ds_write_b64
+; GCN: s_endpgm
 define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8

diff  --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index c4db4c5734a5..6cecc23a8810 100644
--- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt,-enable-ds128 < %s | FileCheck -check-prefix=GCN %s
 
 @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef
 

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index f0e76ac59510..f302ea099b75 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,FUNC %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SICIVI,VI,FUNC %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,FUNC %s
 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck -check-prefixes=EG,FUNC %s
 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cayman < %s | FileCheck -check-prefixes=CM,FUNC %s
@@ -156,7 +156,9 @@ entry:
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 
-; GCN: ds_write2_b64
+; SI: ds_write2_b32
+; VI: ds_write_b128
+; GFX9: ds_write_b128
 define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index 7af1736b3207..b2e69deb73cf 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -46,8 +46,14 @@ define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)*
 }
 
 ; GCN-LABEL: {{^}}local_store_v3i64:
-; GCN: ds_write2_b64
-; GCN: ds_write_b64
+; SI: ds_write2_b64
+; SI: ds_write_b64
+
+; CI: ds_write_b64
+; CI: ds_write_b128
+
+; VI: ds_write_b64
+; VI: ds_write_b128
 define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void


        


More information about the llvm-commits mailing list