[llvm] e0c2cc7 - [AMDGPU][GlobalISel] Add buffer store byte/short RegBankLegalize rules (#179367)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 5 07:18:44 PST 2026


Author: vangthao95
Date: 2026-02-05T07:18:39-08:00
New Revision: e0c2cc7ed0d4b0c1a567aa7b69d84f8792f8a703

URL: https://github.com/llvm/llvm-project/commit/e0c2cc7ed0d4b0c1a567aa7b69d84f8792f8a703
DIFF: https://github.com/llvm/llvm-project/commit/e0c2cc7ed0d4b0c1a567aa7b69d84f8792f8a703.diff

LOG: [AMDGPU][GlobalISel] Add buffer store byte/short RegBankLegalize rules (#179367)

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 7844d19ada723..26e409b176915 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -269,7 +269,8 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) {
     return B64;
   if (Ty == LLT::fixed_vector(3, 32))
     return B96;
-  if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128))
+  if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
+      Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
     return B128;
   return _;
 }
@@ -1022,7 +1023,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Any(
           {{DivB128}, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
 
-  addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_FORMAT,
+  addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
+                    G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
                     G_AMDGPU_BUFFER_STORE_FORMAT_D16,
                     G_AMDGPU_TBUFFER_STORE_FORMAT,
                     G_AMDGPU_TBUFFER_STORE_FORMAT_D16})

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index c365d5711f6ce..373b120c566a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s
 ; FIXME: Test with SI when argument lowering not broken for f16
 
 ; Natural mapping

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
index a15b34dbb8c21..1c667e287f630 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s
 ; FIXME: Test with SI when argument lowering not broken for f16
 
 ; Natural mapping

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
index c9771b5aca0db..6fb35ad5ce1a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s
 
 ; Natural mapping
 define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
index f331e29176740..3a8e2e6e5f6c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s
 
 ; Natural mapping
 define amdgpu_ps void @struct_ptr_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {

diff  --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 867ec0488d199..5967d17c351ea 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -new-reg-bank-select -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Note: if you're adding tests here, also add them to
 ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
@@ -173,6 +173,14 @@ define i128 @load_i128(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load i128, ptr addrspace(7) %p
@@ -439,6 +447,14 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <8 x i16>, ptr addrspace(7) %p
@@ -477,6 +493,14 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x i64>, ptr addrspace(7) %p
@@ -667,6 +691,14 @@ define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <8 x half>, ptr addrspace(7) %p
@@ -1161,6 +1193,14 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p
@@ -1199,6 +1239,10 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p
@@ -1237,6 +1281,12 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p
@@ -1275,6 +1325,14 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p
@@ -1315,6 +1373,12 @@ define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x half>, ptr addrspace(7) %p
@@ -1612,7 +1676,24 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v5
+; GISEL-NEXT:    v_readfirstlane_b32 s10, v6
+; GISEL-NEXT:    v_readfirstlane_b32 s11, v7
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GISEL-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL-NEXT:    v_mov_b32_e32 v6, s10
+; GISEL-NEXT:    v_mov_b32_e32 v7, s11
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p
@@ -1655,6 +1736,8 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <1 x i16>, ptr addrspace(7) %p
@@ -1692,8 +1775,11 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v3i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v1, off, s[16:19], 0 offset:4
+; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1734,8 +1820,11 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: load_v5i16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v2, off, s[16:19], 0 offset:8
+; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v2
+; GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -1777,6 +1866,12 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x i16>, ptr addrspace(7) %p
@@ -1816,7 +1911,16 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v3, off, s[16:19], 0 offset:12
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <7 x i16>, ptr addrspace(7) %p
@@ -1858,7 +1962,18 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:16
+; GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v4
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    v_mov_b32_e32 v4, s8
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <9 x i16>, ptr addrspace(7) %p
@@ -1942,7 +2057,9 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x i8>, ptr addrspace(7) %p
@@ -1990,7 +2107,9 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ubyte v2, off, s[16:19], 0 offset:2
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2040,9 +2159,13 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x i8>, ptr addrspace(7) %p
@@ -2100,9 +2223,13 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ubyte v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2167,11 +2294,17 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <6 x i8>, ptr addrspace(7) %p
@@ -2238,11 +2371,17 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4
 ; GISEL-NEXT:    buffer_load_ubyte v6, off, s[16:19], 0 offset:6
 ; GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -2311,14 +2450,21 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s8, s5, 8
+; GISEL-NEXT:    s_lshr_b32 s9, s5, 16
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 24
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GISEL-NEXT:    v_mov_b32_e32 v1, v8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s8
+; GISEL-NEXT:    v_mov_b32_e32 v6, s9
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <8 x i8>, ptr addrspace(7) %p
@@ -2393,19 +2539,29 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s8, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s9, s5, 8
+; GISEL-NEXT:    s_lshr_b32 s10, s5, 16
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 24
+; GISEL-NEXT:    s_lshr_b32 s11, s6, 8
+; GISEL-NEXT:    s_lshr_b32 s12, s6, 16
+; GISEL-NEXT:    s_lshr_b32 s6, s6, 24
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v8, v2
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v12
+; GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GISEL-NEXT:    v_mov_b32_e32 v2, s8
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL-NEXT:    v_mov_b32_e32 v6, s10
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GISEL-NEXT:    v_mov_b32_e32 v9, s11
+; GISEL-NEXT:    v_mov_b32_e32 v10, s12
+; GISEL-NEXT:    v_mov_b32_e32 v11, s6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <12 x i8>, ptr addrspace(7) %p
@@ -2495,24 +2651,37 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    s_lshr_b32 s8, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s9, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s10, s5, 8
+; GISEL-NEXT:    s_lshr_b32 s11, s5, 16
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 24
+; GISEL-NEXT:    s_lshr_b32 s12, s6, 8
+; GISEL-NEXT:    s_lshr_b32 s13, s6, 16
+; GISEL-NEXT:    s_lshr_b32 s6, s6, 24
+; GISEL-NEXT:    s_lshr_b32 s14, s7, 8
+; GISEL-NEXT:    s_lshr_b32 s15, s7, 16
+; GISEL-NEXT:    s_lshr_b32 s7, s7, 24
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v8, v2
 ; GISEL-NEXT:    v_mov_b32_e32 v12, v3
-; GISEL-NEXT:    v_mov_b32_e32 v1, v16
-; GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GISEL-NEXT:    v_mov_b32_e32 v1, s8
+; GISEL-NEXT:    v_mov_b32_e32 v2, s9
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s10
+; GISEL-NEXT:    v_mov_b32_e32 v6, s11
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GISEL-NEXT:    v_mov_b32_e32 v9, s12
+; GISEL-NEXT:    v_mov_b32_e32 v10, s13
+; GISEL-NEXT:    v_mov_b32_e32 v11, s6
+; GISEL-NEXT:    v_mov_b32_e32 v13, s14
+; GISEL-NEXT:    v_mov_b32_e32 v14, s15
+; GISEL-NEXT:    v_mov_b32_e32 v15, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <16 x i8>, ptr addrspace(7) %p
@@ -2629,43 +2798,69 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v37, 24, v0
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v32, 8, v16
-; GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
-; GISEL-NEXT:    v_lshrrev_b32_e32 v34, 24, v16
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 8, v17
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v17
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 24, v17
-; GISEL-NEXT:    v_lshrrev_b32_e32 v25, 8, v18
-; GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v18
-; GISEL-NEXT:    v_lshrrev_b32_e32 v27, 24, v18
-; GISEL-NEXT:    v_lshrrev_b32_e32 v29, 8, v19
-; GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v19
-; GISEL-NEXT:    v_lshrrev_b32_e32 v31, 24, v19
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s8, v16
+; GISEL-NEXT:    v_readfirstlane_b32 s9, v17
+; GISEL-NEXT:    v_readfirstlane_b32 s10, v18
+; GISEL-NEXT:    v_readfirstlane_b32 s11, v19
+; GISEL-NEXT:    s_lshr_b32 s12, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s13, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s14, s5, 8
+; GISEL-NEXT:    s_lshr_b32 s15, s5, 16
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 24
+; GISEL-NEXT:    s_lshr_b32 s16, s6, 8
+; GISEL-NEXT:    s_lshr_b32 s17, s6, 16
+; GISEL-NEXT:    s_lshr_b32 s6, s6, 24
+; GISEL-NEXT:    s_lshr_b32 s18, s7, 8
+; GISEL-NEXT:    s_lshr_b32 s19, s7, 16
+; GISEL-NEXT:    s_lshr_b32 s7, s7, 24
+; GISEL-NEXT:    s_lshr_b32 s20, s8, 8
+; GISEL-NEXT:    s_lshr_b32 s21, s8, 16
+; GISEL-NEXT:    s_lshr_b32 s8, s8, 24
+; GISEL-NEXT:    s_lshr_b32 s22, s9, 8
+; GISEL-NEXT:    s_lshr_b32 s23, s9, 16
+; GISEL-NEXT:    s_lshr_b32 s9, s9, 24
+; GISEL-NEXT:    s_lshr_b32 s24, s10, 8
+; GISEL-NEXT:    s_lshr_b32 s25, s10, 16
+; GISEL-NEXT:    s_lshr_b32 s10, s10, 24
+; GISEL-NEXT:    s_lshr_b32 s26, s11, 8
+; GISEL-NEXT:    s_lshr_b32 s27, s11, 16
+; GISEL-NEXT:    s_lshr_b32 s11, s11, 24
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v8, v2
 ; GISEL-NEXT:    v_mov_b32_e32 v12, v3
 ; GISEL-NEXT:    v_mov_b32_e32 v20, v17
 ; GISEL-NEXT:    v_mov_b32_e32 v24, v18
 ; GISEL-NEXT:    v_mov_b32_e32 v28, v19
-; GISEL-NEXT:    v_mov_b32_e32 v1, v35
-; GISEL-NEXT:    v_mov_b32_e32 v2, v36
-; GISEL-NEXT:    v_mov_b32_e32 v3, v37
-; GISEL-NEXT:    v_mov_b32_e32 v17, v32
-; GISEL-NEXT:    v_mov_b32_e32 v18, v33
-; GISEL-NEXT:    v_mov_b32_e32 v19, v34
+; GISEL-NEXT:    v_mov_b32_e32 v1, s12
+; GISEL-NEXT:    v_mov_b32_e32 v2, s13
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s14
+; GISEL-NEXT:    v_mov_b32_e32 v6, s15
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GISEL-NEXT:    v_mov_b32_e32 v9, s16
+; GISEL-NEXT:    v_mov_b32_e32 v10, s17
+; GISEL-NEXT:    v_mov_b32_e32 v11, s6
+; GISEL-NEXT:    v_mov_b32_e32 v13, s18
+; GISEL-NEXT:    v_mov_b32_e32 v14, s19
+; GISEL-NEXT:    v_mov_b32_e32 v15, s7
+; GISEL-NEXT:    v_mov_b32_e32 v17, s20
+; GISEL-NEXT:    v_mov_b32_e32 v18, s21
+; GISEL-NEXT:    v_mov_b32_e32 v19, s8
+; GISEL-NEXT:    v_mov_b32_e32 v21, s22
+; GISEL-NEXT:    v_mov_b32_e32 v22, s23
+; GISEL-NEXT:    v_mov_b32_e32 v23, s9
+; GISEL-NEXT:    v_mov_b32_e32 v25, s24
+; GISEL-NEXT:    v_mov_b32_e32 v26, s25
+; GISEL-NEXT:    v_mov_b32_e32 v27, s10
+; GISEL-NEXT:    v_mov_b32_e32 v29, s26
+; GISEL-NEXT:    v_mov_b32_e32 v30, s27
+; GISEL-NEXT:    v_mov_b32_e32 v31, s11
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <32 x i8>, ptr addrspace(7) %p
@@ -2871,7 +3066,9 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 16
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load [2 x half], ptr addrspace(7) %p
@@ -2914,6 +3111,14 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx4 v[0:3], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_readfirstlane_b32 s7, v3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p
@@ -2955,19 +3160,23 @@ define i40 @load_i40(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    buffer_load_ubyte v1, off, s[16:19], 0 offset:4
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
-; GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
-; GISEL-NEXT:    v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GISEL-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GISEL-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
-; GISEL-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 24
+; GISEL-NEXT:    s_and_b32 s5, s5, 0xff
+; GISEL-NEXT:    s_and_b32 s6, s6, 0xff
+; GISEL-NEXT:    s_lshl_b32 s7, s7, 8
+; GISEL-NEXT:    s_and_b32 s4, s4, 0xff
+; GISEL-NEXT:    s_lshl_b32 s5, s5, 8
+; GISEL-NEXT:    s_or_b32 s6, s6, s7
+; GISEL-NEXT:    s_or_b32 s4, s4, s5
+; GISEL-NEXT:    s_and_b32 s5, 0xffff, s6
+; GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
+; GISEL-NEXT:    s_lshl_b32 s5, s5, 16
+; GISEL-NEXT:    s_or_b32 s4, s4, s5
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
@@ -3009,6 +3218,12 @@ define i96 @load_i96(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx3 v[0:2], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v1
+; GISEL-NEXT:    v_readfirstlane_b32 s6, v2
+; GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load i96, ptr addrspace(7) %p
@@ -3221,7 +3436,9 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_ubyte v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 4
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x i4>, ptr addrspace(7) %p
@@ -3279,9 +3496,13 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 4
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 12
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x i4>, ptr addrspace(7) %p
@@ -3347,13 +3568,21 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 12, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 20, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 28, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 4
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 12
+; GISEL-NEXT:    s_lshr_b32 s8, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s9, s4, 20
+; GISEL-NEXT:    s_lshr_b32 s10, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 28
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GISEL-NEXT:    v_mov_b32_e32 v4, s8
+; GISEL-NEXT:    v_mov_b32_e32 v5, s9
+; GISEL-NEXT:    v_mov_b32_e32 v6, s10
+; GISEL-NEXT:    v_mov_b32_e32 v7, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <8 x i4>, ptr addrspace(7) %p
@@ -3429,7 +3658,10 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v0, off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b16_e32 v1, 6, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_and_b32 s4, 0xffff, s4
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 6
+; GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <2 x i6>, ptr addrspace(7) %p
@@ -3528,9 +3760,13 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dword v0, off, s[16:19], 0 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    s_lshr_b32 s5, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load volatile <4 x i8>, ptr addrspace(7) %p
@@ -3593,10 +3829,16 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 8, v4
+; GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GISEL-NEXT:    s_lshr_b32 s6, s4, 8
+; GISEL-NEXT:    s_lshr_b32 s7, s4, 16
+; GISEL-NEXT:    s_lshr_b32 s4, s4, 24
+; GISEL-NEXT:    s_lshr_b32 s5, s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v1, s6
+; GISEL-NEXT:    v_mov_b32_e32 v2, s7
+; GISEL-NEXT:    v_mov_b32_e32 v3, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load volatile <6 x i8>, ptr addrspace(7) %p


        


More information about the llvm-commits mailing list