[llvm] e66f0ed - [AMDGPU] Split unaligned LDS access instead of scalarizing

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Thu Apr 7 14:27:50 PDT 2022


Author: Stanislav Mekhanoshin
Date: 2022-04-07T14:27:37-07:00
New Revision: e66f0edb4031dc1742d5f074822e9e8be58d63fc

URL: https://github.com/llvm/llvm-project/commit/e66f0edb4031dc1742d5f074822e9e8be58d63fc
DIFF: https://github.com/llvm/llvm-project/commit/e66f0edb4031dc1742d5f074822e9e8be58d63fc.diff

LOG: [AMDGPU] Split unaligned LDS access instead of scalarizing

There is no need to fully scalarize an unaligned operation in
some case, just split it to alignment.

Differential Revision: https://reviews.llvm.org/D123330

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/ds-alignment.ll
    llvm/test/CodeGen/AMDGPU/load-local.96.ll
    llvm/test/CodeGen/AMDGPU/store-local.128.ll
    llvm/test/CodeGen/AMDGPU/store-local.96.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e1d73e2dcfae2..9f21fd942e288 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2994,12 +2994,11 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
     // the bytes again are not eliminated in the case of an unaligned copy.
     if (!allowsMisalignedMemoryAccesses(
             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
-      SDValue Ops[2];
-
       if (VT.isVector())
-        std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
-      else
-        std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
+        return SplitVectorLoad(SDValue(LN, 0), DAG);
+
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
 
       return DAG.getMergeValues(Ops, SDLoc(N));
     }
@@ -3050,7 +3049,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     if (!allowsMisalignedMemoryAccesses(
             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
       if (VT.isVector())
-        return scalarizeVectorStore(SN, DAG);
+        return SplitVectorStore(SDValue(SN, 0), DAG);
 
       return expandUnalignedStore(SN, DAG);
     }

diff  --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index d3a62e3c1cb27..00e550a29c6be 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -395,22 +395,22 @@ define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds12align1:
@@ -492,24 +492,23 @@ define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:8
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0 offset:2
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:4
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:10
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:8
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1 offset:8
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:2
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:4
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:6
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:10
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:8
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2 offset:2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:10
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds12align2:
@@ -589,14 +588,14 @@ define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> add
 ; ALIGNED-SDAG:       ; %bb.0:
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v0 offset:8
-; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; ALIGNED-SDAG-NEXT:    ds_read_b64 v[0:1], v2
+; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v2 offset:8
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
 ; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
+; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds12align8:
@@ -667,25 +666,30 @@ define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
 ; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
-; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
 ; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
+; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds16align1:
@@ -783,29 +787,27 @@ define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> add
 ; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:12
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
-; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v8, s1
+; ALIGNED-SDAG-NEXT:    ds_read_u16 v8, v0 offset:12
 ; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v1 offset:12
+; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v3 offset:2
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v2
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v4 offset:4
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:2
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
 ; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v6 offset:8
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v5 offset:6
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v7 offset:10
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
-; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v0 offset:14
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:6
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v7 offset:10
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v6 offset:8
+; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v0 offset:14
+; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v8 offset:12
 ; ALIGNED-SDAG-NEXT:    s_endpgm
 ;
 ; ALIGNED-GISEL-LABEL: ds16align2:
@@ -856,33 +858,19 @@ define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> add
 }
 
 define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
-; ALIGNED-SDAG-LABEL: ds16align4:
-; ALIGNED-SDAG:       ; %bb.0:
-; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
-; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
-; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
-; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
-; ALIGNED-SDAG-NEXT:    s_endpgm
-;
-; ALIGNED-GISEL-LABEL: ds16align4:
-; ALIGNED-GISEL:       ; %bb.0:
-; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
-; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
-; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
-; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
-; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
-; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
-; ALIGNED-GISEL-NEXT:    s_endpgm
+; ALIGNED-LABEL: ds16align4:
+; ALIGNED:       ; %bb.0:
+; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
+; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
+; ALIGNED-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
+; ALIGNED-NEXT:    v_mov_b32_e32 v4, s1
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
+; ALIGNED-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
+; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
+; ALIGNED-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
+; ALIGNED-NEXT:    s_endpgm
 ;
 ; UNALIGNED-LABEL: ds16align4:
 ; UNALIGNED:       ; %bb.0:

diff  --git a/llvm/test/CodeGen/AMDGPU/load-local.96.ll b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
index e29f221d17b7e..b25d767a1346a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local.96.ll
@@ -352,7 +352,7 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX9-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -362,7 +362,7 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX7-NEXT:    s_mov_b32 m0, -1
-; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX7-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -370,12 +370,11 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
 ; GFX6-LABEL: load_lds_v3i32_align8:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX6-NEXT:    s_mov_b32 m0, -1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
+; GFX6-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX6-NEXT:    ds_read_b32 v2, v2
-; GFX6-NEXT:    ds_read_b32 v0, v0
-; GFX6-NEXT:    ds_read_b32 v1, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -384,7 +383,7 @@ define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v0
-; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
+; GFX10-NEXT:    ds_read_b64 v[0:1], v0
 ; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
index be489bae434e0..5514e775d5336 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll
@@ -70,42 +70,42 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:12
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
-; GFX9-NEXT:    ds_write_b8 v0, v2 offset:8
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:8
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
+; GFX9-NEXT:    ds_write_b8 v0, v2 offset:12
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:14
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX9-NEXT:    ds_write_b8 v0, v1
 ; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s7, 24
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:13
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 8
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:15
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 24
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX9-NEXT:    s_lshr_b32 s0, s7, 8
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX9-NEXT:    s_lshr_b32 s0, s7, 24
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:13
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 24
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: store_lds_v4i32_align1:
@@ -115,50 +115,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:12
-; GFX7-NEXT:    ds_write_b8 v0, v2 offset:8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:8
+; GFX7-NEXT:    ds_write_b8 v0, v2 offset:12
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX7-NEXT:    ds_write_b8 v0, v1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_lshr_b32 s4, s2, 8
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
-; GFX7-NEXT:    v_mov_b32_e32 v1, s4
-; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:15
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX7-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX7-NEXT:    s_lshr_b32 s2, s3, 8
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX7-NEXT:    s_lshr_b32 s2, s3, 24
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:13
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX7-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:15
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX6-LABEL: store_lds_v4i32_align1:
@@ -168,50 +168,50 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:12
-; GFX6-NEXT:    ds_write_b8 v0, v2 offset:8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:8
+; GFX6-NEXT:    ds_write_b8 v0, v2 offset:12
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX6-NEXT:    ds_write_b8 v0, v1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    s_lshr_b32 s4, s2, 8
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    s_lshr_b32 s4, s3, 24
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:13
-; GFX6-NEXT:    v_mov_b32_e32 v1, s4
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:15
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:14
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
+; GFX6-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s3, 8
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX6-NEXT:    s_lshr_b32 s2, s3, 24
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:13
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX6-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:15
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 8
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v4i32_align1:
@@ -221,42 +221,42 @@ define amdgpu_kernel void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    s_lshr_b32 s3, s6, 24
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    s_lshr_b32 s0, s7, 8
-; GFX10-NEXT:    s_lshr_b32 s2, s6, 8
-; GFX10-NEXT:    s_lshr_b32 s6, s5, 8
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    s_lshr_b32 s1, s7, 24
-; GFX10-NEXT:    s_lshr_b32 s5, s5, 24
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    s_lshr_b32 s3, s7, 24
+; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    s_lshr_b32 s0, s6, 8
+; GFX10-NEXT:    s_lshr_b32 s1, s6, 24
+; GFX10-NEXT:    s_lshr_b32 s6, s4, 8
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_lshr_b32 s2, s7, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s4, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s3
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v9, s6
-; GFX10-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    s_lshr_b32 s0, s5, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s2
-; GFX10-NEXT:    ds_write_b8 v0, v1 offset:12
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:14
-; GFX10-NEXT:    ds_write_b8 v0, v2 offset:8
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:10
-; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
-; GFX10-NEXT:    ds_write_b8 v0, v4
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v4 offset:2
-; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
-; GFX10-NEXT:    ds_write_b8 v0, v6 offset:15
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:9
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    s_lshr_b32 s1, s4, 24
+; GFX10-NEXT:    ds_write_b8 v0, v1 offset:8
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
+; GFX10-NEXT:    ds_write_b8 v0, v2 offset:12
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:14
+; GFX10-NEXT:    ds_write_b8 v0, v3
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:2
+; GFX10-NEXT:    ds_write_b8 v0, v4 offset:4
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v4 offset:6
+; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
+; GFX10-NEXT:    ds_write_b8 v0, v6 offset:11
+; GFX10-NEXT:    ds_write_b8 v0, v7 offset:13
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    s_lshr_b32 s1, s5, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s1
-; GFX10-NEXT:    ds_write_b8 v0, v8 offset:11
-; GFX10-NEXT:    ds_write_b8 v0, v9 offset:5
-; GFX10-NEXT:    ds_write_b8 v0, v1 offset:7
-; GFX10-NEXT:    ds_write_b8 v0, v2 offset:1
-; GFX10-NEXT:    ds_write_b8 v0, v3 offset:3
+; GFX10-NEXT:    ds_write_b8 v0, v8 offset:15
+; GFX10-NEXT:    ds_write_b8 v0, v9 offset:1
+; GFX10-NEXT:    ds_write_b8 v0, v1 offset:3
+; GFX10-NEXT:    ds_write_b8 v0, v2 offset:5
+; GFX10-NEXT:    ds_write_b8 v0, v3 offset:7
 ; GFX10-NEXT:    s_endpgm
   store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
   ret void
@@ -269,18 +269,18 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
-; GFX9-NEXT:    v_mov_b32_e32 v2, s6
-; GFX9-NEXT:    ds_write_b16 v0, v1 offset:12
-; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
-; GFX9-NEXT:    ds_write_b16 v0, v2 offset:8
-; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
-; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-NEXT:    ds_write_b16 v0, v1 offset:8
+; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
+; GFX9-NEXT:    ds_write_b16 v0, v2 offset:12
+; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:14
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    ds_write_b16 v0, v1
 ; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: store_lds_v4i32_align2:
@@ -290,26 +290,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:12
-; GFX7-NEXT:    ds_write_b16 v0, v2 offset:8
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:8
+; GFX7-NEXT:    ds_write_b16 v0, v2 offset:12
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
-; GFX7-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX7-NEXT:    ds_write_b16 v0, v1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:14
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:14
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX6-LABEL: store_lds_v4i32_align2:
@@ -319,26 +319,26 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:12
-; GFX6-NEXT:    ds_write_b16 v0, v2 offset:8
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:8
+; GFX6-NEXT:    ds_write_b16 v0, v2 offset:12
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    s_lshr_b32 s3, s3, 16
 ; GFX6-NEXT:    ds_write_b16 v0, v1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:14
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshr_b32 s2, s3, 16
 ; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:14
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v4i32_align2:
@@ -348,18 +348,18 @@ define amdgpu_kernel void @store_lds_v4i32_align2(<4 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    v_mov_b32_e32 v1, s7
-; GFX10-NEXT:    v_mov_b32_e32 v2, s6
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    ds_write_b16 v0, v1 offset:12
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:14
-; GFX10-NEXT:    ds_write_b16 v0, v2 offset:8
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:10
-; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
-; GFX10-NEXT:    ds_write_b16 v0, v4
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v4 offset:2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    v_mov_b32_e32 v2, s7
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ds_write_b16 v0, v1 offset:8
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
+; GFX10-NEXT:    ds_write_b16 v0, v2 offset:12
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:14
+; GFX10-NEXT:    ds_write_b16 v0, v3
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:2
+; GFX10-NEXT:    ds_write_b16 v0, v4 offset:4
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v4 offset:6
 ; GFX10-NEXT:    s_endpgm
   store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 2
   ret void
@@ -402,12 +402,12 @@ define amdgpu_kernel void @store_lds_v4i32_align4(<4 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset0:2 offset1:3
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v4i32_align4:

diff  --git a/llvm/test/CodeGen/AMDGPU/store-local.96.ll b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
index 592e2aaa8e589..7f57a1150ed5e 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.96.ll
@@ -68,32 +68,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:8
 ; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
-; GFX9-NEXT:    ds_write_b8 v0, v2 offset:4
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    ds_write_b8 v0, v2
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 8
-; GFX9-NEXT:    ds_write_b8 v0, v1
-; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:2
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
+; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s6, 24
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
-; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    s_lshr_b32 s0, s4, 24
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
 ; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: store_lds_v3i32_align1:
@@ -104,12 +104,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:8
-; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b8 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
-; GFX7-NEXT:    ds_write_b8 v0, v1
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
@@ -117,25 +117,25 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s2, s1, 24
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
-; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 8
-; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
-; GFX7-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
 ; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX6-LABEL: store_lds_v3i32_align1:
@@ -146,12 +146,12 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:8
-; GFX6-NEXT:    ds_write_b8 v0, v2 offset:4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b8 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
-; GFX6-NEXT:    ds_write_b8 v0, v1
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
@@ -159,25 +159,25 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s2, s1, 8
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s2, s1, 24
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
-; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    s_lshr_b32 s1, s0, 24
+; GFX6-NEXT:    s_lshr_b32 s2, s0, 24
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 8
 ; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 24
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v3i32_align1:
@@ -188,32 +188,32 @@ define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    s_lshr_b32 s0, s6, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s6, 24
-; GFX10-NEXT:    s_lshr_b32 s2, s5, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s5, 24
-; GFX10-NEXT:    s_lshr_b32 s5, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX10-NEXT:    s_lshr_b32 s2, s4, 8
+; GFX10-NEXT:    s_lshr_b32 s3, s4, 24
+; GFX10-NEXT:    s_lshr_b32 s4, s5, 8
+; GFX10-NEXT:    s_lshr_b32 s5, s5, 24
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v7, s3
-; GFX10-NEXT:    v_mov_b32_e32 v8, s5
-; GFX10-NEXT:    v_mov_b32_e32 v9, s4
+; GFX10-NEXT:    v_mov_b32_e32 v8, s4
+; GFX10-NEXT:    v_mov_b32_e32 v9, s5
 ; GFX10-NEXT:    ds_write_b8 v0, v1 offset:8
 ; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
-; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:6
-; GFX10-NEXT:    ds_write_b8 v0, v3
-; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:2
+; GFX10-NEXT:    ds_write_b8 v0, v2
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:2
+; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
+; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
 ; GFX10-NEXT:    ds_write_b8 v0, v4 offset:9
 ; GFX10-NEXT:    ds_write_b8 v0, v5 offset:11
-; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
-; GFX10-NEXT:    ds_write_b8 v0, v7 offset:7
-; GFX10-NEXT:    ds_write_b8 v0, v8 offset:1
-; GFX10-NEXT:    ds_write_b8 v0, v9 offset:3
+; GFX10-NEXT:    ds_write_b8 v0, v6 offset:1
+; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
+; GFX10-NEXT:    ds_write_b8 v0, v8 offset:5
+; GFX10-NEXT:    ds_write_b8 v0, v9 offset:7
 ; GFX10-NEXT:    s_endpgm
   store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
   ret void
@@ -227,14 +227,14 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-NEXT:    ds_write_b16 v0, v1 offset:8
 ; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
-; GFX9-NEXT:    ds_write_b16 v0, v2 offset:4
-; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:6
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    ds_write_b16 v0, v1
-; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:2
+; GFX9-NEXT:    ds_write_b16 v0, v2
+; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
+; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: store_lds_v3i32_align2:
@@ -245,20 +245,20 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    ds_write_b16 v0, v1 offset:8
-; GFX7-NEXT:    ds_write_b16 v0, v2 offset:4
-; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b16 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX7-NEXT:    ds_write_b16 v0, v1
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
-; GFX7-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
-; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX6-LABEL: store_lds_v3i32_align2:
@@ -269,20 +269,20 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    ds_write_b16 v0, v1 offset:8
-; GFX6-NEXT:    ds_write_b16 v0, v2 offset:4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b16 v0, v2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
-; GFX6-NEXT:    ds_write_b16 v0, v1
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
-; GFX6-NEXT:    s_lshr_b32 s1, s1, 16
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s0
+; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v3i32_align2:
@@ -293,14 +293,14 @@ define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out,
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    v_mov_b32_e32 v2, s4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX10-NEXT:    ds_write_b16 v0, v1 offset:8
 ; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
-; GFX10-NEXT:    ds_write_b16 v0, v2 offset:4
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:6
-; GFX10-NEXT:    ds_write_b16 v0, v3
-; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:2
+; GFX10-NEXT:    ds_write_b16 v0, v2
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:2
+; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
+; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
 ; GFX10-NEXT:    s_endpgm
   store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
   ret void
@@ -341,11 +341,11 @@ define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out,
 ; GFX6-NEXT:    s_mov_b32 m0, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
-; GFX6-NEXT:    v_mov_b32_e32 v1, s0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s2
+; GFX6-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX6-NEXT:    ds_write_b32 v0, v1 offset:8
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset1:1
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_lds_v3i32_align4:


        


More information about the llvm-commits mailing list