[llvm] 2605adb - [AMDGPU][GlobalISel] Select 8-byte LDS Ops with 4-byte alignment

Austin Kerbow via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 29 10:43:07 PST 2020


Author: Austin Kerbow
Date: 2020-01-29T10:42:12-08:00
New Revision: 2605adb69c6f1f95c709c21560add8230e30e60b

URL: https://github.com/llvm/llvm-project/commit/2605adb69c6f1f95c709c21560add8230e30e60b
DIFF: https://github.com/llvm/llvm-project/commit/2605adb69c6f1f95c709c21560add8230e30e60b.diff

LOG: [AMDGPU][GlobalISel] Select 8-byte LDS Ops with 4-byte alignment

Reviewers: arsenm

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, rovka, dstuttard, tpr, t-tye, hiraditya, Petar.Avramovic, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D73585

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUGISel.td
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/DSInstructions.td
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index ce65717f2b29..21208aaf67ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -88,6 +88,10 @@ def gi_ds_1addr_1offset :
     GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
     GIComplexPatternEquiv<DS1Addr1Offset>;
 
+def gi_ds_64bit_4byte_aligned :
+    GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
+    GIComplexPatternEquiv<DS64Bit4ByteAligned>;
+
 def gi_mubuf_addr64 :
     GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
     GIComplexPatternEquiv<MUBUFAddr64>;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f50817f669f9..4596889d7429 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2399,6 +2399,50 @@ AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
     }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+  if (!RootDef) {
+    return {{
+        [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
+      }};
+  }
+
+  int64_t ConstAddr = 0;
+  Register PtrBase;
+  int64_t Offset;
+
+  std::tie(PtrBase, Offset) =
+    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+  if (Offset) {
+    int64_t DWordOffset0 = Offset / 4;
+    int64_t DWordOffset1 = DWordOffset0 + 1;
+    if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+      // (add n0, c0)
+      return {{
+          [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },
+          [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset0); },
+          [=](MachineInstrBuilder &MIB) { MIB.addImm(DWordOffset1); }
+        }};
+    }
+  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+    // TODO
+
+  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+    // TODO
+
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(1); }
+    }};
+}
+
 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
 /// the base value with the constant offset. There may be intervening copies
 /// between \p Root and the identified constant. Returns \p Root, 0 if this does

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index f4d9defd33f8..d7bf1885dd51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -179,6 +179,8 @@ class AMDGPUInstructionSelector : public InstructionSelector {
 
   InstructionSelector::ComplexRendererFns
   selectDS1Addr1Offset(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectDS64Bit4ByteAligned(MachineOperand &Root) const;
 
   std::pair<Register, int64_t>
   getPtrBaseWithConstantOffset(Register Root,

diff  --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index fe7faca8b157..ab069c681c97 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -737,31 +737,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
 def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
-
-class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
-  (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
-class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
-  (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
-  (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
-              (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
+  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
+              (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
               (i1 0))
 >;
 
-// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
-// related to bounds checking.
-let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
-}
+multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
+  let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
+    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
+    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>;
+  }
 
-let OtherPredicates = [NotLDSRequiresM0Init] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>;
+    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>;
+  }
 }
 
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+foreach vt = VReg_64.RegTypes in {
+defm : DS64Bit4ByteAlignedPat_mc<vt>;
+}
 
 let AddedComplexity = 100 in {
 

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
index a118a873de59..08fdd0f30a16 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir
@@ -28,12 +28,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX9-LABEL: name: load_local_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -68,12 +62,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_U16_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_2
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U16_]]
     ; GFX9-LABEL: name: load_local_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -112,12 +100,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_1
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX9-LABEL: name: load_local_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -152,12 +134,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ; GFX7-DS128-LABEL: name: load_local_v2s32
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX9-LABEL: name: load_local_v2s32
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -188,21 +164,15 @@ body: |
     ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
     ; GFX7-LABEL: name: load_local_v2s32_align4
     ; GFX7: liveins: $vgpr0
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
-    ; GFX7-DS128-LABEL: name: load_local_v2s32_align4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
     ; GFX9-LABEL: name: load_local_v2s32_align4
     ; GFX9: liveins: $vgpr0
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -232,12 +202,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ; GFX7-DS128-LABEL: name: load_local_s64
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX9-LABEL: name: load_local_s64
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -268,21 +232,15 @@ body: |
     ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
     ; GFX7-LABEL: name: load_local_s64_align4
     ; GFX7: liveins: $vgpr0
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
-    ; GFX7-DS128-LABEL: name: load_local_s64_align4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
     ; GFX9-LABEL: name: load_local_s64_align4
     ; GFX9: liveins: $vgpr0
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -312,12 +270,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
-    ; GFX7-DS128-LABEL: name: load_local_p3_from_4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX9-LABEL: name: load_local_p3_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -352,12 +304,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
-    ; GFX7-DS128-LABEL: name: load_local_p5_from_4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX9-LABEL: name: load_local_p5_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -392,12 +338,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ; GFX7-DS128-LABEL: name: load_local_p1_align8
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX9-LABEL: name: load_local_p1_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -428,21 +368,15 @@ body: |
     ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
     ; GFX7-LABEL: name: load_local_p1_align4
     ; GFX7: liveins: $vgpr0
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
-    ; GFX7-DS128-LABEL: name: load_local_p1_align4
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
     ; GFX9-LABEL: name: load_local_p1_align4
     ; GFX9: liveins: $vgpr0
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3)
-    ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
     %0:vgpr(p3) = COPY $vgpr0
     %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3)
     $vgpr0_vgpr1 = COPY %1
@@ -472,12 +406,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
-    ; GFX7-DS128-LABEL: name: load_local_p999_from_8
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
     ; GFX9-LABEL: name: load_local_p999_from_8
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -512,12 +440,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
-    ; GFX7-DS128-LABEL: name: load_local_v2p3
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
     ; GFX9-LABEL: name: load_local_v2p3
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
@@ -552,12 +474,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_B32_]]
-    ; GFX7-DS128-LABEL: name: load_local_v2s16
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_B32_]]
     ; GFX9-LABEL: name: load_local_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -592,12 +508,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
     ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
-    ; GFX7-DS128-LABEL: name: load_local_v4s16
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3)
-    ; GFX7-DS128: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]]
     ; GFX9-LABEL: name: load_local_v4s16
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -659,12 +569,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_65535
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_65535
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -750,14 +654,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_65536
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec
-    ; GFX7-DS128: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_65536
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -800,14 +696,6 @@ body: |
     ; GFX7: $m0 = S_MOV_B32 -1
     ; GFX7: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
     ; GFX7: $vgpr0 = COPY [[DS_READ_U8_]]
-    ; GFX7-DS128-LABEL: name: load_local_s32_from_1_gep_m1
-    ; GFX7-DS128: liveins: $vgpr0
-    ; GFX7-DS128: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX7-DS128: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; GFX7-DS128: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
-    ; GFX7-DS128: $m0 = S_MOV_B32 -1
-    ; GFX7-DS128: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3)
-    ; GFX7-DS128: $vgpr0 = COPY [[DS_READ_U8_]]
     ; GFX9-LABEL: name: load_local_s32_from_1_gep_m1
     ; GFX9: liveins: $vgpr0
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
@@ -822,3 +710,83 @@ body: |
     $vgpr0 = COPY %3
 
 ...
+
+---
+
+name: load_local_s64_align4_from_1_gep_1016
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7: $m0 = S_MOV_B32 -1
+    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 [[COPY]], 254, 255, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
+    ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1016
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 1016
+    %2:vgpr(p3) = G_PTR_ADD %0, %1
+    %3:vgpr(s64) = G_LOAD %2 :: (load 8, align 4, addrspace 3)
+    $vgpr0_vgpr1 = COPY %3
+
+...
+
+---
+
+name: load_local_s64_align4_from_1_gep_1020
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1
+
+    ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020
+    ; GFX6: liveins: $vgpr0_vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3)
+    ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
+    ; GFX7: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX7: $m0 = S_MOV_B32 -1
+    ; GFX7: [[DS_READ2_B32_:%[0-9]+]]:vreg_64 = DS_READ2_B32 %2, 0, 1, 0, implicit $m0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX7: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_]]
+    ; GFX9-LABEL: name: load_local_s64_align4_from_1_gep_1020
+    ; GFX9: liveins: $vgpr0_vgpr1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
+    ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3)
+    ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]]
+    %0:vgpr(p3) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 1020
+    %2:vgpr(p3) = G_PTR_ADD %0, %1
+    %3:vgpr(s64) = G_LOAD %2 :: (load 8, align 4, addrspace 3)
+    $vgpr0_vgpr1 = COPY %3
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
index 449a3e5f725f..60cc05c7da5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir
@@ -284,15 +284,19 @@ body: |
     ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_s64_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
     ; GFX9-LABEL: name: store_local_s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
-    ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX9: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -322,15 +326,19 @@ body: |
     ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_p1_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
     ; GFX9-LABEL: name: store_local_p1_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
-    ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX9: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -360,15 +368,19 @@ body: |
     ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_v2s32_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
     ; GFX9-LABEL: name: store_local_v2s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX9: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
     %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -398,15 +410,19 @@ body: |
     ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
     ; GFX7-LABEL: name: store_local_v4s16_align4
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX7: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
-    ; GFX7: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7: $m0 = S_MOV_B32 -1
-    ; GFX7: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
     ; GFX9-LABEL: name: store_local_v4s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
-    ; GFX9: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
-    ; GFX9: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
-    ; GFX9: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
     %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:vgpr(p3) = COPY $vgpr2
     G_STORE %0, %1 :: (store 8, align 4, addrspace 3)
@@ -564,3 +580,99 @@ body: |
     G_STORE %0, %1 :: (store 8, align 8, addrspace 3)
 
 ...
+
+---
+
+name: store_local_s64_align4_from_1_gep_1016
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr4
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1016
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7: $m0 = S_MOV_B32 -1
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX9-LABEL: name: store_local_s64_align4_from_1_gep_1016
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    %0:vgpr(s64) = COPY $vgpr0_vgpr1
+    %1:vgpr(p3) = COPY $vgpr2
+    %2:vgpr(s32) = G_CONSTANT i32 1016
+    %3:vgpr(p3) = G_PTR_ADD %1, %2
+    G_STORE %0, %3 :: (store 8, align 4, addrspace 3)
+
+...
+
+---
+
+name: store_local_s64_align4_from_1_gep_1020
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr4
+  stackPtrOffsetReg: $sgpr32
+
+body: |
+  bb.0:
+    liveins:  $vgpr0_vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020
+    ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2
+    ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020
+    ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32)
+    ; GFX6: $m0 = S_MOV_B32 -1
+    ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3)
+    ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1020
+    ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX7: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
+    ; GFX7: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX7: $m0 = S_MOV_B32 -1
+    ; GFX7: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX7: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX7: DS_WRITE2_B32 %3, [[COPY3]], [[COPY2]], 0, 1, 0, implicit $m0, implicit $exec :: (store 8, align 4, addrspace 3)
+    ; GFX9-LABEL: name: store_local_s64_align4_from_1_gep_1020
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
+    ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1020, implicit $exec
+    ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY1]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
+    ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
+    ; GFX9: DS_WRITE2_B32_gfx9 [[V_ADD_U32_e64_]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3)
+    %0:vgpr(s64) = COPY $vgpr0_vgpr1
+    %1:vgpr(p3) = COPY $vgpr2
+    %2:vgpr(s32) = G_CONSTANT i32 1020
+    %3:vgpr(p3) = G_PTR_ADD %1, %2
+    G_STORE %0, %3 :: (store 8, align 4, addrspace 3)
+
+...


        


More information about the llvm-commits mailing list