[llvm] r333558 - AMDGPU: Use better alignment for kernarg lowering

Wed May 30 09:17:51 PDT 2018

Author: arsenm
Date: Wed May 30 09:17:51 2018
New Revision: 333558

URL: http://llvm.org/viewvc/llvm-project?rev=333558&view=rev
Log:
AMDGPU: Use better alignment for kernarg lowering

This was just emitting loads with the ABI alignment
for the raw type. The true alignment is often better,
especially when an illegal vector type was scalarized.
The better alignment allows using a scalar load
more often.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
    llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
    llvm/trunk/test/CodeGen/AMDGPU/half.ll
    llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed May 30 09:17:51 2018
@@ -1068,15 +1068,12 @@ SDValue SITargetLowering::convertArgType
 SDValue SITargetLowering::lowerKernargMemParameter(
   SelectionDAG &DAG, EVT VT, EVT MemVT,
   const SDLoc &SL, SDValue Chain,
-  uint64_t Offset, bool Signed,
+  uint64_t Offset, unsigned Align, bool Signed,
   const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  unsigned Align = DL.getABITypeAlignment(Ty);
-
   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
                              MachineMemOperand::MODereferenceable |
@@ -1663,7 +1660,15 @@ SDValue SITargetLowering::LowerFormalArg
 
   SmallVector<SDValue, 16> Chains;
 
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  // FIXME: This is the minimum kernel argument alignment. We should improve
+  // this to the maximum alignment of the arguments.
+  //
+  // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
+  // kern arg offset.
+  const unsigned KernelArgBaseAlign = 16;
+  const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn);
+
+   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
     if (Skipped[i]) {
       InVals.push_back(DAG.getUNDEF(Arg.VT));
@@ -1677,14 +1682,14 @@ SDValue SITargetLowering::LowerFormalArg
       VT = Ins[i].VT;
       EVT MemVT = VA.getLocVT();
 
-      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(Fn) +
-        VA.getLocMemOffset();
+      const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();
       Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+      unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
 
       // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
+      // thread group and global sizes for clover.
       SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+        DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
@@ -4303,7 +4308,7 @@ SDValue SITargetLowering::lowerImplicitZ
                                                  unsigned Offset) const {
   SDLoc SL(Op);
   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
-                                           DAG.getEntryNode(), Offset, false);
+                                           DAG.getEntryNode(), Offset, 4, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
@@ -4404,37 +4409,37 @@ SDValue SITargetLowering::LowerINTRINSIC
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_X, false);
+                                    SI::KernelInputOffsets::NGROUPS_X, 4, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Y, false);
+                                    SI::KernelInputOffsets::NGROUPS_Y, 4, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Z, false);
+                                    SI::KernelInputOffsets::NGROUPS_Z, 4, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Wed May 30 09:17:51 2018
@@ -27,7 +27,7 @@ class SITargetLowering final : public AM
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
   SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                    const SDLoc &SL, SDValue Chain,
-                                   uint64_t Offset, bool Signed,
+                                   uint64_t Offset, unsigned Align, bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,

Modified: llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll Wed May 30 09:17:51 2018
@@ -88,11 +88,9 @@ define amdgpu_kernel void @s_fneg_fabs_v
 ; Combine turns this into integer op when bitcast source (from load)
 
 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16_bc_src:
-; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
 
 ; FIXME: Random commute
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
 define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(<2 x half> addrspace(1)* %out, <2 x half> %in) {
@@ -103,16 +101,12 @@ define amdgpu_kernel void @s_fneg_fabs_v
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
-; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
-; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
-; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
 
 ; FIXME: Random commute
-; GFX89: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
+; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
 
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
 ; VI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]

Modified: llvm/trunk/test/CodeGen/AMDGPU/half.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/half.ll?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/half.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/half.ll Wed May 30 09:17:51 2018
@@ -13,17 +13,10 @@ define amdgpu_kernel void @load_f16_arg(
   ret void
 }
 
-; FIXME: Should always be the same
 ; GCN-LABEL: {{^}}load_v2f16_arg:
-; SI-DAG: buffer_load_ushort [[V0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
-; SI-DAG: buffer_load_ushort [[V1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
-; SI: v_lshlrev_b32_e32 [[HI:v[0-9]+]], 16, [[V1]]
-; SI: v_or_b32_e32 [[PACKED:v[0-9]+]],  [[V0]], [[HI]]
-; SI: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-
-; VI: s_load_dword [[ARG:s[0-9]+]]
-; VI: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
-; VI: buffer_store_dword [[V_ARG]]
+; GCN: s_load_dword [[ARG:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[V_ARG:v[0-9]+]], [[ARG]]
+; GCN: buffer_store_dword [[V_ARG]]
 define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
@@ -31,8 +24,8 @@ define amdgpu_kernel void @load_v2f16_ar
 
 ; GCN-LABEL: {{^}}load_v3f16_arg:
 ; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+
 ; GCN-NOT: buffer_load
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_short
@@ -43,19 +36,14 @@ define amdgpu_kernel void @load_v3f16_ar
   ret void
 }
 
-; GCN-LABEL: {{^}}load_v4f16_arg:
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_store_dwordx2
 
 ; FIXME: Why not one load?
-; VI-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
-; VI-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
-; VI-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
-; VI: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
+; GCN-LABEL: {{^}}load_v4f16_arg:
+; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
+; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
 define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll Wed May 30 09:17:51 2018
@@ -162,10 +162,11 @@ entry:
 
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; HSA: flat_load_ushort
 define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
@@ -179,10 +180,9 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-
-; VI: s_load_dword s
+; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
+; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
+; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
 define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
@@ -226,11 +226,14 @@ entry:
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
 ; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; MESA-VI: buffer_load_ushort
+; MESA-VI: buffer_load_ubyte
+
+; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ubyte
 define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
@@ -245,12 +248,9 @@ entry:
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
 ; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; MESA-GCN: buffer_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
-; HSA-VI: flat_load_ushort
+
+; GCN-DAG: s_load_dword s
+; GCN-DAG: {{buffer|flat}}_load_ushort
 define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
@@ -293,14 +293,13 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dword s
 define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@@ -315,13 +314,14 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
+; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
 
-; VI: s_load_dword s
-; VI: s_load_dword s
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
+
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
+; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
 define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@@ -372,21 +372,17 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
-; HSA-GCN: float_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
 define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@@ -405,15 +401,11 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
 
+; VI: s_load_dwordx2
 ; VI: s_load_dword s
 ; VI: s_load_dword s
 ; VI: s_load_dword s
@@ -481,38 +473,27 @@ entry:
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
 ; EG: VTX_READ_8
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; MESA-GCN: buffer_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
-; HSA-VI: flat_load_ubyte
+
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
+; VI: s_load_dwordx2
 define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@@ -539,22 +520,13 @@ entry:
 ; EG: VTX_READ_16
 ; EG: VTX_READ_16
 
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
-; SI: buffer_load_ushort
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dword s
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
+; SI: s_load_dwordx2
 
 ; VI: s_load_dword s
 ; VI: s_load_dword s

Modified: llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll?rev=333558&r1=333557&r2=333558&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll Wed May 30 09:17:51 2018
@@ -39,10 +39,8 @@ define amdgpu_kernel void @store_v4i32_a
 }
 
 ; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: s_load_dword s
+; GCN: s_load_dwordx2 s
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
 define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>