[llvm] r364297 - AMDGPU: Write LDS objects out as global symbols in code generation

Tue Jun 25 04:52:30 PDT 2019

Author: nha
Date: Tue Jun 25 04:52:30 2019
New Revision: 364297

URL: http://llvm.org/viewvc/llvm-project?rev=364297&view=rev
Log:
AMDGPU: Write LDS objects out as global symbols in code generation

Summary:
The symbols use the processor-specific SHN_AMDGPU_LDS section index
introduced with a previous change. The linker is then expected to resolve
relocations, which are also emitted.

Initially disabled for HSA and PAL environments until they have caught up
in terms of linker and runtime loader.

Some notes:

- The llvm.amdgcn.groupstaticsize intrinsics can no longer be lowered
  to a constant at compile times, which means some tests can no longer
  be applied.

  The current "solution" is a terrible hack, but the intrinsic isn't
  used by Mesa, so we can keep it for now.

- We no longer know the full LDS size per kernel at compile time, which
  means that we can no longer generate a relevant error message at
  compile time. It would be possible to add a check for the size of
  individual variables, but ultimately the linker will have to perform
  the final check.

Change-Id: If66dbf33fccfbf3609aefefa2558ac0850d42275

Reviewers: arsenm, rampitec, t-tye, b-sumner, jsjodin

Subscribers: qcolombet, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D61494

Added:
    llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll
Removed:
    llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
    llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp
    llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
    llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll
    llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
    llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll
    llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll
    llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll
    llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
    llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll
    llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp Tue Jun 25 04:52:30 2019
@@ -298,10 +298,37 @@ void AMDGPUAsmPrinter::EmitBasicBlockSta
 }
 
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+      OutContext.reportError({},
+                             Twine(GV->getName()) +
+                                 ": unsupported initializer for address space");
+      return;
+    }
 
-  // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+    // LDS variables aren't emitted in HSA or PAL yet.
+    const Triple::OSType OS = TM.getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return;
+
+    MCSymbol *GVSym = getSymbol(GV);
+
+    GVSym->redefineIfPossible();
+    if (GVSym->isDefined() || GVSym->isVariable())
+      report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                         "' is already defined");
+
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
+    unsigned Align = GV->getAlignment();
+    if (!Align)
+      Align = 4;
+
+    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+    EmitLinkage(GV, GVSym);
+    getTargetStreamer()->emitAMDGPULDS(GVSym, Size, Align);
     return;
+  }
 
   AsmPrinter::EmitGlobalVariable(GV);
 }

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Tue Jun 25 04:52:30 2019
@@ -4357,6 +4357,7 @@ const char* AMDGPUTargetLowering::getTar
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
+  NODE_NAME_CASE(LDS)
   NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
@@ -4571,6 +4572,15 @@ void AMDGPUTargetLowering::computeKnownB
     Known.Zero.setHighBits(16);
     break;
   }
+  case AMDGPUISD::LDS: {
+    auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
+    unsigned Align = GA->getGlobal()->getAlignment();
+
+    Known.Zero.setHighBits(16);
+    if (Align)
+      Known.Zero.setLowBits(Log2_32(Align));
+    break;
+  }
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IID) {

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Tue Jun 25 04:52:30 2019
@@ -485,6 +485,7 @@ enum NodeType : unsigned {
   INTERP_P1LV_F16,
   INTERP_P2_F16,
   PC_ADD_REL_OFFSET,
+  LDS,
   KILL,
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,

Modified: llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFoldOperands.cpp Tue Jun 25 04:52:30 2019
@@ -50,7 +50,7 @@ struct FoldCandidate {
     } else if (FoldOp->isFI()) {
       FrameIndexToFold = FoldOp->getIndex();
     } else {
-      assert(FoldOp->isReg());
+      assert(FoldOp->isReg() || FoldOp->isGlobal());
       OpToFold = FoldOp;
     }
   }
@@ -67,6 +67,8 @@ struct FoldCandidate {
     return Kind == MachineOperand::MO_Register;
   }
 
+  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
   bool isCommuted() const {
     return Commuted;
   }
@@ -230,7 +232,7 @@ static bool updateOperand(FoldCandidate
     }
   }
 
-  if ((Fold.isImm() || Fold.isFI()) && Fold.needsShrink()) {
+  if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
     MachineBasicBlock *MBB = MI->getParent();
     auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
     if (Liveness != MachineBasicBlock::LQR_Dead)
@@ -277,6 +279,12 @@ static bool updateOperand(FoldCandidate
     return true;
   }
 
+  if (Fold.isGlobal()) {
+    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+                   Fold.OpToFold->getTargetFlags());
+    return true;
+  }
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -368,7 +376,7 @@ static bool tryAddToFoldList(SmallVector
       if ((Opc == AMDGPU::V_ADD_I32_e64 ||
            Opc == AMDGPU::V_SUB_I32_e64 ||
            Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME
-          (OpToFold->isImm() || OpToFold->isFI())) {
+          (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
         MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
 
         // Verify the other operand is a VGPR, otherwise we would violate the
@@ -483,7 +491,8 @@ void SIFoldOperands::foldOperand(
     return;
   }
 
-  bool FoldingImmLike = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImmLike =
+      OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
   if (FoldingImmLike && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
@@ -884,7 +893,7 @@ void SIFoldOperands::foldInstOperand(Mac
   SmallVector<FoldCandidate, 4> FoldList;
   MachineOperand &Dst = MI.getOperand(0);
 
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
   if (FoldingImm) {
     unsigned NumLiteralUses = 0;
     MachineOperand *NonInlineUse = nullptr;
@@ -1232,7 +1241,8 @@ bool SIFoldOperands::runOnMachineFunctio
       }
 
       MachineOperand &OpToFold = MI.getOperand(1);
-      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+      bool FoldingImm =
+          OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
       // FIXME: We could also be folding things like TargetIndexes.
       if (!FoldingImm && !OpToFold.isReg())

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Tue Jun 25 04:52:30 2019
@@ -3588,6 +3588,8 @@ MachineBasicBlock *SITargetLowering::Emi
   }
 
   case AMDGPU::GET_GROUPSTATICSIZE: {
+    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
         .add(MI.getOperand(0))
@@ -4776,7 +4778,10 @@ SDValue SITargetLowering::LowerGlobalAdd
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
-  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+       (!GV->hasExternalLinkage() ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4784,7 +4789,12 @@ SDValue SITargetLowering::LowerGlobalAdd
   SDLoc DL(GSD);
   EVT PtrVT = Op.getValueType();
 
-  // FIXME: Should not make address space based decisions here.
+  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+                                            SIInstrInfo::MO_ABS32_LO);
+    return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
+  }
+
   if (shouldEmitFixup(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
   else if (shouldEmitPCReloc(GV))
@@ -5773,6 +5783,18 @@ SDValue SITargetLowering::LowerINTRINSIC
     return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
                                       Op->getOperand(1), Op->getOperand(2)), 0);
 
+  case Intrinsic::amdgcn_groupstaticsize: {
+    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return Op;
+
+    const Module *M = MF.getFunction().getParent();
+    const GlobalValue *GV =
+        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Tue Jun 25 04:52:30 2019
@@ -2703,7 +2703,7 @@ bool SIInstrInfo::isImmOperandLegal(cons
   const MCInstrDesc &InstDesc = MI.getDesc();
   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
 
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
 
   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
     return true;
@@ -3012,7 +3012,7 @@ bool SIInstrInfo::verifyInstruction(cons
 
     switch (Desc.OpInfo[i].OperandType) {
     case MCOI::OPERAND_REGISTER:
-      if (MI.getOperand(i).isImm()) {
+      if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
         ErrInfo = "Illegal immediate value for operand.";
         return false;
       }
@@ -3682,7 +3682,7 @@ bool SIInstrInfo::isLegalVSrcOperand(con
     return isLegalRegOperand(MRI, OpInfo, MO);
 
   // Handle non-register types that are treated like immediates.
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
   return true;
 }
 
@@ -3739,7 +3739,7 @@ bool SIInstrInfo::isOperandLegal(const M
   }
 
   // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
 
   if (!DefinedRC) {
     // This operand expects an immediate.

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Tue Jun 25 04:52:30 2019
@@ -205,6 +205,10 @@ def SIpc_add_rel_offset : SDNode<"AMDGPU
   SDTypeProfile<1, 2, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>
 >;
 
+def SIlds : SDNode<"AMDGPUISD::LDS",
+  SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisSameAs<0,1>]>
+>;
+
 def SIload_d16_lo : SDNode<"AMDGPUISD::LOAD_D16_LO",
   SIload_d16,
   [SDNPMayLoad, SDNPMemOperand, SDNPHasChain]

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Tue Jun 25 04:52:30 2019
@@ -1142,6 +1142,16 @@ def : GCNPat <
   (S_MOV_B32 imm:$imm)
 >;
 
+def : GCNPat <
+  (VGPRImm<(SIlds tglobaladdr:$ga)>),
+  (V_MOV_B32_e32 $ga)
+>;
+
+def : GCNPat <
+  (SIlds tglobaladdr:$ga),
+  (S_MOV_B32 $ga)
+>;
+
 // FIXME: Workaround for ordering issue with peephole optimizer where
 // a register class copy interferes with immediate folding.  Should
 // use s_mov_b32, which can be shrunk to s_movk_i32

Modified: llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIShrinkInstructions.cpp Tue Jun 25 04:52:30 2019
@@ -95,6 +95,10 @@ static bool foldImmediates(MachineInstr
           Src0.setSubReg(0);
           Src0.ChangeToFrameIndex(MovSrc.getIndex());
           ConstantFolded = true;
+        } else if (MovSrc.isGlobal()) {
+          Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
+                          MovSrc.getTargetFlags());
+          ConstantFolded = true;
         }
 
         if (ConstantFolded) {

Modified: llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/32-bit-local-address-space.ll Tue Jun 25 04:52:30 2019
@@ -81,8 +81,8 @@ define amdgpu_kernel void @mul_32bit_ptr
 @g_lds = addrspace(3) global float undef, align 4
 
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
+; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds at abs32@lo
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds-sub-offset.ll Tue Jun 25 04:52:30 2019
@@ -7,8 +7,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
-; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
-; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]
+; GCN: v_sub_{{[iu]}}32_e32 [[BASEPTR:v[0-9]+]], {{(vcc, )?}}lds.obj at abs32@lo, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_read2.ll Tue Jun 25 04:52:30 2019
@@ -355,7 +355,8 @@ define amdgpu_kernel void @misaligned_2_
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64 at abs32@lo, [[VOFS]]
 ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 
@@ -441,8 +442,8 @@ define amdgpu_kernel void @misaligned_re
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo at abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -455,8 +456,8 @@ define amdgpu_kernel void @load_constant
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo at abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -471,9 +472,9 @@ define amdgpu_kernel void @load_constant
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar at abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -488,10 +489,13 @@ define amdgpu_kernel void @load_misalign
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large at abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
 ; GCN: s_endpgm
 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ds_write2.ll Tue Jun 25 04:52:30 2019
@@ -103,10 +103,17 @@ define amdgpu_kernel void @simple_write2
 ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 ; CI-DAG: s_mov_b32 m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds at abs32@lo, [[VOFS]]
+;
+; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
+;       early legalization of the constant bus constraint on the v_lshl_add_u32,
+;       and then SIFoldOperands folds in an unlucky order.
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
 
-; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
@@ -131,7 +138,12 @@ define amdgpu_kernel void @simple_write2
 ; GFX9-NOT: m0
 
 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds at abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
@@ -153,7 +165,12 @@ define amdgpu_kernel void @simple_write2
 ; GFX9-NOT: m0
 
 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds at abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
@@ -389,8 +406,8 @@ define amdgpu_kernel void @simple_write2
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo at abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -402,8 +419,8 @@ define amdgpu_kernel void @store_constan
 ; GFX9-NOT: m0
 
 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo at abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
 define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -416,9 +433,9 @@ define amdgpu_kernel void @store_constan
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar at abs32@lo{{$}}
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN: s_endpgm
 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
@@ -432,10 +449,13 @@ define amdgpu_kernel void @store_misalig
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large at abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
+; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4

Modified: llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-initializer.ll Tue Jun 25 04:52:30 2019
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 

Added: llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll?rev=364297&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-relocs.ll Tue Jun 25 04:52:30 2019
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s
+
+ at lds.external = external unnamed_addr addrspace(3) global [0 x i32]
+ at lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
+
+; ELF:      Relocations [
+; ELF-NEXT:   Section (3) .rel.text {
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
+; ELF-NEXT:   }
+; ELF-NEXT: ]
+
+; ELF:      Symbol {
+; ELF:        Name: lds.defined
+; ELF-NEXT:   Value: 0x8
+; ELF-NEXT:   Size: 32
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: Object (0x1)
+; ELF-NEXT:   Other: 0
+; ELF-NEXT:   Section: Processor Specific (0xFF00)
+; ELF-NEXT: }
+
+; ELF:      Symbol {
+; ELF:        Name: lds.external
+; ELF-NEXT:   Value: 0x4
+; ELF-NEXT:   Size: 0
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: Object (0x1)
+; ELF-NEXT:   Other: 0
+; ELF-NEXT:   Section: Processor Specific (0xFF00)
+; ELF-NEXT: }
+
+; GCN-LABEL: {{^}}test_basic:
+; GCN: v_mov_b32_e32 v1, lds.external at abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN-NEXT:              ; fixup A - offset: 4, value: lds.external at abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: s_add_i32 s0, lds.defined at abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
+; GCN-NEXT:          ; fixup A - offset: 4, value: lds.defined at abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: .globl lds.external
+; GCN: .amdgpu_lds lds.external, 0, 4
+; GCN: .globl lds.defined
+; GCN: .amdgpu_lds lds.defined, 32, 8
+define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
+main_body:
+  %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
+  %tmp = load i32, i32 addrspace(3)* %gep0
+
+  %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
+  %mask.32 = trunc i64 %mask to i32
+  %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
+  store i32 %mask.32, i32 addrspace(3)* %gep1
+
+  %r = bitcast i32 %tmp to float
+  ret float %r
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4
+
+attributes #0 = { "no-signed-zeros-fp-math"="true" }
+attributes #4 = { convergent nounwind readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-size.ll Tue Jun 25 04:52:30 2019
@@ -1,4 +1,3 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/lds-zero-initializer.ll Tue Jun 25 04:52:30 2019
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll Tue Jun 25 04:52:30 2019
@@ -268,7 +268,11 @@ define amdgpu_kernel void @flat_atomic_d
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0 at abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0 at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
+
 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -412,7 +416,11 @@ define amdgpu_kernel void @global_atomic
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1 at abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1 at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
+
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll Tue Jun 25 04:52:30 2019
@@ -131,7 +131,10 @@ define amdgpu_kernel void @global_atomic
 @lds0 = addrspace(3) global [512 x i32] undef, align 4
 
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0 at abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0 at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -325,7 +328,10 @@ define amdgpu_kernel void @flat_atomic_i
 @lds1 = addrspace(3) global [512 x i64] undef, align 8
 
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1 at abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1 at abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll Tue Jun 25 04:52:30 2019
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
 
 @lds0 = addrspace(3) global [512 x float] undef, align 4
 @lds1 = addrspace(3) global [256 x float] undef, align 4
@@ -8,7 +8,8 @@
 @large = addrspace(3) global [4096 x i32] undef, align 4
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize at abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
 define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
@@ -22,7 +23,8 @@ define amdgpu_kernel void @groupstaticsi
 }
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize at abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
 define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -50,7 +52,8 @@ endif:
 
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize at abs32@lo
+; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep

Modified: llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.amdgcn.ll Tue Jun 25 04:52:30 2019
@@ -3,12 +3,6 @@
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-; Check that the LDS size emitted correctly
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
-
 ; GCN-LABEL: {{^}}local_memory:
 
 ; GCN-NOT: s_wqm_b64
@@ -57,6 +51,7 @@ entry:
 
 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
+
 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()

Modified: llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/local-memory.ll Tue Jun 25 04:52:30 2019
@@ -10,8 +10,8 @@
 ; not an immediate.
 
 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
+; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds at abs32@lo
+; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
 
 ; R600: LDS_READ_RET
 define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {

Modified: llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/merge-store-crash.ll Tue Jun 25 04:52:30 2019
@@ -7,7 +7,8 @@
 @tess_lds = external addrspace(3) global [8192 x i32]
 
 ; CHECK-LABEL: {{^}}main:
-; CHECK: ds_write2_b32
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
 ; CHECK: v_mov_b32_e32 v1, v0
 ; CHECK: tbuffer_store_format_xyzw v[0:3],
 define amdgpu_vs void @main(i32 inreg %arg) {

Removed: llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll?rev=364296&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/over-max-lds-size.ll (removed)
@@ -1,14 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-
-; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
-
- at huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
-
-define amdgpu_kernel void @use_huge_lds() {
-entry:
-  %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
-  store i32 0, i32 addrspace(3)* %v0
-  ret void
-}

Modified: llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-globals.ll Tue Jun 25 04:52:30 2019
@@ -8,7 +8,8 @@
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
-; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
+; ASM: .amdgpu_lds global_array0, 30000, 4
+; ASM: .amdgpu_lds global_array1, 30000, 4
 
 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:

Modified: llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/shl_add_ptr.ll Tue Jun 25 04:52:30 2019
@@ -33,7 +33,11 @@ define amdgpu_kernel void @load_shl_base
 ; remaining add use goes through the normal shl + add constant fold.
 
 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+
+; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
+; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0 at abs32@lo, [[OFS]]
+
 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
 ; GCN-DAG: buffer_store_dword [[RESULT]]
@@ -68,10 +72,18 @@ define amdgpu_kernel void @load_shl_base
 ; The two globals are placed adjacent in memory, so the same base
 ; pointer can be used with an offset into the second one.
 
+; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
+
 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0 at abs32@lo, [[OFS]]
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1 at abs32@lo, [[OFS]]
 ; GCN: s_mov_b32 m0, -1
-; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
+; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
 ; GCN: s_endpgm
 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

Modified: llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll Tue Jun 25 04:52:30 2019
@@ -4,16 +4,11 @@
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
- at ddxy_lds = external addrspace(3) global [64 x i32]
-
 ; GCN-LABEL: {{^}}main:
 ; GCN: s_wqm
 
 ; Make sure not emitting unused scratch resource descriptor setup
 ; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
 
 ; GCN: s_mov_b32 m0
 
@@ -26,6 +21,7 @@
 ; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
+  %lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
   %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
   %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
@@ -203,18 +199,18 @@ main_body:
   %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
   %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
-  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
+  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
   %tmp111 = bitcast float %p2.i to i32
   store i32 %tmp111, i32 addrspace(3)* %tmp110
   %tmp112 = bitcast float %p2.i96 to i32
   store i32 %tmp112, i32 addrspace(3)* %tmp110
   %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
-  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
+  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
   %tmp115 = and i32 %tmp113, -4
-  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
+  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
   %tmp117 = add i32 %tmp115, 1
-  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
+  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
   %tmp119 = bitcast float %p2.i to i32
   store i32 %tmp119, i32 addrspace(3)* %tmp114
   %tmp120 = load i32, i32 addrspace(3)* %tmp116
@@ -241,7 +237,7 @@ main_body:
   %tmp140 = fmul float %tmp59, %p2.i96
   %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
-  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
+  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
   %tmp143 = bitcast float %tmp137 to i32
   store i32 %tmp143, i32 addrspace(3)* %tmp142
   %tmp144 = bitcast float %tmp138 to i32
@@ -252,11 +248,11 @@ main_body:
   store i32 %tmp146, i32 addrspace(3)* %tmp142
   %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
-  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
+  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
   %tmp149 = and i32 %tmp147, -4
-  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
+  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
   %tmp151 = add i32 %tmp149, 2
-  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
+  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
   %tmp153 = bitcast float %tmp137 to i32
   store i32 %tmp153, i32 addrspace(3)* %tmp148
   %tmp154 = load i32, i32 addrspace(3)* %tmp150

Modified: llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/target-cpu.ll Tue Jun 25 04:52:30 2019
@@ -78,7 +78,6 @@ define amdgpu_kernel void @target_fiji()
 
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
-; CHECK: ; LDSByteSize: 5120
 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)

Modified: llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll?rev=364297&r1=364296&r2=364297&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll (original)
+++ llvm/trunk/test/CodeGen/MIR/AMDGPU/machine-function-info.ll Tue Jun 25 04:52:30 2019
@@ -10,7 +10,7 @@
 ; CHECK: machineFunctionInfo:
 ; CHECK-NEXT: explicitKernArgSize: 128
 ; CHECK-NEXT: maxKernArgAlign: 64
-; CHECK-NEXT: ldsSize: 2048
+; CHECK-NEXT: ldsSize: 0
 ; CHECK-NEXT: isEntryFunction: true
 ; CHECK-NEXT: noSignedZerosFPMath: false
 ; CHECK-NEXT: memoryBound: false