[llvm] 96352e0 - AMDGPU/GlobalISel: Handle LDS with relocations case

Wed Jan 29 08:19:04 PST 2020

Author: Matt Arsenault
Date: 2020-01-29T08:18:55-08:00
New Revision: 96352e0a1bda0fc04729ff90d0d576e8f366760f

URL: https://github.com/llvm/llvm-project/commit/96352e0a1bda0fc04729ff90d0d576e8f366760f
DIFF: https://github.com/llvm/llvm-project/commit/96352e0a1bda0fc04729ff90d0d576e8f366760f.diff

LOG: AMDGPU/GlobalISel: Handle LDS with relocations case

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.h
    llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
    llvm/test/CodeGen/AMDGPU/lds-relocs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 39c8fb22f944..f50817f669f9 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1627,7 +1627,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+  MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
@@ -1961,7 +1962,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
   case TargetOpcode::G_FRAME_INDEX:
-    return selectG_FRAME_INDEX(I);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
   case TargetOpcode::G_PTR_MASK:
     return selectG_PTR_MASK(I);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 19bd0315b8c0..f4d9defd33f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   bool selectG_STORE(MachineInstr &I) const;
   bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
-  bool selectG_FRAME_INDEX(MachineInstr &I) const;
+  bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
   bool selectG_PTR_MASK(MachineInstr &I) const;
   bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
   bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5ea35bd0da11..d24bc5066f71 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1734,6 +1734,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
 
     // TODO: We could emit code to handle the initialization somewhere.
     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
+      const SITargetLowering *TLI = ST.getTargetLowering();
+      if (!TLI->shouldUseLDSConstAddress(GV)) {
+        MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
+        return true; // Leave in place;
+      }
+
       B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
       MI.eraseFromParent();
       return true;

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7541c758207a..dd4a9960f6ee 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4411,6 +4411,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
 }
 
+bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
+  if (!GV->hasExternalLinkage())
+    return true;
+
+  const auto OS = getTargetMachine().getTargetTriple().getOS();
+  return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
+}
+
 /// This transforms the control flow intrinsics to get the branch destination as
 /// last parameter, also switches branch target with BR if the need arise
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -5046,9 +5054,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-       (!GV->hasExternalLinkage() ||
-        getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
-        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
+       shouldUseLDSConstAddress(GV)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d59495b052a4..dbdac2722c87 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -199,6 +199,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   /// global value \p GV, false otherwise.
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
+  /// \returns true if this should use a literal constant for an LDS address,
+  /// and not emit a relocation for an LDS global.
+  bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
+
 private:
   // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
new file mode 100644
index 000000000000..704cf594a861
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll
@@ -0,0 +1,28 @@
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; FIXME: Merge with DAG test
+
+ at lds.external = external unnamed_addr addrspace(3) global [0 x i32]
+ at lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
+
+; GCN-LABEL: {{^}}test_basic:
+; GCN: s_add_u32 s0, lds.defined at abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x80,A,A,A,A]
+; GCN: v_add_u32_e32 v0, lds.external at abs32@lo, v0 ; encoding: [0xff,0x00,0x00,0x68,A,A,A,A]
+
+; GCN: .globl lds.external
+; GCN: .amdgpu_lds lds.external, 0, 4
+; GCN: .globl lds.defined
+; GCN: .amdgpu_lds lds.defined, 32, 8
+define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
+main_body:
+  %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
+  %tmp = load i32, i32 addrspace(3)* %gep0
+
+  %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
+  store i32 123, i32 addrspace(3)* %gep1
+
+  %r = bitcast i32 %tmp to float
+  ret float %r
+}
+
+attributes #0 = { "no-signed-zeros-fp-math"="true" }
+attributes #4 = { convergent nounwind readnone }

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index e18895d1e51c..02f77141b411 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -1,5 +1,3 @@
 ; RUN: not llc -global-isel -march=amdgcn -mcpu=tonga < %S/../lds-zero-initializer.ll 2>&1 | FileCheck %s
 
-; FIXME: Select should succeed
 ; CHECK: error: <unknown>:0:0: in function load_zeroinit_lds_global void (i32 addrspace(1)*, i1): unsupported initializer for address space
-; CHECK: LLVM ERROR: cannot select: %16:sreg_32(p3) = G_GLOBAL_VALUE @lds (in function: load_zeroinit_lds_global)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
index 4945f7a338b6..eb82ca55b5d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll
@@ -1174,7 +1174,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0
   ret void
 }
 
- at lds0 = addrspace(3) global [512 x i32] undef
+ at lds0 = internal addrspace(3) global [512 x i32] undef
 
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ; CI-LABEL: atomic_dec_shl_base_lds_0:
@@ -1759,7 +1759,7 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspa
   ret void
 }
 
- at lds1 = addrspace(3) global [512 x i64] undef, align 8
+ at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
 
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ; CI-LABEL: atomic_dec_shl_base_lds_0_i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
index c77c0f2e0ff4..916f4e7fc664 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll
@@ -516,7 +516,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspa
   ret void
 }
 
- at lds0 = addrspace(3) global [512 x i32] undef, align 4
+ at lds0 = internal addrspace(3) global [512 x i32] undef, align 4
 
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i32:
@@ -1331,7 +1331,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0
   ret void
 }
 
- at lds1 = addrspace(3) global [512 x i64] undef, align 8
+ at lds1 = internal addrspace(3) global [512 x i64] undef, align 8
 
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ; CI-LABEL: atomic_inc_shl_base_lds_0_i64:

diff  --git a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
index 63e3dd880bae..dd6bb1f0db2a 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-relocs.ll
@@ -47,10 +47,8 @@ main_body:
   %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
   %tmp = load i32, i32 addrspace(3)* %gep0
 
-  %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
-  %mask.32 = trunc i64 %mask to i32
   %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
-  store i32 %mask.32, i32 addrspace(3)* %gep1
+  store i32 123, i32 addrspace(3)* %gep1
 
   %r = bitcast i32 %tmp to float
   ret float %r