[llvm] 77ce2e2 - [AMDGPU] Add Relocation Constant Support

Mon Mar 30 10:49:38 PDT 2020

Author: Jakub Kuderski
Date: 2020-03-30T13:49:20-04:00
New Revision: 77ce2e21a87768370b97a027e5053b73e8f438a7

URL: https://github.com/llvm/llvm-project/commit/77ce2e21a87768370b97a027e5053b73e8f438a7
DIFF: https://github.com/llvm/llvm-project/commit/77ce2e21a87768370b97a027e5053b73e8f438a7.diff

LOG: [AMDGPU] Add Relocation Constant Support

Summary:
This change adds amdgcn.reloc.constant intrinsic to the amdgpu backend, which will compile into a relocation entry in the resulting elf.

The intrinsics takes a MetadataNode (String) as its only argument, which specifies the symbol name of the relocation entry.

`SelectionDAGBuilder::getValueImpl` is changed to allow metadata operands passed through to ISel.

Author: csyonghe <yonghe at google.com>

Reviewers: tpr, nhaehnle

Reviewed By: nhaehnle

Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, t-tye, hiraditya, kerbowa, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76440

Added: 
    llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll

Modified: 
    llvm/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index c01db52b1622..1eb504bd6e80 100644

--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1875,4 +1875,10 @@ def int_amdgcn_fdiv_fast : Intrinsic<
   [llvm_float_ty], [llvm_float_ty, llvm_float_ty],
   [IntrNoMem, IntrSpeculatable]
 >;
+
+// Represent a relocation constant.
+def int_amdgcn_reloc_constant : Intrinsic<
+  [llvm_i32_ty], [llvm_metadata_ty],
+  [IntrNoMem, IntrSpeculatable]
+>;
 }

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f2936fe93df4..042907d34907 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1604,6 +1604,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
 
+  if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V)) {
+    return DAG.getMDNode(cast<MDNode>(MD->getMetadata()));
+  }
   llvm_unreachable("Can't get register for value!");
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 269434d31e21..09e18cc0c2fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1766,11 +1766,23 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue &Offset, bool &Imm) const {
-
-  // FIXME: Handle non-constant offsets.
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
-  if (!C)
+  if (!C) {
+    if (ByteOffsetNode.getValueType().isScalarInteger() &&
+        ByteOffsetNode.getValueType().getSizeInBits() == 32) {
+      Offset = ByteOffsetNode;
+      Imm = false;
+      return true;
+    }
+    if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
+      if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
+        Offset = ByteOffsetNode.getOperand(0);
+        Imm = false;
+        return true;
+      }
+    }
     return false;
+  }
 
   SDLoc SL(ByteOffsetNode);
   GCNSubtarget::Generation Gen = Subtarget->getGeneration();
@@ -1835,7 +1847,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
   // wraparound, because s_load instructions perform the addition in 64 bits.
   if ((Addr.getValueType() != MVT::i32 ||
        Addr->getFlags().hasNoUnsignedWrap()) &&
-      CurDAG->isBaseWithConstantOffset(Addr)) {
+      (CurDAG->isBaseWithConstantOffset(Addr) ||
+       Addr.getOpcode() == ISD::ADD)) {
     SDValue N0 = Addr.getOperand(0);
     SDValue N1 = Addr.getOperand(1);
 

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4ff42769b470..f431c8e5256c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6022,6 +6022,16 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_alignbit:
     return DAG.getNode(ISD::FSHR, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_reloc_constant: {
+    Module *M = const_cast<Module *>(MF.getFunction().getParent());
+    const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
+    auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
+    auto RelocSymbol = cast<GlobalVariable>(
+        M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
+    SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
new file mode 100644
index 000000000000..fde3ab8c6d4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -0,0 +1,62 @@
+; Test that DAG->DAG ISel is able to pick up the S_LOAD_DWORDX4_SGPR instruction that fetches the offset
+; from a register.
+
+; RUN: llc -march=amdgcn -verify-machineinstrs -stop-after=amdgpu-isel -o - %s | FileCheck -check-prefix=GCN %s
+
+; GCN: %[[OFFSET:[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @DescriptorBuffer
+; GCN: %{{[0-9]+}}:sgpr_128 = S_LOAD_DWORDX4_SGPR killed %{{[0-9]+}}, killed %[[OFFSET]], 0, 0 :: (invariant load 16 from %ir.13, addrspace 4)
+
+define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %resNode0, i32 inreg %resNode1, <3 x i32> inreg %2, i32 inreg %3, <3 x i32> %4) local_unnamed_addr #2 {
+.entry:
+  %5 = call i64 @llvm.amdgcn.s.getpc() #3
+  %6 = bitcast i64 %5 to <2 x i32>
+  %7 = insertelement <2 x i32> %6, i32 %resNode0, i32 0
+  %8 = bitcast <2 x i32> %7 to i64
+  %9 = inttoptr i64 %8 to [4294967295 x i8] addrspace(4)*
+  %10 = call i32 @llvm.amdgcn.reloc.constant(metadata !4)
+  %11 = zext i32 %10 to i64
+  %12 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %9, i64 0, i64 %11
+  %13 = bitcast i8 addrspace(4)* %12 to <4 x i32> addrspace(4)*, !amdgpu.uniform !5
+  %14 = load <4 x i32>, <4 x i32> addrspace(4)* %13, align 16, !invariant.load !5
+  %15 = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %14, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %15, <4 x i32> %14, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+; Function Attrs: nounwind writeonly
+declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.reloc.constant(metadata) #3
+
+; Function Attrs: nounwind readnone speculatable
+declare i64 @llvm.amdgcn.s.getpc() #3
+
+; Function Attrs: nounwind readnone
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32 immarg) #1
+
+attributes #0 = { argmemonly nounwind willreturn }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-unroll-threshold"="700" }
+attributes #3 = { nounwind readnone speculatable }
+attributes #4 = { nounwind writeonly }
+
+!llpc.compute.mode = !{!0}
+!llpc.options = !{!1}
+!llpc.options.CS = !{!2}
+!llpc.user.data.nodes = !{!3, !4, !5, !6}
+!amdgpu.pal.metadata.msgpack = !{!7}
+
+!0 = !{i32 2, i32 3, i32 1}
+!1 = !{i32 245227952, i32 996822128, i32 2024708198, i32 497230408}
+!2 = !{i32 1381820427, i32 1742110173, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 64}
+!3 = !{!"DescriptorTableVaPtr", i32 0, i32 1, i32 1}
+!4 = !{!"DescriptorBuffer", i32 4, i32 8, i32 0, i32 0}
+!5 = !{!"DescriptorTableVaPtr", i32 1, i32 1, i32 1}
+!6 = !{!"DescriptorBuffer", i32 4, i32 8, i32 1, i32 0}
+!7 = !{!"\82\B0amdpal.pipelines\91\88\A4.api\A6Vulkan\B0.hardware_stages\81\A3.cs\82\AB.sgpr_limith\AB.vgpr_limit\CD\01\00\B7.internal_pipeline_hash\92\CF;jLp\0E\9D\E1\B0\CF\1D\A3\22Hx\AE\98f\AA.registers\88\CD.\07\02\CD.\08\03\CD.\09\01\CD.\12\CE\00,\00\00\CD.\13\CD\0F\88\CD.@\CE\10\00\00\00\CD.B\00\CD.C\01\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CFg\D6}\DDR\\\E8\0B\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\02\AEamdpal.version\92\02\03"}
+!8 = !{i32 5}
+!9 = !{!"doff_0_0_b"}
+!10 = !{}
+!11 = !{!"doff_1_0_b"}

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
new file mode 100644
index 000000000000..569f0101e3be
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-reloc-const.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -filetype=obj -o %t.o < %s && llvm-readobj -relocations %t.o | FileCheck --check-prefix=ELF %s
+; GCN-LABEL: {{^}}ps_main:
+; GCN: v_mov_b32_{{.*}} v[[relocreg:[0-9]+]], doff_0_0_b at abs32@lo
+; GCN-NEXT: exp {{.*}} v[[relocreg]], {{.*}}
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
+
+; ELF: Relocations [
+; ELF-NEXT: Section (3) .rel.text {
+; ELF-NEXT: 0x{{[0-9]+}} R_AMDGPU_ABS32 doff_0_0_b {{.*}}
+
+define amdgpu_ps void @ps_main(i32 %arg, i32 inreg %arg1, i32 inreg %arg2) local_unnamed_addr #0 {
+  %rc = call i32 @llvm.amdgcn.reloc.constant(metadata !1)
+  %rcf = bitcast i32 %rc to float
+  call void @llvm.amdgcn.exp.f32(i32 immarg 40, i32 immarg 15, float %rcf, float undef, float undef, float undef, i1 immarg false, i1 immarg false) #0
+  ret void
+}
+
+; Function Attrs: inaccessiblememonly nounwind
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.reloc.constant(metadata) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { inaccessiblememonly nounwind }
+attributes #2 = { nounwind readnone speculatable }
+
+!1 = !{!"doff_0_0_b"}