[llvm] r260599 - AMDGPU/SI: Make sure MIMG descriptors and samplers stay in SGPRs

Thu Feb 11 13:45:08 PST 2016

Author: tstellar
Date: Thu Feb 11 15:45:07 2016
New Revision: 260599

URL: http://llvm.org/viewvc/llvm-project?rev=260599&view=rev
Log:
AMDGPU/SI: Make sure MIMG descriptors and samplers stay in SGPRs

Summary:
It's possible to have resource descriptors and samplers stored in
VGPRs, either by a VMEM instruction or in the case of samplers,
floating-point calculations.  When this happens, we need to use
v_readfirstlane to copy these values back to sgprs.

Reviewers: mareko, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D17102

Added:
    llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td Thu Feb 11 15:45:07 2016
@@ -690,5 +690,6 @@ class MIMG <bits<7> op, dag outs, dag in
   let MIMG = 1;
   let Uses = [EXEC];
 
+  let UseNamedOperandTable = 1;
   let hasSideEffects = 0; // XXX ????
 }

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Thu Feb 11 15:45:07 2016
@@ -1949,6 +1949,32 @@ void SIInstrInfo::legalizeOperandsVOP3(
   }
 }
 
+unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
+                                 MachineRegisterInfo &MRI) const {
+  const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
+  const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
+  unsigned DstReg = MRI.createVirtualRegister(SRC);
+  unsigned SubRegs = VRC->getSize() / 4;
+
+  SmallVector<unsigned, 8> SRegs;
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+    BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
+            get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
+            .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
+    SRegs.push_back(SGPR);
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI,
+                                    UseMI->getDebugLoc(),
+                                    get(AMDGPU::REG_SEQUENCE), DstReg);
+  for (unsigned i = 0; i < SubRegs; ++i) {
+    MIB.addReg(SRegs[i]);
+    MIB.addImm(RI.getSubRegFromChannel(i));
+  }
+  return DstReg;
+}
+
 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
 
@@ -2061,6 +2087,22 @@ void SIInstrInfo::legalizeOperands(Machi
     }
     return;
   }
+
+  // Legalize MIMG
+  if (isMIMG(*MI)) {
+    MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
+    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
+      SRsrc->setReg(SGPR);
+    }
+
+    MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp);
+    if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
+      unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
+      SSamp->setReg(SGPR);
+    }
+    return;
+  }
 
   // Legalize MUBUF* instructions
   // FIXME: If we start using the non-addr64 instructions for compute, we

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Thu Feb 11 15:45:07 2016
@@ -396,6 +396,13 @@ public:
   /// \brief Fix operands in \p MI to satisfy constant bus requirements.
   void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr *MI) const;
 
+  /// Copy a value from a VGPR (\p SrcReg) to SGPR.  This function can only
+  /// be used when it is know that the value in SrcReg is same across all
+  /// threads in the wave.
+  /// \returns The SGPR register that \p SrcReg was copied to.
+  unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
+                          MachineRegisterInfo &MRI) const;
+
   /// \brief Legalize all operands in this instruction.  This function may
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Thu Feb 11 15:45:07 2016
@@ -479,6 +479,24 @@ const TargetRegisterClass *SIRegisterInf
   }
 }
 
+const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
+                                         const TargetRegisterClass *VRC) const {
+  switch (VRC->getSize()) {
+  case 4:
+    return &AMDGPU::SGPR_32RegClass;
+  case 8:
+    return &AMDGPU::SReg_64RegClass;
+  case 16:
+    return &AMDGPU::SReg_128RegClass;
+  case 32:
+    return &AMDGPU::SReg_256RegClass;
+  case 64:
+    return &AMDGPU::SReg_512RegClass;
+  default:
+    llvm_unreachable("Invalid register class size");
+  }
+}
+
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
                          const TargetRegisterClass *RC, unsigned SubIdx) const {
   if (SubIdx == AMDGPU::NoSubRegister)

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.h Thu Feb 11 15:45:07 2016
@@ -89,6 +89,10 @@ public:
   const TargetRegisterClass *getEquivalentVGPRClass(
                                           const TargetRegisterClass *SRC) const;
 
+  /// \returns A SGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentSGPRClass(
+                                           const TargetRegisterClass *VRC) const;
+
   /// \returns The register class that is used for a sub-register of \p RC for
   /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
   /// be returned.

Modified: llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll?rev=260599&r1=260598&r2=260599&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sgpr-copy.ll Thu Feb 11 15:45:07 2016
@@ -362,6 +362,38 @@ bb71:
   ret void
 }
 
+; Check the the resource descriptor is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, 1, 0, 0, 0, 0, 0, 0, 0, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+define void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  ret void
+}
+
+; Check the the sampler is stored in an sgpr.
+; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
+; CHECK: image_sample v{{[0-9]+}}, 1, 0, 0, 0, 0, 0, 0, 0, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+define void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
+  %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+
 attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { readonly }

Added: llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll?rev=260599&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/split-smrd.ll Thu Feb 11 15:45:07 2016
@@ -0,0 +1,46 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s
+
+; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
+; Make sure that when we split an smrd instruction in order to move it to
+; the VALU, we are also moving its users to the VALU.
+; CHECK-LABEL: {{^}}split_smrd_add_worklist:
+; CHECK: image_sample v{{[0-9]+}}, 1, 0, 0, 0, 0, 0, 0, 0, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
+
+define void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp1 = bitcast float %tmp to i32
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  %tmp4 = bitcast float %tmp to i32
+  %tmp5 = add i32 %tmp4, 4
+  %tmp6 = sext i32 %tmp5 to i64
+  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
+  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
+  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = extractelement <4 x float> %tmp9, i32 0
+  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
+  %tmp13 = bitcast i32 %tmp12 to float
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+declare i32 @llvm.SI.packf16(float, float) #1
+
+attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0, i32 1}
+!1 = !{!"const", null}
+!2 = !{!1, !1, i64 0}