[llvm] r326353 - [AMDGPU] added writelane intrinsic

Wed Feb 28 11:10:32 PST 2018

Author: tpr
Date: Wed Feb 28 11:10:32 2018
New Revision: 326353

URL: http://llvm.org/viewvc/llvm-project?rev=326353&view=rev
Log:
[AMDGPU] added writelane intrinsic

Summary:
For use by LLPC SPV_AMD_shader_ballot extension.

The v_writelane instruction was already implemented for use by SGPR
spilling, but I had to add an extra dummy operand tied to the
destination, to represent that all lanes except the selected one keep
the old value of the destination register.

.ll test changes were due to schedule changes caused by that new
operand.

Differential Revision: https://reviews.llvm.org/D42838

Added:
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
    llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================

--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Wed Feb 28 11:10:32 2018
@@ -768,6 +768,19 @@ def int_amdgcn_readlane :
   GCCBuiltin<"__builtin_amdgcn_readlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+// The value to write and lane select arguments must be uniform across the
+// currently active threads of the current wave. Otherwise, the result is
+// undefined.
+def int_amdgcn_writelane :
+  GCCBuiltin<"__builtin_amdgcn_writelane">,
+  Intrinsic<[llvm_i32_ty], [
+    llvm_i32_ty,    // uniform value to write: returned by the selected lane
+    llvm_i32_ty,    // uniform lane select
+    llvm_i32_ty     // returned by all lanes other than the selected one
+  ],
+  [IntrNoMem, IntrConvergent]
+>;
+
 def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
   [IntrNoMem, IntrSpeculatable]

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Feb 28 11:10:32 2018
@@ -2711,8 +2711,9 @@ bool SIInstrInfo::verifyInstruction(cons
     }
   }
 
-  // Verify VOP*
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
+  // Verify VOP*. Ignore multiple sgpr operands on writelane.
+  if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
+      && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
     // Only look at the true operands. Only a real operand can use the constant
     // bus, and we don't want to check pseudo-operands like the source modifier
     // flags.
@@ -3147,6 +3148,29 @@ void SIInstrInfo::legalizeOperandsVOP2(M
       legalizeOpWithMove(MI, Src0Idx);
   }
 
+  // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
+  // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
+  // src0/src1 with V_READFIRSTLANE.
+  if (Opc == AMDGPU::V_WRITELANE_B32) {
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+    MachineOperand &Src0 = MI.getOperand(Src0Idx);
+    const DebugLoc &DL = MI.getDebugLoc();
+    if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src0);
+      Src0.ChangeToRegister(Reg, false);
+    }
+    if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
+      unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+      const DebugLoc &DL = MI.getDebugLoc();
+      BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+          .add(Src1);
+      Src1.ChangeToRegister(Reg, false);
+    }
+    return;
+  }
+
   // VOP2 src0 instructions support all operand types, so we don't need to check
   // their legality. If src1 is already legal, we don't need to do anything.
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Wed Feb 28 11:10:32 2018
@@ -636,6 +636,7 @@ bool SIRegisterInfo::spillSGPR(MachineBa
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -732,11 +733,21 @@ bool SIRegisterInfo::spillSGPR(MachineBa
     if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
+      // During SGPR spilling to VGPR, determine if the VGPR is defined. The
+      // only circumstance in which we say it is undefined is when it is the
+      // first spill to this VGPR in the first basic block.
+      bool VGPRDefined = true;
+      if (MBB == &MF->front())
+        VGPRDefined = !SGPRSpillVGPRDefinedSet.insert(Spill.VGPR).second;
+
+      // Mark the "old value of vgpr" input undef only if this is the first sgpr
+      // spill to this specific vgpr in the first basic block.
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
         .addReg(SubReg, getKillRegState(IsKill))
-        .addImm(Spill.Lane);
+        .addImm(Spill.Lane)
+        .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
 
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to

Modified: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td Wed Feb 28 11:10:32 2018
@@ -323,15 +323,17 @@ def VOP_READLANE : VOPProfile<[i32, i32,
   let HasSDWA9 = 0;
 }
 
-def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
+def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
   let HasExt = 0;
   let HasSDWA9 = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 0;
 }
 
 //===----------------------------------------------------------------------===//
@@ -399,7 +401,10 @@ let isConvergent = 1, Uses = []<Register
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
   [(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))], "">;
 
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, [], "">;
+let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
+  [(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))], "">;
+} // End $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
 defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
@@ -640,7 +645,7 @@ defm V_SUBBREV_U32        : VOP2be_Real_
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
 
-let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
 }
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Wed Feb 28 11:10:32 2018
@@ -33,16 +33,14 @@ entry:
 ; GCN-DAG: buffer_store_dword v32
 ; GCN-DAG: buffer_store_dword v33
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: v_writelane_b32
-
+; GCN-DAG: v_writelane_b32
 ; GCN-DAG: s_add_u32 s32, s32, 0xb00{{$}}
-
 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
-; GCN: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
+; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
 
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
-; GCN: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
 
 ; GCN: s_swappc_b64
 
@@ -80,10 +78,10 @@ entry:
 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
 
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
 
 ; GCN-NOT: s_add_u32 s32, s32, 0x800
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll Wed Feb 28 11:10:32 2018
@@ -44,7 +44,7 @@ define void @callee_with_stack() #0 {
 ; GCN-DAG: v_writelane_b32 v32, s35,
 ; GCN-DAG: s_add_u32 s32, s32, 0x300{{$}}
 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
 ; GCN-DAG: s_mov_b32 s33, s5
 
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir Wed Feb 28 11:10:32 2018
@@ -308,7 +308,7 @@ body: |
 
   bb.1:
     $vgpr0,$sgpr0_sgpr1 = V_ADD_I32_e64 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0
+    $vgpr4 = V_WRITELANE_B32 $sgpr0, $sgpr0, $vgpr4
     S_BRANCH %bb.2
 
   bb.2:
@@ -318,7 +318,7 @@ body: |
 
   bb.3:
     $vgpr0,implicit $vcc = V_ADD_I32_e32 $vgpr1, $vgpr2, implicit $vcc, implicit $exec
-    $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo
+    $vgpr4 = V_WRITELANE_B32 $sgpr4, $vcc_lo, $vgpr4
     S_ENDPGM
 
 ...

Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll?rev=326353&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll Wed Feb 28 11:10:32 2018
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
+
+; CHECK-LABEL: {{^}}test_writelane_sreg:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+  %oldval = load i32, i32 addrspace(1)* %out
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_sreg:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 32, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+  %oldval = load i32, i32 addrspace(1)* %out
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 32, i32 %src1, i32 %oldval)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_vreg_lane:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_writelane_b32 v{{[0-9]+}}, 12, [[LANE]]
+define amdgpu_kernel void @test_writelane_vreg_lane(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
+  %args = load <2 x i32>, <2 x i32> addrspace(1)* %gep.in
+  %oldval = load i32, i32 addrspace(1)* %out
+  %lane = extractelement <2 x i32> %args, i32 1
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 12, i32 %lane, i32 %oldval)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; TODO: m0 should be folded.
+; CHECK-LABEL: {{^}}test_writelane_m0_sreg:
+; CHECK: s_mov_b32 m0, -1
+; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
+; CHECK: v_writelane_b32 v{{[0-9]+}}, [[COPY_M0]], s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+  %oldval = load i32, i32 addrspace(1)* %out
+  %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 %m0, i32 %src1, i32 %oldval)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm:
+; CHECK: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 32
+define amdgpu_kernel void @test_writelane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+  %oldval = load i32, i32 addrspace(1)* %out
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 32, i32 %oldval) #0
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_sreg_oldval:
+; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], s{{[0-9]+}}
+; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_sreg_oldval(i32 inreg %oldval, i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 %oldval)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}test_writelane_imm_oldval:
+; CHECK: v_mov_b32_e32 [[OLDVAL:v[0-9]+]], 42
+; CHECK: v_writelane_b32 [[OLDVAL]], s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @test_writelane_imm_oldval(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+  %writelane = call i32 @llvm.amdgcn.writelane(i32 %src0, i32 %src1, i32 42)
+  store i32 %writelane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll?rev=326353&r1=326352&r2=326353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll Wed Feb 28 11:10:32 2018
@@ -216,7 +216,7 @@ entry:
 ; GCN-DAG: v_writelane_b32 v34, s35, 2
 ; GCN-DAG: s_add_u32 s32, s32, 0x400
 
-; GCN: s_getpc_b64
+; GCN-DAG: s_getpc_b64
 ; GCN: s_swappc_b64
 
 ; GCN: s_getpc_b64 s[6:7]