[PATCH] D32343: AMDGPU: Move v_readlane lane select from VGPR to SGPR

Fri Apr 21 02:21:20 PDT 2017

nhaehnle created this revision.
Herald added subscribers: t-tye, tpr, dstuttard, yaxunl, wdng, kzhuravl.

Fix a compiler bug when the lane select happens to end up in a VGPR.

Clarify the semantic of the corresponding intrinsic to be that of
the corresponding GLSL: the lane select must be uniform across a
wave front, otherwise results are undefined.


https://reviews.llvm.org/D32343

Files:
  include/llvm/IR/IntrinsicsAMDGPU.td
  lib/Target/AMDGPU/SIInstrInfo.cpp
  test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll


Index: test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
===================================================================

--- test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -19,6 +19,18 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}test_readlane_vregs:
+; CHECK: v_readfirstlane_b32 [[LANE:s[0-9]+]], v{{[0-9]+}}
+; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, [[LANE]]
+define amdgpu_kernel void @test_readlane_vregs(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #1 {
+  %args = load <2 x i32>, <2 x i32> addrspace(1)* %in
+  %value = extractelement <2 x i32> %args, i32 0
+  %lane = extractelement <2 x i32> %args, i32 1
+  %readlane = call i32 @llvm.amdgcn.readlane(i32 %value, i32 %lane)
+  store i32 %readlane, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 ; TODO: m0 should be folded.
 ; CHECK-LABEL: {{^}}test_readlane_m0_sreg:
 ; CHECK: s_mov_b32 m0, -1
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2635,6 +2635,18 @@
   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
     return;
 
+  // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
+  // lane select.
+  if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
+      RI.isVGPR(MRI, Src1.getReg())) {
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    DebugLoc DL = MI.getDebugLoc();
+    BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+        .add(Src1);
+    Src1.ChangeToRegister(Reg, false);
+    return;
+  }
+
   // We do not use commuteInstruction here because it is too aggressive and will
   // commute if it is possible. We only want to commute here if it improves
   // legality. This can be called a fairly large number of times so don't waste
Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -629,6 +629,8 @@
   GCCBuiltin<"__builtin_amdgcn_readfirstlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+// The lane argument must be uniform across the currently active threads of the
+// current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
   GCCBuiltin<"__builtin_amdgcn_readlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D32343.96111.patch
Type: text/x-patch
Size: 2532 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20170421/23315803/attachment.bin>