[llvm] r310088 - [AMDGPU] Implement llvm.amdgcn.set.inactive intrinsic

Connor Abbott via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 4 11:36:54 PDT 2017


Author: cwabbott
Date: Fri Aug  4 11:36:54 2017
New Revision: 310088

URL: http://llvm.org/viewvc/llvm-project?rev=310088&view=rev
Log:
[AMDGPU] Implement llvm.amdgcn.set.inactive intrinsic

Summary:
This intrinsic lets us set inactive lanes to an identity value when
implementing wavefront reductions. In combination with Whole Wavefront
Mode, it lets inactive lanes be skipped over as required by GLSL/Vulkan.
Lowering the intrinsic needs to happen post-RA so that RA knows that the
destination isn't completely overwritten due to the EXEC shenanigans, so
we need another pseudo-instruction to represent the un-lowered
intrinsic.

Reviewers: tstellar, arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye

Differential Revision: https://reviews.llvm.org/D34719

Added:
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=310088&r1=310087&r2=310088&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Fri Aug  4 11:36:54 2017
@@ -756,6 +756,16 @@ def int_amdgcn_wwm : Intrinsic<[llvm_any
   [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;
 
+// Given a value, copies it while setting all the inactive lanes to a given
+// value. Note that OpenGL helper lanes are considered active, so if the
+// program ever uses WQM, then the instruction and the first source will be
+// computed in WQM.
+def int_amdgcn_set_inactive :
+  Intrinsic<[llvm_anyint_ty],
+            [LLVMMatchType<0>, // value to be copied
+             LLVMMatchType<0>], // value for the inactive lanes to take
+            [IntrNoMem, IntrConvergent]>;
+
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=310088&r1=310087&r2=310088&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Aug  4 11:36:54 2017
@@ -1099,6 +1099,28 @@ bool SIInstrInfo::expandPostRAPseudo(Mac
     MI.eraseFromParent();
     break;
   }
+  case AMDGPU::V_SET_INACTIVE_B32: {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
+      .add(MI.getOperand(2));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MI.eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_SET_INACTIVE_B64: {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
+                                 MI.getOperand(0).getReg())
+      .add(MI.getOperand(2));
+    expandPostRAPseudo(*Copy);
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
+      .addReg(AMDGPU::EXEC);
+    MI.eraseFromParent();
+    break;
+  }
   case AMDGPU::V_MOVRELD_B32_V1:
   case AMDGPU::V_MOVRELD_B32_V2:
   case AMDGPU::V_MOVRELD_B32_V4:

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=310088&r1=310087&r2=310088&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri Aug  4 11:36:54 2017
@@ -137,6 +137,20 @@ def EXIT_WWM : SPseudoInstSI <(outs SReg
   let mayStore = 0;
 }
 
+// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
+// restoring it after we're done.
+def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
+  (ins VGPR_32: $src, VSrc_b32:$inactive),
+  [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
+  let Constraints = "$src = $vdst";
+}
+
+def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
+  (ins VReg_64: $src, VSrc_b64:$inactive),
+  [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
+  let Constraints = "$src = $vdst";
+}
+
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
   [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;

Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=310088&r1=310087&r2=310088&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Fri Aug  4 11:36:54 2017
@@ -303,6 +303,7 @@ char SIWholeQuadMode::scanInstructions(M
                                        std::vector<WorkItem> &Worklist) {
   char GlobalFlags = 0;
   bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
+  SmallVector<MachineInstr *, 4> SetInactiveInstrs;
 
   // We need to visit the basic blocks in reverse post-order so that we visit
   // defs before uses, in particular so that we don't accidentally mark an
@@ -341,6 +342,23 @@ char SIWholeQuadMode::scanInstructions(M
         GlobalFlags |= StateWWM;
         LowerToCopyInstrs.push_back(&MI);
         continue;
+      } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
+                 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
+        III.Disabled = StateWWM;
+        MachineOperand &Inactive = MI.getOperand(2);
+        if (Inactive.isReg()) {
+          if (Inactive.isUndef()) {
+            LowerToCopyInstrs.push_back(&MI);
+          } else {
+            unsigned Reg = Inactive.getReg();
+            if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+              for (MachineInstr &DefMI : MRI->def_instructions(Reg))
+                markInstruction(DefMI, StateWWM, Worklist);
+            }
+          }
+        }
+        SetInactiveInstrs.push_back(&MI);
+        continue;
       } else if (TII->isDisableWQM(MI)) {
         BBI.Needs |= StateExact;
         if (!(BBI.InNeeds & StateExact)) {
@@ -380,6 +398,14 @@ char SIWholeQuadMode::scanInstructions(M
     }
   }
 
+  // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
+  // ever used anywhere in the function. This implements the corresponding
+  // semantics of @llvm.amdgcn.set.inactive.
+  if (GlobalFlags & StateWQM) {
+    for (MachineInstr *MI : SetInactiveInstrs)
+      markInstruction(*MI, StateWQM, Worklist);
+  }
+
   return GlobalFlags;
 }
 
@@ -799,8 +825,11 @@ void SIWholeQuadMode::lowerLiveMaskQueri
 }
 
 void SIWholeQuadMode::lowerCopyInstrs() {
-  for (MachineInstr *MI : LowerToCopyInstrs)
+  for (MachineInstr *MI : LowerToCopyInstrs) {
+    for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
+      MI->RemoveOperand(i);
     MI->setDesc(TII->get(AMDGPU::COPY));
+  }
 }
 
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {

Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll?rev=310088&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll Fri Aug  4 11:36:54 2017
@@ -0,0 +1,29 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+
+
+; GCN-LABEL: {{^}}set_inactive:
+; GCN: s_not_b64 exec, exec
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 42
+; GCN: s_not_b64 exec, exec
+define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) {
+  %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
+  store i32 %tmp, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}set_inactive_64:
+; GCN: s_not_b64 exec, exec
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
+; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
+; GCN: s_not_b64 exec, exec
+define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
+  %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
+  store i64 %tmp, i64 addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
+declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
+
+attributes #0 = { convergent readnone }

Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=310088&r1=310087&r2=310088&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Fri Aug  4 11:36:54 2017
@@ -256,6 +256,47 @@ endif:
   ret float %out.1
 }
 
+; Check that @llvm.amdgcn.set.inactive disables WWM.
+;
+;CHECK-LABEL: {{^}}test_set_inactive1:
+;CHECK: buffer_load_dword
+;CHECK: s_not_b64 exec, exec
+;CHECK: v_mov_b32_e32
+;CHECK: s_not_b64 exec, exec
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: v_add_i32_e32
+define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
+main_body:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %src.0 = bitcast float %src to i32
+  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
+  %out = add i32 %src.1, %src.1
+  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
+  %out.1 = bitcast i32 %out.0 to float
+  call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  ret void
+}
+
+; Check that enabling WQM anywhere enables WQM for the set.inactive source.
+;
+;CHECK-LABEL: {{^}}test_set_inactive2:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %src1.0 = bitcast float %src1 to i32
+  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src0.0 = bitcast float %src0 to i32
+  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
+  %out = add i32 %src0.1, %src1.1
+  %out.0 = bitcast i32 %out to float
+  call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  ret void
+}
+
 ; Check a case of one branch of an if-else requiring WQM, the other requiring
 ; exact.
 ;
@@ -513,7 +554,7 @@ main_body:
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: v_add_f32_e32 v0,
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
-define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
+define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
 main_body:
   %s = fadd float %a, %b
   ret float %s
@@ -680,10 +721,12 @@ declare float @llvm.amdgcn.wqm.f32(float
 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
 declare float @llvm.amdgcn.wwm.f32(float) #3
 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 attributes #3 = { nounwind readnone }
-attributes #4 = { "amdgpu-ps-wqm-outputs" }
+attributes #4 = { nounwind readnone convergent }
+attributes #5 = { "amdgpu-ps-wqm-outputs" }




More information about the llvm-commits mailing list