[llvm] r310085 - [AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM

Connor Abbott via llvm-commits llvm-commits at lists.llvm.org
Fri Aug 4 11:36:49 PDT 2017


Author: cwabbott
Date: Fri Aug  4 11:36:49 2017
New Revision: 310085

URL: http://llvm.org/viewvc/llvm-project?rev=310085&view=rev
Log:
[AMDGPU] Add an llvm.amdgcn.wqm intrinsic for WQM

Summary:
Previously, we assumed that certain types of instructions needed WQM in
pixel shaders, particularly DS instructions and image sampling
instructions. This was ok because with OpenGL, the assumption was
correct. But we want to start using DPP instructions for derivatives as
well as other things, so the assumption that we can infer whether to use
WQM based on the instruction won't continue to hold. This intrinsic lets
frontends like Mesa indicate what things need WQM based on their
knowledge of the API, rather than second-guessing them in the backend.
We need to keep around the old method of enabling WQM, but eventually we
should remove it once Mesa catches up. For now, this will let us use DPP
instructions for computing derivatives correctly.

Reviewers: arsenm, tpr, nhaehnle

Subscribers: kzhuravl, wdng, yaxunl, dstuttard, llvm-commits, t-tye

Differential Revision: https://reviews.llvm.org/D35167

Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
    llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Fri Aug  4 11:36:49 2017
@@ -740,6 +740,13 @@ def int_amdgcn_alignbyte : Intrinsic<[ll
   [IntrNoMem, IntrSpeculatable]
 >;
 
+
+// Copies the source value to the destination value, with the guarantee that
+// the source value is computed as if the entire program were executed in WQM.
+def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp Fri Aug  4 11:36:49 2017
@@ -338,6 +338,9 @@ static bool isSafeToFoldImmIntoCopy(cons
                                     unsigned &SMovOp,
                                     int64_t &Imm) {
 
+  if (Copy->getOpcode() != AMDGPU::COPY)
+    return false;
+
   if (!MoveImm->isMoveImmediate())
     return false;
 
@@ -564,7 +567,8 @@ bool SIFixSGPRCopies::runOnMachineFuncti
       switch (MI.getOpcode()) {
       default:
         continue;
-      case AMDGPU::COPY: {
+      case AMDGPU::COPY:
+      case AMDGPU::WQM: {
         // If the destination register is a physical register there isn't really
         // much we can do to fix this.
         if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri Aug  4 11:36:49 2017
@@ -3942,6 +3942,11 @@ SDValue SITargetLowering::LowerINTRINSIC
                                Op.getOperand(1), Op.getOperand(2));
     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
   }
+  case Intrinsic::amdgcn_wqm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
+                   0);
+  }
   default:
     return Op;
   }

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Aug  4 11:36:49 2017
@@ -2666,6 +2666,7 @@ unsigned SIInstrInfo::getVALUOp(const Ma
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
+  case AMDGPU::WQM: return AMDGPU::WQM;
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -3970,6 +3971,7 @@ const TargetRegisterClass *SIInstrInfo::
   case AMDGPU::PHI:
   case AMDGPU::REG_SEQUENCE:
   case AMDGPU::INSERT_SUBREG:
+  case AMDGPU::WQM:
     if (RI.hasVGPRs(NewDstRC))
       return nullptr;
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri Aug  4 11:36:49 2017
@@ -116,6 +116,11 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(
 // pass to enable folding of inline immediates.
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
+
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
+// after the WQM pass processes them.
+def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
 let usesCustomInserter = 1, SALU = 1 in {

Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Fri Aug  4 11:36:49 2017
@@ -136,6 +136,7 @@ private:
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
+  SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 
   void printInfo();
 
@@ -162,6 +163,7 @@ private:
   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
 
   void lowerLiveMaskQueries(unsigned LiveMaskReg);
+  void lowerCopyInstrs();
 
 public:
   static char ID;
@@ -294,6 +296,11 @@ char SIWholeQuadMode::scanInstructions(M
         markUsesWQM(MI, Worklist);
         GlobalFlags |= StateWQM;
         continue;
+      } else if (Opcode == AMDGPU::WQM) {
+        // The WQM intrinsic requires its output to have all the helper lanes
+        // correct, so we need it to be in WQM.
+        Flags = StateWQM;
+        LowerToCopyInstrs.push_back(&MI);
       } else if (TII->isDisableWQM(MI)) {
         Flags = StateExact;
       } else {
@@ -666,6 +673,11 @@ void SIWholeQuadMode::lowerLiveMaskQueri
   }
 }
 
+void SIWholeQuadMode::lowerCopyInstrs() {
+  for (MachineInstr *MI : LowerToCopyInstrs)
+    MI->setDesc(TII->get(AMDGPU::COPY));
+}
+
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
     return false;
@@ -673,6 +685,7 @@ bool SIWholeQuadMode::runOnMachineFuncti
   Instructions.clear();
   Blocks.clear();
   LiveMaskQueries.clear();
+  LowerToCopyInstrs.clear();
 
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
@@ -708,6 +721,7 @@ bool SIWholeQuadMode::runOnMachineFuncti
           .addReg(AMDGPU::EXEC);
 
       lowerLiveMaskQueries(LiveMaskReg);
+      lowerCopyInstrs();
       // EntryMI may become invalid here
       return true;
     }
@@ -716,6 +730,7 @@ bool SIWholeQuadMode::runOnMachineFuncti
   DEBUG(printInfo());
 
   lowerLiveMaskQueries(LiveMaskReg);
+  lowerCopyInstrs();
 
   // Handle the general case
   for (auto BII : Blocks)

Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=310085&r1=310084&r2=310085&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Fri Aug  4 11:36:49 2017
@@ -74,6 +74,40 @@ main_body:
   ret <4 x float> %dtex
 }
 
+; Check that WQM is triggered by the wqm intrinsic.
+;
+;CHECK-LABEL: {{^}}test5:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that the wqm intrinsic works correctly for integers.
+;
+;CHECK-LABEL: {{^}}test6:
+;CHECK: s_wqm_b64 exec, exec
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = bitcast float %out to i32
+  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
+  %out.2 = bitcast i32 %out.1 to float
+  ret float %out.2
+}
+
 ; Check a case of one branch of an if-else requiring WQM, the other requiring
 ; exact.
 ;
@@ -494,6 +528,8 @@ declare <4 x float> @llvm.amdgcn.image.s
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
 declare void @llvm.AMDGPU.kill(float) #1
+declare float @llvm.amdgcn.wqm.f32(float) #3
+declare i32 @llvm.amdgcn.wqm.i32(i32) #3
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }




More information about the llvm-commits mailing list