[llvm] r310283 - [AMDGPU] Add pseudo "old" source to all DPP instructions

Connor Abbott via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 7 12:10:56 PDT 2017


Author: cwabbott
Date: Mon Aug  7 12:10:56 2017
New Revision: 310283

URL: http://llvm.org/viewvc/llvm-project?rev=310283&view=rev
Log:
[AMDGPU] Add pseudo "old" source to all DPP instructions

Summary:
All instructions with the DPP modifier may not write to certain lanes of
the output if bound_ctrl=1 is set or any bits in bank_mask or row_mask
aren't set, so the destination register may be both defined and modified.
The right way to handle this is to add a constraint that the destination
register is the same as one of the inputs. We could tie the destination
to the first source, but that would be too restrictive for some use-cases
where we want the destination to be some other value before the
instruction executes. Instead, add a fake "old" source and tie it to the
destination. Effectively, the "old" source defines what value unwritten
lanes will get. We'll expose this functionality to users with a new
intrinsic later.

Also, we want to use DPP instructions for computing derivatives, which
means we need to set WQM for them. We also need to enable the entire
wavefront when using DPP intrinsics to implement nonuniform subgroup
reductions, since otherwise we'll get incorrect results in some cases.
To accomodate this, add a new operand to all DPP instructions which will
be interpreted by the SI WQM pass. This will be exposed with a new
intrinsic later. We'll also add support for Whole Wavefront Mode later.

I also fixed llvm.amdgcn.mov.dpp to overwrite the source and fixed up
the test. However, I could also keep the old behavior (where lanes that
aren't written are undefined) if people want it.

Reviewers: tstellar, arsenm

Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye

Differential Revision: https://reviews.llvm.org/D34716

Modified:
    llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
    llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td
    llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
    llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td
    llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
    llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll

Modified: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp Mon Aug  7 12:10:56 2017
@@ -4458,6 +4458,11 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Ins
     ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
   }
 
+  // All DPP instructions with at least one source operand have a fake "old"
+  // source at the beginning that's tied to the dst operand. Handle it here.
+  if (Desc.getNumOperands() >= 2)
+    Inst.addOperand(Inst.getOperand(0));
+
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     // Add the register arguments
@@ -4480,16 +4485,6 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Ins
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppRowMask, 0xf);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
-
-  // special case v_mac_{f16, f32}:
-  // it has src2 register operand that is tied to dst operand
-  if (Inst.getOpcode() == AMDGPU::V_MAC_F32_dpp ||
-      Inst.getOpcode() == AMDGPU::V_MAC_F16_dpp) {
-    auto it = Inst.begin();
-    std::advance(
-        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
-    Inst.insert(it, Inst.getOperand(0)); // src2 = dst
-  }
 }
 
 //===----------------------------------------------------------------------===//

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.td Mon Aug  7 12:10:56 2017
@@ -1184,8 +1184,9 @@ class getInsVOP3OpSel <RegisterOperand S
   );
 }
 
-class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
-                 bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
+class getInsDPP <RegisterOperand DstRC, RegisterClass Src0RC, RegisterClass Src1RC,
+                 int NumSrcArgs, bit HasModifiers,
+                 Operand Src0Mod, Operand Src1Mod> {
 
   dag ret = !if (!eq(NumSrcArgs, 0),
                 // VOP1 without input operands (V_NOP)
@@ -1194,26 +1195,29 @@ class getInsDPP <RegisterClass Src0RC, R
             !if (!eq(NumSrcArgs, 1),
               !if (!eq(HasModifiers, 1),
                 // VOP1_DPP with modifiers
-                (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                (ins DstRC:$old, Src0Mod:$src0_modifiers,
+                     Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
               /* else */,
                 // VOP1_DPP without modifiers
-                (ins Src0RC:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
-                bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
+                (ins DstRC:$old, Src0RC:$src0,
+                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
               /* endif */)
               /* NumSrcArgs == 2 */,
               !if (!eq(HasModifiers, 1),
                 // VOP2_DPP with modifiers
-                (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                (ins DstRC:$old,
+                     Src0Mod:$src0_modifiers, Src0RC:$src0,
                      Src1Mod:$src1_modifiers, Src1RC:$src1,
                      dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl)
               /* else */,
                 // VOP2_DPP without modifiers
-                (ins Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
-                row_mask:$row_mask, bank_mask:$bank_mask,
-                bound_ctrl:$bound_ctrl)
+                (ins DstRC:$old,
+                     Src0RC:$src0, Src1RC:$src1, dpp_ctrl:$dpp_ctrl,
+                     row_mask:$row_mask, bank_mask:$bank_mask,
+                     bound_ctrl:$bound_ctrl)
              /* endif */)));
 }
 
@@ -1548,7 +1552,7 @@ class VOPProfile <list<ValueType> _ArgVT
                                            getOpSelMod<Src0VT>.ret,
                                            getOpSelMod<Src1VT>.ret,
                                            getOpSelMod<Src2VT>.ret>.ret;
-  field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
+  field dag InsDPP = getInsDPP<DstRCDPP, Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
                                  HasSDWAOMod, Src0ModSDWA, Src1ModSDWA,

Modified: llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP1Instructions.td Mon Aug  7 12:10:56 2017
@@ -266,7 +266,8 @@ def VOP_MOVRELD : VOPProfile<[untyped, i
   let Outs = (outs);
   let Ins32 = (ins Src0RC32:$vdst, VSrc_b32:$src0);
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
-  let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+  let InsDPP = (ins DstRC:$vdst, DstRC:$old, Src0RC32:$src0,
+                    dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
 
   let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
@@ -504,8 +505,6 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo
   let Uses = ps.Uses;
   let SchedRW = ps.SchedRW;
   let hasSideEffects = ps.hasSideEffects;
-  let Constraints = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   bits<8> vdst;
   let Inst{8-0}   = 0xfa; // dpp
@@ -659,11 +658,11 @@ let Predicates = [isVI] in {
 def : Pat <
   (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
                       imm:$bound_ctrl)),
-  (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
-                       (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
+  (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
+                       (as_i32imm $row_mask), (as_i32imm $bank_mask),
+                       (as_i1imm $bound_ctrl))
 >;
 
-
 def : Pat<
   (i32 (anyext i16:$src)),
   (COPY $src)

Modified: llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP2Instructions.td Mon Aug  7 12:10:56 2017
@@ -209,9 +209,9 @@ class VOP_MAC <ValueType vt> : VOPProfil
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
                        HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
-  let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
+  let InsDPP = (ins DstRCDPP:$old,
+                    Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
-                    VGPR_32:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
 
@@ -282,7 +282,8 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile
                      dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
 
-  let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
+  let InsDPP = (ins DstRCDPP:$old,
+                    Src0Mod:$src0_modifiers, Src0DPP:$src0,
                     Src1Mod:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
@@ -665,8 +666,6 @@ class VOP2_DPP <bits<6> op, VOP2_Pseudo
   let Uses = ps.Uses;
   let SchedRW = ps.SchedRW;
   let hasSideEffects = ps.hasSideEffects;
-  let Constraints = ps.Constraints;
-  let DisableEncoding = ps.DisableEncoding;
 
   bits<8> vdst;
   bits<8> src1;

Modified: llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOPInstructions.td Mon Aug  7 12:10:56 2017
@@ -510,6 +510,8 @@ class VOP_DPP <string OpName, VOPProfile
   let AssemblerPredicate = !if(P.HasExt, HasDPP, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
                                      AMDGPUAsmVariants.Disable);
+  let Constraints = !if(P.NumSrcArgs, "$old = $vdst", "");
+  let DisableEncoding = !if(P.NumSrcArgs, "$old", "");
   let DecoderNamespace = "DPP";
 }
 

Modified: llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/inserted-wait-states.mir Mon Aug  7 12:10:56 2017
@@ -504,12 +504,12 @@ name: dpp
 body: |
   bb.0:
     %vgpr0 = V_MOV_B32_e32 0, implicit %exec
-    %vgpr1 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec
+    %vgpr1 = V_MOV_B32_dpp %vgpr1, %vgpr0, 0, 15, 15, 0, implicit %exec
     S_BRANCH %bb.1
 
   bb.1:
     implicit %exec, implicit %vcc = V_CMPX_EQ_I32_e32 %vgpr0, %vgpr1, implicit %exec
-    %vgpr3 = V_MOV_B32_dpp %vgpr0, 0, 15, 15, 0, implicit %exec
+    %vgpr3 = V_MOV_B32_dpp %vgpr3, %vgpr0, 0, 15, 15, 0, implicit %exec
     S_ENDPGM
 ...
 ---

Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll?rev=310283&r1=310282&r2=310283&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll Mon Aug  7 12:10:56 2017
@@ -5,8 +5,10 @@
 
 ; VI-LABEL: {{^}}dpp_test:
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
+; VI-NOOPT: v_mov_b32_e32 v1, s{{[0-9]+}}
 ; VI: s_nop 1
-; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; VI-OPT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
+; VI-NOOPT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x01,0x01,0x08,0x11]
 define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -14,11 +16,14 @@ define amdgpu_kernel void @dpp_test(i32
 }
 
 ; VI-LABEL: {{^}}dpp_wait_states:
+; VI-NOOPT: v_mov_b32_e32 [[VGPR1:v[0-9]+]], s{{[0-9]+}}
 ; VI: v_mov_b32_e32 [[VGPR0:v[0-9]+]], s{{[0-9]+}}
 ; VI: s_nop 1
-; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI-OPT: v_mov_b32_dpp [[VGPR0]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI-NOOPT: v_mov_b32_dpp [[VGPR1]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
-; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI-OPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
+; VI-NOOPT: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0




More information about the llvm-commits mailing list