[llvm] r363870 - Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 19 12:55:27 PDT 2019
Author: arsenm
Date: Wed Jun 19 12:55:27 2019
New Revision: 363870
URL: http://llvm.org/viewvc/llvm-project?rev=363870&view=rev
Log:
Reapply "AMDGPU: Add ds_gws_init / ds_gws_barrier intrinsics"
This reapplies r363678, using the correct chain for the CopyToReg for
v0. glueCopyToM0 counterintuitively changes the operands of the
original node.
Added:
llvm/trunk/test/CodeGen/AMDGPU/gws-hazards.mir
llvm/trunk/test/CodeGen/AMDGPU/insert-skips-gws.mir
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/trunk/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Wed Jun 19 12:55:27 2019
@@ -1348,6 +1348,28 @@ def int_amdgcn_alignbyte : Intrinsic<[ll
[IntrNoMem, IntrSpeculatable]
>;
+// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
+//
+// bar_val is the total number of waves that will wait on this
+// barrier, minus 1.
+def int_amdgcn_ds_gws_init :
+ GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
+ Intrinsic<[],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrConvergent, IntrWriteMem, IntrInaccessibleMemOnly], "",
+ [SDNPMemOperand]
+>;
+
+// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id)
+// bar_val is the total number of waves that will wait on this
+// barrier, minus 1.
+def int_amdgcn_ds_gws_barrier :
+ GCCBuiltin<"__builtin_amdgcn_ds_gws_barrier">,
+ Intrinsic<[],
+ [llvm_i32_ty, llvm_i32_ty],
+ [IntrConvergent, IntrInaccessibleMemOnly], "",
+ [SDNPMemOperand]
+>;
// Copies the source value to the destination value, with the guarantee that
// the source value is computed as if the entire program were executed in WQM.
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp Wed Jun 19 12:55:27 2019
@@ -218,7 +218,9 @@ private:
void SelectFMAD_FMA(SDNode *N);
void SelectATOMIC_CMP_SWAP(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
+ void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
+ void SelectINTRINSIC_VOID(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
@@ -832,6 +834,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *
SelectINTRINSIC_W_CHAIN(N);
return;
}
+ case ISD::INTRINSIC_VOID: {
+ SelectINTRINSIC_VOID(N);
+ return;
+ }
}
SelectCode(N);
@@ -2034,6 +2040,72 @@ void AMDGPUDAGToDAGISel::SelectDSAppendC
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
+void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
+ SDLoc SL(N);
+ SDValue VSrc0 = N->getOperand(2);
+ SDValue BaseOffset = N->getOperand(3);
+ int ImmOffset = 0;
+ MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+ MachineMemOperand *MMO = M->getMemOperand();
+
+ // Don't worry if the offset ends up in a VGPR. Only one lane will have
+ // effect, so SIFixSGPRCopies will validly insert readfirstlane.
+
+ // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+ // offset field) % 64. Some versions of the programming guide omit the m0
+ // part, or claim it's from offset 0.
+ if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
+ // If we have a constant offset, try to use the default value for m0 as a
+ // base to possibly avoid setting it up.
+ glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
+ ImmOffset = ConstOffset->getZExtValue() + 1;
+ } else {
+ if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
+ ImmOffset = BaseOffset.getConstantOperandVal(1);
+ BaseOffset = BaseOffset.getOperand(0);
+ }
+
+ // Prefer to do the shift in an SGPR since it should be possible to use m0
+ // as the result directly. If it's already an SGPR, it will be eliminated
+ // later.
+ SDNode *SGPROffset
+ = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
+ BaseOffset);
+ // Shift to offset in m0
+ SDNode *M0Base
+ = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+ SDValue(SGPROffset, 0),
+ CurDAG->getTargetConstant(16, SL, MVT::i32));
+ glueCopyToM0(N, SDValue(M0Base, 0));
+ }
+
+ // The manual doesn't mention this, but it seems only v0 works.
+ SDValue V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
+
+ SDValue CopyToV0 = CurDAG->getCopyToReg(
+ N->getOperand(0), SL, V0, VSrc0,
+ N->getOperand(N->getNumOperands() - 1));
+
+ SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
+
+ // TODO: Can this just be removed from the instruction?
+ SDValue GDS = CurDAG->getTargetConstant(1, SL, MVT::i1);
+
+ unsigned Opc = IntrID == Intrinsic::amdgcn_ds_gws_init ?
+ AMDGPU::DS_GWS_INIT : AMDGPU::DS_GWS_BARRIER;
+
+ SDValue Ops[] = {
+ V0,
+ OffsetField,
+ GDS,
+ CopyToV0, // Chain
+ CopyToV0.getValue(1) // Glue
+ };
+
+ SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
+ CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
+}
+
void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
@@ -2044,6 +2116,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC
SelectDSAppendConsume(N, IntrID);
return;
}
+ }
+
+ SelectCode(N);
+}
+
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ SelectDS_GWS(N, IntrID);
+ return;
default:
break;
}
Modified: llvm/trunk/lib/Target/AMDGPU/DSInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/DSInstructions.td?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/DSInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/DSInstructions.td Wed Jun 19 12:55:27 2019
@@ -467,11 +467,15 @@ defm DS_WRXCHG_RTN_B64 : DS_1A1D_RET_mc<
defm DS_WRXCHG2_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>;
defm DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET_mc<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>;
-def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init">;
+let isConvergent = 1 in {
+def DS_GWS_INIT : DS_GWS_1D<"ds_gws_init"> {
+ let mayLoad = 0;
+}
def DS_GWS_SEMA_V : DS_GWS_0D<"ds_gws_sema_v">;
def DS_GWS_SEMA_BR : DS_GWS_1D<"ds_gws_sema_br">;
def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
+}
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Jun 19 12:55:27 2019
@@ -961,6 +961,24 @@ bool SITargetLowering::getTgtMemIntrinsi
return true;
}
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier: {
+ Info.opc = ISD::INTRINSIC_VOID;
+
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ Info.ptrVal =
+ MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo());
+
+ // This is an abstract access, but we need to specify a type and size.
+ Info.memVT = MVT::i32;
+ Info.size = 4;
+ Info.align = 4;
+
+ Info.flags = MachineMemOperand::MOStore;
+ if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
+ Info.flags = MachineMemOperand::MOLoad;
+ return true;
+ }
default:
return false;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp Wed Jun 19 12:55:27 2019
@@ -536,15 +536,19 @@ void WaitcntBrackets::updateByEvent(cons
// Put score on the source vgprs. If this is a store, just use those
// specific register(s).
if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+ int AddrOpIdx =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
// All GDS operations must protect their address register (same as
// export.)
- if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
- Inst.getOpcode() != AMDGPU::DS_CONSUME) {
- setExpScore(
- &Inst, TII, TRI, MRI,
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
- CurrScore);
+ if (AddrOpIdx != -1) {
+ setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
+ } else {
+ assert(Inst.getOpcode() == AMDGPU::DS_APPEND ||
+ Inst.getOpcode() == AMDGPU::DS_CONSUME ||
+ Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+ Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER);
}
+
if (Inst.mayStore()) {
if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
AMDGPU::OpName::data0) != -1) {
@@ -1407,18 +1411,6 @@ bool SIInsertWaitcnts::insertWaitcntInBl
ScoreBrackets.dump();
});
- // Check to see if this is a GWS instruction. If so, and if this is CI or
- // VI, then the generated code sequence will include an S_WAITCNT 0.
- // TODO: Are these the only GWS instructions?
- if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
- Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
- Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
- // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
- ScoreBrackets.applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
- }
-
// TODO: Remove this work-around after fixing the scheduler and enable the
// assert above.
if (VCCZBugWorkAround) {
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Jun 19 12:55:27 2019
@@ -2547,7 +2547,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhen
// given the typical code patterns.
if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
- Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP)
+ Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
+ Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
return true;
if (MI.isCall() || MI.isInlineAsm())
Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Wed Jun 19 12:55:27 2019
@@ -43,7 +43,8 @@ class AMDGPUPseudoSourceValue : public P
public:
enum AMDGPUPSVKind : unsigned {
PSVBuffer = PseudoSourceValue::TargetCustom,
- PSVImage
+ PSVImage,
+ GWSResource
};
protected:
@@ -87,6 +88,30 @@ public:
}
};
+class AMDGPUGWSResourcePseudoSourceValue final : public AMDGPUPseudoSourceValue {
+public:
+ explicit AMDGPUGWSResourcePseudoSourceValue(const TargetInstrInfo &TII)
+ : AMDGPUPseudoSourceValue(GWSResource, TII) {}
+
+ static bool classof(const PseudoSourceValue *V) {
+ return V->kind() == GWSResource;
+ }
+
+ // These are inaccessible memory from IR.
+ bool isAliased(const MachineFrameInfo *) const override {
+ return false;
+ }
+
+ // These are inaccessible memory from IR.
+ bool mayAlias(const MachineFrameInfo *) const override {
+ return false;
+ }
+
+ void printCustom(raw_ostream &OS) const override {
+ OS << "GWSResource";
+ }
+};
+
namespace yaml {
struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
@@ -188,6 +213,7 @@ class SIMachineFunctionInfo final : publ
std::unique_ptr<const AMDGPUBufferPseudoSourceValue>> BufferPSVs;
DenseMap<const Value *,
std::unique_ptr<const AMDGPUImagePseudoSourceValue>> ImagePSVs;
+ std::unique_ptr<const AMDGPUGWSResourcePseudoSourceValue> GWSResourcePSV;
private:
unsigned LDSWaveSpillSize = 0;
@@ -674,6 +700,15 @@ public:
return PSV.first->second.get();
}
+ const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
+ if (!GWSResourcePSV) {
+ GWSResourcePSV =
+ llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+ }
+
+ return GWSResourcePSV.get();
+ }
+
unsigned getOccupancy() const {
return Occupancy;
}
Added: llvm/trunk/test/CodeGen/AMDGPU/gws-hazards.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/gws-hazards.mir?rev=363870&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/gws-hazards.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/gws-hazards.mir Wed Jun 19 12:55:27 2019
@@ -0,0 +1,103 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=VI %s
+# RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=CI %s
+# RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=SI %s
+
+---
+name: m0_gws_init0
+tracksRegLiveness: true
+body: |
+
+ bb.0:
+ liveins: $vgpr0
+ ; GFX9-LABEL: name: m0_gws_init0
+ ; GFX9: liveins: $vgpr0
+ ; GFX9: $m0 = S_MOV_B32 -1
+ ; GFX9: S_NOP 0
+ ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; VI-LABEL: name: m0_gws_init0
+ ; VI: liveins: $vgpr0
+ ; VI: $m0 = S_MOV_B32 -1
+ ; VI: S_NOP 0
+ ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; CI-LABEL: name: m0_gws_init0
+ ; CI: liveins: $vgpr0
+ ; CI: $m0 = S_MOV_B32 -1
+ ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; SI-LABEL: name: m0_gws_init0
+ ; SI: liveins: $vgpr0
+ ; SI: $m0 = S_MOV_B32 -1
+ ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ $m0 = S_MOV_B32 -1
+ DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+
+...
+
+---
+name: m0_gws_init1
+tracksRegLiveness: true
+body: |
+
+ bb.0:
+ ; GFX9-LABEL: name: m0_gws_init1
+ ; GFX9: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GFX9: $m0 = S_MOV_B32 -1
+ ; GFX9: S_NOP 0
+ ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; VI-LABEL: name: m0_gws_init1
+ ; VI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; VI: $m0 = S_MOV_B32 -1
+ ; VI: S_NOP 0
+ ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; CI-LABEL: name: m0_gws_init1
+ ; CI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CI: $m0 = S_MOV_B32 -1
+ ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; SI-LABEL: name: m0_gws_init1
+ ; SI: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; SI: $m0 = S_MOV_B32 -1
+ ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $m0 = S_MOV_B32 -1
+ DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+
+...
+
+# Test a typical situation where m0 needs to be set from a VGPR
+# through readfirstlane
+---
+name: m0_gws_readlane
+tracksRegLiveness: true
+body: |
+
+ bb.0:
+ liveins: $vgpr0, $vgpr1
+
+ ; GFX9-LABEL: name: m0_gws_readlane
+ ; GFX9: liveins: $vgpr0, $vgpr1
+ ; GFX9: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ ; GFX9: $m0 = S_MOV_B32 $sgpr0
+ ; GFX9: S_NOP 0
+ ; GFX9: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; VI-LABEL: name: m0_gws_readlane
+ ; VI: liveins: $vgpr0, $vgpr1
+ ; VI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ ; VI: $m0 = S_MOV_B32 $sgpr0
+ ; VI: S_NOP 0
+ ; VI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; CI-LABEL: name: m0_gws_readlane
+ ; CI: liveins: $vgpr0, $vgpr1
+ ; CI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ ; CI: $m0 = S_MOV_B32 $sgpr0
+ ; CI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; SI-LABEL: name: m0_gws_readlane
+ ; SI: liveins: $vgpr0, $vgpr1
+ ; SI: $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ ; SI: $m0 = S_MOV_B32 $sgpr0
+ ; SI: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ $sgpr0 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ $m0 = S_MOV_B32 $sgpr0
+ DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+
+...
Added: llvm/trunk/test/CodeGen/AMDGPU/insert-skips-gws.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/insert-skips-gws.mir?rev=363870&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/insert-skips-gws.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/insert-skips-gws.mir Wed Jun 19 12:55:27 2019
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-skips -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
+# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
+
+---
+
+name: skip_gws_init
+body: |
+ ; CHECK-LABEL: name: skip_gws_init
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
+ ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; CHECK: bb.2:
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ SI_MASK_BRANCH %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ DS_GWS_INIT $vgpr0, 0, 1, implicit $m0, implicit $exec
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+
+name: skip_gws_barrier
+body: |
+ ; CHECK-LABEL: name: skip_gws_barrier
+ ; CHECK: bb.0:
+ ; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK: SI_MASK_BRANCH %bb.2, implicit $exec
+ ; CHECK: S_CBRANCH_EXECZ %bb.2, implicit $exec
+ ; CHECK: bb.1:
+ ; CHECK: successors: %bb.2(0x80000000)
+ ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK: DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
+ ; CHECK: bb.2:
+ ; CHECK: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1, %bb.2
+ SI_MASK_BRANCH %bb.2, implicit $exec
+
+ bb.1:
+ successors: %bb.2
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ DS_GWS_BARRIER $vgpr0, 0, 1, implicit $m0, implicit $exec
+
+ bb.2:
+ S_ENDPGM 0
+...
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll?rev=363870&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll Wed Jun 19 12:55:27 2019
@@ -0,0 +1,179 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -mattr=+flat-for-global -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIPLUS %s
+
+; Minimum offset
+; GCN-LABEL: {{^}}gws_barrier_offset0:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: s_mov_b32 m0, -1{{$}}
+; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
+define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
+ ret void
+}
+
+; Maximum offset
+; GCN-LABEL: {{^}}gws_barrier_offset63:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: s_mov_b32 m0, -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_barrier v0 offset:64 gds{{$}}
+define amdgpu_kernel void @gws_barrier_offset63(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 63)
+ ret void
+}
+
+; FIXME: Should be able to shift directly into m0
+; GCN-LABEL: {{^}}gws_barrier_sgpr_offset:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
+; GCN: ds_gws_barrier v0 gds{{$}}
+define amdgpu_kernel void @gws_barrier_sgpr_offset(i32 %val, i32 %offset) #0 {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
+ ret void
+}
+
+; Variable offset in SGPR with constant add
+; GCN-LABEL: {{^}}gws_barrier_sgpr_offset_add1:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
+; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
+define amdgpu_kernel void @gws_barrier_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
+ %offset = add i32 %offset.base, 1
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
+ ret void
+}
+
+; GCN-LABEL: {{^}}gws_barrier_vgpr_offset:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_barrier v0 gds{{$}}
+define amdgpu_kernel void @gws_barrier_vgpr_offset(i32 %val) #0 {
+ %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
+ ret void
+}
+
+; Variable offset in VGPR with constant add
+; GCN-LABEL: {{^}}gws_barrier_vgpr_offset_add:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_barrier v0 offset:3 gds{{$}}
+define amdgpu_kernel void @gws_barrier_vgpr_offset_add(i32 %val) #0 {
+ %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
+ %vgpr.offset = add i32 %vgpr.offset.base, 3
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %vgpr.offset)
+ ret void
+}
+
+ at lds = internal unnamed_addr addrspace(3) global i32 undef
+
+; Check if m0 initialization is shared
+; GCN-LABEL: {{^}}gws_barrier_save_m0_barrier_constant_offset:
+; GCN: s_mov_b32 m0, -1
+; GCN-NOT: s_mov_b32 m0
+define amdgpu_kernel void @gws_barrier_save_m0_barrier_constant_offset(i32 %val) #0 {
+ store i32 1, i32 addrspace(3)* @lds
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 10)
+ store i32 2, i32 addrspace(3)* @lds
+ ret void
+}
+
+; Make sure this increments lgkmcnt
+; GCN-LABEL: {{^}}gws_barrier_lgkmcnt:
+; GCN: ds_gws_barrier v0 offset:1 gds{{$}}
+; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @gws_barrier_lgkmcnt(i32 %val) {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 0)
+ ret void
+}
+
+; Does not imply memory fence on its own
+; GCN-LABEL: {{^}}gws_barrier_wait_before:
+; GCN: store_dword
+; CIPLUS-NOT: s_waitcnt
+; GCN: ds_gws_barrier v0 offset:8 gds
+define amdgpu_kernel void @gws_barrier_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
+ store i32 0, i32 addrspace(1)* %ptr
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ ret void
+}
+
+; GCN-LABEL: {{^}}gws_barrier_wait_after:
+; GCN: ds_gws_barrier v0 offset:8 gds
+; GCN-NEXT: s_waitcnt expcnt(0){{$}}
+; GCN-NEXT: load_dword
+define amdgpu_kernel void @gws_barrier_wait_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ %load = load volatile i32, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; Does not imply memory fence on its own
+; GCN-LABEL: {{^}}gws_barrier_fence_before:
+; GCN: store_dword
+; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN: ds_gws_barrier v0 offset:8 gds
+define amdgpu_kernel void @gws_barrier_fence_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
+ store i32 0, i32 addrspace(1)* %ptr
+ fence release
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ ret void
+}
+
+; GCN-LABEL: {{^}}gws_barrier_fence_after:
+; GCN: ds_gws_barrier v0 offset:8 gds
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: load_dword
+define amdgpu_kernel void @gws_barrier_fence_after(i32 %val, i32 addrspace(1)* %ptr) #0 {
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ fence release
+ %load = load volatile i32, i32 addrspace(1)* %ptr
+ ret void
+}
+
+; FIXME: Should a wait be inserted here, or is an explicit fence needed?
+; GCN-LABEL: {{^}}gws_init_barrier:
+; GCN: s_mov_b32 m0, -1
+; GCN: ds_gws_init v0 offset:8 gds
+; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
+define amdgpu_kernel void @gws_init_barrier(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ ret void
+}
+
+; FIXME: Why vmcnt, not expcnt?
+; GCN-LABEL: {{^}}gws_init_fence_barrier:
+; GCN: s_mov_b32 m0, -1
+; GCN: ds_gws_init v0 offset:8 gds
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: ds_gws_barrier v0 offset:8 gds
+define amdgpu_kernel void @gws_init_fence_barrier(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
+ fence release
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 7)
+ ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #1
+declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
+declare i32 @llvm.amdgcn.workitem.id.x() #3
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent inaccessiblememonly nounwind }
+attributes #2 = { convergent inaccessiblememonly nounwind writeonly }
+attributes #3 = { nounwind readnone speculatable }
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll?rev=363870&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.init.ll Wed Jun 19 12:55:27 2019
@@ -0,0 +1,119 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+
+; Minimum offset
+; GCN-LABEL: {{^}}gws_init_offset0:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: s_mov_b32 m0, -1{{$}}
+; GCN: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_init v0 offset:1 gds{{$}}
+define amdgpu_kernel void @gws_init_offset0(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
+ ret void
+}
+
+; Maximum offset
+; GCN-LABEL: {{^}}gws_init_offset63:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: s_mov_b32 m0, -1{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_init v0 offset:64 gds{{$}}
+define amdgpu_kernel void @gws_init_offset63(i32 %val) #0 {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 63)
+ ret void
+}
+
+; FIXME: Should be able to shift directly into m0
+; GCN-LABEL: {{^}}gws_init_sgpr_offset:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
+; GCN: ds_gws_init v0 gds{{$}}
+define amdgpu_kernel void @gws_init_sgpr_offset(i32 %val, i32 %offset) #0 {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
+ ret void
+}
+
+; Variable offset in SGPR with constant add
+; GCN-LABEL: {{^}}gws_init_sgpr_offset_add1:
+; GCN-DAG: s_load_dwordx2 s{{\[}}[[BAR_NUM:[0-9]+]]:[[OFFSET:[0-9]+]]{{\]}}
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], s[[OFFSET]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, s[[BAR_NUM]]
+; GCN: ds_gws_init v0 offset:1 gds{{$}}
+define amdgpu_kernel void @gws_init_sgpr_offset_add1(i32 %val, i32 %offset.base) #0 {
+ %offset = add i32 %offset.base, 1
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
+ ret void
+}
+
+; GCN-LABEL: {{^}}gws_init_vgpr_offset:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_init v0 gds{{$}}
+define amdgpu_kernel void @gws_init_vgpr_offset(i32 %val) #0 {
+ %vgpr.offset = call i32 @llvm.amdgcn.workitem.id.x()
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
+ ret void
+}
+
+; Variable offset in VGPR with constant add
+; GCN-LABEL: {{^}}gws_init_vgpr_offset_add:
+; GCN-DAG: s_load_dword [[BAR_NUM:s[0-9]+]]
+; GCN-DAG: v_readfirstlane_b32 [[READLANE:s[0-9]+]], v0
+; GCN-DAG: s_lshl_b32 [[SHL:s[0-9]+]], [[READLANE]], 16
+; GCN-DAG: s_mov_b32 m0, [[SHL]]{{$}}
+; GCN-DAG: v_mov_b32_e32 v0, [[BAR_NUM]]
+; GCN: ds_gws_init v0 offset:3 gds{{$}}
+define amdgpu_kernel void @gws_init_vgpr_offset_add(i32 %val) #0 {
+ %vgpr.offset.base = call i32 @llvm.amdgcn.workitem.id.x()
+ %vgpr.offset = add i32 %vgpr.offset.base, 3
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %vgpr.offset)
+ ret void
+}
+
+ at lds = internal unnamed_addr addrspace(3) global i32 undef
+
+; Check if m0 initialization is shared.
+; GCN-LABEL: {{^}}gws_init_save_m0_init_constant_offset:
+; GCN: s_mov_b32 m0, -1
+; GCN-NOT: s_mov_b32 m0
+define amdgpu_kernel void @gws_init_save_m0_init_constant_offset(i32 %val) #0 {
+ store i32 1, i32 addrspace(3)* @lds
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 10)
+ store i32 2, i32 addrspace(3)* @lds
+ ret void
+}
+
+; GCN-LABEL: {{^}}gws_init_lgkmcnt:
+; GCN: ds_gws_init v0 offset:1 gds{{$}}
+; GCN-NEXT: s_waitcnt expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64
+define void @gws_init_lgkmcnt(i32 %val) {
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 0)
+ ret void
+}
+
+; Does not imply memory fence on its own
+; GCN-LABEL: {{^}}gws_init_wait_before:
+; GCN: store_dword
+; CIPLUS-NOT: s_waitcnt
+; GCN: ds_gws_init v0 offset:8 gds
+define amdgpu_kernel void @gws_init_wait_before(i32 %val, i32 addrspace(1)* %ptr) #0 {
+ store i32 0, i32 addrspace(1)* %ptr
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 7)
+ ret void
+}
+
+declare void @llvm.amdgcn.ds.gws.init(i32, i32) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #2
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent inaccessiblememonly nounwind writeonly }
+attributes #2 = { nounwind readnone speculatable }
Modified: llvm/trunk/test/CodeGen/AMDGPU/tail-duplication-convergent.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/tail-duplication-convergent.ll?rev=363870&r1=363869&r2=363870&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/tail-duplication-convergent.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/tail-duplication-convergent.ll Wed Jun 19 12:55:27 2019
@@ -6,6 +6,8 @@
declare void @nonconvergent_func() #0
declare void @convergent_func() #1
declare void @llvm.amdgcn.s.barrier() #1
+declare void @llvm.amdgcn.ds.gws.init(i32, i32) #2
+declare void @llvm.amdgcn.ds.gws.barrier(i32, i32) #2
; barrier shouldn't be duplicated.
@@ -100,6 +102,52 @@ call:
ret void
}
+; GCN-LABEL: {{^}}taildup_gws_init:
+; GCN: ds_gws_init
+; GCN-NOT: ds_gws_init
+define amdgpu_kernel void @taildup_gws_init(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
+entry:
+ br i1 %cond, label %bb1, label %bb2
+
+bb1:
+ store i32 0, i32 addrspace(1)* %a
+ br label %call
+
+bb2:
+ store i32 1, i32 addrspace(1)* %a
+ br label %call
+
+call:
+ call void @llvm.amdgcn.ds.gws.init(i32 %val, i32 %offset)
+ br label %ret
+
+ret:
+ ret void
+}
+
+; GCN-LABEL: {{^}}taildup_gws_barrier:
+; GCN: ds_gws_barrier
+; GCN-NOT: ds_gws_barrier
+define amdgpu_kernel void @taildup_gws_barrier(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i1 %cond, i32 %val, i32 %offset) #0 {
+entry:
+ br i1 %cond, label %bb1, label %bb2
+
+bb1:
+ store i32 0, i32 addrspace(1)* %a
+ br label %call
+
+bb2:
+ store i32 1, i32 addrspace(1)* %a
+ br label %call
+
+call:
+ call void @llvm.amdgcn.ds.gws.barrier(i32 %val, i32 %offset)
+ br label %ret
+
+ret:
+ ret void
+}
attributes #0 = { nounwind }
attributes #1 = { nounwind convergent }
+attributes #2 = { convergent inaccessiblememonly nounwind }
More information about the llvm-commits
mailing list