[llvm] r363185 - [AMDGPU] gfx1010 premlane instructions
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 12 10:52:51 PDT 2019
Author: rampitec
Date: Wed Jun 12 10:52:51 2019
New Revision: 363185
URL: http://llvm.org/viewvc/llvm-project?rev=363185&view=rev
Log:
[AMDGPU] gfx1010 premlane instructions
Differential Revision: https://reviews.llvm.org/D63202
Added:
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
llvm/trunk/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Wed Jun 12 10:52:51 2019
@@ -1437,6 +1437,18 @@ def int_amdgcn_ds_bpermute :
// GFX10 Intrinsics
//===----------------------------------------------------------------------===//
+// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
+def int_amdgcn_permlane16 :
+ Intrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+ [IntrNoMem, IntrConvergent]>;
+
+// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
+def int_amdgcn_permlanex16 :
+ Intrinsic<[llvm_i32_ty],
+ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+ [IntrNoMem, IntrConvergent]>;
+
def int_amdgcn_s_get_waveid_in_workgroup :
GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
Intrinsic<[llvm_i32_ty], [], [IntrReadMem]>;
Modified: llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp Wed Jun 12 10:52:51 2019
@@ -1148,6 +1148,7 @@ private:
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst);
bool validateLdsDirect(const MCInst &Inst);
+ bool validateOpSel(const MCInst &Inst);
bool validateVOP3Literal(const MCInst &Inst) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -3003,6 +3004,19 @@ bool AMDGPUAsmParser::validateSOPLiteral
return NumLiterals <= 1;
}
+bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+ unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+
+ if (OpSel & ~3)
+ return false;
+ }
+ return true;
+}
+
// VOP3 literal is only allowed in GFX10+ and only one can be used
bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
unsigned Opcode = Inst.getOpcode();
@@ -3071,6 +3085,11 @@ bool AMDGPUAsmParser::validateInstructio
"integer clamping is not supported on this GPU");
return false;
}
+ if (!validateOpSel(Inst)) {
+ Error(IDLoc,
+ "invalid op_sel operand");
+ return false;
+ }
// For MUBUF/MTBUF d16 is a part of opcode, so there is nothing to validate.
if (!validateMIMGD16(Inst)) {
Error(IDLoc,
Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.cpp Wed Jun 12 10:52:51 2019
@@ -115,6 +115,12 @@ static bool isSendMsgTraceDataOrGDS(cons
}
}
+static bool isPermlane(const MachineInstr &MI) {
+ unsigned Opcode = MI.getOpcode();
+ return Opcode == AMDGPU::V_PERMLANE16_B32 ||
+ Opcode == AMDGPU::V_PERMLANEX16_B32;
+}
+
static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
AMDGPU::OpName::simm16);
@@ -835,11 +841,49 @@ int GCNHazardRecognizer::checkReadM0Haza
void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
fixVMEMtoScalarWriteHazards(MI);
+ fixVcmpxPermlaneHazards(MI);
fixSMEMtoVectorWriteHazards(MI);
fixVcmpxExecWARHazard(MI);
fixLdsBranchVmemWARHazard(MI);
}
+bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
+ if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
+ return false;
+
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ auto IsHazardFn = [TII] (MachineInstr *MI) {
+ return TII->isVOPC(*MI);
+ };
+
+ auto IsExpiredFn = [] (MachineInstr *MI, int) {
+ if (!MI)
+ return false;
+ unsigned Opc = MI->getOpcode();
+ return SIInstrInfo::isVALU(*MI) &&
+ Opc != AMDGPU::V_NOP_e32 &&
+ Opc != AMDGPU::V_NOP_e64 &&
+ Opc != AMDGPU::V_NOP_sdwa;
+ };
+
+ if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
+ std::numeric_limits<int>::max())
+ return false;
+
+ // V_NOP will be discarded by SQ.
+ // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
+ // which is always a VGPR and available.
+ auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+ unsigned Reg = Src0->getReg();
+ bool IsUndef = Src0->isUndef();
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(AMDGPU::V_MOV_B32_e32))
+ .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
+ .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
+
+ return true;
+}
+
bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
if (!ST.hasVMEMtoScalarWriteHazard())
return false;
Modified: llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/GCNHazardRecognizer.h Wed Jun 12 10:52:51 2019
@@ -84,8 +84,8 @@ private:
int checkAnyInstHazards(MachineInstr *MI);
int checkReadM0Hazards(MachineInstr *SMovRel);
int checkNSAtoVMEMHazard(MachineInstr *MI);
-
void fixHazards(MachineInstr *MI);
+ bool fixVcmpxPermlaneHazards(MachineInstr *MI);
bool fixVMEMtoScalarWriteHazards(MachineInstr *MI);
bool fixSMEMtoVectorWriteHazards(MachineInstr *MI);
bool fixVcmpxExecWARHazard(MachineInstr *MI);
Modified: llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp Wed Jun 12 10:52:51 2019
@@ -1005,6 +1005,18 @@ void AMDGPUInstPrinter::printPackedModif
void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
+ unsigned Opc = MI->getOpcode();
+ if (Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_gfx10) {
+ auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
+ unsigned FI = !!(MI->getOperand(FIN).getImm() & SISrcMods::OP_SEL_0);
+ unsigned BC = !!(MI->getOperand(BCN).getImm() & SISrcMods::OP_SEL_0);
+ if (FI || BC)
+ O << " op_sel:[" << FI << ',' << BC << ']';
+ return;
+ }
+
printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Jun 12 10:52:51 2019
@@ -9721,6 +9721,24 @@ SDNode *SITargetLowering::PostISelFoldin
Ops.push_back(ImpDef.getValue(1));
return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
}
+ case AMDGPU::V_PERMLANE16_B32:
+ case AMDGPU::V_PERMLANEX16_B32: {
+ ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
+ ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
+ if (!FI->getZExtValue() && !BC->getZExtValue())
+ break;
+ SDValue VDstIn = Node->getOperand(6);
+ if (VDstIn.isMachineOpcode()
+ && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
+ break;
+ MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+ SDLoc(Node), MVT::i32);
+ SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
+ SDValue(BC, 0), Node->getOperand(3),
+ Node->getOperand(4), Node->getOperand(5),
+ SDValue(ImpDef, 0), Node->getOperand(7) };
+ return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+ }
default:
break;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Wed Jun 12 10:52:51 2019
@@ -3790,6 +3790,26 @@ void SIInstrInfo::legalizeOperandsVOP3(M
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
};
+ if (Opc == AMDGPU::V_PERMLANE16_B32 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32) {
+ // src1 and src2 must be scalar
+ MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
+ MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src1);
+ Src1.ChangeToRegister(Reg, false);
+ }
+ if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
+ unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src2);
+ Src2.ChangeToRegister(Reg, false);
+ }
+ }
+
// Find the one SGPR operand we are allowed to use.
int ConstantBusLimit = ST.getConstantBusLimit(Opc);
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
Modified: llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td?rev=363185&r1=363184&r2=363185&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/VOP3Instructions.td Wed Jun 12 10:52:51 2019
@@ -625,9 +625,35 @@ def : ThreeOp_i32_Pats<xor, add, V_XAD_U
} // End SubtargetPredicate = isGFX9Plus
+def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
+ let Src0RC64 = VRegSrc_32;
+ let Src1RC64 = SCSrc_b32;
+ let Src2RC64 = SCSrc_b32;
+ let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
+ IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1,
+ IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2,
+ VGPR_32:$vdst_in, op_sel:$op_sel);
+ let HasClamp = 0;
+ let HasOMod = 0;
+}
+
let SubtargetPredicate = isGFX10Plus in {
def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
+
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ def V_PERMLANE16_B32 : VOP3Inst <"v_permlane16_b32", VOP3_PERMLANE_Profile>;
+ def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+
+ def : GCNPat<
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
+ def : GCNPat<
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
+ >;
} // End SubtargetPredicate = isGFX10Plus
//===----------------------------------------------------------------------===//
@@ -790,6 +816,8 @@ defm V_MAX_I16 : VOP3_Real_gfx10
defm V_MIN_U16 : VOP3_Real_gfx10_with_name<0x30b, "V_MIN_U16_e64", "v_min_u16">;
defm V_MIN_I16 : VOP3_Real_gfx10_with_name<0x30c, "V_MIN_I16_e64", "v_min_i16">;
defm V_LSHLREV_B16 : VOP3_Real_gfx10_with_name<0x314, "V_LSHLREV_B16_e64", "v_lshlrev_b16">;
+defm V_PERMLANE16_B32 : VOP3OpSel_Real_gfx10<0x377>;
+defm V_PERMLANEX16_B32 : VOP3OpSel_Real_gfx10<0x378>;
//===----------------------------------------------------------------------===//
// GFX7, GFX10.
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll?rev=363185&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll Wed Jun 12 10:52:51 2019
@@ -0,0 +1,311 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vss:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vii:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vll:
+; FIXME-GFX10: It is allowed to have both immediates as literals
+; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
+; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vvv:
+; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
+; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vvs:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vsv:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vss_bc:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_vss_fi_bc:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vss:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vss(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vii:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vii(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 1, i32 2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vll:
+; FIXME-GFX10: It is allowed to have both immediates as literals
+; GFX10-DAG: s_movk_i32 [[SRC1:s[0-9]+]], 0x1234
+; GFX10-DAG: s_mov_b32 [[SRC2:s[0-9]+]], 0xc1d1
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vll(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vvv:
+; GFX10-DAG: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
+; GFX10-DAG: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vvv(i32 addrspace(1)* %out, i32 %src0) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vvs:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_readfirstlane_b32 [[SRC1:s[0-9]+]], v0
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[SRC1]], s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vvs(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vsv:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_readfirstlane_b32 [[SRC2:s[0-9]+]], v1
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, [[SRC2]]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vsv(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+ %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vss_bc:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vss_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 0, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_vss_fi_bc:
+; GFX10-NOT: v_readfirstlane_b32
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 1, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_tid_tid:
+; GFX10: v_permlane16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlane16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_undef_tid:
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlane16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_i_tid:
+; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
+; GFX10: v_permlane16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlane16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_bc:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlane16_b32_i_tid_fi_bc:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlane16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
+define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_tid_tid:
+; GFX10: v_permlanex16_b32 v0, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_tid_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_undef_tid:
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_undef_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid:
+; GFX10: v_mov_b32_e32 [[OLD:v[0-9]+]], 0x3039
+; GFX10: v_permlanex16_b32 [[OLD]], v0, s{{[0-9]+}}, s{{[0-9]+}}{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_i_tid(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,0]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_bc:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[0,1]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 0, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+; GCN-LABEL: {{^}}v_permlanex16_b32_i_tid_fi_bc:
+; GFX10-NOT: 0x3039
+; GFX10: v_permlanex16_b32 v{{[0-9]+}}, v0, s{{[0-9]+}}, s{{[0-9]+}} op_sel:[1,1]{{$}}
+define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #1 {
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlanex16(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 1, i1 1)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+attributes #0 = { nounwind readnone convergent }
+attributes #1 = { nounwind }
Added: llvm/trunk/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir?rev=363185&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir Wed Jun 12 10:52:51 2019
@@ -0,0 +1,145 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-skips,post-RA-hazard-rec -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: hazard_vcmpx_permlane16
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: S_ADD_U32
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANE16_B32
+---
+name: hazard_vcmpx_permlane16
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: name: hazard_vcmpx_permlanex16
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANEX16_B32
+---
+name: hazard_vcmpx_permlanex16
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr1 = V_PERMLANEX16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: name: hazard_vcmpx_permlane16_v_nop
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: V_NOP
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANE16_B32
+---
+name: hazard_vcmpx_permlane16_v_nop
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ V_NOP_e32 implicit $exec
+ $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: name: hazard_vcmpx_permlane16_far
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANE16_B32
+---
+name: hazard_vcmpx_permlane16_far
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr2 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ V_NOP_e32 implicit $exec
+ $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: name: hazard_vcmpx_permlane16_no_hazard
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: V_ADD_F32
+# GCN-NEXT: V_PERMLANE16_B32
+---
+name: hazard_vcmpx_permlane16_no_hazard
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = IMPLICIT_DEF
+ $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec
+ $vgpr1 = V_PERMLANE16_B32 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
+
+# GCN-LABEL: name: hazard_vcmpx_permlane16_undef_src
+# GCN: V_CMPX_LE_F32_nosdst_e32
+# GCN: S_ADD_U32
+# GCN-NEXT: dead $vgpr1 = V_MOV_B32_e32 undef $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANE16_B32
+---
+name: hazard_vcmpx_permlane16_undef_src
+body: |
+ bb.0:
+ successors: %bb.1
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ SI_KILL_F32_COND_IMM_TERMINATOR $vgpr0, 0, 3, implicit-def $exec, implicit-def $vcc, implicit-def $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr2 = IMPLICIT_DEF
+ $sgpr0 = IMPLICIT_DEF
+ $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+ $vgpr1 = V_PERMLANE16_B32 0, undef $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, undef $vgpr1, 0, implicit $exec
+ S_ENDPGM 0
+...
More information about the llvm-commits
mailing list