[llvm] r359959 - [AMDGPU] gfx1010: use fmac instructions
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri May 3 21:20:37 PDT 2019
Author: rampitec
Date: Fri May 3 21:20:37 2019
New Revision: 359959
URL: http://llvm.org/viewvc/llvm-project?rev=359959&view=rev
Log:
[AMDGPU] gfx1010: use fmac instructions
Differential Revision: https://reviews.llvm.org/D61527
Added:
llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll
llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
llvm/trunk/test/CodeGen/AMDGPU/twoaddr-fma.mir
Modified:
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
llvm/trunk/test/CodeGen/AMDGPU/madak.ll
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Fri May 3 21:20:37 2019
@@ -521,7 +521,7 @@ SITargetLowering::SITargetLowering(const
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
- if (!Subtarget->hasFP16Denormals())
+ if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@@ -8723,8 +8723,10 @@ unsigned SITargetLowering::getFusedOpcod
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
+ if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
+ (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
+ getSubtarget()->hasMadF16())) &&
+ isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri May 3 21:20:37 2019
@@ -2071,7 +2071,9 @@ bool SIInstrInfo::FoldImmediate(MachineI
}
if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
+ Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64) {
// Don't fold if we are using source or output modifiers. The new VOP2
// instructions don't have them.
if (hasAnyModifiersSet(UseMI))
@@ -2086,7 +2088,10 @@ bool SIInstrInfo::FoldImmediate(MachineI
if (isInlineConstant(UseMI, *Src0, *ImmOp))
return false;
- bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMA_F32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMA_F16 || Opc == AMDGPU::V_FMAC_F16_e64;
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
@@ -2099,6 +2104,12 @@ bool SIInstrInfo::FoldImmediate(MachineI
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
+ : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
// We need to swap operands 0 and 1 since madmk constant is at operand 1.
const int64_t Imm = ImmOp->getImm();
@@ -2119,14 +2130,16 @@ bool SIInstrInfo::FoldImmediate(MachineI
Src0->setIsKill(Src1->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
Src1->ChangeToImmediate(Imm);
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
+ UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2176,6 +2189,12 @@ bool SIInstrInfo::FoldImmediate(MachineI
// VGPR is okay as Src1 - fallthrough
}
+ unsigned NewOpc =
+ IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
+ : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return false;
+
const int64_t Imm = ImmOp->getImm();
// FIXME: This would be a lot easier if we could return a new instruction
@@ -2188,7 +2207,9 @@ bool SIInstrInfo::FoldImmediate(MachineI
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
if (Opc == AMDGPU::V_MAC_F32_e64 ||
- Opc == AMDGPU::V_MAC_F16_e64)
+ Opc == AMDGPU::V_MAC_F16_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64)
UseMI.untieRegOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
@@ -2197,7 +2218,7 @@ bool SIInstrInfo::FoldImmediate(MachineI
// These come before src2.
removeModOperands(UseMI);
- UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
+ UseMI.setDesc(get(NewOpc));
bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
if (DeleteDef)
@@ -2310,18 +2331,21 @@ MachineInstr *SIInstrInfo::convertToThre
LiveVariables *LV) const {
unsigned Opc = MI.getOpcode();
bool IsF16 = false;
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64;
switch (Opc) {
default:
return nullptr;
case AMDGPU::V_MAC_F16_e64:
+ case AMDGPU::V_FMAC_F16_e64:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_FMAC_F32_e64:
break;
case AMDGPU::V_MAC_F16_e32:
+ case AMDGPU::V_FMAC_F16_e32:
IsF16 = true;
LLVM_FALLTHROUGH;
case AMDGPU::V_MAC_F32_e32:
@@ -2350,32 +2374,38 @@ MachineInstr *SIInstrInfo::convertToThre
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
- if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
+ if (!Src0Mods && !Src1Mods && !Clamp && !Omod &&
// If we have an SGPR input, we will violate the constant bus restriction.
(ST.getConstantBusLimit(Opc) > 1 ||
!Src0->isReg() ||
!RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
if (auto Imm = getFoldableImm(Src2)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
- .add(*Dst)
- .add(*Src0)
- .add(*Src1)
- .addImm(Imm);
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
+ : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .add(*Src1)
+ .addImm(Imm);
}
+ unsigned NewOpc =
+ IsFMA ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
if (auto Imm = getFoldableImm(Src1)) {
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
- .add(*Dst)
- .add(*Src0)
- .addImm(Imm)
- .add(*Src2);
+ if (pseudoToMCOpcode(NewOpc) != -1)
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
+ .add(*Dst)
+ .add(*Src0)
+ .addImm(Imm)
+ .add(*Src2);
}
if (auto Imm = getFoldableImm(Src0)) {
- if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
+ if (pseudoToMCOpcode(NewOpc) != -1 &&
+ isOperandLegal(MI, AMDGPU::getNamedOperandIdx(NewOpc,
AMDGPU::OpName::src0), Src1))
- return BuildMI(*MBB, MI, MI.getDebugLoc(),
- get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
+ return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.add(*Src1)
.addImm(Imm)
@@ -2383,9 +2413,11 @@ MachineInstr *SIInstrInfo::convertToThre
}
}
- assert((!IsFMA || !IsF16) && "fmac only expected with f32");
- unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
- (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16 : AMDGPU::V_FMA_F32)
+ : (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
+ if (pseudoToMCOpcode(NewOpc) == -1)
+ return nullptr;
+
return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
.add(*Dst)
.addImm(Src0Mods ? Src0Mods->getImm() : 0)
@@ -2678,6 +2710,7 @@ bool SIInstrInfo::canShrink(const Machin
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F16_e64:
if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
return false;
@@ -3410,13 +3443,16 @@ void SIInstrInfo::legalizeOpWithMove(Mac
MachineBasicBlock *MBB = MI.getParent();
MachineOperand &MO = MI.getOperand(OpIdx);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ const SIRegisterInfo *TRI =
+ static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
const TargetRegisterClass *RC = RI.getRegClass(RCID);
- unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+ unsigned Size = TRI->getRegSizeInBits(*RC);
+ unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
if (MO.isReg())
Opcode = AMDGPU::COPY;
else if (RI.isSGPRClass(RC))
- Opcode = AMDGPU::S_MOV_B32;
+ Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
@@ -5332,6 +5368,12 @@ MachineOperand *SIInstrInfo::getNamedOpe
}
uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
+ return (16ULL << 44) | // IMG_FORMAT_32_FLOAT
+ (1ULL << 56) | // RESOURCE_LEVEL = 1
+ (3ULL << 60); // OOB_SELECT = 3
+ }
+
uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
if (ST.isAmdHsaOS()) {
// Set ATC = 1. GFX9 doesn't have this bit.
@@ -5358,12 +5400,14 @@ uint64_t SIInstrInfo::getScratchRsrcWord
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
}
- // IndexStride = 64.
- Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
+ // IndexStride = 64 / 32.
+ uint64_t IndexStride = ST.getGeneration() <= AMDGPUSubtarget::GFX9 ? 3 : 2;
+ Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
// If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
// Clear them unless we want a huge stride.
- if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
+ ST.getGeneration() <= AMDGPUSubtarget::GFX9)
Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
return Rsrc23;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri May 3 21:20:37 2019
@@ -1462,7 +1462,7 @@ def : GCNPat<
def : GCNPat<
(fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
- (V_PK_MUL_F16 0, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+ (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
>;
}
@@ -1523,6 +1523,14 @@ def : GCNPat <
>;
} // End OtherPredicates = [HasDLInsts]
+let SubtargetPredicate = isGFX10Plus in
+def : GCNPat <
+ (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
+ (f16 (VOP3NoMods f32:$src2))),
+ (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
+ SRCMODS.NONE, $src2, $clamp, $omod)
+>;
// Allow integer inputs
class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
Modified: llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIPeepholeSDWA.cpp Fri May 3 21:20:37 2019
@@ -418,7 +418,9 @@ bool SDWASrcOperand::convertToSDWA(Machi
}
assert(Src && Src->isReg());
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
!isSameReg(*Src, *getReplacedOperand())) {
// In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
@@ -460,7 +462,9 @@ MachineInstr *SDWADstOperand::potentialT
bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
// Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
- if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+ if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
+ MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
+ MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
getDstSel() != AMDGPU::SDWA::DWORD) {
// v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
@@ -964,10 +968,16 @@ bool SIPeepholeSDWA::isConvertibleToSDWA
return false;
}
- if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
+ if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
+ Opc == AMDGPU::V_FMAC_F32_e32 ||
+ Opc == AMDGPU::V_MAC_F16_e32 ||
Opc == AMDGPU::V_MAC_F32_e32))
return false;
+ // Check if target supports this SDWA opcode
+ if (TII->pseudoToMCOpcode(Opc) == -1)
+ return false;
+
// FIXME: has SDWA but require handling of implicit VCC use
if (Opc == AMDGPU::V_CNDMASK_B32_e32)
return false;
@@ -1038,7 +1048,9 @@ bool SIPeepholeSDWA::convertToSDWA(Machi
SDWAInst.add(*Src1);
}
- if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+ if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
+ SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
+ SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
// v_mac_f16/32 has additional src2 operand tied to vdst
MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
Modified: llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fdiv.f16.ll Fri May 3 21:20:37 2019
@@ -1,7 +1,8 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s
; Make sure fdiv is promoted to f32.
@@ -21,17 +22,17 @@
; SI: v_div_fixup_f32
; SI: v_cvt_f16_f32
-; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
-; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
-; GFX8_9-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
+; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]]
+; GFX8_9_10-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]]
-; GFX8_9-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
-; GFX8_9: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
-; GFX8_9: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
-; GFX8_9: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10-DAG: v_rcp_f32_e32 [[RCP_RHS:v[0-9]+]], [[CVT_RHS]]
+; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]]
+; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
+; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16(
half addrspace(1)* %r,
half addrspace(1)* %a,
@@ -50,11 +51,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; GFX8_9-NOT: [[RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GFX8_9_10-NOT: [[RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -68,11 +69,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_abs:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
-; GFX8_9-NOT: [RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
+; GFX8_9_10-NOT: [RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -87,11 +88,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_arcp:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; GFX8_9-NOT: [[RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GFX8_9_10-NOT: [[RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -105,11 +106,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rcp_f16_neg:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
-; GFX8_9-NOT: [RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
+; GFX8_9_10-NOT: [RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -123,11 +124,11 @@ entry:
}
; GCN-LABEL: {{^}}v_rsq_f16:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; GFX8_9-NOT: [RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; GFX8_9_10-NOT: [RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -142,12 +143,12 @@ entry:
}
; GCN-LABEL: {{^}}v_rsq_f16_neg:
-; GFX8_9: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
-; GFX8_9-NOT: [[VAL]]
-; GFX8_9: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
-; GFX8_9-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
-; GFX8_9-NOT: [RESULT]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[VAL:v[0-9]+]]
+; GFX8_9_10-NOT: [[VAL]]
+; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]]
+; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
+; GFX8_9_10-NOT: [RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -162,13 +163,13 @@ entry:
}
; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
-; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
-; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
+; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
+; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -184,13 +185,13 @@ entry:
}
; GCN-LABEL: {{^}}v_fdiv_f16_unsafe:
-; GFX8_9: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[LHS:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_ushort [[RHS:v[0-9]+]]
-; GFX8_9: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
-; GFX8_9: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
+; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]]
+; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]]
-; GFX8_9: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -208,8 +209,8 @@ entry:
; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
-; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
-; GFX8_9: buffer_store_short [[MUL]]
+; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, 2.0
@@ -220,8 +221,8 @@ define amdgpu_kernel void @div_arcp_2_x_
; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dccc000, v{{[0-9]+}}
-; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
-; GFX8_9: buffer_store_short [[MUL]]
+; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
+; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, 10.0
@@ -232,8 +233,8 @@ define amdgpu_kernel void @div_arcp_k_x_
; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdccc000, v{{[0-9]+}}
-; GFX8_9: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
-; GFX8_9: buffer_store_short [[MUL]]
+; GFX8_9_10: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
+; GFX8_9_10: buffer_store_short [[MUL]]
define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
%x = load half, half addrspace(1)* undef
%rcp = fdiv arcp half %x, -10.0
Added: llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll?rev=359959&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmac.sdwa.ll Fri May 3 21:20:37 2019
@@ -0,0 +1,76 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX1010 %s
+
+; GCN-LABEL: {{^}}addMul2D:
+; GFX1010: v_fmac_f16
+; GFX1010: v_fmac_f16
+define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
+ %5 = extractelement <2 x i32> %2, i64 1
+ %6 = icmp sgt i32 %5, 0
+ br i1 %6, label %7, label %38
+
+7: ; preds = %4
+ %8 = extractelement <2 x i32> %2, i64 0
+ %9 = icmp sgt i32 %8, 0
+ br label %10
+
+10: ; preds = %34, %7
+ %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
+ %12 = phi i32 [ 0, %7 ], [ %36, %34 ]
+ br i1 %9, label %13, label %34
+
+13: ; preds = %10
+ %14 = mul nsw i32 %12, %3
+ %15 = mul nsw i32 %12, %8
+ br label %16
+
+16: ; preds = %16, %13
+ %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
+ %18 = phi i32 [ 0, %13 ], [ %32, %16 ]
+ %19 = add nsw i32 %18, %14
+ %20 = sext i32 %19 to i64
+ %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
+ %22 = load <4 x i8>, <4 x i8>* %21, align 4
+ %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
+ %24 = add nsw i32 %18, %15
+ %25 = sext i32 %24 to i64
+ %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
+ %27 = load float, float addrspace(4)* %26, align 4
+ %28 = fptrunc float %27 to half
+ %29 = insertelement <4 x half> undef, half %28, i32 0
+ %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
+ %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
+ %32 = add nuw nsw i32 %18, 1
+ %33 = icmp eq i32 %32, %8
+ br i1 %33, label %34, label %16
+
+34: ; preds = %16, %10
+ %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
+ %36 = add nuw nsw i32 %12, 1
+ %37 = icmp eq i32 %36, %5
+ br i1 %37, label %38, label %10
+
+38: ; preds = %34, %4
+ %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
+ ret <4 x half> %39
+}
+
+define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
+ %2 = extractelement <4 x i8> %0, i64 0
+ %3 = uitofp i8 %2 to half
+ %4 = insertelement <4 x half> undef, half %3, i32 0
+ %5 = extractelement <4 x i8> %0, i64 1
+ %6 = uitofp i8 %5 to half
+ %7 = insertelement <4 x half> %4, half %6, i32 1
+ %8 = extractelement <4 x i8> %0, i64 2
+ %9 = uitofp i8 %8 to half
+ %10 = insertelement <4 x half> %7, half %9, i32 2
+ %11 = extractelement <4 x i8> %0, i64 3
+ %12 = uitofp i8 %11 to half
+ %13 = insertelement <4 x half> %10, half %12, i32 3
+ ret <4 x half> %13
+}
+
+declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
+
+attributes #0 = { convergent nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" "target-features"="+16-bit-insts,+dl-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx10-insts,+gfx9-insts,+s-memrealtime,-code-object-v3,-sram-ecc,-xnack" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
Modified: llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/fmuladd.f16.ll Fri May 3 21:20:37 2019
@@ -1,8 +1,13 @@
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare half @llvm.fmuladd.f16(half, half, half) #1
@@ -12,6 +17,11 @@ declare half @llvm.fabs.f16(half) #1
; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX10-FLUSH: v_mul_f16_e32
+; GFX10-FLUSH: v_add_f16_e32
+; GFX10-DENORM: v_fmac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
%r0 = load half, half addrspace(1)* %in1
@@ -23,13 +33,21 @@ define amdgpu_kernel void @fmuladd_f16(h
}
; GCN-LABEL: {{^}}fmuladd_2.0_a_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX10-DENORM: v_fmac_f16_e32 [[R2:v[0-9]+]], 2.0, [[R1]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
+
+; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -45,13 +63,21 @@ define amdgpu_kernel void @fmuladd_2.0_a
}
; GCN-LABEL: {{^}}fmuladd_a_2.0_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
+
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -67,17 +93,25 @@ define amdgpu_kernel void @fmuladd_a_2.0
}
; GCN-LABEL: {{^}}fadd_a_a_b_f16:
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+
define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
half addrspace(1)* %in1,
half addrspace(1)* %in2) #0 {
@@ -96,17 +130,25 @@ define amdgpu_kernel void @fadd_a_a_b_f1
}
; GCN-LABEL: {{^}}fadd_b_a_a_f16:
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+
define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
half addrspace(1)* %in1,
half addrspace(1)* %in2) #0 {
@@ -125,11 +167,17 @@ define amdgpu_kernel void @fadd_b_a_a_f1
}
; GCN-LABEL: {{^}}fmuladd_neg_2.0_a_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
-; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
+; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
+; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
+; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -145,13 +193,20 @@ define amdgpu_kernel void @fmuladd_neg_2
}
; GCN-LABEL: {{^}}fmuladd_neg_2.0_neg_a_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], 2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -169,13 +224,20 @@ define amdgpu_kernel void @fmuladd_neg_2
}
; GCN-LABEL: {{^}}fmuladd_2.0_neg_a_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
-; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -193,11 +255,14 @@ define amdgpu_kernel void @fmuladd_2.0_n
}
; GCN-LABEL: {{^}}fmuladd_2.0_a_neg_b_f16
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
-; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
+; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
+; GCN-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]]
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -215,18 +280,22 @@ define amdgpu_kernel void @fmuladd_2.0_a
}
; GCN-LABEL: {{^}}mad_sub_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -[[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -246,17 +315,23 @@ define amdgpu_kernel void @mad_sub_f16(h
}
; GCN-LABEL: {{^}}mad_sub_inv_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -276,17 +351,21 @@ define amdgpu_kernel void @mad_sub_inv_f
}
; GCN-LABEL: {{^}}mad_sub_fabs_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], -|[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
+; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -307,18 +386,22 @@ define amdgpu_kernel void @mad_sub_fabs_
}
; GCN-LABEL: {{^}}mad_sub_fabs_inv_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], |[[REGC]]|
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
+; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -339,18 +422,24 @@ define amdgpu_kernel void @mad_sub_fabs_
}
; GCN-LABEL: {{^}}neg_neg_mad_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
+; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], [[REGB]], [[REGC]]
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[REGC]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
+; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]]
define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -372,18 +461,22 @@ define amdgpu_kernel void @neg_neg_mad_f
}
; GCN-LABEL: {{^}}mad_fabs_sub_f16:
-; GCN: {{buffer|flat}}_load_ushort [[REGA:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGB:v[0-9]+]]
-; GCN: {{buffer|flat}}_load_ushort [[REGC:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGA:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGB:v[0-9]+]]
+; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[REGA]], |[[REGB]]|, -[[REGC]]
-; VI-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
-; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GCN-DENORM-STRICT: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
+; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]|
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64
@@ -404,17 +497,24 @@ define amdgpu_kernel void @mad_fabs_sub_
}
; GCN-LABEL: {{^}}fsub_c_fadd_a_a_f16:
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
; VI-FLUSH: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], -2.0, [[R2]]
+; GFX10-DENORM-CONTRACT: v_fmac_f16_e32 [[R2]], -2.0, [[R1]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
+; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -432,17 +532,21 @@ define amdgpu_kernel void @fsub_c_fadd_a
}
; GCN-LABEL: {{^}}fsub_fadd_a_a_c_f16:
-; GCN: {{buffer|flat}}_load_ushort [[R1:v[0-9]+]],
-; GCN: {{buffer|flat}}_load_ushort [[R2:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R1:v[0-9]+]],
+; GCN: {{buffer|flat|global}}_load_ushort [[R2:v[0-9]+]],
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
+; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
-; VI-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
-; VI-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GCN-DENORM-STRICT: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]]
+; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
+; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
Modified: llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll Fri May 3 21:20:37 2019
@@ -2,6 +2,8 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s
declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
@@ -23,6 +25,13 @@ declare <2 x half> @llvm.fmuladd.v2f16(<
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
; VI-DENORM: buffer_store_short [[RESULT]]
+; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], v[[A_F16]], v[[B_F16]]
+; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
+; GFX10-FLUSH: buffer_store_short [[ADD]]
+
+; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], v[[A_F16]], v[[B_F16]]
+; GFX10-DENORM: buffer_store_short v[[C_F16]],
+
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16(
half addrspace(1)* %r,
@@ -53,6 +62,13 @@ define amdgpu_kernel void @fmuladd_f16(
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
; VI-DENORM: buffer_store_short [[RESULT]]
+; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[B_F16]]
+; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
+; GFX10-FLUSH: buffer_store_short [[ADD]]
+
+; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[B_F16]]
+; GFX10-DENORM: buffer_store_short v[[C_F16]],
+
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16_imm_a(
half addrspace(1)* %r,
@@ -81,6 +97,12 @@ define amdgpu_kernel void @fmuladd_f16_i
; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
; VI-DENORM buffer_store_short [[RESULT]]
+; GFX10-FLUSH: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x4200, v[[A_F16]]
+; GFX10-FLUSH: v_add_f16_e32 [[ADD:v[0-9]+]], [[MUL]], v[[C_F16]]
+; GFX10-FLUSH: buffer_store_short [[ADD]]
+
+; GFX10-DENORM: v_fmac_f16_e32 v[[C_F16]], 0x4200, v[[A_F16]]
+; GFX10-DENORM buffer_store_short v[[C_F16]],
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_f16_imm_b(
@@ -107,6 +129,9 @@ define amdgpu_kernel void @fmuladd_f16_i
; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; GFX10: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GFX10: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GFX10: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -116,7 +141,6 @@ define amdgpu_kernel void @fmuladd_f16_i
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
@@ -127,7 +151,6 @@ define amdgpu_kernel void @fmuladd_f16_i
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
-
; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
@@ -144,6 +167,11 @@ define amdgpu_kernel void @fmuladd_f16_i
; VI-DENORM-NOT: v_and_b32
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
+; GFX10-FLUSH: v_pk_mul_f16 [[MUL:v[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
+; GFX10-FLUSH: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[MUL]], v[[C_V2_F16]]
+
+; GFX10-DENORM: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+
; GCN: buffer_store_dword v[[R_V2_F16]]
define amdgpu_kernel void @fmuladd_v2f16(
<2 x half> addrspace(1)* %r,
Modified: llvm/trunk/test/CodeGen/AMDGPU/madak.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/madak.ll?rev=359959&r1=359958&r2=359959&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/madak.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/madak.ll Fri May 3 21:20:37 2019
@@ -1,6 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,MAD,GFX10-MAD %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.fabs.f32(float) nounwind readnone
@@ -12,7 +14,10 @@ declare float @llvm.fabs.f32(float) noun
; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -33,17 +38,20 @@ define amdgpu_kernel void @madak_f32(flo
; it.
; GCN-LABEL: {{^}}madak_2_use_f32:
-; GFX8_9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
-; GFX8_9: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
-; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-; GCN-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
-; GCN: s_endpgm
+; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
+; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
+; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
+; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
+; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
+; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
+; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]]
+; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]]
+; GCN: s_endpgm
define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -70,7 +78,8 @@ define amdgpu_kernel void @madak_2_use_f
; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
; GCN: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
-; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
+; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -94,7 +103,10 @@ define amdgpu_kernel void @madak_m_inlin
; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+; GFX10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
+; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -112,11 +124,13 @@ define amdgpu_kernel void @madak_inline_
; We can't use an SGPR when forming madak
; GCN-LABEL: {{^}}s_v_madak_f32:
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
-; GCN-NOT: v_madak_f32
-; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
+; GCN-NOT: v_madak_f32
+; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
+; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
+; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000
define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -131,11 +145,13 @@ define amdgpu_kernel void @s_v_madak_f32
}
; GCN-LABEL: @v_s_madak_f32
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN-DAG: {{buffer|flat|global}}_load_dword [[VA:v[0-9]+]]
-; GCN-NOT: v_madak_f32
-; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]]
+; GFX6_8_9-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
+; GCN-DAG: {{buffer|flat|global}}_load_dword{{(_addtid)?}} [[VA:v[0-9]+]]
+; GFX6_8_9-NOT: v_madak_f32
+; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
+; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
+; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000
define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -151,7 +167,9 @@ define amdgpu_kernel void @v_s_madak_f32
; GCN-LABEL: {{^}}s_s_madak_f32:
; GCN-NOT: v_madak_f32
-; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
+; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
%mul = fmul float %a, %b
%madak = fadd float %mul, 10.0
@@ -160,12 +178,14 @@ define amdgpu_kernel void @s_s_madak_f32
}
; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
-; GCN: s_endpgm
+; GFX6: buffer_load_dword [[VA:v[0-9]+]]
+; GFX6: buffer_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
+; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000
+; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -184,12 +204,14 @@ define amdgpu_kernel void @no_madak_src0
}
; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
-; GFX6: buffer_load_dword [[VA:v[0-9]+]]
-; GFX6: buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
-; GCN: s_endpgm
+; GFX6: buffer_load_dword [[VA:v[0-9]+]]
+; GFX6: buffer_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX6_8_9: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
+; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000
+; GCN: s_endpgm
define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
@@ -207,16 +229,18 @@ define amdgpu_kernel void @no_madak_src1
ret void
}
-; SIFoldOperands should not fold the SGPR copy into the instruction
+; SIFoldOperands should not fold the SGPR copy into the instruction before GFX10
; because the implicit immediate already uses the constant bus.
+; On GFX10+ we can use two scalar operands.
; GCN-LABEL: {{^}}madak_constant_bus_violation:
-; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
-; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
-; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
-; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
-; GFX6: buffer_store_dword [[MUL]]
-; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
+; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
+; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
+; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
+; MAD: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; FMA: v_fmaak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
+; GFX6: buffer_store_dword [[MUL]]
+; GFX8_9_10: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
bb:
%tmp = icmp eq i32 %arg1, 0
Added: llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir?rev=359959&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/sdwa-peephole-instr-gfx10.mir Fri May 3 21:20:37 2019
@@ -0,0 +1,293 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-peephole-sdwa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1010 -check-prefix=GCN %s
+
+# GCN-LABEL: {{^}}name: vop1_instructions
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_MOV_B32_sdwa 0, %{{[0-9]+}}, 0, 6, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 0, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, implicit $exec
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FRACT_F32_sdwa 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SIN_F32_sdwa 0, %{{[0-9]+}}, 1, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_U32_F32_sdwa 1, %{{[0-9]+}}, 0, 5, 0, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_CVT_F32_I32_sdwa 0, %{{[0-9]+}}, 0, 1, 5, 0, 5, implicit $exec
+
+---
+name: vop1_instructions
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vreg_64 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: vgpr_32 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vgpr_32 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vgpr_32 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vgpr_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vgpr_32 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vgpr_32 }
+ - { id: 28, class: vgpr_32 }
+ - { id: 29, class: vgpr_32 }
+ - { id: 30, class: vgpr_32 }
+ - { id: 31, class: vgpr_32 }
+ - { id: 32, class: vgpr_32 }
+ - { id: 33, class: vgpr_32 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+ - { id: 36, class: vgpr_32 }
+ - { id: 37, class: vgpr_32 }
+ - { id: 38, class: vgpr_32 }
+ - { id: 39, class: vgpr_32 }
+ - { id: 40, class: vgpr_32 }
+ - { id: 41, class: vgpr_32 }
+ - { id: 42, class: vgpr_32 }
+ - { id: 43, class: vgpr_32 }
+ - { id: 44, class: vgpr_32 }
+ - { id: 45, class: vgpr_32 }
+ - { id: 46, class: vgpr_32 }
+ - { id: 47, class: vgpr_32 }
+ - { id: 48, class: vgpr_32 }
+ - { id: 100, class: vgpr_32 }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+ %2 = COPY $sgpr30_sgpr31
+ %1 = COPY $vgpr2_vgpr3
+ %0 = COPY $vgpr0_vgpr1
+ %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+
+ %5 = S_MOV_B32 65535
+ %6 = S_MOV_B32 65535
+
+ %10 = V_LSHRREV_B32_e64 16, %3, implicit $exec
+ %11 = V_MOV_B32_e32 %10, implicit $exec
+ %12 = V_LSHLREV_B32_e64 16, %11, implicit $exec
+ %14 = V_FRACT_F32_e32 123, implicit $exec
+ %15 = V_LSHLREV_B32_e64 16, %14, implicit $exec
+ %16 = V_LSHRREV_B32_e64 16, %15, implicit $exec
+ %17 = V_SIN_F32_e32 %16, implicit $exec
+ %18 = V_LSHLREV_B32_e64 16, %17, implicit $exec
+ %19 = V_LSHRREV_B32_e64 16, %18, implicit $exec
+ %20 = V_CVT_U32_F32_e32 %19, implicit $exec
+ %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
+ %23 = V_CVT_F32_I32_e32 123, implicit $exec
+ %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
+
+ %25 = V_LSHRREV_B32_e64 16, %3, implicit $exec
+ %26 = V_MOV_B32_e64 %25, implicit $exec
+ %26 = V_LSHLREV_B32_e64 16, %26, implicit $exec
+ %27 = V_FRACT_F32_e64 0, %6, 0, 0, implicit $exec
+ %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
+ %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
+ %30 = V_SIN_F32_e64 0, %29, 0, 0, implicit $exec
+ %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
+ %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
+ %33 = V_CVT_U32_F32_e64 0, %32, 0, 0, implicit $exec
+ %34 = V_LSHLREV_B32_e64 16, %33, implicit $exec
+ %35 = V_CVT_F32_I32_e64 %6, 0, 0, implicit $exec
+ %36 = V_LSHLREV_B32_e64 16, %35, implicit $exec
+
+
+ %37 = V_LSHRREV_B32_e64 16, %36, implicit $exec
+ %38 = V_FRACT_F32_e64 1, %37, 0, 0, implicit $exec
+ %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
+ %40 = V_LSHRREV_B32_e64 16, %39, implicit $exec
+ %41 = V_SIN_F32_e64 0, %40, 1, 0, implicit $exec
+ %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
+ %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
+ %44 = V_CVT_U32_F32_e64 1, %43, 0, 0, implicit $exec
+ %45 = V_LSHLREV_B32_e64 16, %44, implicit $exec
+ %46 = V_LSHRREV_B32_e64 16, %45, implicit $exec
+ %47 = V_CVT_F32_I32_e64 %46, 0, 1, implicit $exec
+ %48 = V_LSHLREV_B32_e64 16, %47, implicit $exec
+
+
+ %100 = V_MOV_B32_e32 %48, implicit $exec
+
+ FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+ $sgpr30_sgpr31 = COPY %2
+ S_SETPC_B64_return $sgpr30_sgpr31
+
+...
+---
+# GCN-LABEL: {{^}}name: vop2_instructions
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 6, 0, 5, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e32 %{{[0-9]+}}, %{{[0-9]+}}, %{{[0-9]+}}, implicit $exec
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_AND_B32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 5, 0, 6, 5, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 0, 23, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, %{{[0-9]+}}, 0, 0, implicit $exec
+
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_ADD_F32_sdwa 0, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 5, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_SUB_F16_sdwa 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 0, 5, 0, 6, 1, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F32_e64 1, 23, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, 0, implicit $exec
+# GFX1010: %{{[0-9]+}}:vgpr_32 = V_FMAC_F16_e64 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 1, %{{[0-9]+}}, 0, 2, implicit $exec
+
+name: vop2_instructions
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vreg_64 }
+ - { id: 2, class: sreg_64 }
+ - { id: 3, class: vgpr_32 }
+ - { id: 4, class: sreg_32_xm0 }
+ - { id: 5, class: sreg_32_xm0 }
+ - { id: 6, class: sreg_32_xm0 }
+ - { id: 7, class: sreg_32_xm0 }
+ - { id: 8, class: sreg_32 }
+ - { id: 9, class: vgpr_32 }
+ - { id: 10, class: vgpr_32 }
+ - { id: 11, class: vgpr_32 }
+ - { id: 12, class: vgpr_32 }
+ - { id: 13, class: vgpr_32 }
+ - { id: 14, class: vgpr_32 }
+ - { id: 15, class: vgpr_32 }
+ - { id: 16, class: vgpr_32 }
+ - { id: 17, class: vgpr_32 }
+ - { id: 18, class: vgpr_32 }
+ - { id: 19, class: vgpr_32 }
+ - { id: 20, class: vgpr_32 }
+ - { id: 21, class: vgpr_32 }
+ - { id: 22, class: vgpr_32 }
+ - { id: 23, class: vgpr_32 }
+ - { id: 24, class: vgpr_32 }
+ - { id: 25, class: vgpr_32 }
+ - { id: 26, class: vgpr_32 }
+ - { id: 27, class: vgpr_32 }
+ - { id: 28, class: vgpr_32 }
+ - { id: 29, class: vgpr_32 }
+ - { id: 30, class: vgpr_32 }
+ - { id: 31, class: vgpr_32 }
+ - { id: 32, class: vgpr_32 }
+ - { id: 33, class: vgpr_32 }
+ - { id: 34, class: vgpr_32 }
+ - { id: 35, class: vgpr_32 }
+ - { id: 36, class: vgpr_32 }
+ - { id: 37, class: vgpr_32 }
+ - { id: 38, class: vgpr_32 }
+ - { id: 39, class: vgpr_32 }
+ - { id: 40, class: vgpr_32 }
+ - { id: 41, class: vgpr_32 }
+ - { id: 42, class: vgpr_32 }
+ - { id: 43, class: vgpr_32 }
+ - { id: 44, class: vgpr_32 }
+ - { id: 45, class: vgpr_32 }
+ - { id: 46, class: vgpr_32 }
+ - { id: 47, class: vgpr_32 }
+ - { id: 48, class: vgpr_32 }
+ - { id: 49, class: vgpr_32 }
+ - { id: 50, class: vgpr_32 }
+ - { id: 51, class: vgpr_32 }
+ - { id: 52, class: vgpr_32 }
+ - { id: 53, class: vgpr_32 }
+ - { id: 54, class: vgpr_32 }
+ - { id: 55, class: vgpr_32 }
+ - { id: 56, class: vgpr_32 }
+ - { id: 57, class: vgpr_32 }
+ - { id: 58, class: vgpr_32 }
+ - { id: 59, class: vgpr_32 }
+ - { id: 60, class: vgpr_32 }
+ - { id: 100, class: vgpr_32 }
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr30_sgpr31
+
+ %2 = COPY $sgpr30_sgpr31
+ %1 = COPY $vgpr2_vgpr3
+ %0 = COPY $vgpr0_vgpr1
+ %3 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 4)
+
+ %5 = S_MOV_B32 65535
+ %6 = S_MOV_B32 65535
+
+ %11 = V_LSHRREV_B32_e64 16, %3, implicit $exec
+ %12 = V_AND_B32_e32 %6, %11, implicit $exec
+ %13 = V_LSHLREV_B32_e64 16, %12, implicit $exec
+ %14 = V_LSHRREV_B32_e64 16, %13, implicit $exec
+ %15 = V_BFE_U32 %13, 8, 8, implicit $exec
+ %16 = V_ADD_F32_e32 %14, %15, implicit $exec
+ %17 = V_LSHLREV_B32_e64 16, %16, implicit $exec
+ %18 = V_LSHRREV_B32_e64 16, %17, implicit $exec
+ %19 = V_BFE_U32 %17, 8, 8, implicit $exec
+ %20 = V_SUB_F16_e32 %18, %19, implicit $exec
+ %21 = V_LSHLREV_B32_e64 16, %20, implicit $exec
+ %22 = V_BFE_U32 %20, 8, 8, implicit $exec
+ %23 = V_FMAC_F32_e32 %21, %22, %22, implicit $exec
+ %24 = V_LSHLREV_B32_e64 16, %23, implicit $exec
+ %25 = V_LSHRREV_B32_e64 16, %24, implicit $exec
+ %26 = V_BFE_U32 %24, 8, 8, implicit $exec
+ %27 = V_FMAC_F16_e32 %25, %26, %26, implicit $exec
+ %28 = V_LSHLREV_B32_e64 16, %27, implicit $exec
+
+ %29 = V_LSHRREV_B32_e64 16, %28, implicit $exec
+ %30 = V_AND_B32_e64 23, %29, implicit $exec
+ %31 = V_LSHLREV_B32_e64 16, %30, implicit $exec
+ %32 = V_LSHRREV_B32_e64 16, %31, implicit $exec
+ %33 = V_BFE_U32 %31, 8, 8, implicit $exec
+ %34 = V_ADD_F32_e64 0, %32, 0, %33, 0, 0, implicit $exec
+ %35 = V_LSHLREV_B32_e64 16, %34, implicit $exec
+ %37 = V_BFE_U32 %35, 8, 8, implicit $exec
+ %38 = V_SUB_F16_e64 0, 23, 0, %37, 0, 0, implicit $exec
+ %39 = V_LSHLREV_B32_e64 16, %38, implicit $exec
+ %40 = V_BFE_U32 %39, 8, 8, implicit $exec
+ %41 = V_FMAC_F32_e64 0, 23, 0, %40, 0, %40, 0, 0, implicit $exec
+ %42 = V_LSHLREV_B32_e64 16, %41, implicit $exec
+ %43 = V_LSHRREV_B32_e64 16, %42, implicit $exec
+ %44 = V_BFE_U32 %42, 8, 8, implicit $exec
+ %45 = V_FMAC_F16_e64 0, %43, 0, %44, 0, %44, 0, 0, implicit $exec
+ %46 = V_LSHLREV_B32_e64 16, %45, implicit $exec
+
+ %47 = V_LSHRREV_B32_e64 16, %46, implicit $exec
+ %48 = V_BFE_U32 %46, 8, 8, implicit $exec
+ %49 = V_ADD_F32_e64 0, %47, 1, %48, 0, 0, implicit $exec
+ %50 = V_LSHLREV_B32_e64 16, %49, implicit $exec
+ %51 = V_BFE_U32 %50, 8, 8, implicit $exec
+ %52 = V_SUB_F16_e64 1, 23, 1, %51, 0, 0, implicit $exec
+ %53 = V_LSHLREV_B32_e64 16, %52, implicit $exec
+ %54 = V_BFE_U32 %53, 8, 8, implicit $exec
+ %55 = V_FMAC_F32_e64 1, 23, 1, %54, 1, %54, 1, 0, implicit $exec
+ %56 = V_LSHLREV_B32_e64 16, %55, implicit $exec
+ %57 = V_LSHRREV_B32_e64 16, %56, implicit $exec
+ %58 = V_BFE_U32 %56, 8, 8, implicit $exec
+ %59 = V_FMAC_F16_e64 1, %57, 1, %58, 1, %58, 0, 2, implicit $exec
+ %60 = V_LSHLREV_B32_e64 16, %59, implicit $exec
+
+ %100 = V_MOV_B32_e32 %60, implicit $exec
+
+ FLAT_STORE_DWORD %0, %100, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 4)
+ $sgpr30_sgpr31 = COPY %2
+ S_SETPC_B64_return $sgpr30_sgpr31
+
+...
Added: llvm/trunk/test/CodeGen/AMDGPU/twoaddr-fma.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/twoaddr-fma.mir?rev=359959&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/twoaddr-fma.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/twoaddr-fma.mir Fri May 3 21:20:37 2019
@@ -0,0 +1,183 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 %s -run-pass twoaddressinstruction -verify-machineinstrs -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: test_fmamk_reg_imm_f32
+# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+---
+name: test_fmamk_reg_imm_f32
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = COPY %0.sub1
+ %2 = V_MOV_B32_e32 1078523331, implicit $exec
+ %3 = V_FMAC_F32_e32 killed %0.sub0, %2, killed %1, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_imm_reg_f32
+# GCN: V_FMAMK_F32 killed %0.sub0, 1078523331, killed %1, implicit $exec
+---
+name: test_fmamk_imm_reg_f32
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = COPY %0.sub1
+ %2 = V_MOV_B32_e32 1078523331, implicit $exec
+ %3 = V_FMAC_F32_e32 %2, killed %0.sub0, killed %1, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_f32
+# GCN: V_FMAAK_F32 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+---
+name: test_fmaak_f32
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = V_MOV_B32_e32 1078523331, implicit $exec
+ %2 = V_FMAC_F32_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_reg_imm_f16
+# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+---
+name: test_fmamk_reg_imm_f16
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = COPY %0.sub1
+ %2 = V_MOV_B32_e32 1078523331, implicit $exec
+ %3 = V_FMAC_F16_e32 killed %0.sub0, %2, killed %1, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmamk_imm_reg_f16
+# GCN: V_FMAMK_F16 killed %0.sub0, 1078523331, killed %1, implicit $exec
+---
+name: test_fmamk_imm_reg_f16
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = COPY %0.sub1
+ %2 = V_MOV_B32_e32 1078523331, implicit $exec
+ %3 = V_FMAC_F16_e32 %2, killed %0.sub0, killed %1, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_f16
+# GCN: V_FMAAK_F16 killed %0.sub0, %0.sub1, 1078523331, implicit $exec
+---
+name: test_fmaak_f16
+registers:
+ - { id: 0, class: vreg_64 }
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = V_MOV_B32_e32 1078523331, implicit $exec
+ %2 = V_FMAC_F16_e32 killed %0.sub0, %0.sub1, %1, implicit $exec
+...
+
+# GCN-LABEL: name: test_fmaak_sgpr_src0_f32
+# GCN: %2:vgpr_32 = V_FMAMK_F32 killed %0, 1078523331, %3:vgpr_32, implicit $exec
+
+---
+name: test_fmaak_sgpr_src0_f32
+registers:
+ - { id: 0, class: sreg_32_xm0 }
+ - { id: 1, class: vgpr_32}
+ - { id: 2, class: vgpr_32 }
+ - { id: 3, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = IMPLICIT_DEF
+ %1 = V_MOV_B32_e32 1078523331, implicit $exec
+ %2 = V_FMAC_F32_e32 killed %0, %1, %3, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_inlineimm_src0_f32
+# GCN: %1:vgpr_32 = V_FMAMK_F32 1073741824, 1078523331, %2:vgpr_32, implicit $exec
+
+---
+name: test_fmaak_inlineimm_src0_f32
+registers:
+ - { id: 0, class: vgpr_32}
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = V_MOV_B32_e32 1078523331, implicit $exec
+ %1 = V_FMAC_F32_e32 1073741824, %0, %2, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_otherimm_src0_f32
+# GCN: %1:vgpr_32 = V_FMAC_F32_e32 1120403456, %0, %1, implicit $exec
+
+---
+name: test_fmaak_otherimm_src0_f32
+registers:
+ - { id: 0, class: vgpr_32}
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+body: |
+ bb.0:
+
+ %0 = V_MOV_B32_e32 1078523331, implicit $exec
+ %1 = V_FMAC_F32_e32 1120403456, %0, %2, implicit $exec
+
+...
+
+# GCN-LABEL: name: test_fmaak_other_constantlike_src0_f32
+# GCN: %1:vgpr_32 = V_FMAC_F32_e32 %stack.0, %0, %1, implicit $exec
+---
+name: test_fmaak_other_constantlike_src0_f32
+registers:
+ - { id: 0, class: vgpr_32}
+ - { id: 1, class: vgpr_32 }
+ - { id: 2, class: vgpr_32 }
+stack:
+ - { id: 0, name: "", type: default, offset: 0, size: 128, alignment: 8,
+ callee-saved-register: '', local-offset: 0, debug-info-variable: '',
+ debug-info-expression: '', debug-info-location: '' }
+body: |
+ bb.0:
+
+ %0 = V_MOV_B32_e32 1078523331, implicit $exec
+ %1 = V_FMAC_F32_e32 %stack.0, %0, %2, implicit $exec
+
+...
More information about the llvm-commits
mailing list