[llvm] [AMDGPU] Select gfx1150 SALU Float instructions (PR #66885)
Mirko BrkuĊĦanin via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 21 02:25:46 PDT 2023
https://github.com/mbrkusanin updated https://github.com/llvm/llvm-project/pull/66885
>From edb2ac1e775173cd61c5d4b6c68ce3441c688737 Mon Sep 17 00:00:00 2001
From: Mirko Brkusanin <Mirko.Brkusanin at amd.com>
Date: Wed, 20 Sep 2023 12:14:05 +0200
Subject: [PATCH] [AMDGPU] Select gfx1150 SALU Float instructions
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 19 +
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 166 +++-
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 99 +-
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 69 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 220 ++++-
llvm/lib/Target/AMDGPU/SOPInstructions.td | 73 +-
.../inst-select-scalar-float-sop1.mir | 302 ++++++
.../inst-select-scalar-float-sop2.mir | 294 ++++++
.../inst-select-scalar-float-sopc.mir | 647 +++++++++++++
.../AMDGPU/GlobalISel/legalize-fcmp-s32.mir | 46 +
.../AMDGPU/GlobalISel/legalize-fcmp.mir | 1 +
.../AMDGPU/GlobalISel/regbankselect-fcmp.mir | 69 +-
.../GlobalISel/regbankselect-salu-float.mir | 246 +++++
.../test/CodeGen/AMDGPU/code-size-estimate.ll | 37 +
.../AMDGPU/commute-compares-scalar-float.ll | 515 ++++++++++
llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir | 99 ++
.../AMDGPU/fold-operands-scalar-fmac.mir | 238 +++++
llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll | 174 ++++
llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll | 212 +++++
llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll | 899 ++++++++++++++++++
23 files changed, 4282 insertions(+), 157 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp-s32.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-salu-float.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
create mode 100644 llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 05deb69b2bfc140..b5ceaaa14b4fd5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -663,6 +663,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BRCOND:
SelectBRCOND(N);
return;
+ case ISD::FP_EXTEND:
+ SelectFP_EXTEND(N);
+ return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -2303,6 +2306,22 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
+void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
+ if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
+ !N->isDivergent()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getValueType() == MVT::f16) {
+ if (isExtractHiElt(Src, Src)) {
+ CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
+ {Src});
+ return;
+ }
+ }
+ }
+
+ SelectCode(N);
+}
+
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7b4a1a4aedaf7e5..a8a606f60a3faee 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -273,6 +273,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
+ void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9f13f58f8a9f404..31d72fb8cadd8a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1211,36 +1211,104 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
}
}
- if (Size != 32)
- return -1;
+ if (Size == 32) {
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U32;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U32;
+ case CmpInst::ICMP_SGT:
+ return AMDGPU::S_CMP_GT_I32;
+ case CmpInst::ICMP_SGE:
+ return AMDGPU::S_CMP_GE_I32;
+ case CmpInst::ICMP_SLT:
+ return AMDGPU::S_CMP_LT_I32;
+ case CmpInst::ICMP_SLE:
+ return AMDGPU::S_CMP_LE_I32;
+ case CmpInst::ICMP_UGT:
+ return AMDGPU::S_CMP_GT_U32;
+ case CmpInst::ICMP_UGE:
+ return AMDGPU::S_CMP_GE_U32;
+ case CmpInst::ICMP_ULT:
+ return AMDGPU::S_CMP_LT_U32;
+ case CmpInst::ICMP_ULE:
+ return AMDGPU::S_CMP_LE_U32;
+ case CmpInst::FCMP_OEQ:
+ return AMDGPU::S_CMP_EQ_F32;
+ case CmpInst::FCMP_OGT:
+ return AMDGPU::S_CMP_GT_F32;
+ case CmpInst::FCMP_OGE:
+ return AMDGPU::S_CMP_GE_F32;
+ case CmpInst::FCMP_OLT:
+ return AMDGPU::S_CMP_LT_F32;
+ case CmpInst::FCMP_OLE:
+ return AMDGPU::S_CMP_LE_F32;
+ case CmpInst::FCMP_ONE:
+ return AMDGPU::S_CMP_LG_F32;
+ case CmpInst::FCMP_ORD:
+ return AMDGPU::S_CMP_O_F32;
+ case CmpInst::FCMP_UNO:
+ return AMDGPU::S_CMP_U_F32;
+ case CmpInst::FCMP_UEQ:
+ return AMDGPU::S_CMP_NLG_F32;
+ case CmpInst::FCMP_UGT:
+ return AMDGPU::S_CMP_NLE_F32;
+ case CmpInst::FCMP_UGE:
+ return AMDGPU::S_CMP_NLT_F32;
+ case CmpInst::FCMP_ULT:
+ return AMDGPU::S_CMP_NGE_F32;
+ case CmpInst::FCMP_ULE:
+ return AMDGPU::S_CMP_NGT_F32;
+ case CmpInst::FCMP_UNE:
+ return AMDGPU::S_CMP_NEQ_F32;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
+ }
- switch (P) {
- case CmpInst::ICMP_NE:
- return AMDGPU::S_CMP_LG_U32;
- case CmpInst::ICMP_EQ:
- return AMDGPU::S_CMP_EQ_U32;
- case CmpInst::ICMP_SGT:
- return AMDGPU::S_CMP_GT_I32;
- case CmpInst::ICMP_SGE:
- return AMDGPU::S_CMP_GE_I32;
- case CmpInst::ICMP_SLT:
- return AMDGPU::S_CMP_LT_I32;
- case CmpInst::ICMP_SLE:
- return AMDGPU::S_CMP_LE_I32;
- case CmpInst::ICMP_UGT:
- return AMDGPU::S_CMP_GT_U32;
- case CmpInst::ICMP_UGE:
- return AMDGPU::S_CMP_GE_U32;
- case CmpInst::ICMP_ULT:
- return AMDGPU::S_CMP_LT_U32;
- case CmpInst::ICMP_ULE:
- return AMDGPU::S_CMP_LE_U32;
- default:
- llvm_unreachable("Unknown condition code!");
+ if (Size == 16) {
+ if (!STI.hasSALUFloatInsts())
+ return -1;
+
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ return AMDGPU::S_CMP_EQ_F16;
+ case CmpInst::FCMP_OGT:
+ return AMDGPU::S_CMP_GT_F16;
+ case CmpInst::FCMP_OGE:
+ return AMDGPU::S_CMP_GE_F16;
+ case CmpInst::FCMP_OLT:
+ return AMDGPU::S_CMP_LT_F16;
+ case CmpInst::FCMP_OLE:
+ return AMDGPU::S_CMP_LE_F16;
+ case CmpInst::FCMP_ONE:
+ return AMDGPU::S_CMP_LG_F16;
+ case CmpInst::FCMP_ORD:
+ return AMDGPU::S_CMP_O_F16;
+ case CmpInst::FCMP_UNO:
+ return AMDGPU::S_CMP_U_F16;
+ case CmpInst::FCMP_UEQ:
+ return AMDGPU::S_CMP_NLG_F16;
+ case CmpInst::FCMP_UGT:
+ return AMDGPU::S_CMP_NLE_F16;
+ case CmpInst::FCMP_UGE:
+ return AMDGPU::S_CMP_NLT_F16;
+ case CmpInst::FCMP_ULT:
+ return AMDGPU::S_CMP_NGE_F16;
+ case CmpInst::FCMP_ULE:
+ return AMDGPU::S_CMP_NGT_F16;
+ case CmpInst::FCMP_UNE:
+ return AMDGPU::S_CMP_NEQ_F16;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
}
+
+ return -1;
}
-bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
+
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -1266,6 +1334,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
return Ret;
}
+ if (I.getOpcode() == AMDGPU::G_FCMP)
+ return false;
+
int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
if (Opcode == -1)
return false;
@@ -2439,6 +2510,42 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
+static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
+ Register &Out) {
+ Register LShlSrc;
+ if (mi_match(In, MRI,
+ m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
+ Out = LShlSrc;
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
+ if (!Subtarget->hasSALUFloatInsts())
+ return false;
+
+ Register Dst = I.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register Src = I.getOperand(1).getReg();
+
+ if (MRI->getType(Dst) == LLT::scalar(32) &&
+ MRI->getType(Src) == LLT::scalar(16)) {
+ if (isExtractHiElt(*MRI, Src, Src)) {
+ MachineBasicBlock *BB = I.getParent();
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
+ .addUse(Src);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
+ }
+ }
+
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineOperand &ImmOp = I.getOperand(1);
@@ -3471,7 +3578,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
case TargetOpcode::G_ICMP:
- if (selectG_ICMP(I))
+ case TargetOpcode::G_FCMP:
+ if (selectG_ICMP_or_FCMP(I))
return true;
return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
@@ -3508,6 +3616,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
+ case TargetOpcode::G_FPEXT:
+ if (selectG_FPEXT(I))
+ return true;
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_GLOBAL_VALUE:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c9afa4e9fcc2a59..93e45fcd8682f07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -90,6 +90,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
+ bool selectG_FPEXT(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_FNEG(MachineInstr &I) const;
bool selectG_FABS(MachineInstr &I) const;
@@ -129,7 +130,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
- bool selectG_ICMP(MachineInstr &I) const;
+ bool selectG_ICMP_or_FCMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 31617eef562d99d..db226a302900160 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1132,8 +1132,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
- getActionDefinitionsBuilder(G_FCMP)
- .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
+ auto &FCmpBuilder =
+ getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
+ {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
+
+ if (ST.hasSALUFloatInsts())
+ FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
+
+ FCmpBuilder
.widenScalarToNextPow2(1)
.clampScalar(1, S32, S64)
.scalarize(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 121768f039f4a5d..5b056bd9e5dba2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2178,6 +2178,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
+ case AMDGPU::G_FCMP:
+ if (!Subtarget.hasSALUFloatInsts())
+ break;
+ LLVM_FALLTHROUGH;
case AMDGPU::G_ICMP:
case AMDGPU::G_UADDO:
case AMDGPU::G_USUBO:
@@ -2185,7 +2189,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE: {
- unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
+ unsigned BoolDstOp =
+ (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
Register DstReg = MI.getOperand(BoolDstOp).getReg();
const RegisterBank *DstBank =
@@ -3706,40 +3711,59 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_UBFX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
- [[fallthrough]];
-
- case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
- case AMDGPU::G_SSUBSAT:
- case AMDGPU::G_UADDSAT:
- case AMDGPU::G_USUBSAT:
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
- case AMDGPU::G_FPTOSI:
- case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
- case AMDGPU::G_FMAD:
- case AMDGPU::G_FSQRT:
case AMDGPU::G_FFLOOR:
case AMDGPU::G_FCEIL:
case AMDGPU::G_FRINT:
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_STRICT_FADD:
+ case AMDGPU::G_STRICT_FSUB:
+ case AMDGPU::G_STRICT_FMUL:
+ case AMDGPU::G_STRICT_FMA: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ case AMDGPU::G_FPTOSI:
+ case AMDGPU::G_FPTOUI:
case AMDGPU::G_SITOFP:
- case AMDGPU::G_UITOFP:
+ case AMDGPU::G_UITOFP: {
+ unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_FPTRUNC:
- case AMDGPU::G_FPEXT:
+ case AMDGPU::G_FPEXT: {
+ unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
+ case AMDGPU::G_SSUBSAT:
+ case AMDGPU::G_UADDSAT:
+ case AMDGPU::G_USUBSAT:
+ case AMDGPU::G_FMAD:
+ case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
case AMDGPU::G_FLDEXP:
- case AMDGPU::G_FMINNUM:
- case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
- case AMDGPU::G_INTRINSIC_TRUNC:
- case AMDGPU::G_STRICT_FADD:
- case AMDGPU::G_STRICT_FSUB:
- case AMDGPU::G_STRICT_FMUL:
- case AMDGPU::G_STRICT_FMA:
case AMDGPU::G_STRICT_FLDEXP:
case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
case AMDGPU::G_FSHR: // TODO: Expand for scalar
@@ -3959,14 +3983,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
SrcSize);
break;
}
- case AMDGPU::G_FCMP: {
- unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
- OpdsMapping[1] = nullptr; // Predicate Operand.
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- break;
- }
case AMDGPU::G_IS_FPCLASS: {
Register SrcReg = MI.getOperand(1).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
@@ -3987,8 +4003,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
break;
}
- case AMDGPU::G_ICMP: {
- auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ case AMDGPU::G_ICMP:
+ case AMDGPU::G_FCMP: {
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
// See if the result register has already been constrained to vcc, which may
@@ -3998,12 +4014,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
+ auto canUseSCCICMP = [&]() {
+ auto Pred =
+ static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ return Size == 32 ||
+ (Size == 64 &&
+ (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
+ Subtarget.hasScalarCompareEq64());
+ };
+ auto canUseSCCFCMP = [&]() {
+ return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
+ };
+
+ bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID &&
- (Size == 32 || (Size == 64 &&
- (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
- Subtarget.hasScalarCompareEq64()));
+ (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
@@ -4013,6 +4040,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const unsigned ResultSize = 1;
OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
break;
@@ -4209,7 +4237,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp:
case Intrinsic::amdgcn_fract:
- case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
@@ -4276,6 +4303,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_permlane64:
return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
case Intrinsic::amdgcn_groupstaticsize:
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3adaa092aaefc4a..1032f7a95d791d6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -345,9 +345,44 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const {
+ const unsigned Opc = MI->getOpcode();
+
+ auto tryToFoldAsFMAAKorMK = [&]() {
+ if (!OpToFold->isImm())
+ return false;
+
+ const bool TryAK = OpNo == 3;
+ const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
+ MI->setDesc(TII->get(NewOpc));
+
+ // We have to fold into operand which would be Imm not into OpNo.
+ bool FoldAsFMAAKorMK =
+ tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
+ if (FoldAsFMAAKorMK) {
+ // Untie Src2 of fmac.
+ MI->untieRegOperand(3);
+ // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
+ if (OpNo == 1) {
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ Register OldReg = Op1.getReg();
+ // Operand 2 might be an inlinable constant
+ if (Op2.isImm()) {
+ Op1.ChangeToImmediate(Op2.getImm());
+ Op2.ChangeToRegister(OldReg, false);
+ } else {
+ Op1.setReg(Op2.getReg());
+ Op2.setReg(OldReg);
+ }
+ }
+ return true;
+ }
+ MI->setDesc(TII->get(Opc));
+ return false;
+ };
+
if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
- unsigned Opc = MI->getOpcode();
unsigned NewOpc = macToMad(Opc);
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
@@ -367,6 +402,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_fmac_f32 if we are trying to fold into Src2.
+ // By transforming into fmaak we can untie Src2 and make folding legal.
+ if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Special case for s_setreg_b32
if (OpToFold->isImm()) {
unsigned ImmOpc = 0;
@@ -447,6 +489,28 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
return true;
}
+ // Inlineable constant might have been folded into Imm operand of fmaak or
+ // fmamk and we are trying to fold a non-inlinable constant.
+ if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
+ !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
+ unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
+ MachineOperand &OpImm = MI->getOperand(ImmIdx);
+ if (!OpImm.isReg() &&
+ TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
+ return tryToFoldAsFMAAKorMK();
+ }
+
+ // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
+ // By changing into fmamk we can untie Src2.
+ // If folding for Src0 happens first and it is identical operand to Src1 we
+ // should avoid transforming into fmamk which requires commuting as it would
+ // cause folding into Src1 to fail later on due to wrong OpNo used.
+ if (Opc == AMDGPU::S_FMAC_F32 &&
+ (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Check the case where we might introduce a second constant operand to a
// scalar instruction
if (TII->isSALU(MI->getOpcode())) {
@@ -458,7 +522,8 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Otherwise check for another constant
for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
auto &Op = MI->getOperand(i);
- if (OpNo != i && !Op.isReg() && !TII->isInlineConstant(Op, OpInfo))
+ if (OpNo != i && !Op.isReg() &&
+ !TII->isInlineConstant(Op, InstDesc.operands()[i]))
return false;
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38b5e0114903cdf..cf391856bf733fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4990,6 +4990,64 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
+ case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
+ case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
+ case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
+ case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
+ case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
+ case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
+ case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
+ case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
+ case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
+ case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
+ case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
+ case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
+ case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
+ case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
+ case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
+ case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
+ case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_t16_e64;
+ case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_t16_e64;
+ case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_t16_e64;
+ case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_t16_e64;
+ case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_t16_e64;
+ case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
+ case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
+ case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
+ case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
+ case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
+ case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
+ case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
+ case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
+ case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
+ case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
+ case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
+ case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
+ case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
+ case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
+ case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
+ case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
+ case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
+ case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
+ case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
+ case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
+ case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
+ case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
+ case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
+ case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
+ case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
+ case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
+ case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
+ case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
+ case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
+ case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
+ case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
+ case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
+ case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
@@ -5383,6 +5441,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
legalizeOpWithMove(MI, Src1Idx);
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
+ if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
+ legalizeOpWithMove(MI, Src2Idx);
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
@@ -5532,6 +5597,11 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
// legalize it.
legalizeOpWithMove(MI, Idx);
}
+
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
+ if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
+ !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
+ legalizeOpWithMove(MI, VOP3Idx[2]);
}
Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
@@ -6665,21 +6735,78 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::S_CMP_LT_U32:
case AMDGPU::S_CMP_LE_U32:
case AMDGPU::S_CMP_EQ_U64:
- case AMDGPU::S_CMP_LG_U64: {
- const MCInstrDesc &NewDesc = get(NewOpcode);
+ case AMDGPU::S_CMP_LG_U64:
+ case AMDGPU::S_CMP_LT_F32:
+ case AMDGPU::S_CMP_EQ_F32:
+ case AMDGPU::S_CMP_LE_F32:
+ case AMDGPU::S_CMP_GT_F32:
+ case AMDGPU::S_CMP_LG_F32:
+ case AMDGPU::S_CMP_GE_F32:
+ case AMDGPU::S_CMP_O_F32:
+ case AMDGPU::S_CMP_U_F32:
+ case AMDGPU::S_CMP_NGE_F32:
+ case AMDGPU::S_CMP_NLG_F32:
+ case AMDGPU::S_CMP_NGT_F32:
+ case AMDGPU::S_CMP_NLE_F32:
+ case AMDGPU::S_CMP_NEQ_F32:
+ case AMDGPU::S_CMP_NLT_F32:
+ case AMDGPU::S_CMP_LT_F16:
+ case AMDGPU::S_CMP_EQ_F16:
+ case AMDGPU::S_CMP_LE_F16:
+ case AMDGPU::S_CMP_GT_F16:
+ case AMDGPU::S_CMP_LG_F16:
+ case AMDGPU::S_CMP_GE_F16:
+ case AMDGPU::S_CMP_O_F16:
+ case AMDGPU::S_CMP_U_F16:
+ case AMDGPU::S_CMP_NGE_F16:
+ case AMDGPU::S_CMP_NLG_F16:
+ case AMDGPU::S_CMP_NGT_F16:
+ case AMDGPU::S_CMP_NLE_F16:
+ case AMDGPU::S_CMP_NEQ_F16:
+ case AMDGPU::S_CMP_NLT_F16: {
Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
- .add(Inst.getOperand(0))
- .add(Inst.getOperand(1));
+ auto NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
+ .setMIFlags(Inst.getFlags());
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0) {
+ NewInstr
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(0)) // src0
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(1)) // src1
+ .addImm(0); // clamp
+ } else {
+ NewInstr
+ .add(Inst.getOperand(0))
+ .add(Inst.getOperand(1));
+ }
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
+ return;
}
+ case AMDGPU::S_CVT_HI_F32_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Inst.getOperand(1));
+ BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .addReg(TmpReg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
return;
}
+ }
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
@@ -6723,8 +6850,61 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
- for (const MachineOperand &Op : Inst.explicit_operands())
- NewInstr->addOperand(Op);
+ if (isVOP3(NewOpcode)) {
+ // Intersperse VOP3 modifiers among the SALU operands.
+ NewInstr->addOperand(Inst.getOperand(0));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0)
+ NewInstr->addOperand(Inst.getOperand(1));
+
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ NewInstr.addImm(0);
+ } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second
+ // operand back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
+ } else {
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src1_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
+ NewInstr->addOperand(Inst.getOperand(2));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src2_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
+ NewInstr->addOperand(Inst.getOperand(3));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
+ NewInstr.addImm(0);
+ }
+ } else {
+ // Just copy the SALU operands.
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
+ }
+
// Remove any references to SCC. Vector instructions can't read from it, and
// We're just about to add the implicit use / defs of VCC, and we don't want
// both.
@@ -6748,30 +6928,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
}
- if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
- // We are converting these to a BFE, so we need to add the missing
- // operands for the size and offset.
- unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- NewInstr.addImm(0);
- NewInstr.addImm(Size);
- } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
- // The VALU version adds the second operand to the result, so insert an
- // extra 0 operand.
- NewInstr.addImm(0);
- }
- if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
- // If we need to move this to VGPRs, we need to unpack the second operand
- // back into the 2 separate ones for bit offset and width.
- assert(OffsetWidthOp.isImm() &&
- "Scalar BFE is only implemented for constant width and offset");
- uint32_t Imm = OffsetWidthOp.getImm();
- uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
- uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- NewInstr->removeOperand(2);
- NewInstr.addImm(Offset);
- NewInstr.addImm(BitWidth);
- }
fixImplicitOperands(*NewInstr);
// Legalize the operands
legalizeOperands(*NewInstr, MDT);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index fa43b094c6549e5..90b89e9ed4055ae 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -401,29 +401,33 @@ let SubtargetPredicate = isGFX11Plus in {
}
} // End SubtargetPredicate = isGFX11Plus
+class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
+ ValueType vt1=vt0> :
+ SOP1_32<opName, [(set vt0:$sdst, (UniformUnaryFrag<Op> vt1:$src0))]>;
+
let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
SchedRW = [WriteSFPU], isReMaterializable = 1 in {
- def S_CVT_F32_I32 : SOP1_32<"s_cvt_f32_i32">;
- def S_CVT_F32_U32 : SOP1_32<"s_cvt_f32_u32">;
+ def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>;
+ def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>;
let mayRaiseFPException = 1 in {
- def S_CVT_I32_F32 : SOP1_32<"s_cvt_i32_f32">;
- def S_CVT_U32_F32 : SOP1_32<"s_cvt_u32_f32">;
- def S_CVT_F32_F16 : SOP1_32<"s_cvt_f32_f16">;
+ def S_CVT_I32_F32 : SOP1_F32_Inst<"s_cvt_i32_f32", fp_to_sint, i32, f32>;
+ def S_CVT_U32_F32 : SOP1_F32_Inst<"s_cvt_u32_f32", fp_to_uint, i32, f32>;
+ def S_CVT_F32_F16 : SOP1_F32_Inst<"s_cvt_f32_f16", fpextend, f32, f16>;
def S_CVT_HI_F32_F16 : SOP1_32<"s_cvt_hi_f32_f16">;
- def S_CEIL_F32 : SOP1_32<"s_ceil_f32">;
- def S_FLOOR_F32 : SOP1_32<"s_floor_f32">;
- def S_TRUNC_F32 : SOP1_32<"s_trunc_f32">;
- def S_RNDNE_F32 : SOP1_32<"s_rndne_f32">;
+ def S_CEIL_F32 : SOP1_F32_Inst<"s_ceil_f32", fceil>;
+ def S_FLOOR_F32 : SOP1_F32_Inst<"s_floor_f32", ffloor>;
+ def S_TRUNC_F32 : SOP1_F32_Inst<"s_trunc_f32", ftrunc>;
+ def S_RNDNE_F32 : SOP1_F32_Inst<"s_rndne_f32", frint>;
let FPDPRounding = 1 in
- def S_CVT_F16_F32 : SOP1_32<"s_cvt_f16_f32">;
+ def S_CVT_F16_F32 : SOP1_F32_Inst<"s_cvt_f16_f32", fpround, f16, f32>;
- def S_CEIL_F16 : SOP1_32<"s_ceil_f16">;
- def S_FLOOR_F16 : SOP1_32<"s_floor_f16">;
- def S_TRUNC_F16 : SOP1_32<"s_trunc_f16">;
- def S_RNDNE_F16 : SOP1_32<"s_rndne_f16">;
+ def S_CEIL_F16 : SOP1_F32_Inst<"s_ceil_f16", fceil, f16>;
+ def S_FLOOR_F16 : SOP1_F32_Inst<"s_floor_f16", ffloor, f16>;
+ def S_TRUNC_F16 : SOP1_F32_Inst<"s_trunc_f16", ftrunc, f16>;
+ def S_RNDNE_F16 : SOP1_F32_Inst<"s_rndne_f16", frint, f16>;
} // End mayRaiseFPException = 1
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
// SchedRW = [WriteSFPU], isReMaterializable = 1
@@ -756,14 +760,22 @@ let SubtargetPredicate = isGFX11Plus in {
def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">;
} // End SubtargetPredicate = isGFX11Plus
+class SOP2_F32_Inst<string opName, SDPatternOperator Op, ValueType dstVt=f32> :
+ SOP2_F32<opName,
+ [(set dstVt:$sdst, (UniformBinFrag<Op> SSrc_f32:$src0, SSrc_f32:$src1))]>;
+
+class SOP2_F16_Inst<string opName, SDPatternOperator Op> :
+ SOP2_F16<opName,
+ [(set f16:$sdst, (UniformBinFrag<Op> SSrc_f16:$src0, SSrc_f16:$src1))]>;
+
let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
Uses = [MODE], SchedRW = [WriteSFPU] in {
let isReMaterializable = 1 in {
let isCommutable = 1 in {
- def S_ADD_F32 : SOP2_F32<"s_add_f32">;
- def S_MIN_F32 : SOP2_F32<"s_min_f32">;
- def S_MAX_F32 : SOP2_F32<"s_max_f32">;
- def S_MUL_F32 : SOP2_F32<"s_mul_f32">;
+ def S_ADD_F32 : SOP2_F32_Inst<"s_add_f32", any_fadd>;
+ def S_MIN_F32 : SOP2_F32_Inst<"s_min_f32", fminnum_like>;
+ def S_MAX_F32 : SOP2_F32_Inst<"s_max_f32", fmaxnum_like>;
+ def S_MUL_F32 : SOP2_F32_Inst<"s_mul_f32", any_fmul>;
let FixedSize = 1 in
def S_FMAAK_F32 : SOP2_Pseudo<
@@ -773,19 +785,20 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
>;
let FPDPRounding = 1 in {
- def S_ADD_F16 : SOP2_F16<"s_add_f16">;
- def S_MUL_F16 : SOP2_F16<"s_mul_f16">;
+ def S_ADD_F16 : SOP2_F16_Inst<"s_add_f16", any_fadd>;
+ def S_MUL_F16 : SOP2_F16_Inst<"s_mul_f16", any_fmul>;
} // End FPDPRounding
- def S_MIN_F16 : SOP2_F16<"s_min_f16">;
- def S_MAX_F16 : SOP2_F16<"s_max_f16">;
+ def S_MIN_F16 : SOP2_F16_Inst<"s_min_f16", fminnum_like>;
+ def S_MAX_F16 : SOP2_F16_Inst<"s_max_f16", fmaxnum_like>;
} // End isCommutable = 1
let FPDPRounding = 1 in
- def S_SUB_F16 : SOP2_F16<"s_sub_f16">;
+ def S_SUB_F16 : SOP2_F16_Inst<"s_sub_f16", any_fsub>;
- def S_SUB_F32 : SOP2_F32<"s_sub_f32">;
- def S_CVT_PK_RTZ_F16_F32 : SOP2_F32<"s_cvt_pk_rtz_f16_f32">;
+ def S_SUB_F32 : SOP2_F32_Inst<"s_sub_f32", any_fsub>;
+ def S_CVT_PK_RTZ_F16_F32 : SOP2_F32_Inst<"s_cvt_pk_rtz_f16_f32",
+ AMDGPUpkrtz_f16_f32, v2f16>;
let FixedSize = 1 in
def S_FMAMK_F32 : SOP2_Pseudo<
@@ -796,20 +809,22 @@ let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
} // End isReMaterializable = 1
let Constraints = "$sdst = $src2", DisableEncoding="$src2",
- isCommutable = 1 in {
+ isCommutable = 1, AddedComplexity = 20 in {
def S_FMAC_F32 : SOP2_Pseudo<
"s_fmac_f32", (outs SReg_32:$sdst),
(ins SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2),
- "$sdst, $src0, $src1"
+ "$sdst, $src0, $src1",
+ [(set f32:$sdst, (UniformTernaryFrag<any_fma> SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2))]
>;
def S_FMAC_F16 : SOP2_Pseudo<
"s_fmac_f16", (outs SReg_32:$sdst),
(ins SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2),
- "$sdst, $src0, $src1"
+ "$sdst, $src0, $src1",
+ [(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))]
>;
} // End Constraints = "$sdst = $src2", DisableEncoding="$src2",
- // isCommutable = 1
+ // isCommutable = 1, AddedComplexity = 20
} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
// Uses = [MODE], SchedRW = [WriteSFPU]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir
new file mode 100644
index 000000000000000..bb86413964098ba
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop1.mir
@@ -0,0 +1,302 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s
+
+---
+name: sitofp_i32_to_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: sitofp_i32_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[S_CVT_F32_I32_:%[0-9]+]]:sreg_32 = S_CVT_F32_I32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY [[S_CVT_F32_I32_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_SITOFP %0(s32)
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: uitofp_u32_to_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: uitofp_u32_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[S_CVT_F32_U32_:%[0-9]+]]:sreg_32 = S_CVT_F32_U32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY [[S_CVT_F32_U32_]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_UITOFP %0(s32)
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: fptosi_f32_to_i32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptosi_f32_to_i32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_CVT_I32_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FPTOSI %0(s32)
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: fptoui_f32_to_u32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptoui_f32_to_u32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_CVT_U32_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FPTOUI %0(s32)
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: fpext_f16_to_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fpext_f16_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_CVT_F32_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = G_FPEXT %1(s16)
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fpext_hif16_to_32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fpext_hif16_to_32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[S_CVT_HI_F32_F16_:%[0-9]+]]:sreg_32 = S_CVT_HI_F32_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY [[S_CVT_HI_F32_F16_]]
+ %0:sgpr(<2 x s16>) = COPY $sgpr0
+ %2:sgpr(s32) = G_BITCAST %0(<2 x s16>)
+ %3:sgpr(s32) = G_CONSTANT i32 16
+ %4:sgpr(s32) = G_LSHR %2, %3(s32)
+ %5:sgpr(s16) = G_TRUNC %4(s32)
+ %6:sgpr(s32) = G_FPEXT %5(s16)
+ $sgpr0 = COPY %6(s32)
+
+...
+---
+name: fptrunc_f32_to_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptrunc_f32_to_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_CVT_F16_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_FPTRUNC %0(s32)
+ %2:sgpr(s32) = G_ANYEXT %1(s16)
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fceil_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fceil_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_CEIL_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FCEIL %0
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: ffloor_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: ffloor_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_FLOOR_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FFLOOR %0
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: ftrunc_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: ftrunc_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_TRUNC_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_INTRINSIC_TRUNC %0
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: frint_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: frint_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %1:sreg_32 = nofpexcept S_RNDNE_F32 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %1
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = G_FRINT %0
+ $sgpr0 = COPY %1(s32)
+
+...
+---
+name: fceil_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fceil_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_CEIL_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_FCEIL %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $sgpr0 = COPY %3(s32)
+
+...
+---
+name: ffloor_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: ffloor_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_FLOOR_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_FFLOOR %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $sgpr0 = COPY %3(s32)
+
+...
+---
+name: ftrunc_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: ftrunc_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_TRUNC_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_INTRINSIC_TRUNC %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $sgpr0 = COPY %3(s32)
+
+...
+---
+name: frint_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: frint_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_RNDNE_F16 [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s16) = G_FRINT %1
+ %3:sgpr(s32) = G_ANYEXT %2(s16)
+ $sgpr0 = COPY %3(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir
new file mode 100644
index 000000000000000..48b4534c871ee0c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sop2.mir
@@ -0,0 +1,294 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s
+
+---
+name: fadd_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fadd_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_ADD_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FADD %0, %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fsub_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fsub_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_SUB_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FSUB %0, %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fmul_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmul_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_MUL_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FMUL %0, %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fmin_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmin_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_MIN_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FMINNUM %0, %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fmax_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmax_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_MAX_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FMAXNUM %0, %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: fadd_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fadd_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %4:sreg_32 = nofpexcept S_ADD_F16 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %4
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s16) = G_FADD %1, %3
+ %5:sgpr(s32) = G_ANYEXT %4(s16)
+ $sgpr0 = COPY %5(s32)
+
+...
+---
+name: fsub_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fsub_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %4:sreg_32 = nofpexcept S_SUB_F16 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %4
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s16) = G_FSUB %1, %3
+ %5:sgpr(s32) = G_ANYEXT %4(s16)
+ $sgpr0 = COPY %5(s32)
+
+...
+---
+name: fmul_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmul_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %4:sreg_32 = nofpexcept S_MUL_F16 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %4
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s16) = G_FMUL %1, %3
+ %5:sgpr(s32) = G_ANYEXT %4(s16)
+ $sgpr0 = COPY %5(s32)
+
+...
+---
+name: fmin_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmin_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %4:sreg_32 = nofpexcept S_MIN_F16 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %4
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s16) = G_FMINNUM %1, %3
+ %5:sgpr(s32) = G_ANYEXT %4(s16)
+ $sgpr0 = COPY %5(s32)
+
+...
+---
+name: fmax_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fmax_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s16) = G_FMAXNUM %1, %3
+ %5:sgpr(s32) = G_ANYEXT %4(s16)
+
+...
+---
+name: s_cvt_pkrtz_v2f16_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: s_cvt_pkrtz_v2f16_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: %2:sreg_32 = nofpexcept S_CVT_PK_RTZ_F16_F32 [[COPY]], [[COPY1]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %2
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(<2 x s16>) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pkrtz), %0(s32), %1(s32)
+ $sgpr0 = COPY %2(<2 x s16>)
+
+...
+---
+name: fmac_f32
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+
+ ; GFX1150-LABEL: name: fmac_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1, $sgpr2
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1150-NEXT: %3:sreg_32 = nofpexcept S_FMAC_F32 [[COPY1]], [[COPY2]], [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %3
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = COPY $sgpr2
+ %3:sgpr(s32) = G_FMA %1, %2, %0
+ $sgpr0 = COPY %3(s32)
+
+...
+---
+name: fmac_f16
+legalized: true
+regBankSelected: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+
+ ; GFX1150-LABEL: name: fmac_f16
+ ; GFX1150: liveins: $sgpr0, $sgpr1, $sgpr2
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+ ; GFX1150-NEXT: %6:sreg_32 = nofpexcept S_FMAC_F16 [[COPY1]], [[COPY2]], [[COPY]], implicit $mode
+ ; GFX1150-NEXT: $sgpr0 = COPY %6
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = COPY $sgpr2
+ %5:sgpr(s16) = G_TRUNC %4(s32)
+ %6:sgpr(s16) = G_FMA %3, %5, %1
+ %7:sgpr(s32) = G_ANYEXT %6(s16)
+ $sgpr0 = COPY %7(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir
new file mode 100644
index 000000000000000..3ef974135d2f156
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-scalar-float-sopc.mir
@@ -0,0 +1,647 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s
+
+---
+name: f32_olt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_olt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LT_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(olt), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_oeq
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_oeq
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_EQ_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(oeq), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ole
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ole
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LE_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ole), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ogt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ogt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_GT_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ogt), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_one
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_one
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LG_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(one), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_oge
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_oge
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_GE_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(oge), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ord
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ord
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_O_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ord), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_uno
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_uno
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_U_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(uno), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ult
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ult
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NGE_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ult), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ueq
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ueq
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLG_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ueq), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ule
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ule
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NGT_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ule), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_ugt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_ugt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLE_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(ugt), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_une
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_une
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NEQ_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(une), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f32_uge
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_uge
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLT_F32 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s32) = COPY $sgpr1
+ %2:sgpr(s32) = G_FCMP floatpred(uge), %0(s32), %1
+ $sgpr0 = COPY %2(s32)
+
+...
+---
+name: f16_olt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_olt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LT_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(olt), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_oeq
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_oeq
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_EQ_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(oeq), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ole
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ole
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LE_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ole), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ogt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ogt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_GT_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ogt), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_one
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_one
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_LG_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(one), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_oge
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_oge
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_GE_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(oge), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ord
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ord
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_O_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ord), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_uno
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_uno
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_U_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(uno), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ult
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ult
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NGE_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ult), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ueq
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ueq
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLG_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ueq), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ule
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ule
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NGT_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ule), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_ugt
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_ugt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLE_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(ugt), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_une
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_une
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NEQ_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(une), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
+---
+name: f16_uge
+legalized: true
+regBankSelected: true
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_uge
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; GFX1150-NEXT: S_CMP_NLT_F16 [[COPY]], [[COPY1]], implicit-def $scc, implicit $mode
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY $scc
+ ; GFX1150-NEXT: $sgpr0 = COPY [[COPY2]]
+ %0:sgpr(s32) = COPY $sgpr0
+ %1:sgpr(s16) = G_TRUNC %0(s32)
+ %2:sgpr(s32) = COPY $sgpr1
+ %3:sgpr(s16) = G_TRUNC %2(s32)
+ %4:sgpr(s32) = G_FCMP floatpred(uge), %1(s16), %3
+ $sgpr0 = COPY %4(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp-s32.mir
new file mode 100644
index 000000000000000..bd4acfe00a0b283
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp-s32.mir
@@ -0,0 +1,46 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -march=amdgcn -mcpu=gfx1150 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX1150 %s
+
+---
+name: f32_olt
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f32_olt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; GFX1150-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(olt), [[COPY]](s32), [[COPY1]]
+ ; GFX1150-NEXT: $sgpr0 = COPY [[FCMP]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s32) = G_FCMP floatpred(olt), %0(s32), %1
+ $sgpr0 = COPY %2
+
+...
+
+---
+name: f16_olt
+body: |
+ bb.1.entry:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: f16_olt
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+ ; GFX1150-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+ ; GFX1150-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(olt), [[TRUNC]](s16), [[TRUNC1]]
+ ; GFX1150-NEXT: $sgpr0 = COPY [[FCMP]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0(s32)
+ %2:_(s32) = COPY $sgpr1
+ %3:_(s16) = G_TRUNC %2(s32)
+ %4:_(s32) = G_FCMP floatpred(olt), %1(s16), %3
+ $sgpr0 = COPY %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
index bf93a3d242e0150..d25a3fdfa3c3098 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcmp.mir
@@ -4,6 +4,7 @@
# RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -O0 -march=amdgcn -mcpu=gfx1010 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
# RUN: llc -O0 -march=amdgcn -mcpu=gfx1100 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -O0 -march=amdgcn -mcpu=gfx1150 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
---
name: test_fcmp_s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fcmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fcmp.mir
index f599c869d114d36..8f09618207aa118 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fcmp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fcmp.mir
@@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck --check-prefixes=GCN,GFX803 %s
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck --check-prefixes=GCN,GFX803 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck --check-prefixes=GCN,GFX1150 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=amdgpu-regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck --check-prefixes=GCN,GFX1150 %s
---
name: fcmp_ss
@@ -9,14 +11,21 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0, $sgpr1
- ; CHECK-LABEL: name: fcmp_ss
- ; CHECK: liveins: $sgpr0, $sgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
- ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY2]](s32), [[COPY3]]
+ ; GFX803-LABEL: name: fcmp_ss
+ ; GFX803: liveins: $sgpr0, $sgpr1
+ ; GFX803-NEXT: {{ $}}
+ ; GFX803-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX803-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; GFX803-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; GFX803-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+ ; GFX803-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY2]](s32), [[COPY3]]
+ ; GFX1150-LABEL: name: fcmp_ss
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; GFX1150-NEXT: [[FCMP:%[0-9]+]]:sgpr(s32) = G_FCMP floatpred(uge), [[COPY]](s32), [[COPY1]]
+ ; GFX1150-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s1) = G_TRUNC [[FCMP]](s32)
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $sgpr1
%2:_(s1) = G_FCMP floatpred(uge), %0(s32), %1
@@ -29,13 +38,13 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0, $vgpr0
- ; CHECK-LABEL: name: fcmp_sv
- ; CHECK: liveins: $sgpr0, $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY2]](s32), [[COPY1]]
+ ; GCN-LABEL: name: fcmp_sv
+ ; GCN: liveins: $sgpr0, $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; GCN-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY2]](s32), [[COPY1]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $vgpr0
%2:_(s1) = G_FCMP floatpred(uge), %0, %1
@@ -48,13 +57,13 @@ legalized: true
body: |
bb.0:
liveins: $sgpr0, $vgpr0
- ; CHECK-LABEL: name: fcmp_vs
- ; CHECK: liveins: $sgpr0, $vgpr0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
- ; CHECK-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY1]](s32), [[COPY2]]
+ ; GCN-LABEL: name: fcmp_vs
+ ; GCN: liveins: $sgpr0, $vgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; GCN-NEXT: [[FCMP:%[0-9]+]]:vcc(s1) = G_FCMP floatpred(uge), [[COPY1]](s32), [[COPY2]]
%0:_(s32) = COPY $sgpr0
%1:_(s32) = COPY $vgpr0
%2:_(s1) = G_FCMP floatpred(uge), %1, %0
@@ -67,12 +76,12 @@ legalized: true
body: |
bb.0:
liveins: $vgpr0, $vgpr1
- ; CHECK-LABEL: name: fcmp_vv
- ; CHECK: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
- ; CHECK-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP floatpred(uge), [[COPY]](s32), [[COPY1]]
+ ; GCN-LABEL: name: fcmp_vv
+ ; GCN: liveins: $vgpr0, $vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+ ; GCN-NEXT: [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP floatpred(uge), [[COPY]](s32), [[COPY1]]
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s1) = G_ICMP floatpred(uge), %0, %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-salu-float.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-salu-float.mir
new file mode 100644
index 000000000000000..dba206af69fe77c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-salu-float.mir
@@ -0,0 +1,246 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck --check-prefixes=GFX1150 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck --check-prefixes=GFX1150 %s
+
+---
+name: fadd_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fadd_f32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr1
+ ; GFX1150-NEXT: [[FADD:%[0-9]+]]:sgpr(s32) = G_FADD [[COPY]], [[COPY1]]
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = COPY $sgpr1
+ %2:_(s32) = G_FADD %0, %1
+...
+
+---
+name: fptosi_f32_to_i32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptosi_f32_to_i32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[FPTOSI:%[0-9]+]]:sgpr(s32) = G_FPTOSI [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_FPTOSI %0(s32)
+...
+
+---
+name: fptoui_f32_to_u32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptoui_f32_to_u32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[FPTOUI:%[0-9]+]]:sgpr(s32) = G_FPTOUI [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_FPTOUI %0(s32)
+...
+
+---
+name: sitofp_i32_to_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: sitofp_i32_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[SITOFP:%[0-9]+]]:sgpr(s32) = G_SITOFP [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_SITOFP %0(s32)
+...
+
+---
+name: uitofp_u32_to_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: uitofp_u32_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[UITOFP:%[0-9]+]]:sgpr(s32) = G_UITOFP [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = G_UITOFP %0(s32)
+...
+
+---
+name: fptrunc_f32_to_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptrunc_f32_to_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[FPTRUNC:%[0-9]+]]:sgpr(s16) = G_FPTRUNC [[COPY]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_FPTRUNC %0(s32)
+...
+
+---
+name: fpext_f16_to_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fpext_f16_to_f32
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ %1:_(s32) = COPY $sgpr0
+ %0:_(s16) = G_TRUNC %1(s32)
+...
+
+# Tests below should not select scalar registers
+
+---
+name: fadd_f64
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+
+ ; GFX1150-LABEL: name: fadd_f64
+ ; GFX1150: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr2_sgpr3
+ ; GFX1150-NEXT: [[COPY2:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64)
+ ; GFX1150-NEXT: [[COPY3:%[0-9]+]]:vgpr(s64) = COPY [[COPY1]](s64)
+ ; GFX1150-NEXT: [[FADD:%[0-9]+]]:vgpr(s64) = G_FADD [[COPY2]], [[COPY3]]
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s64) = COPY $sgpr2_sgpr3
+ %2:_(s64) = G_FADD %0, %1
+...
+
+---
+name: fptosi_f64_to_i32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; GFX1150-LABEL: name: fptosi_f64_to_i32
+ ; GFX1150: liveins: $sgpr0, $sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64)
+ ; GFX1150-NEXT: [[FPTOSI:%[0-9]+]]:vgpr(s32) = G_FPTOSI [[COPY1]](s64)
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_FPTOSI %0(s64)
+...
+
+---
+name: fptoui_f16_to_u16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fptoui_f16_to_u16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16)
+ ; GFX1150-NEXT: [[FPTOUI:%[0-9]+]]:vgpr(s16) = G_FPTOUI [[COPY1]](s16)
+ %1:_(s32) = COPY $sgpr0
+ %0:_(s16) = G_TRUNC %1(s32)
+ %2:_(s16) = G_FPTOUI %0(s16)
+...
+
+---
+name: sitofp_i32_to_f64
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: sitofp_i32_to_f64
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; GFX1150-NEXT: [[SITOFP:%[0-9]+]]:vgpr(s64) = G_SITOFP [[COPY1]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s64) = G_SITOFP %0(s32)
+...
+
+---
+name: uitofp_u16_to_f16
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: uitofp_u16_to_f16
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[TRUNC:%[0-9]+]]:sgpr(s16) = G_TRUNC [[COPY]](s32)
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s16) = COPY [[TRUNC]](s16)
+ ; GFX1150-NEXT: [[UITOFP:%[0-9]+]]:vgpr(s16) = G_UITOFP [[COPY1]](s16)
+ %1:_(s32) = COPY $sgpr0
+ %0:_(s16) = G_TRUNC %1(s32)
+ %2:_(s16) = G_UITOFP %0(s16)
+...
+
+---
+name: fptrunc_f64_to_f32
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1
+
+ ; GFX1150-LABEL: name: fptrunc_f64_to_f32
+ ; GFX1150: liveins: $sgpr0_sgpr1
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY [[COPY]](s64)
+ ; GFX1150-NEXT: [[FPTRUNC:%[0-9]+]]:vgpr(s32) = G_FPTRUNC [[COPY1]](s64)
+ %0:_(s64) = COPY $sgpr0_sgpr1
+ %1:_(s32) = G_FPTRUNC %0(s64)
+...
+
+---
+name: fpext_f32_to_f64
+legalized: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GFX1150-LABEL: name: fpext_f32_to_f64
+ ; GFX1150: liveins: $sgpr0
+ ; GFX1150-NEXT: {{ $}}
+ ; GFX1150-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+ ; GFX1150-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+ ; GFX1150-NEXT: [[FPEXT:%[0-9]+]]:vgpr(s64) = G_FPEXT [[COPY1]](s32)
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s64) = G_FPEXT %0(s32)
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 7e76569606ef991..aa24cc32047292a 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -1,6 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -show-mc-encoding < %s | FileCheck -check-prefixes=CHECK,GFX11,GFX1150 %s
declare float @llvm.fabs.f32(float)
declare float @llvm.fma.f32(float, float, float)
@@ -311,3 +312,39 @@ define float @v_fma_k_f32_src_mods(float %x, float %y) {
; GFX9: codeLenInByte = 24
; GFX10: codeLenInByte = 20
; GFX11: codeLenInByte = 20
+
+define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
+; GFX9-LABEL: s_fmaak_f32:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; encoding: [0x01,0x02,0x00,0x7e]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0x43800000 ; encoding: [0xff,0x02,0x02,0x7e,0x00,0x00,0x80,0x43]
+; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 ; encoding: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x06,0x04]
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fmaak_f32:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX10-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX1100-LABEL: s_fmaak_f32:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: v_mov_b32_e32 v0, 0x43800000 ; encoding: [0xff,0x02,0x00,0x7e,0x00,0x00,0x80,0x43]
+; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) ; encoding: [0x01,0x00,0x87,0xbf]
+; GFX1100-NEXT: v_fmac_f32_e64 v0, s0, s1 ; encoding: [0x00,0x00,0x2b,0xd5,0x00,0x02,0x00,0x00]
+; GFX1100-NEXT: ; return to shader part epilog
+;
+; GFX1150-LABEL: s_fmaak_f32:
+; GFX1150: ; %bb.0:
+; GFX1150-NEXT: s_fmaak_f32 s0, s0, s1, 0x43800000 ; encoding: [0x00,0x01,0x80,0xa2,0x00,0x00,0x80,0x43]
+; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
+; GFX1150-NEXT: v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
+; GFX1150-NEXT: ; return to shader part epilog
+ %fma = call float @llvm.fma.f32(float %x, float %y, float 256.0)
+ ret float %fma
+}
+
+; GFX9: codeLenInByte = 20
+; GFX10: codeLenInByte = 16
+; GFX1100: codeLenInByte = 20
+; GFX1150: codeLenInByte = 16
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
new file mode 100644
index 000000000000000..e996fda4c9fd6ca
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares-scalar-float.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1150 -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
+
+define amdgpu_vs void @fcmp_f32_olt_to_ogt(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_olt_to_ogt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_gt_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_olt_to_ogt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_gt_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp olt float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_ogt_to_olt(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_ogt_to_olt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lt_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_ogt_to_olt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lt_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ogt float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_ole_to_oge(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_ole_to_oge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ge_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_ole_to_oge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ge_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ole float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_oge_to_ole(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_oge_to_ole:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_le_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_oge_to_ole:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_le_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oge float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_ult_to_ugt(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_ult_to_ugt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nle_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_ult_to_ugt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nle_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ult float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_ugt_to_ult(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_ugt_to_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nge_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_ugt_to_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nge_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ugt float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_ule_to_uge(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_ule_to_uge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlt_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_ule_to_uge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlt_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ule float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f32_uge_to_ule(ptr addrspace(1) inreg %out, float inreg %a) {
+; SDAG-LABEL: fcmp_f32_uge_to_ule:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ngt_f32 s2, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f32_uge_to_ule:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ngt_f32 s2, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uge float 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_olt_to_ogt(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_olt_to_ogt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_gt_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_olt_to_ogt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_gt_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp olt half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_ogt_to_olt(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_ogt_to_olt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lt_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_ogt_to_olt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lt_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ogt half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_ole_to_oge(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_ole_to_oge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ge_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_ole_to_oge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ge_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ole half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_oge_to_ole(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_oge_to_ole:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_le_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_oge_to_ole:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_le_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oge half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_ult_to_ugt(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_ult_to_ugt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nle_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_ult_to_ugt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nle_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ult half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_f16_ugt_to_ult(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_f16_ugt_to_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nge_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_f16_ugt_to_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nge_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ugt half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_ule_to_uge(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_ule_to_uge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlt_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_ule_to_uge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlt_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ule half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @fcmp_uge_to_ule(ptr addrspace(1) inreg %out, half inreg %a) {
+; SDAG-LABEL: fcmp_uge_to_ule:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ngt_f16 s2, 0x4000
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: fcmp_uge_to_ule:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ngt_f16 s2, 0x4000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uge half 2.0, %a
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 1315c227ecde385..36fa95c4c3ab5bf 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -126,3 +126,102 @@ body: |
%3:sreg_32 = COPY %1:vgpr_32
%4:sreg_32 = S_CSELECT_B32 killed %2:sreg_32, killed %3:sreg_32, implicit undef $scc
---
+
+---
+name: cmp_f32
+body: |
+ bb.0:
+ ; GCN-LABEL: name: cmp_f32
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: %6:sreg_64_xexec = nofpexcept V_CMP_LT_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %6, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3:sreg_32 = COPY %2:vgpr_32
+ nofpexcept S_CMP_LT_F32 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+ %4:sreg_64_xexec = COPY $scc
+ %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
+
+---
+name: cmp_f16
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: cmp_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %3:sreg_32 = COPY %2:vgpr_32
+ nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+ %4:sreg_64_xexec = COPY $scc
+ %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
+
+# Needs extra shift instruction to select hi 16 bits
+---
+name: cvt_hi_f32_f16
+body: |
+ bb.0:
+
+ ; GCN-LABEL: name: cvt_hi_f32_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_e64_]], implicit $exec
+ ; GCN-NEXT: [[V_CVT_F32_F16_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %2:sreg_32 = COPY %1:vgpr_32
+ %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
+...
+
+# Test to ensure that src2 of fmac is moved to vgpr
+---
+name: fmac_f32
+body: |
+ bb.0:
+ ; GCN-LABEL: name: fmac_f32
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[V_CVT_F32_U32_e64_]], 0, [[DEF1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %4:sreg_32 = COPY %3:vgpr_32
+ %5:sreg_32 = nofpexcept S_FMAC_F32 killed %4:sreg_32, %1:sreg_32, %2:sreg_32, implicit $mode
+...
+
+---
+name: fmac_f16
+body: |
+ bb.0:
+ ; GCN-LABEL: name: fmac_f16
+ ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
+ ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[COPY]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sreg_32 = IMPLICIT_DEF
+ %2:sreg_32 = IMPLICIT_DEF
+ %3:vgpr_32 = V_CVT_F32_U32_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+ %4:sreg_32 = COPY %3:vgpr_32
+ %5:sreg_32 = nofpexcept S_FMAC_F16 killed %1:sreg_32, %2:sreg_32, %4:sreg_32, implicit $mode
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
new file mode 100644
index 000000000000000..39511a95cebc7de
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir
@@ -0,0 +1,238 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass=si-fold-operands %s -o - | FileCheck %s
+
+---
+name: fmac_fold_inlinable_src0_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src0_to_fmamk
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %inlinable, %0, %1, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_inlinable_src1_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src1_to_fmamk
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %inlinable, %1, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_inlinable_src2_to_fmaak
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src2_to_fmaak
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], [[COPY1]], 1056964608, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %1, %inlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_noninlinable_src0_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_noninlinable_src0_to_fmamk
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %noninlinable, %0, %1, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_noninlinable_src1_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_noninlinable_src1_to_fmamk
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, [[COPY1]], implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %noninlinable, %1, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_noninlinable_src2_to_fmaak
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1
+
+ ; CHECK-LABEL: name: fmac_fold_noninlinable_src2_to_fmaak
+ ; CHECK: liveins: $sgpr0, $sgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], [[COPY1]], 1234567890, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %1:sreg_32 = COPY $sgpr1
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %1, %noninlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+
+---
+name: fmac_fold_inlinable_src2_to_fmaak_noninlinable_src0_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src2_to_fmaak_noninlinable_src0_to_fmamk
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %noninlinable, %0, %inlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_inlinable_src2_to_fmaak_noninlinable_src1_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src2_to_fmaak_noninlinable_src1_to_fmamk
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %noninlinable, %inlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_noninlinable_src2_to_fmaak_inlinable_src1
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_noninlinable_src2_to_fmaak_inlinable_src1
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %inlinable, %noninlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_noninlinable_src2_to_fmaak_dont_fold_other_noninlinable
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_noninlinable_src2_to_fmaak_dont_fold_other_noninlinable
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %noninlinable2:sreg_32 = S_MOV_B32 1234567891
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], %noninlinable2, 1234567890, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %noninlinable2:sreg_32 = S_MOV_B32 1234567891
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %noninlinable2, %noninlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_inlinable_src1_to_fmamk_noninlinable_src2_to_fmaak
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_inlinable_src1_to_fmamk_noninlinable_src2_to_fmaak
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %inlinable:sreg_32 = S_MOV_B32 1056964608
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %0, %inlinable, %noninlinable, implicit $mode
+ $sgpr0 = COPY %fma
+...
+---
+name: fmac_fold_same_noninlinable_src0_and_src1_to_fmamk
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; CHECK-LABEL: name: fmac_fold_same_noninlinable_src0_and_src1_to_fmamk
+ ; CHECK: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+ ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 1234567890, 1234567890, [[COPY]], implicit $mode
+ ; CHECK-NEXT: $sgpr0 = COPY %fma
+ %0:sreg_32 = COPY $sgpr0
+ %noninlinable:sreg_32 = S_MOV_B32 1234567890
+ %fma:sreg_32 = nofpexcept S_FMAC_F32 %noninlinable, %noninlinable, %0, implicit $mode
+ $sgpr0 = COPY %fma
+...
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
new file mode 100644
index 000000000000000..5fb5a8b1b0350ab
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop1.ll
@@ -0,0 +1,174 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_vs float @sitofp_i32_to_f32(i32 inreg %val) {
+; CHECK-LABEL: sitofp_i32_to_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_f32_i32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = sitofp i32 %val to float
+ ret float %res
+}
+
+define amdgpu_vs float @uitofp_u32_to_f32(i32 inreg %val) {
+; CHECK-LABEL: uitofp_u32_to_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_f32_u32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = uitofp i32 %val to float
+ ret float %res
+}
+
+define amdgpu_vs i32 @fptosi_f32_to_i32(float inreg %val) {
+; CHECK-LABEL: fptosi_f32_to_i32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_i32_f32 s0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = fptosi float %val to i32
+ ret i32 %res
+}
+
+define amdgpu_vs i32 @fptoui_f32_to_u32(float inreg %val) {
+; CHECK-LABEL: fptoui_f32_to_u32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_u32_f32 s0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = fptoui float %val to i32
+ ret i32 %res
+}
+
+define amdgpu_vs float @fpext_f16_to_f32(half inreg %val) {
+; CHECK-LABEL: fpext_f16_to_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_f32_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = fpext half %val to float
+ ret float %res
+}
+
+define amdgpu_vs float @fpext_hif16_to_32(<2 x half> inreg %val) {
+; CHECK-LABEL: fpext_hif16_to_32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_hi_f32_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %hielt = extractelement <2 x half> %val, i32 1
+ %res = fpext half %hielt to float
+ ret float %res
+}
+
+define amdgpu_vs half @fptrunc_f32_to_f16(float inreg %val) {
+; CHECK-LABEL: fptrunc_f32_to_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_f16_f32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = fptrunc float %val to half
+ ret half %res
+}
+
+define amdgpu_vs float @fceil_f32(float inreg %val) {
+; CHECK-LABEL: fceil_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_ceil_f32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call float @llvm.ceil.f32(float %val)
+ ret float %res
+}
+
+define amdgpu_vs float @ffloor_f32(float inreg %val) {
+; CHECK-LABEL: ffloor_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_floor_f32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call float @llvm.floor.f32(float %val)
+ ret float %res
+}
+
+define amdgpu_vs float @ftrunc_f32(float inreg %val) {
+; CHECK-LABEL: ftrunc_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_trunc_f32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call float @llvm.trunc.f32(float %val)
+ ret float %res
+}
+
+define amdgpu_vs float @frint_f32(float inreg %val) {
+; CHECK-LABEL: frint_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_rndne_f32 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call float @llvm.rint.f32(float %val)
+ ret float %res
+}
+
+define amdgpu_vs half @fceil_f16(half inreg %val) {
+; CHECK-LABEL: fceil_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_ceil_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call half @llvm.ceil.f16(half %val)
+ ret half %res
+}
+
+define amdgpu_vs half @ffloor_f16(half inreg %val) {
+; CHECK-LABEL: ffloor_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_floor_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call half @llvm.floor.f16(half %val)
+ ret half %res
+}
+
+define amdgpu_vs half @ftrunc_f16(half inreg %val) {
+; CHECK-LABEL: ftrunc_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_trunc_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call half @llvm.trunc.f16(half %val)
+ ret half %res
+}
+
+define amdgpu_vs half @frint_f16(half inreg %val) {
+; CHECK-LABEL: frint_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_rndne_f16 s0, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call half @llvm.rint.f16(half %val)
+ ret half %res
+}
+
+declare float @llvm.ceil.f32(float)
+declare float @llvm.floor.f32(float)
+declare float @llvm.trunc.f32(float)
+declare float @llvm.rint.f32(float)
+declare half @llvm.ceil.f16(half)
+declare half @llvm.floor.f16(half)
+declare half @llvm.trunc.f16(half)
+declare half @llvm.rint.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
new file mode 100644
index 000000000000000..d736606a2aaa567
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sop2.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_vs float @fadd_f32(float inreg %a, float inreg %b) {
+; CHECK-LABEL: fadd_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %add = fadd float %a, %b
+ ret float %add
+}
+
+define amdgpu_vs float @fsub_f32(float inreg %a, float inreg %b) {
+; CHECK-LABEL: fsub_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sub_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %sub = fsub float %a, %b
+ ret float %sub
+}
+
+define amdgpu_vs float @fmul_f32(float inreg %a, float inreg %b) {
+; CHECK-LABEL: fmul_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mul_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %mul = fmul float %a, %b
+ ret float %mul
+}
+
+define amdgpu_vs float @fmin_f32(float inreg %a, float inreg %b) {
+; CHECK-LABEL: fmin_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_min_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %min = call float @llvm.minnum.f32(float %a, float %b)
+ ret float %min
+}
+
+define amdgpu_vs float @fmax_f32(float inreg %a, float inreg %b) {
+; CHECK-LABEL: fmax_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_max_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %max = call float @llvm.maxnum.f32(float %a, float %b)
+ ret float %max
+}
+
+define amdgpu_vs half @fadd_f16(half inreg %a, half inreg %b) {
+; CHECK-LABEL: fadd_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_add_f16 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %add = fadd half %a, %b
+ ret half %add
+}
+
+define amdgpu_vs half @fsub_f16(half inreg %a, half inreg %b) {
+; CHECK-LABEL: fsub_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_sub_f16 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %sub = fsub half %a, %b
+ ret half %sub
+}
+
+define amdgpu_vs half @fmul_f16(half inreg %a, half inreg %b) {
+; CHECK-LABEL: fmul_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mul_f16 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %mul = fmul half %a, %b
+ ret half %mul
+}
+
+define amdgpu_vs half @fmin_f16(half inreg %a, half inreg %b) {
+; CHECK-LABEL: fmin_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_min_f16 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %min = call half @llvm.minnum.f16(half %a, half %b)
+ ret half %min
+}
+
+define amdgpu_vs half @fmax_f16(half inreg %a, half inreg %b) {
+; CHECK-LABEL: fmax_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_max_f16 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %max = call half @llvm.maxnum.f16(half %a, half %b)
+ ret half %max
+}
+
+define amdgpu_vs <2 x half> @s_cvt_pkrtz_v2f16_f32(float inreg %x, float inreg %y) {
+; CHECK-LABEL: s_cvt_pkrtz_v2f16_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_cvt_pk_rtz_f16_f32 s0, s0, s1
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+ ret <2 x half> %result
+}
+
+define amdgpu_vs float @fmac_f32(float inreg %a, float inreg %b, float inreg %c) {
+; CHECK-LABEL: fmac_f32:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_fmac_f32 s0, s1, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call float @llvm.fma.f32(float %b, float %c, float %a)
+ ret float %res
+}
+
+; Check selection of mov + fmac if src2 of fmac has a use later on
+define amdgpu_vs float @fmac_f32_with_mov(float inreg %a, float inreg %b, float inreg %c) {
+; CHECK-LABEL: fmac_f32_with_mov:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s3, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-NEXT: s_fmac_f32 s3, s0, s1
+; CHECK-NEXT: s_add_f32 s0, s3, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %fma = call float @llvm.fma.f32(float %a, float %b, float %c)
+ %res = fadd float %fma, %c
+ ret float %res
+}
+
+define amdgpu_vs half @fmac_f16(half inreg %a, half inreg %b, half inreg %c) {
+; CHECK-LABEL: fmac_f16:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_fmac_f16 s0, s1, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %res = call half @llvm.fma.f16(half %b, half %c, half %a)
+ ret half %res
+}
+
+; Check selection of mov + fmac if src2 of fmac has a use later
+define amdgpu_vs half @fmac_f16_with_mov(half inreg %a, half inreg %b, half inreg %c) {
+; CHECK-LABEL: fmac_f16_with_mov:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 s3, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; CHECK-NEXT: s_fmac_f16 s3, s0, s1
+; CHECK-NEXT: s_add_f16 s0, s3, s2
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+ %fma = call half @llvm.fma.f16(half %a, half %b, half %c)
+ %res = fadd half %fma, %c
+ ret half %res
+}
+
+; Regression test for crash in SIFoldOperands
+define amdgpu_ps float @_amdgpu_ps_main() {
+; CHECK-LABEL: _amdgpu_ps_main:
+; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: s_mov_b32 s2, s0
+; CHECK-NEXT: s_mov_b32 s3, s0
+; CHECK-NEXT: s_buffer_load_b64 s[0:1], s[0:3], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: s_fmamk_f32 s0, s1, 0x40800000, s0
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
+; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: ; return to shader part epilog
+bb:
+ %i = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 0, i32 0)
+ %i1 = bitcast i32 %i to float
+ %i2 = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> zeroinitializer, i32 4, i32 0)
+ %i3 = bitcast i32 %i2 to float
+ %i4 = fmul contract float %i3, 4.0
+ %i5 = fadd contract float %i4, %i1
+ ret float %i5
+}
+
+declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
+declare float @llvm.minnum.f32(float, float)
+declare float @llvm.maxnum.f32(float, float)
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float)
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare half @llvm.fma.f16(half, half, half) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
new file mode 100644
index 000000000000000..19e50be155a9646
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scalar-float-sopc.ll
@@ -0,0 +1,899 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -march=amdgcn -mcpu=gfx1150 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL %s
+
+define amdgpu_vs void @f32_olt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_olt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lt_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_olt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lt_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp olt float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_oeq(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_oeq:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_eq_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_oeq:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_eq_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oeq float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ole(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ole:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_le_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ole:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_le_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ole float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ogt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ogt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_gt_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ogt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_gt_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ogt float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_one(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_one:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lg_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_one:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lg_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp one float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_oge(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_oge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ge_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_oge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ge_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oge float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ord(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ord:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_o_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ord:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_o_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ord float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_uno(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_uno:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_u_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_uno:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_u_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uno float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ult(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nge_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nge_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ult float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ueq(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ueq:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlg_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ueq:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlg_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ueq float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ule(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ule:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ngt_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ule:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ngt_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ule float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_ugt(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_ugt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nle_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_ugt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nle_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ugt float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_une(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_une:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_neq_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_une:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_neq_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp une float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f32_uge(ptr addrspace(1) inreg %out, float inreg %a, float inreg %b) {
+; SDAG-LABEL: f32_uge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlt_f32 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f32_uge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlt_f32 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uge float %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_olt(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_olt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lt_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_olt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lt_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp olt half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_oeq(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_oeq:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_eq_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_oeq:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_eq_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oeq half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ole(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ole:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_le_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ole:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_le_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ole half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ogt(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ogt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_gt_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ogt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_gt_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ogt half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_one(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_one:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_lg_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_one:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_lg_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp one half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_oge(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_oge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ge_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_oge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ge_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp oge half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ord(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ord:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_o_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ord:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_o_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ord half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_uno(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_uno:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_u_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_uno:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_u_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uno half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ult(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nge_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nge_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ult half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ueq(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ueq:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlg_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ueq:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlg_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ueq half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ule(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ule:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_ngt_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ule:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_ngt_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ule half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_ugt(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_ugt:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nle_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_ugt:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nle_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp ugt half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_une(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_une:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_neq_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_une:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_neq_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp une half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_vs void @f16_uge(ptr addrspace(1) inreg %out, half inreg %a, half inreg %b) {
+; SDAG-LABEL: f16_uge:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_cmp_nlt_f16 s2, s3
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
+; SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: f16_uge:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_cmp_nlt_f16 s2, s3
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GISEL-NEXT: s_endpgm
+entry:
+ %0 = fcmp uge half %a, %b
+ %1 = sext i1 %0 to i32
+ store i32 %1, ptr addrspace(1) %out
+ ret void
+}
More information about the llvm-commits
mailing list