[llvm] 94a7136 - [Hexagon] Implement [SU]INT_TO_FP and FP_TO_[SU]INT for HVX
Krzysztof Parzyszek via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 19 11:11:40 PDT 2022
Author: Krzysztof Parzyszek
Date: 2022-09-19T11:11:20-07:00
New Revision: 94a71361d6ada8a0e25817fe8ebe443092677e6c
URL: https://github.com/llvm/llvm-project/commit/94a71361d6ada8a0e25817fe8ebe443092677e6c
DIFF: https://github.com/llvm/llvm-project/commit/94a71361d6ada8a0e25817fe8ebe443092677e6c.diff
LOG: [Hexagon] Implement [SU]INT_TO_FP and FP_TO_[SU]INT for HVX
Added:
llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
Modified:
llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
llvm/lib/Target/Hexagon/HexagonISelLowering.h
llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
llvm/lib/Target/Hexagon/HexagonPatterns.td
llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index f3e12391935a..331e1cb19f36 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1899,6 +1899,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
case HexagonISD::VASL: return "HexagonISD::VASL";
case HexagonISD::VASR: return "HexagonISD::VASR";
case HexagonISD::VLSR: return "HexagonISD::VLSR";
+ case HexagonISD::SSAT: return "HexagonISD::SSAT";
+ case HexagonISD::USAT: return "HexagonISD::USAT";
case HexagonISD::VEXTRACTW: return "HexagonISD::VEXTRACTW";
case HexagonISD::VINSERTW0: return "HexagonISD::VINSERTW0";
case HexagonISD::VROR: return "HexagonISD::VROR";
@@ -3290,13 +3292,25 @@ HexagonTargetLowering::LowerOperationWrapper(SDNode *N,
return;
}
- // We are only custom-lowering stores to verify the alignment of the
- // address if it is a compile-time constant. Since a store can be modified
- // during type-legalization (the value being stored may need legalization),
- // return empty Results here to indicate that we don't really make any
- // changes in the custom lowering.
- if (N->getOpcode() != ISD::STORE)
- return TargetLowering::LowerOperationWrapper(N, Results, DAG);
+ SDValue Op(N, 0);
+ unsigned Opc = N->getOpcode();
+
+ switch (Opc) {
+ case HexagonISD::SSAT:
+ case HexagonISD::USAT:
+ Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG));
+ break;
+ case ISD::STORE:
+ // We are only custom-lowering stores to verify the alignment of the
+ // address if it is a compile-time constant. Since a store can be
+ // modified during type-legalization (the value being stored may need
+ // legalization), return empty Results here to indicate that we don't
+ // really make any changes in the custom lowering.
+ return;
+ default:
+ TargetLowering::LowerOperationWrapper(N, Results, DAG);
+ break;
+ }
}
void
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 7e776dc6f99b..59b6a40feb19 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -57,6 +57,8 @@ enum NodeType : unsigned {
VASR,
VLSR,
+ SSAT, // Signed saturate.
+ USAT, // Unsigned saturate.
TSTBIT,
INSERT,
EXTRACTU,
@@ -405,6 +407,9 @@ class HexagonTargetLowering : public TargetLowering {
TypePair typeSplit(MVT Ty) const;
MVT typeExtElem(MVT VecTy, unsigned Factor) const;
MVT typeTruncElem(MVT VecTy, unsigned Factor) const;
+ TypePair typeExtendToWider(MVT Ty0, MVT Ty1) const;
+ TypePair typeWidenToWider(MVT Ty0, MVT Ty1) const;
+ MVT typeLegalize(MVT Ty, SelectionDAG &DAG) const;
SDValue opJoin(const VectorPair &Ops, const SDLoc &dl,
SelectionDAG &DAG) const;
@@ -453,6 +458,12 @@ class HexagonTargetLowering : public TargetLowering {
bool ZeroExt, SelectionDAG &DAG) const;
SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
SelectionDAG &DAG) const;
+ SDValue resizeToWidth(SDValue VecV, MVT ResTy, bool Signed, const SDLoc &dl,
+ SelectionDAG &DAG) const;
+ VectorPair emitHvxAddWithOverflow(SDValue A, SDValue B, const SDLoc &dl,
+ bool Signed, SelectionDAG &DAG) const;
+ VectorPair emitHvxShiftRightRnd(SDValue Val, unsigned Amt, bool Signed,
+ SelectionDAG &DAG) const;
SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const;
@@ -474,7 +485,10 @@ class HexagonTargetLowering : public TargetLowering {
SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
VectorPair SplitVectorOp(SDValue Op, SelectionDAG &DAG) const;
@@ -484,11 +498,15 @@ class HexagonTargetLowering : public TargetLowering {
SDValue WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxExtend(SDValue Op, SelectionDAG &DAG) const;
SDValue WidenHvxTruncate(SDValue Op, SelectionDAG &DAG) const;
+ SDValue WidenHvxFpIntConv(SDValue Op, SelectionDAG &DAG) const;
+ SDValue ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG) const;
+ SDValue EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG) const;
std::pair<const TargetRegisterClass*, uint8_t>
findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT)
const override;
+ bool shouldSplitToHvx(MVT Ty, SelectionDAG &DAG) const;
bool shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const;
bool isHvxOperation(SDNode *N, SelectionDAG &DAG) const;
SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 1729bf0185a1..10dc9f618790 100755
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -24,6 +24,22 @@ static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
+static std::tuple<unsigned, unsigned, unsigned> getIEEEProperties(MVT Ty) {
+ // For a float scalar type, return (exp-bits, exp-bias, fraction-bits)
+ MVT ElemTy = Ty.getScalarType();
+ switch (ElemTy.SimpleTy) {
+ case MVT::f16:
+ return std::make_tuple(5, 15, 10);
+ case MVT::f32:
+ return std::make_tuple(8, 127, 23);
+ case MVT::f64:
+ return std::make_tuple(11, 1023, 52);
+ default:
+ break;
+ }
+ llvm_unreachable(("Unexpected type: " + EVT(ElemTy).getEVTString()).c_str());
+}
+
void
HexagonTargetLowering::initializeHVXLowering() {
if (Subtarget.useHVX64BOps()) {
@@ -214,12 +230,8 @@ HexagonTargetLowering::initializeHVXLowering() {
setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
}
- if (Subtarget.useHVXQFloatOps()) {
- setOperationAction(ISD::SINT_TO_FP, T, Expand);
- setOperationAction(ISD::UINT_TO_FP, T, Expand);
- setOperationAction(ISD::FP_TO_SINT, T, Expand);
- setOperationAction(ISD::FP_TO_UINT, T, Expand);
- } else if (Subtarget.useHVXIEEEFPOps()) {
+ if (Subtarget.useHVXFloatingPoint()) {
+ // Same action for both QFloat and IEEE.
setOperationAction(ISD::SINT_TO_FP, T, Custom);
setOperationAction(ISD::UINT_TO_FP, T, Custom);
setOperationAction(ISD::FP_TO_SINT, T, Custom);
@@ -289,10 +301,13 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::UMAX, T, Custom);
}
- setOperationAction(ISD::SINT_TO_FP, T, Custom);
- setOperationAction(ISD::UINT_TO_FP, T, Custom);
- setOperationAction(ISD::FP_TO_SINT, T, Custom);
- setOperationAction(ISD::FP_TO_UINT, T, Custom);
+ if (Subtarget.useHVXFloatingPoint()) {
+ // Same action for both QFloat and IEEE.
+ setOperationAction(ISD::SINT_TO_FP, T, Custom);
+ setOperationAction(ISD::UINT_TO_FP, T, Custom);
+ setOperationAction(ISD::FP_TO_SINT, T, Custom);
+ setOperationAction(ISD::FP_TO_UINT, T, Custom);
+ }
}
setCondCodeAction(ISD::SETNE, MVT::v64f16, Expand);
@@ -380,6 +395,12 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::ANY_EXTEND, VecTy, Custom);
setOperationAction(ISD::SIGN_EXTEND, VecTy, Custom);
setOperationAction(ISD::ZERO_EXTEND, VecTy, Custom);
+ if (Subtarget.useHVXFloatingPoint()) {
+ setOperationAction(ISD::FP_TO_SINT, VecTy, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VecTy, Custom);
+ setOperationAction(ISD::SINT_TO_FP, VecTy, Custom);
+ setOperationAction(ISD::UINT_TO_FP, VecTy, Custom);
+ }
MVT BoolTy = MVT::getVectorVT(MVT::i1, N);
if (!isTypeLegal(BoolTy))
@@ -419,10 +440,13 @@ HexagonTargetLowering::getPreferredHvxVectorAction(MVT VecTy) const {
// any scientific way.
if (llvm::is_contained(Tys, ElemTy)) {
unsigned VecWidth = VecTy.getSizeInBits();
+ unsigned HwWidth = 8*HwLen;
+ if (VecWidth > 2*HwWidth)
+ return TargetLoweringBase::TypeSplitVector;
+
bool HaveThreshold = HvxWidenThreshold.getNumOccurrences() > 0;
if (HaveThreshold && 8*HvxWidenThreshold <= VecWidth)
return TargetLoweringBase::TypeWidenVector;
- unsigned HwWidth = 8*HwLen;
if (VecWidth >= HwWidth/2 && VecWidth < HwWidth)
return TargetLoweringBase::TypeWidenVector;
}
@@ -1404,6 +1428,34 @@ HexagonTargetLowering::compressHvxPred(SDValue VecQ, const SDLoc &dl,
return DAG.getBitcast(ResTy, Collect);
}
+SDValue
+HexagonTargetLowering::resizeToWidth(SDValue VecV, MVT ResTy, bool Signed,
+ const SDLoc &dl, SelectionDAG &DAG) const {
+ // Take a vector and resize the element type to match the given type.
+ MVT InpTy = ty(VecV);
+ if (InpTy == ResTy)
+ return VecV;
+
+ unsigned InpWidth = InpTy.getSizeInBits();
+ unsigned ResWidth = ResTy.getSizeInBits();
+
+ if (InpTy.isFloatingPoint()) {
+ return InpWidth < ResWidth ? DAG.getNode(ISD::FP_EXTEND, dl, ResTy, VecV)
+ : DAG.getNode(ISD::FP_ROUND, dl, ResTy, VecV,
+ getZero(dl, MVT::i32, DAG));
+ }
+
+ assert(InpTy.isInteger());
+
+ if (InpWidth < ResWidth) {
+ unsigned ExtOpc = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ return DAG.getNode(ExtOpc, dl, ResTy, VecV);
+ } else {
+ unsigned NarOpc = Signed ? HexagonISD::SSAT : HexagonISD::USAT;
+ return DAG.getNode(NarOpc, dl, ResTy, VecV, DAG.getValueType(ResTy));
+ }
+}
+
SDValue
HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
const {
@@ -1488,7 +1540,7 @@ HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
SDValue V = Elems[i];
MVT Ty = ty(V);
if (!isTypeLegal(Ty)) {
- EVT NTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+ MVT NTy = typeLegalize(Ty, DAG);
if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
Elems[i] = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NTy,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NTy,
@@ -1934,7 +1986,7 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
SDValue
HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
- const SDLoc &dl(Op);
+ const SDLoc &dl(Op);
MVT ResTy = ty(Op);
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2026,7 +2078,8 @@ HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op,
SelectionDAG &DAG) const {
- // This conversion only applies to QFloat.
+ // This conversion only applies to QFloat. IEEE extension from f16 to f32
+ // is legal (done via a pattern).
assert(Subtarget.useHVXQFloatOps());
assert(Op->getOpcode() == ISD::FP_EXTEND);
@@ -2060,42 +2113,429 @@ SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op,
}
SDValue
-HexagonTargetLowering::LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG)
- const {
- // This conversion only applies to IEEE.
- assert(Subtarget.useHVXIEEEFPOps());
+HexagonTargetLowering::LowerHvxFpToInt(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ // Catch invalid conversion ops (just in case).
+ assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT);
+ MVT ResTy = ty(Op);
+ MVT FpTy = ty(Op.getOperand(0)).getVectorElementType();
+ MVT IntTy = ResTy.getVectorElementType();
+
+ if (Subtarget.useHVXIEEEFPOps()) {
+ // There are only conversions from f16.
+ if (FpTy == MVT::f16) {
+ // Other int types aren't legal in HVX, so we shouldn't see them here.
+ assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
+ // Conversions to i8 and i16 are legal.
+ if (IntTy == MVT::i8 || IntTy == MVT::i16)
+ return Op;
+ }
+ }
+
+ if (IntTy.getSizeInBits() != FpTy.getSizeInBits())
+ return EqualizeFpIntConversion(Op, DAG);
+
+ return ExpandHvxFpToInt(Op, DAG);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
// Catch invalid conversion ops (just in case).
+ assert(Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP);
+
+ MVT ResTy = ty(Op);
+ MVT IntTy = ty(Op.getOperand(0)).getVectorElementType();
+ MVT FpTy = ResTy.getVectorElementType();
+
+ if (Subtarget.useHVXIEEEFPOps()) {
+ // There are only conversions to f16.
+ if (FpTy == MVT::f16) {
+ // Other int types aren't legal in HVX, so we shouldn't see them here.
+ assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
+ // i8, i16 -> f16 is legal.
+ if (IntTy == MVT::i8 || IntTy == MVT::i16)
+ return Op;
+ }
+ }
+
+ if (IntTy.getSizeInBits() != FpTy.getSizeInBits())
+ return EqualizeFpIntConversion(Op, DAG);
+
+ return ExpandHvxIntToFp(Op, DAG);
+}
+
+HexagonTargetLowering::TypePair
+HexagonTargetLowering::typeExtendToWider(MVT Ty0, MVT Ty1) const {
+ // Compare the widths of elements of the two types, and extend the narrower
+ // type to match the with of the wider type. For vector types, apply this
+ // to the element type.
+ assert(Ty0.isVector() == Ty1.isVector());
+
+ MVT ElemTy0 = Ty0.getScalarType();
+ MVT ElemTy1 = Ty1.getScalarType();
+
+ unsigned Width0 = ElemTy0.getSizeInBits();
+ unsigned Width1 = ElemTy1.getSizeInBits();
+ unsigned MaxWidth = std::max(Width0, Width1);
+
+ auto getScalarWithWidth = [](MVT ScalarTy, unsigned Width) {
+ if (ScalarTy.isInteger())
+ return MVT::getIntegerVT(Width);
+ assert(ScalarTy.isFloatingPoint());
+ return MVT::getFloatingPointVT(Width);
+ };
+
+ MVT WideETy0 = getScalarWithWidth(ElemTy0, MaxWidth);
+ MVT WideETy1 = getScalarWithWidth(ElemTy1, MaxWidth);
+
+ if (!Ty0.isVector()) {
+ // Both types are scalars.
+ return {WideETy0, WideETy1};
+ }
+
+ // Vector types.
+ unsigned NumElem = Ty0.getVectorNumElements();
+ assert(NumElem == Ty1.getVectorNumElements());
+
+ return {MVT::getVectorVT(WideETy0, NumElem),
+ MVT::getVectorVT(WideETy1, NumElem)};
+}
+
+HexagonTargetLowering::TypePair
+HexagonTargetLowering::typeWidenToWider(MVT Ty0, MVT Ty1) const {
+ // Compare the numbers of elements of two vector types, and widen the
+ // narrower one to match the number of elements in the wider one.
+ assert(Ty0.isVector() && Ty1.isVector());
+
+ unsigned Len0 = Ty0.getVectorNumElements();
+ unsigned Len1 = Ty1.getVectorNumElements();
+ if (Len0 == Len1)
+ return {Ty0, Ty1};
+
+ unsigned MaxLen = std::max(Len0, Len1);
+ return {MVT::getVectorVT(Ty0.getVectorElementType(), MaxLen),
+ MVT::getVectorVT(Ty1.getVectorElementType(), MaxLen)};
+}
+
+MVT
+HexagonTargetLowering::typeLegalize(MVT Ty, SelectionDAG &DAG) const {
+ EVT LegalTy = getTypeToTransformTo(*DAG.getContext(), Ty);
+ assert(LegalTy.isSimple());
+ return LegalTy.getSimpleVT();
+}
+
+HexagonTargetLowering::VectorPair
+HexagonTargetLowering::emitHvxAddWithOverflow(SDValue A, SDValue B,
+ const SDLoc &dl, bool Signed, SelectionDAG &DAG) const {
+ // Compute A+B, return {A+B, O}, where O = vector predicate indicating
+ // whether an overflow has occured.
+ MVT ResTy = ty(A);
+ assert(ResTy == ty(B));
+ MVT PredTy = MVT::getVectorVT(MVT::i1, ResTy.getVectorNumElements());
+
+ if (!Signed) {
+ // V62+ has V6_vaddcarry, but it requires input predicate, so it doesn't
+ // save any instructions.
+ SDValue Add = DAG.getNode(ISD::ADD, dl, ResTy, {A, B});
+ SDValue Ovf = DAG.getSetCC(dl, PredTy, Add, A, ISD::SETULT);
+ return {Add, Ovf};
+ }
+
+ // Signed overflow has happened, if:
+ // (A, B have the same sign) and (A+B has a
diff erent sign from either)
+ // i.e. (~A xor B) & ((A+B) xor B), then check the sign bit
+ SDValue Add = DAG.getNode(ISD::ADD, dl, ResTy, {A, B});
+ SDValue NotA =
+ DAG.getNode(ISD::XOR, dl, ResTy, {A, DAG.getConstant(-1, dl, ResTy)});
+ SDValue Xor0 = DAG.getNode(ISD::XOR, dl, ResTy, {NotA, B});
+ SDValue Xor1 = DAG.getNode(ISD::XOR, dl, ResTy, {Add, B});
+ SDValue And = DAG.getNode(ISD::AND, dl, ResTy, {Xor0, Xor1});
+ SDValue MSB =
+ DAG.getSetCC(dl, PredTy, And, getZero(dl, ResTy, DAG), ISD::SETLT);
+ return {Add, MSB};
+}
+
+HexagonTargetLowering::VectorPair
+HexagonTargetLowering::emitHvxShiftRightRnd(SDValue Val, unsigned Amt,
+ bool Signed, SelectionDAG &DAG) const {
+ // Shift Val right by Amt bits, round the result to the nearest integer,
+ // tie-break by rounding halves to even integer.
+
+ const SDLoc &dl(Val);
+ MVT ValTy = ty(Val);
+
+ // This should also work for signed integers.
+ //
+ // uint tmp0 = inp + ((1 << (Amt-1)) - 1);
+ // bool ovf = (inp > tmp0);
+ // uint rup = inp & (1 << (Amt+1));
+ //
+ // uint tmp1 = inp >> (Amt-1); // tmp1 == tmp2 iff
+ // uint tmp2 = tmp0 >> (Amt-1); // the Amt-1 lower bits were all 0
+ // uint tmp3 = tmp2 + rup;
+ // uint frac = (tmp1 != tmp2) ? tmp2 >> 1 : tmp3 >> 1;
+ unsigned ElemWidth = ValTy.getVectorElementType().getSizeInBits();
+ MVT ElemTy = MVT::getIntegerVT(ElemWidth);
+ MVT IntTy = tyVector(ValTy, ElemTy);
+ MVT PredTy = MVT::getVectorVT(MVT::i1, IntTy.getVectorNumElements());
+ unsigned ShRight = Signed ? ISD::SRA : ISD::SRL;
+
+ SDValue Inp = DAG.getBitcast(IntTy, Val);
+ SDValue LowBits = DAG.getConstant((1u << (Amt - 1)) - 1, dl, IntTy);
+
+ SDValue AmtP1 = DAG.getConstant(1u << Amt, dl, IntTy);
+ SDValue And = DAG.getNode(ISD::AND, dl, IntTy, {Inp, AmtP1});
+ SDValue Zero = getZero(dl, IntTy, DAG);
+ SDValue Bit = DAG.getSetCC(dl, PredTy, And, Zero, ISD::SETNE);
+ SDValue Rup = DAG.getZExtOrTrunc(Bit, dl, IntTy);
+ auto [Tmp0, Ovf] = emitHvxAddWithOverflow(Inp, LowBits, dl, Signed, DAG);
+
+ SDValue AmtM1 = DAG.getConstant(Amt - 1, dl, IntTy);
+ SDValue Tmp1 = DAG.getNode(ShRight, dl, IntTy, Inp, AmtM1);
+ SDValue Tmp2 = DAG.getNode(ShRight, dl, IntTy, Tmp0, AmtM1);
+ SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, IntTy, Tmp2, Rup);
+
+ SDValue Eq = DAG.getSetCC(dl, PredTy, Tmp1, Tmp2, ISD::SETEQ);
+ SDValue One = DAG.getConstant(1, dl, IntTy);
+ SDValue Tmp4 = DAG.getNode(ShRight, dl, IntTy, {Tmp2, One});
+ SDValue Tmp5 = DAG.getNode(ShRight, dl, IntTy, {Tmp3, One});
+ SDValue Mux = DAG.getNode(ISD::VSELECT, dl, IntTy, {Eq, Tmp5, Tmp4});
+ return {Mux, Ovf};
+}
+
+SDValue
+HexagonTargetLowering::EqualizeFpIntConversion(SDValue Op, SelectionDAG &DAG)
+ const {
+ // Rewrite conversion between integer and floating-point in such a way that
+ // the integer type is extended/narrowed to match the bitwidth of the
+ // floating-point type, combined with additional integer-integer extensions
+ // or narrowings to match the original input/result types.
+ // E.g. f32 -> i8 ==> f32 -> i32 -> i8
+ //
+ // The input/result types are not required to be legal, but if they are
+ // legal, this function should not introduce illegal types.
+
+ unsigned Opc = Op.getOpcode();
assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT ||
Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP);
+
+ SDValue Inp = Op.getOperand(0);
+ MVT InpTy = ty(Inp);
MVT ResTy = ty(Op);
- if (Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT) {
- MVT FpTy = ty(Op.getOperand(0)).getVectorElementType();
- // There are only conversions of f16.
- if (FpTy != MVT::f16)
- return SDValue();
-
- MVT IntTy = ResTy.getVectorElementType();
- // Other int types aren't legal in HVX, so we shouldn't see them here.
- assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
- // Conversions to i8 and i16 are legal.
- if (IntTy == MVT::i8 || IntTy == MVT::i16)
- return Op;
+ if (InpTy == ResTy)
+ return Op;
+
+ const SDLoc &dl(Op);
+ bool Signed = Opc == ISD::FP_TO_SINT || Opc == ISD::SINT_TO_FP;
+
+ auto [WInpTy, WResTy] = typeExtendToWider(InpTy, ResTy);
+ SDValue WInp = resizeToWidth(Inp, WInpTy, Signed, dl, DAG);
+ SDValue Conv = DAG.getNode(Opc, dl, WResTy, WInp);
+ SDValue Res = resizeToWidth(Conv, ResTy, Signed, dl, DAG);
+ return Res;
+}
+
+SDValue
+HexagonTargetLowering::ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT);
+
+ const SDLoc &dl(Op);
+ SDValue Op0 = Op.getOperand(0);
+ MVT InpTy = ty(Op0);
+ MVT ResTy = ty(Op);
+ assert(InpTy.changeTypeToInteger() == ResTy);
+
+ // int32_t conv_f32_to_i32(uint32_t inp) {
+ // // s | exp8 | frac23
+ //
+ // int neg = (int32_t)inp < 0;
+ //
+ // // "expm1" is the actual exponent minus 1: instead of "bias", subtract
+ // // "bias+1". When the encoded exp is "all-1" (i.e. inf/nan), this will
+ // // produce a large positive "expm1", which will result in max u/int.
+ // // In all IEEE formats, bias is the largest positive number that can be
+ // // represented in bias-width bits (i.e. 011..1).
+ // int32_t expm1 = (inp << 1) - 0x80000000;
+ // expm1 >>= 24;
+ //
+ // // Always insert the "implicit 1". Subnormal numbers will become 0
+ // // regardless.
+ // uint32_t frac = (inp << 8) | 0x80000000;
+ //
+ // // "frac" is the fraction part represented as Q1.31. If it was
+ // // interpreted as uint32_t, it would be the fraction part multiplied
+ // // by 2^31.
+ //
+ // // Calculate the amount of right shift, since shifting further to the
+ // // left would lose significant bits. Limit it to 32, because we want
+ // // shifts by 32+ to produce 0, whereas V6_vlsrwv treats the shift
+ // // amount as a 6-bit signed value (so 33 is same as -31, i.e. shift
+ // // left by 31). "rsh" can be negative.
+ // int32_t rsh = min(31 - (expm1 + 1), 32);
+ //
+ // frac >>= rsh; // rsh == 32 will produce 0
+ //
+ // // Everything up to this point is the same for conversion to signed
+ // // unsigned integer.
+ //
+ // if (neg) // Only for signed int
+ // frac = -frac; //
+ // if (rsh <= 0 && neg) // bound = neg ? 0x80000000 : 0x7fffffff
+ // frac = 0x80000000; // frac = rsh <= 0 ? bound : frac
+ // if (rsh <= 0 && !neg) //
+ // frac = 0x7fffffff; //
+ //
+ // if (neg) // Only for unsigned int
+ // frac = 0; //
+ // if (rsh < 0 && !neg) // frac = rsh < 0 ? 0x7fffffff : frac;
+ // frac = 0x7fffffff; // frac = neg ? 0 : frac;
+ //
+ // return frac;
+ // }
+
+ MVT PredTy = MVT::getVectorVT(MVT::i1, ResTy.getVectorElementCount());
+
+ // Zero = V6_vd0();
+ // Neg = V6_vgtw(Zero, Inp);
+ // One = V6_lvsplatw(1);
+ // M80 = V6_lvsplatw(0x80000000);
+ // Exp00 = V6_vaslwv(Inp, One);
+ // Exp01 = V6_vsubw(Exp00, M80);
+ // ExpM1 = V6_vasrw(Exp01, 24);
+ // Frc00 = V6_vaslw(Inp, 8);
+ // Frc01 = V6_vor(Frc00, M80);
+ // Rsh00 = V6_vsubw(V6_lvsplatw(30), ExpM1);
+ // Rsh01 = V6_vminw(Rsh00, V6_lvsplatw(32));
+ // Frc02 = V6_vlsrwv(Frc01, Rsh01);
+
+ // if signed int:
+ // Bnd = V6_vmux(Neg, M80, V6_lvsplatw(0x7fffffff))
+ // Pos = V6_vgtw(Rsh01, Zero);
+ // Frc13 = V6_vsubw(Zero, Frc02);
+ // Frc14 = V6_vmux(Neg, Frc13, Frc02);
+ // Int = V6_vmux(Pos, Frc14, Bnd);
+ //
+ // if unsigned int:
+ // Rsn = V6_vgtw(Zero, Rsh01)
+ // Frc23 = V6_vmux(Rsn, V6_lvsplatw(0x7fffffff), Frc02)
+ // Int = V6_vmux(Neg, Zero, Frc23)
+
+ auto [ExpWidth, ExpBias, FracWidth] = getIEEEProperties(InpTy);
+ unsigned ElemWidth = 1 + ExpWidth + FracWidth;
+ assert(1u << (ExpWidth - 1) == 1 + ExpBias);
+
+ SDValue Inp = DAG.getBitcast(ResTy, Op0);
+ SDValue Zero = getZero(dl, ResTy, DAG);
+ SDValue Neg = DAG.getSetCC(dl, PredTy, Inp, Zero, ISD::SETLT);
+ SDValue M80 = DAG.getConstant(1u << (ElemWidth - 1), dl, ResTy);
+ SDValue M7F = DAG.getConstant((1u << (ElemWidth - 1)) - 1, dl, ResTy);
+ SDValue One = DAG.getConstant(1, dl, ResTy);
+ SDValue Exp00 = DAG.getNode(ISD::SHL, dl, ResTy, {Inp, One});
+ SDValue Exp01 = DAG.getNode(ISD::SUB, dl, ResTy, {Exp00, M80});
+ SDValue MNE = DAG.getConstant(ElemWidth - ExpWidth, dl, ResTy);
+ SDValue ExpM1 = DAG.getNode(ISD::SRA, dl, ResTy, {Exp01, MNE});
+
+ SDValue ExpW = DAG.getConstant(ExpWidth, dl, ResTy);
+ SDValue Frc00 = DAG.getNode(ISD::SHL, dl, ResTy, {Inp, ExpW});
+ SDValue Frc01 = DAG.getNode(ISD::OR, dl, ResTy, {Frc00, M80});
+
+ SDValue MN2 = DAG.getConstant(ElemWidth - 2, dl, ResTy);
+ SDValue Rsh00 = DAG.getNode(ISD::SUB, dl, ResTy, {MN2, ExpM1});
+ SDValue MW = DAG.getConstant(ElemWidth, dl, ResTy);
+ SDValue Rsh01 = DAG.getNode(ISD::SMIN, dl, ResTy, {Rsh00, MW});
+ SDValue Frc02 = DAG.getNode(ISD::SRL, dl, ResTy, {Frc01, Rsh01});
+
+ SDValue Int;
+
+ if (Opc == ISD::FP_TO_SINT) {
+ SDValue Bnd = DAG.getNode(ISD::VSELECT, dl, ResTy, {Neg, M80, M7F});
+ SDValue Pos = DAG.getSetCC(dl, PredTy, Rsh01, Zero, ISD::SETGT);
+ SDValue Frc13 = DAG.getNode(ISD::SUB, dl, ResTy, {Zero, Frc02});
+ SDValue Frc14 = DAG.getNode(ISD::VSELECT, dl, ResTy, {Neg, Frc13, Frc02});
+ Int = DAG.getNode(ISD::VSELECT, dl, ResTy, {Pos, Frc14, Bnd});
} else {
- // Converting int -> fp.
- if (ResTy.getVectorElementType() != MVT::f16)
- return SDValue();
- MVT IntTy = ty(Op.getOperand(0)).getVectorElementType();
- // Other int types aren't legal in HVX, so we shouldn't see them here.
- assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
- // i8, i16 -> f16 is legal.
- if (IntTy == MVT::i8 || IntTy == MVT::i16)
- return Op;
+ assert(Opc == ISD::FP_TO_UINT);
+ SDValue Rsn = DAG.getSetCC(dl, PredTy, Rsh01, Zero, ISD::SETLT);
+ SDValue Frc23 = DAG.getNode(ISD::VSELECT, dl, ResTy, Rsn, M7F, Frc02);
+ Int = DAG.getNode(ISD::VSELECT, dl, ResTy, Neg, Zero, Frc23);
}
- return SDValue();
+ return Int;
+}
+
+SDValue
+HexagonTargetLowering::ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ assert(Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP);
+
+ const SDLoc &dl(Op);
+ SDValue Op0 = Op.getOperand(0);
+ MVT InpTy = ty(Op0);
+ MVT ResTy = ty(Op);
+ assert(ResTy.changeTypeToInteger() == InpTy);
+
+ // uint32_t vnoc1_rnd(int32_t w) {
+ // int32_t iszero = w == 0;
+ // int32_t isneg = w < 0;
+ // uint32_t u = __builtin_HEXAGON_A2_abs(w);
+ //
+ // uint32_t norm_left = __builtin_HEXAGON_S2_cl0(u) + 1;
+ // uint32_t frac0 = (uint64_t)u << norm_left;
+ //
+ // // Rounding:
+ // uint32_t frac1 = frac0 + ((1 << 8) - 1);
+ // uint32_t renorm = (frac0 > frac1);
+ // uint32_t rup = (int)(frac0 << 22) < 0;
+ //
+ // uint32_t frac2 = frac0 >> 8;
+ // uint32_t frac3 = frac1 >> 8;
+ // uint32_t frac = (frac2 != frac3) ? frac3 >> 1 : (frac3 + rup) >> 1;
+ //
+ // int32_t exp = 32 - norm_left + renorm + 127;
+ // exp <<= 23;
+ //
+ // uint32_t sign = 0x80000000 * isneg;
+ // uint32_t f = sign | exp | frac;
+ // return iszero ? 0 : f;
+ // }
+
+ MVT PredTy = MVT::getVectorVT(MVT::i1, InpTy.getVectorElementCount());
+ bool Signed = Opc == ISD::SINT_TO_FP;
+
+ auto [ExpWidth, ExpBias, FracWidth] = getIEEEProperties(ResTy);
+ unsigned ElemWidth = 1 + ExpWidth + FracWidth;
+
+ SDValue Zero = getZero(dl, InpTy, DAG);
+ SDValue One = DAG.getConstant(1, dl, InpTy);
+ SDValue IsZero = DAG.getSetCC(dl, PredTy, Op0, Zero, ISD::SETEQ);
+ SDValue Abs = Signed ? DAG.getNode(ISD::ABS, dl, InpTy, Op0) : Op0;
+ SDValue Clz = DAG.getNode(ISD::CTLZ, dl, InpTy, Abs);
+ SDValue NLeft = DAG.getNode(ISD::ADD, dl, InpTy, {Clz, One});
+ SDValue Frac0 = DAG.getNode(ISD::SHL, dl, InpTy, {Abs, NLeft});
+
+ auto [Frac, Ovf] = emitHvxShiftRightRnd(Frac0, ExpWidth + 1, false, DAG);
+ if (Signed) {
+ SDValue IsNeg = DAG.getSetCC(dl, PredTy, Op0, Zero, ISD::SETLT);
+ SDValue M80 = DAG.getConstant(1 << (ElemWidth - 1), dl, InpTy);
+ SDValue Sign = DAG.getNode(ISD::VSELECT, dl, InpTy, {IsNeg, M80, Zero});
+ Frac = DAG.getNode(ISD::OR, dl, InpTy, {Sign, Frac});
+ }
+
+ SDValue Rnrm = DAG.getZExtOrTrunc(Ovf, dl, InpTy);
+ SDValue Exp0 = DAG.getConstant(ElemWidth + ExpBias, dl, InpTy);
+ SDValue Exp1 = DAG.getNode(ISD::ADD, dl, InpTy, {Rnrm, Exp0});
+ SDValue Exp2 = DAG.getNode(ISD::SUB, dl, InpTy, {Exp1, NLeft});
+ SDValue Exp3 = DAG.getNode(ISD::SHL, dl, InpTy,
+ {Exp2, DAG.getConstant(FracWidth, dl, InpTy)});
+ SDValue Flt0 = DAG.getNode(ISD::OR, dl, InpTy, {Frac, Exp3});
+ SDValue Flt1 = DAG.getNode(ISD::VSELECT, dl, InpTy, {IsZero, Zero, Flt0});
+ SDValue Flt = DAG.getBitcast(ResTy, Flt1);
+
+ return Flt;
}
HexagonTargetLowering::VectorPair
@@ -2116,6 +2556,8 @@ HexagonTargetLowering::SplitVectorOp(SDValue Op, SelectionDAG &DAG) const {
// Special case for type operand.
switch (Op.getOpcode()) {
case ISD::SIGN_EXTEND_INREG:
+ case HexagonISD::SSAT:
+ case HexagonISD::USAT:
if (const auto *N = dyn_cast<const VTSDNode>(A.getNode()))
std::tie(Lo, Hi) = SplitVTNode(N);
break;
@@ -2298,7 +2740,7 @@ HexagonTargetLowering::WidenHvxSetCC(SDValue Op, SelectionDAG &DAG) const {
SDValue SetCC = DAG.getNode(ISD::SETCC, dl, ResTy,
{WideOp0, WideOp1, Op.getOperand(2)});
- EVT RetTy = getTypeToTransformTo(*DAG.getContext(), ty(Op));
+ EVT RetTy = typeLegalize(ty(Op), DAG);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RetTy,
{SetCC, getZero(dl, MVT::i32, DAG)});
}
@@ -2472,9 +2914,9 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::LOAD: return SDValue();
case ISD::FP_EXTEND: return LowerHvxFpExtend(Op, DAG);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT:
+ case ISD::FP_TO_UINT: return LowerHvxFpToInt(Op, DAG);
case ISD::SINT_TO_FP:
- case ISD::UINT_TO_FP: return LowerHvxConvertFpInt(Op, DAG);
+ case ISD::UINT_TO_FP: return LowerHvxIntToFp(Op, DAG);
}
#ifndef NDEBUG
Op.dumpr(&DAG);
@@ -2482,29 +2924,96 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Unhandled HVX operation");
}
+SDValue
+HexagonTargetLowering::ExpandHvxResizeIntoSteps(SDValue Op, SelectionDAG &DAG)
+ const {
+ // Rewrite the extension/truncation/saturation op into steps where each
+ // step changes the type widths by a factor of 2.
+ // E.g. i8 -> i16 remains unchanged, but i8 -> i32 ==> i8 -> i16 -> i32.
+ //
+ // Some of the vector types in Op may not be legal.
+
+ bool NeedVT = false;
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case HexagonISD::SSAT:
+ case HexagonISD::USAT:
+ NeedVT = true;
+ [[fallthrough]];
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::TRUNCATE:
+ break;
+#ifndef NDEBUG
+ Op.dump(&DAG);
+#endif
+ llvm_unreachable("Unexpected operation");
+ }
+
+ SDValue Inp = Op.getOperand(0);
+ MVT InpTy = ty(Inp);
+ MVT ResTy = ty(Op);
+
+ unsigned InpWidth = InpTy.getVectorElementType().getSizeInBits();
+ unsigned ResWidth = ResTy.getVectorElementType().getSizeInBits();
+ assert(InpWidth != ResWidth);
+
+ if (InpWidth == 2 * ResWidth || ResWidth == 2 * InpWidth)
+ return Op;
+
+ const SDLoc &dl(Op);
+ unsigned NumElems = InpTy.getVectorNumElements();
+ assert(NumElems == ResTy.getVectorNumElements());
+
+ auto repeatOp = [&](unsigned NewWidth, SDValue Arg) {
+ MVT Ty = MVT::getVectorVT(MVT::getIntegerVT(NewWidth), NumElems);
+ SmallVector<SDValue, 2> Args = {Arg};
+ if (NeedVT)
+ Args.push_back(DAG.getValueType(Ty));
+ return DAG.getNode(Opc, dl, Ty, Args);
+ };
+
+ SDValue S = Inp;
+ if (InpWidth < ResWidth) {
+ assert(ResWidth % InpWidth == 0 && isPowerOf2_32(ResWidth / InpWidth));
+ while (InpWidth * 2 <= ResWidth)
+ S = repeatOp(InpWidth *= 2, S);
+ } else {
+ // InpWidth > ResWidth
+ assert(InpWidth % ResWidth == 0 && isPowerOf2_32(InpWidth / ResWidth));
+ while (InpWidth / 2 >= ResWidth)
+ S = repeatOp(InpWidth /= 2, S);
+ }
+ return S;
+}
+
void
HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
unsigned Opc = N->getOpcode();
SDValue Op(N, 0);
+ SDValue Inp0; // Optional first argument.
+ if (N->getNumOperands() > 0)
+ Inp0 = Op.getOperand(0);
switch (Opc) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
- if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (shouldWidenToHvx(ty(Inp0), DAG)) {
if (SDValue T = WidenHvxExtend(Op, DAG))
Results.push_back(T);
}
break;
case ISD::SETCC:
- if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (shouldWidenToHvx(ty(Inp0), DAG)) {
if (SDValue T = WidenHvxSetCC(Op, DAG))
Results.push_back(T);
}
break;
case ISD::TRUNCATE:
- if (shouldWidenToHvx(ty(Op.getOperand(0)), DAG)) {
+ if (shouldWidenToHvx(ty(Inp0), DAG)) {
if (SDValue T = WidenHvxTruncate(Op, DAG))
Results.push_back(T);
}
@@ -2530,6 +3039,29 @@ HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
Results.push_back(S);
}
break;
+ case ISD::SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) {
+ SDValue T = EqualizeFpIntConversion(Op, DAG);
+ Results.push_back(T);
+ }
+ break;
+ case HexagonISD::SSAT:
+ case HexagonISD::USAT:
+ if (SDValue T = ExpandHvxResizeIntoSteps(Op, DAG); T != Op) {
+ Results.push_back(T);
+ } else if (shouldWidenToHvx(ty(Op), DAG)) {
+ SDValue W = appendUndef(Inp0, typeJoin({ty(Inp0), ty(Inp0)}), DAG);
+ MVT WideTy = typeJoin({ty(Op), ty(Op)});
+ SDValue T =
+ DAG.getNode(Opc, SDLoc(Op), WideTy, W, DAG.getValueType(WideTy));
+ Results.push_back(T);
+ } else if (shouldSplitToHvx(ty(Inp0), DAG)) {
+ Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG));
+ }
+ break;
default:
break;
}
@@ -2540,6 +3072,10 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
unsigned Opc = N->getOpcode();
SDValue Op(N, 0);
+ SDValue Inp0; // Optional first argument.
+ if (N->getNumOperands() > 0)
+ Inp0 = Op.getOperand(0);
+
switch (Opc) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
@@ -2571,12 +3107,43 @@ HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
break;
}
case ISD::BITCAST:
- if (isHvxBoolTy(ty(N->getOperand(0)))) {
- SDValue Op(N, 0);
+ if (isHvxBoolTy(ty(Inp0))) {
SDValue C = LowerHvxBitcast(Op, DAG);
Results.push_back(C);
}
break;
+ case ISD::FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ if (ty(Op).getSizeInBits() != ty(Inp0).getSizeInBits()) {
+ SDValue T = EqualizeFpIntConversion(Op, DAG);
+ Results.push_back(T);
+ }
+ break;
+ case HexagonISD::SSAT:
+ case HexagonISD::USAT:
+ if (shouldWidenToHvx(ty(Op), DAG)) {
+ MVT InpTy = ty(Inp0);
+ MVT WResTy = typeLegalize(ty(Op), DAG);
+ if (Subtarget.isHVXVectorType(InpTy, true)) {
+ // If the input is legal it won't be auto-legalized, so we
+ // need to pad it explicitly.
+ MVT WInpTy = typeWidenToWider(InpTy, WResTy).first;
+ Inp0 = appendUndef(Inp0, WInpTy, DAG);
+ }
+ SDValue S = DAG.getNode(Opc, SDLoc(Op), WResTy, Inp0,
+ DAG.getValueType(WResTy));
+ SDValue T = ExpandHvxResizeIntoSteps(S, DAG);
+ Results.push_back(T);
+ } else {
+ // Check if we need to split (for example when scalarizing).
+ MVT LResTy = typeLegalize(ty(Op), DAG);
+ if (!Subtarget.isHVXVectorType(LResTy, true)) {
+ Results.push_back(opJoin(SplitVectorOp(Op, DAG), SDLoc(Op), DAG));
+ } else {
+ llvm_unreachable("");
+ }
+ }
+ break;
default:
break;
}
@@ -2637,14 +3204,23 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
return SDValue();
}
+bool
+HexagonTargetLowering::shouldSplitToHvx(MVT Ty, SelectionDAG &DAG) const {
+ if (Subtarget.isHVXVectorType(Ty, true))
+ return false;
+ auto Action = getPreferredHvxVectorAction(Ty);
+ if (Action == TargetLoweringBase::TypeSplitVector)
+ return Subtarget.isHVXVectorType(typeLegalize(Ty, DAG), true);
+ return false;
+}
+
bool
HexagonTargetLowering::shouldWidenToHvx(MVT Ty, SelectionDAG &DAG) const {
+ if (Subtarget.isHVXVectorType(Ty, true))
+ return false;
auto Action = getPreferredHvxVectorAction(Ty);
- if (Action == TargetLoweringBase::TypeWidenVector) {
- EVT WideTy = getTypeToTransformTo(*DAG.getContext(), Ty);
- assert(WideTy.isSimple());
- return Subtarget.isHVXVectorType(WideTy.getSimpleVT(), true);
- }
+ if (Action == TargetLoweringBase::TypeWidenVector)
+ return Subtarget.isHVXVectorType(typeLegalize(Ty, DAG), true);
return false;
}
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index 80fbf33d83b7..cbb437c43431 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -10,7 +10,7 @@
// (0) Definitions
// (1) Immediates
// (2) Type casts
-// (3) Extend/truncate
+// (3) Extend/truncate/saturate
// (4) Logical
// (5) Compare
// (6) Select
@@ -98,6 +98,11 @@ def HexagonPFALSE: SDNode<"HexagonISD::PFALSE", SDTVecLeaf>;
def HexagonVALIGN: SDNode<"HexagonISD::VALIGN", SDTVecVecIntOp>;
def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+def SDTSaturate:
+ SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>, SDTCisVT<2, OtherVT>]>;
+def HexagonSSAT: SDNode<"HexagonISD::SSAT", SDTSaturate>;
+def HexagonUSAT: SDNode<"HexagonISD::USAT", SDTSaturate>;
+
def ptrue: PatFrag<(ops), (HexagonPTRUE)>;
def pfalse: PatFrag<(ops), (HexagonPFALSE)>;
def pnot: PatFrag<(ops node:$Pu), (xor node:$Pu, ptrue)>;
@@ -106,6 +111,9 @@ def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
(HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
+def ssat: PatFrag<(ops node:$V, node:$Ty), (HexagonSSAT node:$V, node:$Ty)>;
+def usat: PatFrag<(ops node:$V, node:$Ty), (HexagonUSAT node:$V, node:$Ty)>;
+
// Pattern fragments to extract the low and high subregisters from a
// 64-bit value.
def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
@@ -477,7 +485,7 @@ defm: NopCast_pat<v2i32, v8i8, DoubleRegs>;
defm: NopCast_pat<v4i16, v8i8, DoubleRegs>;
-// --(3) Extend/truncate -------------------------------------------------
+// --(3) Extend/truncate/saturate ----------------------------------------
//
def: Pat<(sext_inreg I32:$Rs, i8), (A2_sxtb I32:$Rs)>;
@@ -553,6 +561,22 @@ def: Pat<(v4i8 (trunc V4I16:$Rs)),
def: Pat<(v2i16 (trunc V2I32:$Rs)),
(A2_combine_ll (HiReg $Rs), (LoReg $Rs))>;
+// Saturation:
+// Note: saturation assumes the same signed-ness for the input and the
+// output.
+def: Pat<(i32 (ssat I32:$Rs, i8)), (A2_satb I32:$Rs)>;
+def: Pat<(i32 (ssat I32:$Rs, i16)), (A2_sath I32:$Rs)>;
+def: Pat<(i32 (ssat I64:$Rs, i32)), (A2_sat I64:$Rs)>;
+def: Pat<(i32 (usat I32:$Rs, i8)), (A2_satub I32:$Rs)>;
+def: Pat<(i32 (usat I32:$Rs, i16)), (A2_satuh I32:$Rs)>;
+def: Pat<(i32 (usat I64:$Rs, i32)),
+ (C2_mux (C2_cmpeqi (HiReg $Rs), (i32 0)), (LoReg $Rs), (i32 -1))>;
+
+def: Pat<(v4i8 (ssat V4I16:$Rs, v4i8)), (S2_vsathb V4I16:$Rs)>;
+def: Pat<(v2i16 (ssat V2I32:$Rs, v2i16)), (S2_vsatwh V2I32:$Rs)>;
+def: Pat<(v4i8 (usat V4I16:$Rs, v4i8)), (S2_vsathub V4I16:$Rs)>;
+def: Pat<(v2i16 (usat V2I32:$Rs, v2i16)), (S2_vsatwuh V2I32:$Rs)>;
+
// --(4) Logical ---------------------------------------------------------
//
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index 119330ef6de9..e961612e5819 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -904,3 +904,28 @@ let Predicates = [UseHVXV62], AddedComplexity = 20 in {
def: Pat<(VecI16 (abs HVI16:$Vs)), (V6_vabsh HvxVR:$Vs)>;
def: Pat<(VecI32 (abs HVI32:$Vs)), (V6_vabsw HvxVR:$Vs)>;
+// If a node takes an MVT type as a parameter, the argument must be
+// a name of a member of MVT.
+multiclass Saturates<ValueType HvxTy_i8, ValueType HvxTy_i16> {
+ def: Pat<(VecI8 (ssat HWI16:$Vss, HvxTy_i8)),
+ (V6_vpackhb_sat (HiVec $Vss), (LoVec $Vss))>;
+ def: Pat<(VecI8 (ssat (concat_vectors HWI32:$Vss, HWI32:$Vtt), HvxTy_i8)),
+ (V6_vpackhb_sat (V6_vpackwh_sat (HiVec $Vtt), (LoVec $Vtt)),
+ (V6_vpackwh_sat (HiVec $Vss), (LoVec $Vss)))>;
+ def: Pat<(VecI16 (ssat HWI32:$Vss, HvxTy_i16)),
+ (V6_vpackwh_sat (HiVec $Vss), (LoVec $Vss))>;
+
+ def: Pat<(VecI8 (usat HWI16:$Vss, HvxTy_i8)),
+ (V6_vpackhub_sat (HiVec $Vss), (LoVec $Vss))>;
+ def: Pat<(VecI8 (usat (concat_vectors HWI32:$Vss, HWI32:$Vtt), HvxTy_i8)),
+ (V6_vpackhub_sat (V6_vpackwuh_sat (HiVec $Vtt), (LoVec $Vtt)),
+ (V6_vpackwuh_sat (HiVec $Vss), (LoVec $Vss)))>;
+ def: Pat<(VecI16 (usat HWI32:$Vss, HvxTy_i16)),
+ (V6_vpackwuh_sat (HiVec $Vss), (LoVec $Vss))>;
+}
+let Predicates = [UseHVX64B] in {
+ defm: Saturates<v64i8, v32i16>;
+}
+let Predicates = [UseHVX128B] in {
+ defm: Saturates<v128i8, v64i16>;
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
new file mode 100644
index 000000000000..9ea5d11f89c2
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
@@ -0,0 +1,2100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; f16 -> s8
+; No widening
+define void @f16s8_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s8_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##32768,#1)
+; CHECK-NEXT: r4 = #14
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r3)
+; CHECK-NEXT: r6 = #5
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r4)
+; CHECK-NEXT: v8.h = vasl(v1.h,r2)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r5:4 = combine(#11,##32767)
+; CHECK-NEXT: v7 = vxor(v7,v7)
+; CHECK-NEXT: v8.h = vsub(v8.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = #16
+; CHECK-NEXT: v5.h = vasl(v0.h,r6)
+; CHECK-NEXT: q1 = vcmp.gt(v7.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vsplat(r3)
+; CHECK-NEXT: v28.h = vasr(v3.h,r5)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: q0 = vcmp.gt(v7.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.h = vsplat(r4)
+; CHECK-NEXT: v8.h = vasr(v8.h,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.h = vasl(v1.h,r6)
+; CHECK-NEXT: v1.h = vsub(v4.h,v28.h)
+; CHECK-NEXT: v4.h = vsub(v4.h,v8.h)
+; CHECK-NEXT: v29 = vmux(q0,v2,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vmin(v1.h,v6.h)
+; CHECK-NEXT: v0 = vor(v27,v2)
+; CHECK-NEXT: v4.h = vmin(v4.h,v6.h)
+; CHECK-NEXT: v2 = vmux(q1,v2,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q2 = vcmp.gt(v1.h,v7.h)
+; CHECK-NEXT: q3 = vcmp.gt(v4.h,v7.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vlsr(v5.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vlsr(v0.h,v4.h)
+; CHECK-NEXT: v30.h = vsub(v7.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.h = vsub(v7.h,v0.h)
+; CHECK-NEXT: v5 = vmux(q0,v30,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v31,v0)
+; CHECK-NEXT: v1 = vmux(q2,v5,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.b = vpack(v0.h,v1.h):sat
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x half>, ptr %a0, align 128
+ %v1 = fptosi <128 x half> %v0 to <128 x i8>
+ store <128 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @f16s8_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s8_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r3 = #32767
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r3)
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q2 = vcmp.gt(v3.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.b = vpack(v0.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptosi <64 x half> %v0 to <64 x i8>
+ store <64 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f16 -> s16
+; No widening
+define void @f16s16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r2 = #32767
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r2)
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vcmp.gt(v3.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptosi <64 x half> %v0 to <64 x i16>
+ store <64 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @f16s16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r3 = #32767
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r3)
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q1 = vcmp.gt(v3.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vlsr(v29.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.h = vsub(v1.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x half>, ptr %a0, align 128
+ %v1 = fptosi <32 x half> %v0 to <32 x i16>
+ store <32 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f16 -> s32
+; No widening
+define void @f16s32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #15360
+; CHECK-NEXT: r7 = #-4
+; CHECK-NEXT: v1 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vsplat(r2)
+; CHECK-NEXT: r6 = ##-2147483648
+; CHECK-NEXT: r2 = #1
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v24 = vxor(v24,v24)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0.qf32 = vmpy(v1.hf,v0.hf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.sf = v0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.sf = v1.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r7:6 = combine(#30,#24)
+; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r7)
+; CHECK-NEXT: q0 = vcmp.gt(v24.w,v1.w)
+; CHECK-NEXT: q1 = vcmp.gt(v24.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v4.w = vasl(v1.w,r2)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25 = vsplat(r2)
+; CHECK-NEXT: v5.w = vasl(v0.w,r5)
+; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasr(v3.w,r6)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: v28 = vmux(q0,v2,v25)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasr(v4.w,r6)
+; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v1.w,r5)
+; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
+; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8 = vor(v8,v2)
+; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
+; CHECK-NEXT: v2 = vmux(q1,v2,v25)
+; CHECK-NEXT: q3 = vcmp.gt(v3.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT: q2 = vcmp.gt(v4.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.w = vlsr(v8.w,v4.w)
+; CHECK-NEXT: v29.w = vsub(v24.w,v26.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.w = vsub(v24.w,v27.w)
+; CHECK-NEXT: v1 = vmux(q1,v29,v26)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vmux(q0,v9,v27)
+; CHECK-NEXT: v31 = vmux(q3,v1,v2)
+; CHECK-NEXT: vmem(r1+#0) = v31.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v30,v28)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptosi <64 x half> %v0 to <64 x i32>
+ store <64 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @f16s32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16s32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #15360
+; CHECK-NEXT: r7 = #-4
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vsplat(r4)
+; CHECK-NEXT: r2 = ##-2147483648
+; CHECK-NEXT: r3 = #1
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r2)
+; CHECK-NEXT: r5:4 = combine(#8,#30)
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r4)
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v30 = vsplat(r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.sf = v0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.sf = v1.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v2.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v0.w,r3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,r5)
+; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29 = vor(v6,v3)
+; CHECK-NEXT: v3 = vmux(q0,v3,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasr(v1.w,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v4.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vmin(v1.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vcmp.gt(v1.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vlsr(v29.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v2.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v3)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x half>, ptr %a0, align 128
+ %v1 = fptosi <32 x half> %v0 to <32 x i32>
+ store <32 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> s8
+; No widening
+define void @f32s8_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s8_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(#1,#8)
+; CHECK-NEXT: r4 = ##-2147483648
+; CHECK-NEXT: v6 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vsplat(r4)
+; CHECK-NEXT: r7 = #30
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: v4 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vsplat(r7)
+; CHECK-NEXT: r5 = #32
+; CHECK-NEXT: v9.w = vasl(v6.w,r3)
+; CHECK-NEXT: v1 = vmem(r0+#3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v4.w,r3)
+; CHECK-NEXT: v14 = vxor(v14,v14)
+; CHECK-NEXT: v9.w = vsub(v9.w,v0.w)
+; CHECK-NEXT: v2 = vmem(r0+#2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v13 = vsplat(r5)
+; CHECK-NEXT: v11.w = vasl(v2.w,r3)
+; CHECK-NEXT: v8.w = vsub(v8.w,v0.w)
+; CHECK-NEXT: q1 = vcmp.gt(v14.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.w = vasl(v1.w,r3)
+; CHECK-NEXT: q0 = vcmp.gt(v14.w,v4.w)
+; CHECK-NEXT: v11.w = vsub(v11.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: r7 = #64
+; CHECK-NEXT: v9.w = vasr(v9.w,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20 = vsplat(r3)
+; CHECK-NEXT: v7.w = vasl(v6.w,r2)
+; CHECK-NEXT: v21.w = vsub(v12.w,v0.w)
+; CHECK-NEXT: v9.w = vsub(v10.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasr(v8.w,r6)
+; CHECK-NEXT: v27 = vmux(q1,v0,v20)
+; CHECK-NEXT: v25 = vmux(q0,v0,v20)
+; CHECK-NEXT: v9.w = vmin(v9.w,v13.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v4.w,r2)
+; CHECK-NEXT: v7 = vor(v7,v0)
+; CHECK-NEXT: v8.w = vsub(v10.w,v8.w)
+; CHECK-NEXT: q3 = vcmp.gt(v9.w,v14.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.w = vasr(v11.w,r6)
+; CHECK-NEXT: v8.w = vmin(v8.w,v13.w)
+; CHECK-NEXT: v5 = vor(v5,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasr(v21.w,r6)
+; CHECK-NEXT: v11.w = vsub(v10.w,v11.w)
+; CHECK-NEXT: q2 = vcmp.gt(v8.w,v14.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v1.w,r2)
+; CHECK-NEXT: v6.w = vsub(v10.w,v6.w)
+; CHECK-NEXT: v23.w = vmin(v11.w,v13.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v22.w = vasl(v2.w,r2)
+; CHECK-NEXT: v3 = vor(v3,v0)
+; CHECK-NEXT: v6.w = vmin(v6.w,v13.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w)
+; CHECK-NEXT: v12 = vor(v22,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vlsr(v5.w,v8.w)
+; CHECK-NEXT: v26.w = vsub(v14.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.w = vlsr(v12.w,v23.w)
+; CHECK-NEXT: v24.w = vsub(v14.w,v5.w)
+; CHECK-NEXT: v7 = vmux(q1,v26,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vlsr(v3.w,v6.w)
+; CHECK-NEXT: v5 = vmux(q0,v24,v5)
+; CHECK-NEXT: q0 = vcmp.gt(v14.w,v2.w)
+; CHECK-NEXT: v29.w = vsub(v14.w,v28.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q3,v7,v27)
+; CHECK-NEXT: q3 = vcmp.gt(v14.w,v1.w)
+; CHECK-NEXT: v31.w = vsub(v14.w,v3.w)
+; CHECK-NEXT: v5 = vmux(q2,v5,v25)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q0,v0,v20)
+; CHECK-NEXT: v30 = vmux(q0,v29,v28)
+; CHECK-NEXT: q2 = vcmp.gt(v23.w,v14.w)
+; CHECK-NEXT: v3 = vmux(q3,v31,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vpack(v2.w,v5.w):sat
+; CHECK-NEXT: v0 = vmux(q3,v0,v20)
+; CHECK-NEXT: q3 = vcmp.gt(v6.w,v14.w)
+; CHECK-NEXT: v1 = vmux(q2,v30,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v3,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.b = vpack(v3.h,v2.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.b = vpack(v3.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v0,v1,r7)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x float>, ptr %a0, align 128
+ %v1 = fptosi <128 x float> %v0 to <128 x i8>
+ store <128 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result #1
+define void @f32s8_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s8_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##-2147483648,#8)
+; CHECK-NEXT: r4 = #1
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r3)
+; CHECK-NEXT: r5 = #30
+; CHECK-NEXT: v4.w = vasl(v0.w,r4)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v1.w,r4)
+; CHECK-NEXT: v4.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r5)
+; CHECK-NEXT: v8 = vsplat(r4)
+; CHECK-NEXT: v2.w = vasl(v1.w,r2)
+; CHECK-NEXT: v5.w = vsub(v5.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasr(v4.w,r6)
+; CHECK-NEXT: v27 = vxor(v27,v27)
+; CHECK-NEXT: v2 = vor(v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: v5.w = vasr(v5.w,r6)
+; CHECK-NEXT: q0 = vcmp.gt(v27.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vsplat(r3)
+; CHECK-NEXT: v6.w = vasl(v0.w,r2)
+; CHECK-NEXT: v4.w = vsub(v7.w,v4.w)
+; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vsub(v7.w,v5.w)
+; CHECK-NEXT: v4.w = vmin(v4.w,v8.w)
+; CHECK-NEXT: v31 = vmux(q0,v3,v28)
+; CHECK-NEXT: v6 = vor(v6,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vmin(v5.w,v8.w)
+; CHECK-NEXT: q1 = vcmp.gt(v4.w,v27.w)
+; CHECK-NEXT: v0 = vmux(q2,v3,v28)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v6.w = vlsr(v6.w,v4.w)
+; CHECK-NEXT: q3 = vcmp.gt(v5.w,v27.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w)
+; CHECK-NEXT: v29.w = vsub(v27.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.w = vsub(v27.w,v2.w)
+; CHECK-NEXT: v1 = vmux(q0,v29,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q2,v30,v2)
+; CHECK-NEXT: v1 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v0 = vmux(q3,v2,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.b = vpack(v2.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <64 x float>, ptr %a0, align 128
+ %v1 = fptosi <64 x float> %v0 to <64 x i8>
+ store <64 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result #2
+define void @f32s8_2(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s8_2:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#30,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: r5:4 = combine(#8,#24)
+; CHECK-NEXT: r6 = #32
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: v5 = vsplat(r6)
+; CHECK-NEXT: v6.w = vasl(v0.w,r5)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r6)
+; CHECK-NEXT: v3.w = vasr(v3.w,r4)
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = ##2147483647
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r4)
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.b = vpack(v1.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptosi <32 x float> %v0 to <32 x i8>
+ store <32 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> s16
+; No widening
+define void @f32s16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
+; CHECK-NEXT: r4 = #30
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r3)
+; CHECK-NEXT: r6 = #8
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r4)
+; CHECK-NEXT: v8.w = vasl(v1.w,r2)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r5:4 = combine(#24,##2147483647)
+; CHECK-NEXT: v7 = vxor(v7,v7)
+; CHECK-NEXT: v8.w = vsub(v8.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = #32
+; CHECK-NEXT: v5.w = vasl(v0.w,r6)
+; CHECK-NEXT: q1 = vcmp.gt(v7.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: v28.w = vasr(v3.w,r5)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: q0 = vcmp.gt(v7.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9 = vsplat(r4)
+; CHECK-NEXT: v8.w = vasr(v8.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.w = vasl(v1.w,r6)
+; CHECK-NEXT: v1.w = vsub(v4.w,v28.w)
+; CHECK-NEXT: v4.w = vsub(v4.w,v8.w)
+; CHECK-NEXT: v29 = vmux(q0,v2,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vmin(v1.w,v6.w)
+; CHECK-NEXT: v0 = vor(v27,v2)
+; CHECK-NEXT: v4.w = vmin(v4.w,v6.w)
+; CHECK-NEXT: v2 = vmux(q1,v2,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q2 = vcmp.gt(v1.w,v7.w)
+; CHECK-NEXT: q3 = vcmp.gt(v4.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vlsr(v5.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.w = vlsr(v0.w,v4.w)
+; CHECK-NEXT: v30.w = vsub(v7.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v7.w,v0.w)
+; CHECK-NEXT: v5 = vmux(q0,v30,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v31,v0)
+; CHECK-NEXT: v1 = vmux(q2,v5,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x float>, ptr %a0, align 128
+ %v1 = fptosi <64 x float> %v0 to <64 x i16>
+ store <64 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @f32s16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q2 = vcmp.gt(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vpack(v0.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptosi <32 x float> %v0 to <32 x i16>
+ store <32 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> s32
+; No widening
+define void @f32s32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r2)
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vcmp.gt(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptosi <32 x float> %v0 to <32 x i32>
+ store <32 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @f32s32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32s32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v29 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q0,v2,v30)
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q1 = vcmp.gt(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vlsr(v29.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.w = vsub(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q0,v31,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v0,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <16 x float>, ptr %a0, align 128
+ %v1 = fptosi <16 x float> %v0 to <16 x i32>
+ store <16 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; f16 -> u8
+; No widening
+define void @f16u8_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u8_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##32768,#1)
+; CHECK-NEXT: r4 = #14
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(#11,#16)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vsplat(r4)
+; CHECK-NEXT: r5 = #5
+; CHECK-NEXT: v4.h = vasl(v1.h,r2)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vsplat(r6)
+; CHECK-NEXT: v5.h = vasl(v0.h,r5)
+; CHECK-NEXT: v4.h = vsub(v4.h,v2.h)
+; CHECK-NEXT: v28 = vxor(v28,v28)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #32767
+; CHECK-NEXT: v3.h = vasr(v3.h,r7)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.h = vsplat(r2)
+; CHECK-NEXT: v4.h = vasr(v4.h,r7)
+; CHECK-NEXT: q2 = vcmp.gt(v28.h,v0.h)
+; CHECK-NEXT: v3.h = vsub(v6.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.h = vasl(v1.h,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v28.h,v1.h)
+; CHECK-NEXT: v4.h = vsub(v6.h,v4.h)
+; CHECK-NEXT: v3.h = vmin(v3.h,v7.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vmin(v4.h,v7.h)
+; CHECK-NEXT: v2 = vor(v8,v2)
+; CHECK-NEXT: q0 = vcmp.gt(v28.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vlsr(v5.h,v3.h)
+; CHECK-NEXT: q1 = vcmp.gt(v28.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vlsr(v2.h,v4.h)
+; CHECK-NEXT: v30 = vmux(q0,v29,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q1,v29,v2)
+; CHECK-NEXT: v0 = vmux(q2,v28,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q3,v28,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x half>, ptr %a0, align 128
+ %v1 = fptoui <128 x half> %v0 to <128 x i8>
+ store <128 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @f16u8_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u8_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r3 = #32767
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r3)
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.ub = vpack(v0.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptoui <64 x half> %v0 to <64 x i8>
+ store <64 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f16 -> u16
+; No widening
+define void @f16u16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r2 = #32767
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r2)
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptoui <64 x half> %v0 to <64 x i16>
+ store <64 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @f16u16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##32768
+; CHECK-NEXT: r3:2 = combine(#5,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v3.h = vasl(v0.h,r2)
+; CHECK-NEXT: r6 = #14
+; CHECK-NEXT: r5 = #11
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r6)
+; CHECK-NEXT: r4 = #16
+; CHECK-NEXT: v6.h = vasl(v0.h,r3)
+; CHECK-NEXT: v3.h = vsub(v3.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r4)
+; CHECK-NEXT: r3 = #32767
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.h = vsplat(r3)
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.h = vasr(v3.h,r5)
+; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v3.h = vsub(v4.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vmin(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x half>, ptr %a0, align 128
+ %v1 = fptoui <32 x half> %v0 to <32 x i16>
+ store <32 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f16 -> u32
+; No widening
+define void @f16u32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #15360
+; CHECK-NEXT: r7 = #-4
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vsplat(r2)
+; CHECK-NEXT: r4 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#30,#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r4)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: r0 = #8
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: v26 = vxor(v26,v26)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.sf = v0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.sf = v1.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q1 = vcmp.gt(v26.w,v1.w)
+; CHECK-NEXT: q3 = vcmp.gt(v26.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v1.w,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v4.w = vasl(v0.w,r2)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27 = vsplat(r2)
+; CHECK-NEXT: v5.w = vasl(v1.w,r0)
+; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasr(v3.w,r6)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasr(v4.w,r6)
+; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v0.w,r0)
+; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
+; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
+; CHECK-NEXT: v2 = vor(v8,v2)
+; CHECK-NEXT: q0 = vcmp.gt(v26.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT: q2 = vcmp.gt(v26.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.w = vlsr(v2.w,v4.w)
+; CHECK-NEXT: v29 = vmux(q0,v27,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vmux(q2,v27,v28)
+; CHECK-NEXT: v31 = vmux(q1,v26,v29)
+; CHECK-NEXT: vmem(r1+#1) = v31.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v26,v30)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x half>, ptr %a0, align 128
+ %v1 = fptoui <64 x half> %v0 to <64 x i32>
+ store <64 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @f16u32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f16u32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #15360
+; CHECK-NEXT: r7 = #-4
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vsplat(r4)
+; CHECK-NEXT: r2 = ##-2147483648
+; CHECK-NEXT: r3 = #1
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r2)
+; CHECK-NEXT: r5:4 = combine(#8,#30)
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r4)
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0.qf32 = vmpy(v0.hf,v1.hf)
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v30 = vsplat(r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.sf = v0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.sf = v1.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q1 = vcmp.gt(v2.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v0.w,r3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,r5)
+; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vor(v6,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasr(v1.w,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v4.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vmin(v1.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v2.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vlsr(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v2,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x half>, ptr %a0, align 128
+ %v1 = fptoui <32 x half> %v0 to <32 x i32>
+ store <32 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> u8
+; No widening
+define void @f32u8_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u8_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##-2147483648,#8)
+; CHECK-NEXT: r4 = #1
+; CHECK-NEXT: v5 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: r5 = #30
+; CHECK-NEXT: r6 = #24
+; CHECK-NEXT: v2 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14 = vsplat(r5)
+; CHECK-NEXT: v8.w = vasl(v5.w,r4)
+; CHECK-NEXT: v13 = vxor(v13,v13)
+; CHECK-NEXT: v0 = vmem(r0+#2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = #64
+; CHECK-NEXT: v9.w = vasl(v2.w,r4)
+; CHECK-NEXT: v8.w = vsub(v8.w,v4.w)
+; CHECK-NEXT: v1 = vmem(r0+#3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.w = vasl(v0.w,r4)
+; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w)
+; CHECK-NEXT: v9.w = vsub(v9.w,v4.w)
+; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v12.w = vasl(v1.w,r4)
+; CHECK-NEXT: v11.w = vsub(v11.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24 = vsplat(r4)
+; CHECK-NEXT: v8.w = vasr(v8.w,r6)
+; CHECK-NEXT: v12.w = vsub(v12.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.w = vasr(v9.w,r6)
+; CHECK-NEXT: v8.w = vsub(v14.w,v8.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v5.w,r2)
+; CHECK-NEXT: v9.w = vsub(v14.w,v9.w)
+; CHECK-NEXT: v8.w = vmin(v8.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vasl(v2.w,r2)
+; CHECK-NEXT: v6 = vor(v6,v4)
+; CHECK-NEXT: v9.w = vmin(v9.w,v24.w)
+; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.w = vasr(v11.w,r6)
+; CHECK-NEXT: v7 = vor(v7,v4)
+; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.w = vasr(v12.w,r6)
+; CHECK-NEXT: v5.w = vsub(v14.w,v23.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v1.w,r2)
+; CHECK-NEXT: v25.w = vsub(v14.w,v12.w)
+; CHECK-NEXT: v5.w = vmin(v5.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v10.w = vasl(v0.w,r2)
+; CHECK-NEXT: v3 = vor(v3,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26 = vsplat(r2)
+; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w)
+; CHECK-NEXT: v10 = vor(v10,v4)
+; CHECK-NEXT: v4.w = vmin(v25.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w)
+; CHECK-NEXT: v6 = vmux(q1,v26,v6)
+; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.w = vlsr(v10.w,v5.w)
+; CHECK-NEXT: v7 = vmux(q2,v26,v7)
+; CHECK-NEXT: q2 = vcmp.gt(v13.w,v4.w)
+; CHECK-NEXT: v28 = vmux(q0,v13,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w)
+; CHECK-NEXT: v29 = vmux(q3,v13,v7)
+; CHECK-NEXT: v2 = vmux(q1,v26,v27)
+; CHECK-NEXT: q1 = vcmp.gt(v13.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vcmp.gt(v13.w,v1.w)
+; CHECK-NEXT: v0 = vmux(q2,v26,v3)
+; CHECK-NEXT: v1 = vmux(q1,v13,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uh = vpack(v29.w,v28.w):sat
+; CHECK-NEXT: v0 = vmux(q3,v13,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uh = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.ub = vpack(v31.h,v30.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.ub = vpack(v31.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v0,v1,r7)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x float>, ptr %a0, align 128
+ %v1 = fptoui <128 x float> %v0 to <128 x i8>
+ store <128 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result #1
+define void @f32u8_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u8_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
+; CHECK-NEXT: r4 = #30
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(#24,#32)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r4)
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: v4.w = vasl(v1.w,r2)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r6)
+; CHECK-NEXT: v5.w = vasl(v0.w,r5)
+; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
+; CHECK-NEXT: v27 = vxor(v27,v27)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: v3.w = vasr(v3.w,r7)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vsplat(r3)
+; CHECK-NEXT: v4.w = vasr(v4.w,r7)
+; CHECK-NEXT: q2 = vcmp.gt(v27.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v8.w = vasl(v1.w,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v27.w,v1.w)
+; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
+; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
+; CHECK-NEXT: v2 = vor(v8,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v27.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v27.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w)
+; CHECK-NEXT: v29 = vmux(q0,v28,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vmux(q1,v28,v2)
+; CHECK-NEXT: v0 = vmux(q2,v27,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v1 = vmux(q3,v27,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.ub = vpack(v31.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <64 x float>, ptr %a0, align 128
+ %v1 = fptoui <64 x float> %v0 to <64 x i8>
+ store <64 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result #2
+define void @f32u8_2(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u8_2:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#30,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: r5:4 = combine(#8,#24)
+; CHECK-NEXT: r6 = #32
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: v5 = vsplat(r6)
+; CHECK-NEXT: v6.w = vasl(v0.w,r5)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r6)
+; CHECK-NEXT: v3.w = vasr(v3.w,r4)
+; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = ##2147483647
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r4)
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uh = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.ub = vpack(v1.h,v0.h):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptoui <32 x float> %v0 to <32 x i8>
+ store <32 x i8> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> u16
+; No widening
+define void @f32u16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
+; CHECK-NEXT: r4 = #30
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(#24,#32)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r4)
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: v4.w = vasl(v1.w,r2)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r6)
+; CHECK-NEXT: v5.w = vasl(v0.w,r5)
+; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
+; CHECK-NEXT: v28 = vxor(v28,v28)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v3.w = vasr(v3.w,r7)
+; CHECK-NEXT: v5 = vor(v5,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29 = vsplat(r2)
+; CHECK-NEXT: v4.w = vasr(v4.w,r7)
+; CHECK-NEXT: q2 = vcmp.gt(v28.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v1.w,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v28.w,v1.w)
+; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
+; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
+; CHECK-NEXT: v2 = vor(v8,v2)
+; CHECK-NEXT: q0 = vcmp.gt(v28.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v28.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w)
+; CHECK-NEXT: v30 = vmux(q0,v29,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q1,v29,v2)
+; CHECK-NEXT: v0 = vmux(q2,v28,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q3,v28,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x float>, ptr %a0, align 128
+ %v1 = fptoui <64 x float> %v0 to <64 x i16>
+ store <64 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @f32u16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uh = vpack(v0.w,v0.w):sat
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptoui <32 x float> %v0 to <32 x i16>
+ store <32 x i16> %v1, ptr %a1, align 128
+ ret void
+}
+
+; f32 -> u32
+; No widening
+define void @f32u32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##2147483647
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r2)
+; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x float>, ptr %a0, align 128
+ %v1 = fptoui <32 x float> %v0 to <32 x i32>
+ store <32 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @f32u32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: f32u32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v3.w = vasl(v0.w,r2)
+; CHECK-NEXT: r6 = #30
+; CHECK-NEXT: r5 = #24
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r4 = #32
+; CHECK-NEXT: v6.w = vasl(v0.w,r3)
+; CHECK-NEXT: v3.w = vsub(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r4)
+; CHECK-NEXT: v2 = vor(v6,v2)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = ##2147483647
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.w = vasr(v3.w,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vsplat(r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: q1 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v3.w = vsub(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vmin(v3.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vmux(q0,v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q1,v1,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <16 x float>, ptr %a0, align 128
+ %v1 = fptoui <16 x float> %v0 to <16 x i32>
+ store <16 x i32> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
new file mode 100644
index 000000000000..260bee823240
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
@@ -0,0 +1,2744 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon -hexagon-hvx-widen=32 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; s8 -> f16
+; No widening
+define void @s8f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s8f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##.LCPI0_0
+; CHECK-NEXT: v1:0.h = vunpack(v2.b)
+; CHECK-NEXT: v2.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = #1
+; CHECK-NEXT: v4.h = vabs(v0.h)
+; CHECK-NEXT: v1 = vmem(r2+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r7)
+; CHECK-NEXT: r5:4 = combine(#31,#5)
+; CHECK-NEXT: v1 = vdelta(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v21.h = vsplat(r5)
+; CHECK-NEXT: r6 = #64
+; CHECK-NEXT: v6.uh = vcl0(v4.uh)
+; CHECK-NEXT: v10 = vxor(v10,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vsplat(r6)
+; CHECK-NEXT: r5 = ##32768
+; CHECK-NEXT: v3:2.h = vunpack(v1.b)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.h = vsplat(r5)
+; CHECK-NEXT: v20.h = vadd(v6.h,v5.h)
+; CHECK-NEXT: v3.h = vabs(v2.h)
+; CHECK-NEXT: q1 = vcmp.gt(v10.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vasl(v4.h,v20.h)
+; CHECK-NEXT: v29 = vmux(q1,v28,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uh = vcl0(v3.uh)
+; CHECK-NEXT: v9.h = vadd(v4.h,v21.h)
+; CHECK-NEXT: v11 = vand(v4,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uh = vlsr(v4.uh,r4)
+; CHECK-NEXT: v8.h = vadd(v8.h,v5.h)
+; CHECK-NEXT: q2 = vcmp.gt(v4.uh,v9.uh)
+; CHECK-NEXT: q0 = vcmp.eq(v11.h,v10.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v22.uh = vlsr(v9.uh,r4)
+; CHECK-NEXT: v25 = vmux(q2,v5,v10)
+; CHECK-NEXT: v13 = vmux(q0,v10,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vasl(v3.h,v8.h)
+; CHECK-NEXT: v13.h = vadd(v22.h,v13.h)
+; CHECK-NEXT: q0 = vcmp.eq(v12.h,v22.h)
+; CHECK-NEXT: v12.h = vadd(v25.h,v21.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uh = vlsr(v22.uh,r7)
+; CHECK-NEXT: v23.h = vadd(v3.h,v21.h)
+; CHECK-NEXT: v7 = vand(v3,v7)
+; CHECK-NEXT: v1.h = vsub(v12.h,v20.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uh = vlsr(v3.uh,r4)
+; CHECK-NEXT: q2 = vcmp.eq(v7.h,v10.h)
+; CHECK-NEXT: q3 = vcmp.gt(v3.uh,v23.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uh = vlsr(v23.uh,r4)
+; CHECK-NEXT: v7 = vmux(q2,v10,v5)
+; CHECK-NEXT: v5 = vmux(q3,v5,v10)
+; CHECK-NEXT: q3 = vcmp.gt(v10.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.uh = vlsr(v13.uh,r7)
+; CHECK-NEXT: v7.h = vadd(v4.h,v7.h)
+; CHECK-NEXT: v5.h = vadd(v5.h,v21.h)
+; CHECK-NEXT: q2 = vcmp.eq(v24.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #10
+; CHECK-NEXT: v4.uh = vlsr(v4.uh,r7)
+; CHECK-NEXT: v5.h = vsub(v5.h,v8.h)
+; CHECK-NEXT: v30 = vmux(q3,v28,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uh = vlsr(v7.uh,r7)
+; CHECK-NEXT: v3 = vmux(q0,v26,v27)
+; CHECK-NEXT: q3 = vcmp.eq(v2.h,v10.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vasl(v5.h,r4)
+; CHECK-NEXT: v4 = vmux(q2,v7,v4)
+; CHECK-NEXT: v3 = vor(v30,v3)
+; CHECK-NEXT: q2 = vcmp.eq(v0.h,v10.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,r4)
+; CHECK-NEXT: v4 = vor(v29,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v3,v1)
+; CHECK-NEXT: v31 = vor(v4,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q2,v10,v1)
+; CHECK-NEXT: v0 = vmux(q3,v10,v31)
+; CHECK-NEXT: vmem(r1+#0) = v1.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v0
+; CHECK-NEXT: }
+ %v0 = load <128 x i8>, ptr %a0, align 128
+ %v1 = sitofp <128 x i8> %v0 to <128 x half>
+ store <128 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @s8f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s8f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: r3:2 = combine(#64,#31)
+; CHECK-NEXT: v1:0.h = vunpack(v0.b)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vsplat(r6)
+; CHECK-NEXT: v4.h = vsplat(r2)
+; CHECK-NEXT: v2.h = vabs(v0.h)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vsplat(r3)
+; CHECK-NEXT: r5:4 = combine(##32768,#5)
+; CHECK-NEXT: r2 = #10
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.h = vsplat(r5)
+; CHECK-NEXT: v5.uh = vcl0(v2.uh)
+; CHECK-NEXT: q3 = vcmp.eq(v0.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vasl(v2.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v2.h,v4.h)
+; CHECK-NEXT: v6 = vand(v2,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.uh = vlsr(v2.uh,r4)
+; CHECK-NEXT: q0 = vcmp.eq(v6.h,v1.h)
+; CHECK-NEXT: q1 = vcmp.gt(v2.uh,v7.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
+; CHECK-NEXT: v26 = vmux(q0,v1,v3)
+; CHECK-NEXT: v3 = vmux(q1,v3,v1)
+; CHECK-NEXT: q1 = vcmp.gt(v1.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
+; CHECK-NEXT: v3.h = vadd(v3.h,v4.h)
+; CHECK-NEXT: q2 = vcmp.eq(v2.h,v25.h)
+; CHECK-NEXT: v30 = vmux(q1,v8,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
+; CHECK-NEXT: v28.h = vsub(v3.h,v5.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vasl(v28.h,r2)
+; CHECK-NEXT: v3 = vmux(q2,v29,v27)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vor(v30,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v1,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i8>, ptr %a0, align 128
+ %v1 = sitofp <64 x i8> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; s8 -> f32
+; No widening
+define void @s8f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s8f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##.LCPI2_0,#8)
+; CHECK-NEXT: v3:2.h = vunpack(v1.b)
+; CHECK-NEXT: v1.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r7 = #512
+; CHECK-NEXT: r4 = #255
+; CHECK-NEXT: v3 = vmem(r3+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r0)
+; CHECK-NEXT: v13 = vsplat(r7)
+; CHECK-NEXT: v4 = vdelta(v1,v3)
+; CHECK-NEXT: v0 = vxor(v0,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vsplat(r4)
+; CHECK-NEXT: r6 = ##-2147483648
+; CHECK-NEXT: v3:2.w = vunpack(v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v15 = vsplat(r6)
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: v5:4.h = vunpack(v4.b)
+; CHECK-NEXT: v6.w = vabs(v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v17 = vsplat(r5)
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v8.w = vabs(v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5:4.w = vunpack(v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.uw = vcl0(v6.uw)
+; CHECK-NEXT: v7.w = vabs(v4.w)
+; CHECK-NEXT: v11.w = vabs(v5.w)
+; CHECK-NEXT: q0 = vcmp.gt(v0.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14.uw = vcl0(v8.uw)
+; CHECK-NEXT: v9.w = vadd(v9.w,v1.w)
+; CHECK-NEXT: v18 = vmux(q0,v15,v0)
+; CHECK-NEXT: q1 = vcmp.gt(v0.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vcl0(v7.uw)
+; CHECK-NEXT: v14.w = vadd(v14.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v16.uw = vcl0(v11.uw)
+; CHECK-NEXT: v12.w = vadd(v12.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v6.w,v9.w)
+; CHECK-NEXT: v16.w = vadd(v16.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vasl(v7.w,v12.w)
+; CHECK-NEXT: v19 = vand(v6,v13)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.w = vasl(v11.w,v16.w)
+; CHECK-NEXT: v21 = vand(v7,v13)
+; CHECK-NEXT: v31.w = vadd(v7.w,v10.w)
+; CHECK-NEXT: q0 = vcmp.eq(v19.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v8.w,v14.w)
+; CHECK-NEXT: v22.w = vadd(v11.w,v10.w)
+; CHECK-NEXT: q3 = vcmp.eq(v21.w,v0.w)
+; CHECK-NEXT: v24 = vand(v11,v13)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v31.uw,r2)
+; CHECK-NEXT: v29 = vmux(q3,v0,v1)
+; CHECK-NEXT: q3 = vcmp.eq(v24.w,v0.w)
+; CHECK-NEXT: q2 = vcmp.gt(v7.uw,v31.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v11.uw,r2)
+; CHECK-NEXT: v27 = vmux(q3,v0,v1)
+; CHECK-NEXT: v19.w = vadd(v23.w,v29.w)
+; CHECK-NEXT: v31 = vmux(q2,v1,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v22.uw,r2)
+; CHECK-NEXT: v13 = vand(v8,v13)
+; CHECK-NEXT: v26 = vmux(q0,v0,v1)
+; CHECK-NEXT: v12.w = vsub(v31.w,v12.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.uw = vlsr(v7.uw,r2)
+; CHECK-NEXT: q3 = vcmp.eq(v28.w,v30.w)
+; CHECK-NEXT: v28.w = vadd(v30.w,v27.w)
+; CHECK-NEXT: v31 = vmux(q1,v15,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uw = vlsr(v30.uw,r0)
+; CHECK-NEXT: v30.w = vadd(v6.w,v10.w)
+; CHECK-NEXT: q2 = vcmp.eq(v20.w,v23.w)
+; CHECK-NEXT: v10.w = vadd(v8.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vlsr(v28.uw,r0)
+; CHECK-NEXT: q0 = vcmp.gt(v8.uw,v10.uw)
+; CHECK-NEXT: v12.w = vadd(v12.w,v17.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v23.uw,r0)
+; CHECK-NEXT: v7 = vmux(q3,v7,v29)
+; CHECK-NEXT: q3 = vcmp.eq(v13.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v19.uw = vlsr(v19.uw,r0)
+; CHECK-NEXT: v29 = vmux(q3,v0,v1)
+; CHECK-NEXT: v7 = vor(v31,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uw = vlsr(v30.uw,r2)
+; CHECK-NEXT: v19 = vmux(q2,v19,v23)
+; CHECK-NEXT: q2 = vcmp.gt(v11.uw,v22.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10.uw = vlsr(v10.uw,r2)
+; CHECK-NEXT: v27 = vmux(q2,v1,v0)
+; CHECK-NEXT: q2 = vcmp.gt(v6.uw,v30.uw)
+; CHECK-NEXT: v28.w = vadd(v25.w,v26.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
+; CHECK-NEXT: v31 = vmux(q2,v1,v0)
+; CHECK-NEXT: v1 = vmux(q0,v1,v0)
+; CHECK-NEXT: v30.w = vadd(v10.w,v29.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v8.uw,r2)
+; CHECK-NEXT: v1.w = vsub(v1.w,v14.w)
+; CHECK-NEXT: q3 = vcmp.eq(v6.w,v25.w)
+; CHECK-NEXT: v21.w = vsub(v31.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uw = vlsr(v28.uw,r0)
+; CHECK-NEXT: v6.w = vadd(v21.w,v17.w)
+; CHECK-NEXT: v1.w = vadd(v1.w,v17.w)
+; CHECK-NEXT: q0 = vcmp.eq(v24.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v22.uw = vlsr(v25.uw,r0)
+; CHECK-NEXT: v13.w = vsub(v27.w,v16.w)
+; CHECK-NEXT: q2 = vcmp.gt(v0.w,v3.w)
+; CHECK-NEXT: v18 = vor(v18,v19)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v30.uw,r0)
+; CHECK-NEXT: v8 = vmux(q3,v8,v22)
+; CHECK-NEXT: q3 = vcmp.gt(v0.w,v2.w)
+; CHECK-NEXT: v26 = vmux(q2,v15,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v10.uw,r0)
+; CHECK-NEXT: v25.w = vadd(v13.w,v17.w)
+; CHECK-NEXT: v27 = vmux(q3,v15,v0)
+; CHECK-NEXT: v8 = vor(v26,v8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v6.w,r4)
+; CHECK-NEXT: v9 = vmux(q0,v23,v24)
+; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v9 = vor(v27,v9)
+; CHECK-NEXT: v6 = vor(v8,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.w = vasl(v12.w,r4)
+; CHECK-NEXT: v1 = vor(v9,v1)
+; CHECK-NEXT: v29 = vmux(q2,v0,v6)
+; CHECK-NEXT: vmem(r1+#1) = v29.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.w = vasl(v25.w,r4)
+; CHECK-NEXT: v1 = vmux(q3,v0,v1)
+; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w)
+; CHECK-NEXT: vmem(r1+#0) = v1.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vor(v7,v28)
+; CHECK-NEXT: v31 = vor(v18,v12)
+; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vmux(q2,v0,v30)
+; CHECK-NEXT: v0 = vmux(q3,v0,v31)
+; CHECK-NEXT: vmem(r1+#3) = v2.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#2) = v0
+; CHECK-NEXT: }
+ %v0 = load <128 x i8>, ptr %a0, align 128
+ %v1 = sitofp <128 x i8> %v0 to <128 x float>
+ store <128 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input #1
+define void @s8f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s8f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: v3:2.h = vunpack(v0.b)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r0)
+; CHECK-NEXT: r3:2 = combine(##255,#8)
+; CHECK-NEXT: r6 = #512
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r3)
+; CHECK-NEXT: v3:2.w = vunpack(v2.h)
+; CHECK-NEXT: v22 = vxor(v22,v22)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vsplat(r6)
+; CHECK-NEXT: r7 = ##-2147483648
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9 = vsplat(r7)
+; CHECK-NEXT: v4.w = vabs(v2.w)
+; CHECK-NEXT: v5.w = vabs(v3.w)
+; CHECK-NEXT: q0 = vcmp.gt(v22.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12 = vsplat(r5)
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v11 = vmux(q0,v9,v22)
+; CHECK-NEXT: q0 = vcmp.gt(v22.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vcl0(v4.uw)
+; CHECK-NEXT: v30 = vmux(q0,v9,v22)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uw = vcl0(v5.uw)
+; CHECK-NEXT: v6.w = vadd(v6.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vadd(v8.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasl(v4.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v5.w,v8.w)
+; CHECK-NEXT: v13 = vand(v4,v10)
+; CHECK-NEXT: v14.w = vadd(v4.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vand(v5,v10)
+; CHECK-NEXT: v7.w = vadd(v5.w,v7.w)
+; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v14.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v13.w,v22.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14.uw = vlsr(v14.uw,r2)
+; CHECK-NEXT: q3 = vcmp.eq(v10.w,v22.w)
+; CHECK-NEXT: v25 = vmux(q2,v1,v22)
+; CHECK-NEXT: q2 = vcmp.gt(v5.uw,v7.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
+; CHECK-NEXT: v26 = vmux(q1,v22,v1)
+; CHECK-NEXT: v27 = vmux(q3,v22,v1)
+; CHECK-NEXT: v1 = vmux(q2,v1,v22)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v5.uw,r2)
+; CHECK-NEXT: v5.w = vadd(v14.w,v26.w)
+; CHECK-NEXT: v29.w = vadd(v7.w,v27.w)
+; CHECK-NEXT: v6.w = vsub(v25.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v1.w = vsub(v1.w,v8.w)
+; CHECK-NEXT: v6.w = vadd(v6.w,v12.w)
+; CHECK-NEXT: q3 = vcmp.eq(v24.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v14.uw,r0)
+; CHECK-NEXT: v1.w = vadd(v1.w,v12.w)
+; CHECK-NEXT: q1 = vcmp.eq(v23.w,v14.w)
+; CHECK-NEXT: q2 = vcmp.eq(v3.w,v22.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vlsr(v5.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vlsr(v7.uw,r0)
+; CHECK-NEXT: v5 = vmux(q1,v5,v28)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v29.uw,r0)
+; CHECK-NEXT: v5 = vor(v11,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v6.w,r4)
+; CHECK-NEXT: v4 = vmux(q3,v4,v7)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v22.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v4 = vor(v30,v4)
+; CHECK-NEXT: v31 = vor(v5,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v4,v1)
+; CHECK-NEXT: v0 = vmux(q3,v22,v31)
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q2,v22,v1)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v1.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i8>, ptr %a0, align 128
+ %v1 = sitofp <64 x i8> %v0 to <64 x float>
+ store <64 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input #2
+define void @s8f32_2(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s8f32_2:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v1:0.h = vunpack(v0.b)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r0)
+; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v1:0.w = vunpack(v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r2)
+; CHECK-NEXT: v8 = vsplat(r4)
+; CHECK-NEXT: v5.w = vabs(v0.w)
+; CHECK-NEXT: q2 = vcmp.gt(v3.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r7)
+; CHECK-NEXT: r2 = #23
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vcl0(v5.uw)
+; CHECK-NEXT: v30 = vmux(q2,v7,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v6.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v5.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vadd(v5.w,v1.w)
+; CHECK-NEXT: v4 = vand(v5,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
+; CHECK-NEXT: q0 = vcmp.eq(v4.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v5.uw,v1.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
+; CHECK-NEXT: v4 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v1.w,v4.w)
+; CHECK-NEXT: v2.w = vsub(v2.w,v6.w)
+; CHECK-NEXT: q3 = vcmp.eq(v5.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v1.uw,r0)
+; CHECK-NEXT: v2.w = vadd(v2.w,v8.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uw = vlsr(v4.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,r2)
+; CHECK-NEXT: v1 = vmux(q3,v29,v28)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v30,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v1,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v3,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i8>, ptr %a0, align 128
+ %v1 = sitofp <32 x i8> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; s16 -> f16
+; No widening
+define void @s16f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s16f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: r3:2 = combine(#64,#31)
+; CHECK-NEXT: v1.h = vabs(v0.h)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vsplat(r6)
+; CHECK-NEXT: v5.h = vsplat(r2)
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vsplat(r3)
+; CHECK-NEXT: r5:4 = combine(##32768,#5)
+; CHECK-NEXT: v4.uh = vcl0(v1.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.h = vsplat(r5)
+; CHECK-NEXT: r2 = #10
+; CHECK-NEXT: v4.h = vadd(v4.h,v3.h)
+; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
+; CHECK-NEXT: v6 = vand(v1,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uh = vlsr(v1.uh,r4)
+; CHECK-NEXT: q0 = vcmp.eq(v6.h,v2.h)
+; CHECK-NEXT: q1 = vcmp.gt(v1.uh,v7.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uh = vlsr(v7.uh,r4)
+; CHECK-NEXT: v26 = vmux(q0,v2,v3)
+; CHECK-NEXT: v3 = vmux(q1,v3,v2)
+; CHECK-NEXT: q1 = vcmp.gt(v2.h,v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
+; CHECK-NEXT: v3.h = vadd(v3.h,v5.h)
+; CHECK-NEXT: q2 = vcmp.eq(v1.h,v25.h)
+; CHECK-NEXT: v30 = vmux(q1,v8,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uh = vlsr(v25.uh,r6)
+; CHECK-NEXT: v28.h = vsub(v3.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v7.uh,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v28.h,r2)
+; CHECK-NEXT: v3 = vmux(q2,v29,v27)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vor(v30,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v2,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i16>, ptr %a0, align 128
+ %v1 = sitofp <64 x i16> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @s16f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s16f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = #1
+; CHECK-NEXT: r3:2 = combine(#31,#64)
+; CHECK-NEXT: v1.h = vabs(v0.h)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r7)
+; CHECK-NEXT: v5.h = vsplat(r3)
+; CHECK-NEXT: r6 = #5
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vsplat(r2)
+; CHECK-NEXT: r4 = ##32768
+; CHECK-NEXT: v4.uh = vcl0(v1.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.h = vsplat(r4)
+; CHECK-NEXT: r3 = #10
+; CHECK-NEXT: q2 = vcmp.gt(v3.h,v0.h)
+; CHECK-NEXT: v4.h = vadd(v4.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vmux(q2,v8,v3)
+; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v1.h,v5.h)
+; CHECK-NEXT: v6 = vand(v1,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uh = vlsr(v1.uh,r6)
+; CHECK-NEXT: q1 = vcmp.eq(v6.h,v3.h)
+; CHECK-NEXT: q0 = vcmp.gt(v1.uh,v7.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uh = vlsr(v7.uh,r6)
+; CHECK-NEXT: v26 = vmux(q1,v3,v2)
+; CHECK-NEXT: v2 = vmux(q0,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v25.h,v26.h)
+; CHECK-NEXT: v2.h = vadd(v2.h,v5.h)
+; CHECK-NEXT: q3 = vcmp.eq(v1.h,v25.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uh = vlsr(v25.uh,r7)
+; CHECK-NEXT: v28.h = vsub(v2.h,v4.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v7.uh,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v28.h,r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v2 = vmux(q3,v29,v27)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vor(v30,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v3,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x i16>, ptr %a0, align 128
+ %v1 = sitofp <32 x i16> %v0 to <32 x half>
+ store <32 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; s16 -> f32
+; No widening
+define void @s16f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s16f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##.LCPI7_0
+; CHECK-NEXT: v1:0.w = vunpack(v2.h)
+; CHECK-NEXT: v2.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: v4.w = vabs(v0.w)
+; CHECK-NEXT: v1 = vmem(r2+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r0)
+; CHECK-NEXT: r5:4 = combine(##255,#8)
+; CHECK-NEXT: v1 = vdelta(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20 = vsplat(r5)
+; CHECK-NEXT: r7 = #512
+; CHECK-NEXT: v6.uw = vcl0(v4.uw)
+; CHECK-NEXT: v10 = vxor(v10,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r7)
+; CHECK-NEXT: r6 = #159
+; CHECK-NEXT: r5 = ##-2147483648
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23 = vsplat(r6)
+; CHECK-NEXT: v3:2.w = vunpack(v1.h)
+; CHECK-NEXT: v19.w = vadd(v6.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vsplat(r5)
+; CHECK-NEXT: v3.w = vabs(v2.w)
+; CHECK-NEXT: q0 = vcmp.gt(v10.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasl(v4.w,v19.w)
+; CHECK-NEXT: v29 = vmux(q0,v28,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uw = vcl0(v3.uw)
+; CHECK-NEXT: v9.w = vadd(v4.w,v20.w)
+; CHECK-NEXT: v11 = vand(v4,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vlsr(v4.uw,r4)
+; CHECK-NEXT: v8.w = vadd(v8.w,v5.w)
+; CHECK-NEXT: q2 = vcmp.gt(v4.uw,v9.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v11.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v21.uw = vlsr(v9.uw,r4)
+; CHECK-NEXT: v9 = vmux(q2,v5,v10)
+; CHECK-NEXT: v22 = vmux(q1,v10,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v3.w,v8.w)
+; CHECK-NEXT: v4.w = vadd(v21.w,v22.w)
+; CHECK-NEXT: v1.w = vsub(v9.w,v19.w)
+; CHECK-NEXT: q1 = vcmp.eq(v12.w,v21.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uw = vlsr(v21.uw,r0)
+; CHECK-NEXT: v6.w = vadd(v3.w,v20.w)
+; CHECK-NEXT: v7 = vand(v3,v7)
+; CHECK-NEXT: v1.w = vadd(v1.w,v23.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v3.uw,r4)
+; CHECK-NEXT: q2 = vcmp.eq(v7.w,v10.w)
+; CHECK-NEXT: q3 = vcmp.gt(v3.uw,v6.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uw = vlsr(v6.uw,r4)
+; CHECK-NEXT: v26 = vmux(q2,v10,v5)
+; CHECK-NEXT: v5 = vmux(q3,v5,v10)
+; CHECK-NEXT: q3 = vcmp.gt(v10.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r0)
+; CHECK-NEXT: v6.w = vadd(v25.w,v26.w)
+; CHECK-NEXT: v5.w = vsub(v5.w,v8.w)
+; CHECK-NEXT: q2 = vcmp.eq(v24.w,v25.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v3.uw = vlsr(v25.uw,r0)
+; CHECK-NEXT: v5.w = vadd(v5.w,v23.w)
+; CHECK-NEXT: v30 = vmux(q3,v28,v10)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r0)
+; CHECK-NEXT: v4 = vmux(q1,v4,v27)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v5.w,r4)
+; CHECK-NEXT: v3 = vmux(q2,v6,v3)
+; CHECK-NEXT: v4 = vor(v30,v4)
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v3 = vor(v29,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v4,v1)
+; CHECK-NEXT: v31 = vor(v3,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vmux(q2,v10,v1)
+; CHECK-NEXT: v0 = vmux(q3,v10,v31)
+; CHECK-NEXT: vmem(r1+#0) = v1.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v0
+; CHECK-NEXT: }
+ %v0 = load <64 x i16>, ptr %a0, align 128
+ %v1 = sitofp <64 x i16> %v0 to <64 x float>
+ store <64 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @s16f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s16f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v1:0.w = vunpack(v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r0)
+; CHECK-NEXT: v4 = vsplat(r2)
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v2.w = vabs(v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
+; CHECK-NEXT: v1 = vxor(v1,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v5.uw = vcl0(v2.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v29 = vsplat(r7)
+; CHECK-NEXT: q2 = vcmp.gt(v1.w,v0.w)
+; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #23
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v2.w,v4.w)
+; CHECK-NEXT: v6 = vand(v2,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.uw = vlsr(v2.uw,r6)
+; CHECK-NEXT: q0 = vcmp.eq(v6.w,v1.w)
+; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r6)
+; CHECK-NEXT: v6 = vmux(q0,v1,v3)
+; CHECK-NEXT: v3 = vmux(q1,v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v4.w,v6.w)
+; CHECK-NEXT: v27.w = vsub(v3.w,v5.w)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v4.uw,r0)
+; CHECK-NEXT: v2.w = vadd(v27.w,v7.w)
+; CHECK-NEXT: v4 = vmux(q2,v29,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,r2)
+; CHECK-NEXT: v3 = vmux(q3,v30,v28)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vor(v4,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v1,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i16>, ptr %a0, align 128
+ %v1 = sitofp <32 x i16> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; s32 -> f16
+; No widening
+define void @s32f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s32f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: r6 = #255
+; CHECK-NEXT: v2.w = vabs(v1.w)
+; CHECK-NEXT: v1.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r2)
+; CHECK-NEXT: r4 = #512
+; CHECK-NEXT: v3.w = vabs(v0.w)
+; CHECK-NEXT: v0.cur = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9 = vsplat(r4)
+; CHECK-NEXT: v8 = vsplat(r6)
+; CHECK-NEXT: v5.uw = vcl0(v2.uw)
+; CHECK-NEXT: v7 = vxor(v7,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v6.uw = vcl0(v3.uw)
+; CHECK-NEXT: v5.w = vadd(v5.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vsplat(r4)
+; CHECK-NEXT: r5 = ##-2147483648
+; CHECK-NEXT: v6.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v13 = vsplat(r5)
+; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
+; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v3.w,v6.w)
+; CHECK-NEXT: v27 = vmux(q0,v13,v7)
+; CHECK-NEXT: v10.w = vadd(v2.w,v8.w)
+; CHECK-NEXT: v11 = vand(v2,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9 = vand(v3,v9)
+; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w)
+; CHECK-NEXT: v8.w = vadd(v3.w,v8.w)
+; CHECK-NEXT: q2 = vcmp.gt(v2.uw,v10.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vlsr(v2.uw,r3)
+; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w)
+; CHECK-NEXT: v23 = vmux(q1,v7,v4)
+; CHECK-NEXT: q1 = vcmp.gt(v3.uw,v8.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.uw = vlsr(v10.uw,r3)
+; CHECK-NEXT: v25 = vmux(q3,v7,v4)
+; CHECK-NEXT: v24 = vmux(q2,v4,v7)
+; CHECK-NEXT: v4 = vmux(q1,v4,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3)
+; CHECK-NEXT: v9.w = vadd(v2.w,v23.w)
+; CHECK-NEXT: v5.w = vsub(v24.w,v5.w)
+; CHECK-NEXT: v4.w = vsub(v4.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.uw = vlsr(v3.uw,r3)
+; CHECK-NEXT: v26.w = vadd(v8.w,v25.w)
+; CHECK-NEXT: q3 = vcmp.eq(v12.w,v2.w)
+; CHECK-NEXT: v5.w = vadd(v5.w,v28.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2)
+; CHECK-NEXT: q2 = vcmp.eq(v3.w,v8.w)
+; CHECK-NEXT: v4.w = vadd(v4.w,v28.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uw = vlsr(v26.uw,r2)
+; CHECK-NEXT: v2 = vmux(q3,v9,v2)
+; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.uw = vlsr(v8.uw,r2)
+; CHECK-NEXT: v30 = vmux(q3,v13,v7)
+; CHECK-NEXT: v2 = vor(v27,v2)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vasl(v5.w,r3)
+; CHECK-NEXT: v3 = vmux(q2,v29,v3)
+; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v4.w,r3)
+; CHECK-NEXT: v31 = vor(v30,v3)
+; CHECK-NEXT: v2 = vor(v2,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v31,v3)
+; CHECK-NEXT: v2 = vmux(q2,v7,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v7,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.qf32 = vadd(v2.sf,v7.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.hf = v3:2.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vdeal(v0.h)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i32>, ptr %a0, align 128
+ %v1 = sitofp <64 x i32> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @s32f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s32f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: v1.w = vabs(v0.w)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r6)
+; CHECK-NEXT: r3:2 = combine(##255,#8)
+; CHECK-NEXT: r4 = #512
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r3)
+; CHECK-NEXT: v6 = vsplat(r4)
+; CHECK-NEXT: v4.uw = vcl0(v1.uw)
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: r4 = ##-2147483648
+; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vsplat(r5)
+; CHECK-NEXT: v29 = vsplat(r4)
+; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
+; CHECK-NEXT: v31 = vmux(q3,v29,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
+; CHECK-NEXT: v6 = vand(v1,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vlsr(v1.uw,r2)
+; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v1.uw = vlsr(v5.uw,r2)
+; CHECK-NEXT: v27 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v5.w = vadd(v1.w,v27.w)
+; CHECK-NEXT: v2.w = vsub(v2.w,v4.w)
+; CHECK-NEXT: q2 = vcmp.eq(v7.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
+; CHECK-NEXT: v2.w = vadd(v2.w,v28.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v5.uw,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,r3)
+; CHECK-NEXT: v1 = vmux(q2,v30,v1)
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vor(v31,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.qf32 = vadd(v3.sf,v3.sf)
+; CHECK-NEXT: v0 = vor(v1,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v3,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v3.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.hf = v1:0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vdeal(v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x i32>, ptr %a0, align 128
+ %v1 = sitofp <32 x i32> %v0 to <32 x half>
+ store <32 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; s32 -> f32
+; No widening
+define void @s32f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s32f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v1.w = vabs(v0.w)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r0)
+; CHECK-NEXT: v5 = vsplat(r2)
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
+; CHECK-NEXT: v4.uw = vcl0(v1.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v4.w = vadd(v4.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v29 = vsplat(r7)
+; CHECK-NEXT: r2 = #23
+; CHECK-NEXT: q2 = vcmp.gt(v2.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
+; CHECK-NEXT: v6 = vand(v1,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
+; CHECK-NEXT: q0 = vcmp.eq(v6.w,v2.w)
+; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
+; CHECK-NEXT: v6 = vmux(q0,v2,v3)
+; CHECK-NEXT: v3 = vmux(q1,v3,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
+; CHECK-NEXT: v27.w = vsub(v3.w,v4.w)
+; CHECK-NEXT: q3 = vcmp.eq(v1.w,v5.w)
+; CHECK-NEXT: v4 = vmux(q2,v29,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
+; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r2)
+; CHECK-NEXT: v3 = vmux(q3,v30,v28)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vor(v4,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v2,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i32>, ptr %a0, align 128
+ %v1 = sitofp <32 x i32> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @s32f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: s32f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v1.w = vabs(v0.w)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r0)
+; CHECK-NEXT: v5 = vsplat(r2)
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: r7:6 = combine(##-2147483648,#8)
+; CHECK-NEXT: v4.uw = vcl0(v1.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v29 = vsplat(r7)
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: q3 = vcmp.gt(v3.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v1.w = vasl(v1.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vadd(v1.w,v5.w)
+; CHECK-NEXT: v6 = vand(v1,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.uw = vlsr(v1.uw,r6)
+; CHECK-NEXT: q0 = vcmp.eq(v6.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v1.uw,v5.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vlsr(v5.uw,r6)
+; CHECK-NEXT: v6 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v5.w,v6.w)
+; CHECK-NEXT: v27.w = vsub(v2.w,v4.w)
+; CHECK-NEXT: q2 = vcmp.eq(v1.w,v5.w)
+; CHECK-NEXT: v4 = vmux(q3,v29,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v5.uw,r0)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v1.w = vadd(v27.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v6.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: v2 = vmux(q2,v30,v28)
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vor(v4,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v3,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <16 x i32>, ptr %a0, align 128
+ %v1 = sitofp <16 x i32> %v0 to <16 x float>
+ store <16 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; u8 -> f16
+; No widening
+define void @u8f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u8f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##.LCPI13_0
+; CHECK-NEXT: v1:0.uh = vunpack(v2.ub)
+; CHECK-NEXT: v2.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: v1 = vmem(r2+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vsplat(r0)
+; CHECK-NEXT: r7:6 = combine(#31,#5)
+; CHECK-NEXT: r4 = #64
+; CHECK-NEXT: v1 = vdelta(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vsplat(r4)
+; CHECK-NEXT: v6.h = vsplat(r7)
+; CHECK-NEXT: v4.uh = vcl0(v0.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #10
+; CHECK-NEXT: v19:18.uh = vunpack(v1.ub)
+; CHECK-NEXT: v17.h = vadd(v4.h,v3.h)
+; CHECK-NEXT: v8 = vxor(v8,v8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v19.h = vasl(v0.h,v17.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uh = vcl0(v18.uh)
+; CHECK-NEXT: v9.h = vadd(v19.h,v6.h)
+; CHECK-NEXT: v10 = vand(v19,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.uh = vlsr(v19.uh,r6)
+; CHECK-NEXT: v5.h = vadd(v5.h,v3.h)
+; CHECK-NEXT: q0 = vcmp.eq(v10.h,v8.h)
+; CHECK-NEXT: q1 = vcmp.gt(v19.uh,v9.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v21.uh = vlsr(v9.uh,r6)
+; CHECK-NEXT: v13 = vmux(q1,v3,v8)
+; CHECK-NEXT: v22 = vmux(q0,v8,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.h = vasl(v18.h,v5.h)
+; CHECK-NEXT: v9.h = vadd(v21.h,v22.h)
+; CHECK-NEXT: v13.h = vadd(v13.h,v6.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uh = vlsr(v21.uh,r0)
+; CHECK-NEXT: v12.h = vadd(v20.h,v6.h)
+; CHECK-NEXT: v7 = vand(v20,v7)
+; CHECK-NEXT: v2.h = vsub(v13.h,v17.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14.uh = vlsr(v20.uh,r6)
+; CHECK-NEXT: q3 = vcmp.eq(v7.h,v8.h)
+; CHECK-NEXT: q2 = vcmp.gt(v20.uh,v12.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v15.uh = vlsr(v12.uh,r6)
+; CHECK-NEXT: v24 = vmux(q3,v8,v3)
+; CHECK-NEXT: v3 = vmux(q2,v3,v8)
+; CHECK-NEXT: q3 = vcmp.eq(v11.h,v21.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uh = vlsr(v9.uh,r0)
+; CHECK-NEXT: v3.h = vadd(v3.h,v6.h)
+; CHECK-NEXT: v26.h = vadd(v15.h,v24.h)
+; CHECK-NEXT: q2 = vcmp.eq(v14.h,v15.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uh = vlsr(v15.uh,r0)
+; CHECK-NEXT: v3.h = vsub(v3.h,v5.h)
+; CHECK-NEXT: v29 = vmux(q3,v23,v25)
+; CHECK-NEXT: q3 = vcmp.eq(v18.h,v8.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uh = vlsr(v26.uh,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vasl(v2.h,r4)
+; CHECK-NEXT: v1 = vmux(q2,v28,v27)
+; CHECK-NEXT: q2 = vcmp.eq(v0.h,v8.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vasl(v3.h,r4)
+; CHECK-NEXT: v2 = vor(v29,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vor(v1,v3)
+; CHECK-NEXT: v31 = vmux(q2,v8,v2)
+; CHECK-NEXT: vmem(r1+#0) = v31.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v8,v30)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x i8>, ptr %a0, align 128
+ %v1 = uitofp <128 x i8> %v0 to <128 x half>
+ store <128 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @u8f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u8f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: r3:2 = combine(#64,#31)
+; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vsplat(r6)
+; CHECK-NEXT: v4.h = vsplat(r2)
+; CHECK-NEXT: r5 = #5
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r3)
+; CHECK-NEXT: r4 = #10
+; CHECK-NEXT: v3.uh = vcl0(v0.uh)
+; CHECK-NEXT: q3 = vcmp.eq(v0.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.h = vadd(v3.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vasl(v0.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
+; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
+; CHECK-NEXT: q1 = vcmp.eq(v5.h,v2.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.uh = vlsr(v7.uh,r5)
+; CHECK-NEXT: v27 = vmux(q1,v2,v1)
+; CHECK-NEXT: v1 = vmux(q0,v1,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vadd(v1.h,v4.h)
+; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
+; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v26.uh,r6)
+; CHECK-NEXT: v1.h = vsub(v1.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uh = vlsr(v28.uh,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,r4)
+; CHECK-NEXT: v3 = vmux(q2,v30,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v2,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i8>, ptr %a0, align 128
+ %v1 = uitofp <64 x i8> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; u8 -> f32
+; No widening
+define void @u8f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u8f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##.LCPI15_0,#8)
+; CHECK-NEXT: v3:2.uh = vunpack(v1.ub)
+; CHECK-NEXT: v1.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: r6 = #512
+; CHECK-NEXT: r7 = #255
+; CHECK-NEXT: v3 = vmem(r3+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r0)
+; CHECK-NEXT: v16 = vsplat(r6)
+; CHECK-NEXT: v3 = vdelta(v1,v3)
+; CHECK-NEXT: v0 = vxor(v0,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vsplat(r7)
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: v5:4.uw = vunpack(v2.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v19 = vsplat(r5)
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v31:30.uh = vunpack(v3.ub)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vcl0(v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3:2.uw = vunpack(v30.uh)
+; CHECK-NEXT: v6.w = vadd(v6.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vcl0(v5.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.uw = vcl0(v2.uw)
+; CHECK-NEXT: v7.w = vadd(v7.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vcl0(v3.uw)
+; CHECK-NEXT: v11.w = vadd(v11.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v4.w,v6.w)
+; CHECK-NEXT: v12.w = vadd(v12.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.w = vasl(v5.w,v7.w)
+; CHECK-NEXT: v20 = vand(v8,v16)
+; CHECK-NEXT: v17.w = vadd(v8.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v18.w = vasl(v2.w,v11.w)
+; CHECK-NEXT: v22 = vand(v9,v16)
+; CHECK-NEXT: q1 = vcmp.eq(v20.w,v0.w)
+; CHECK-NEXT: v13.w = vadd(v9.w,v10.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v21.w = vasl(v3.w,v12.w)
+; CHECK-NEXT: v28.w = vadd(v18.w,v10.w)
+; CHECK-NEXT: q2 = vcmp.eq(v22.w,v0.w)
+; CHECK-NEXT: v25 = vand(v18,v16)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29 = vmux(q1,v0,v1)
+; CHECK-NEXT: v24 = vmux(q2,v0,v1)
+; CHECK-NEXT: v16 = vand(v21,v16)
+; CHECK-NEXT: q1 = vcmp.eq(v25.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.uw = vlsr(v28.uw,r2)
+; CHECK-NEXT: v10.w = vadd(v21.w,v10.w)
+; CHECK-NEXT: q2 = vcmp.gt(v18.uw,v28.uw)
+; CHECK-NEXT: q3 = vcmp.eq(v16.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.uw = vlsr(v18.uw,r2)
+; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v13.uw)
+; CHECK-NEXT: v18 = vmux(q2,v1,v0)
+; CHECK-NEXT: v30 = vmux(q1,v0,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v15.uw = vlsr(v13.uw,r2)
+; CHECK-NEXT: q2 = vcmp.gt(v8.uw,v17.uw)
+; CHECK-NEXT: v13.w = vadd(v26.w,v30.w)
+; CHECK-NEXT: v27 = vmux(q3,v0,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v17.uw,r2)
+; CHECK-NEXT: v30 = vmux(q0,v1,v0)
+; CHECK-NEXT: q3 = vcmp.gt(v21.uw,v10.uw)
+; CHECK-NEXT: v11.w = vsub(v18.w,v11.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uw = vlsr(v10.uw,r2)
+; CHECK-NEXT: v7.w = vsub(v30.w,v7.w)
+; CHECK-NEXT: v22.w = vadd(v23.w,v29.w)
+; CHECK-NEXT: v29.w = vadd(v15.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v16.uw = vlsr(v21.uw,r2)
+; CHECK-NEXT: v21 = vmux(q2,v1,v0)
+; CHECK-NEXT: v31.w = vadd(v25.w,v27.w)
+; CHECK-NEXT: v1 = vmux(q3,v1,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14.uw = vlsr(v8.uw,r2)
+; CHECK-NEXT: v6.w = vsub(v21.w,v6.w)
+; CHECK-NEXT: v7.w = vadd(v7.w,v19.w)
+; CHECK-NEXT: v1.w = vsub(v1.w,v12.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
+; CHECK-NEXT: v6.w = vadd(v6.w,v19.w)
+; CHECK-NEXT: v11.w = vadd(v11.w,v19.w)
+; CHECK-NEXT: v1.w = vadd(v1.w,v19.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v18.uw = vlsr(v31.uw,r0)
+; CHECK-NEXT: q1 = vcmp.eq(v20.w,v26.w)
+; CHECK-NEXT: q0 = vcmp.eq(v16.w,v25.w)
+; CHECK-NEXT: q2 = vcmp.eq(v14.w,v23.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uw = vlsr(v25.uw,r0)
+; CHECK-NEXT: q3 = vcmp.eq(v9.w,v15.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.uw = vlsr(v22.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.uw = vlsr(v23.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.uw = vlsr(v29.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v15.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0)
+; CHECK-NEXT: v26 = vmux(q0,v18,v27)
+; CHECK-NEXT: v8 = vmux(q3,v8,v24)
+; CHECK-NEXT: v27 = vmux(q2,v20,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vasl(v7.w,r4)
+; CHECK-NEXT: q2 = vcmp.eq(v5.w,v0.w)
+; CHECK-NEXT: q3 = vcmp.eq(v4.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v13.uw = vlsr(v13.uw,r0)
+; CHECK-NEXT: v7 = vor(v8,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v6.w,r4)
+; CHECK-NEXT: v25 = vmux(q1,v13,v28)
+; CHECK-NEXT: v29 = vmux(q2,v0,v7)
+; CHECK-NEXT: vmem(r1+#1) = v29.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v28 = vor(v27,v6)
+; CHECK-NEXT: q2 = vcmp.eq(v3.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.w = vasl(v11.w,r4)
+; CHECK-NEXT: v1 = vor(v26,v1)
+; CHECK-NEXT: v30 = vmux(q3,v0,v28)
+; CHECK-NEXT: vmem(r1+#0) = v30.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v25,v11)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v0.w)
+; CHECK-NEXT: v1 = vmux(q2,v0,v1)
+; CHECK-NEXT: vmem(r1+#3) = v1.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v0,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#2) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <128 x i8>, ptr %a0, align 128
+ %v1 = uitofp <128 x i8> %v0 to <128 x float>
+ store <128 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input #1
+define void @u8f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u8f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r7 = #1
+; CHECK-NEXT: r6 = #512
+; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r7)
+; CHECK-NEXT: v8 = vsplat(r6)
+; CHECK-NEXT: r3:2 = combine(##255,#8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6 = vsplat(r3)
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v13 = vsplat(r5)
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vcl0(v0.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vcl0(v1.uw)
+; CHECK-NEXT: v4.w = vadd(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vadd(v5.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.w = vasl(v0.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v9.w = vasl(v1.w,v5.w)
+; CHECK-NEXT: v11 = vand(v7,v8)
+; CHECK-NEXT: v10.w = vadd(v7.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v9.w,v6.w)
+; CHECK-NEXT: q0 = vcmp.eq(v11.w,v3.w)
+; CHECK-NEXT: v8 = vand(v9,v8)
+; CHECK-NEXT: q1 = vcmp.gt(v7.uw,v10.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v19.uw = vlsr(v10.uw,r2)
+; CHECK-NEXT: v21 = vmux(q0,v3,v2)
+; CHECK-NEXT: q3 = vcmp.eq(v8.w,v3.w)
+; CHECK-NEXT: q0 = vcmp.gt(v9.uw,v6.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.uw = vlsr(v6.uw,r2)
+; CHECK-NEXT: v22 = vmux(q1,v2,v3)
+; CHECK-NEXT: v24 = vmux(q3,v3,v2)
+; CHECK-NEXT: v2 = vmux(q0,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vsub(v22.w,v4.w)
+; CHECK-NEXT: v2.w = vsub(v2.w,v5.w)
+; CHECK-NEXT: v10.w = vadd(v19.w,v21.w)
+; CHECK-NEXT: v25.w = vadd(v20.w,v24.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vlsr(v7.uw,r2)
+; CHECK-NEXT: v4.w = vadd(v4.w,v13.w)
+; CHECK-NEXT: v2.w = vadd(v2.w,v13.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v9.uw,r2)
+; CHECK-NEXT: q2 = vcmp.eq(v12.w,v19.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.uw = vlsr(v19.uw,r7)
+; CHECK-NEXT: q3 = vcmp.eq(v23.w,v20.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uw = vlsr(v10.uw,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.uw = vlsr(v20.uw,r7)
+; CHECK-NEXT: v5 = vmux(q2,v27,v11)
+; CHECK-NEXT: q2 = vcmp.eq(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v25.uw,r7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vasl(v4.w,r4)
+; CHECK-NEXT: v6 = vmux(q3,v6,v26)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,r4)
+; CHECK-NEXT: v29 = vor(v5,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28 = vor(v6,v2)
+; CHECK-NEXT: v31 = vmux(q3,v3,v29)
+; CHECK-NEXT: vmem(r1+#0) = v31.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vmux(q2,v3,v28)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v30.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i8>, ptr %a0, align 128
+ %v1 = uitofp <64 x i8> %v0 to <64 x float>
+ store <64 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input #2
+define void @u8f32_2(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u8f32_2:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v1:0.uh = vunpack(v0.ub)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r6)
+; CHECK-NEXT: v29 = vsplat(r2)
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r3)
+; CHECK-NEXT: r5:4 = combine(##159,#8)
+; CHECK-NEXT: v5:4.uw = vunpack(v0.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r5)
+; CHECK-NEXT: q3 = vcmp.eq(v4.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vcl0(v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.w = vadd(v5.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v4.w,v5.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.w = vadd(v6.w,v29.w)
+; CHECK-NEXT: v3 = vand(v6,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
+; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v0.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v3.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v0.uw = vlsr(v0.uw,r4)
+; CHECK-NEXT: v3 = vmux(q1,v2,v1)
+; CHECK-NEXT: v1 = vmux(q0,v1,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v1.w,v5.w)
+; CHECK-NEXT: v3.w = vadd(v0.w,v3.w)
+; CHECK-NEXT: q2 = vcmp.eq(v6.w,v0.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v0.uw,r6)
+; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.uw = vlsr(v3.uw,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v0 = vmux(q2,v31,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vor(v0,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v2,v0)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i8>, ptr %a0, align 128
+ %v1 = uitofp <32 x i8> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; u16 -> f16
+; No widening
+define void @u16f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u16f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(#64,#1)
+; CHECK-NEXT: r5 = #31
+; CHECK-NEXT: v1.uh = vcl0(v0.uh)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r2)
+; CHECK-NEXT: v5.h = vsplat(r3)
+; CHECK-NEXT: r4 = #5
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.h = vsplat(r5)
+; CHECK-NEXT: r3 = #10
+; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
+; CHECK-NEXT: q3 = vcmp.eq(v0.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uh = vlsr(v6.uh,r4)
+; CHECK-NEXT: q0 = vcmp.eq(v5.h,v3.h)
+; CHECK-NEXT: q1 = vcmp.gt(v6.uh,v7.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v26.uh = vlsr(v7.uh,r4)
+; CHECK-NEXT: v27 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
+; CHECK-NEXT: v28.h = vadd(v26.h,v27.h)
+; CHECK-NEXT: q2 = vcmp.eq(v6.h,v26.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v26.uh,r2)
+; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,r3)
+; CHECK-NEXT: v2 = vmux(q2,v30,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v3,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i16>, ptr %a0, align 128
+ %v1 = uitofp <64 x i16> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @u16f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u16f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(#31,#1)
+; CHECK-NEXT: r6 = #64
+; CHECK-NEXT: v1.uh = vcl0(v0.uh)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vsplat(r2)
+; CHECK-NEXT: v4.h = vsplat(r3)
+; CHECK-NEXT: r5 = #5
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.h = vsplat(r6)
+; CHECK-NEXT: r4 = #10
+; CHECK-NEXT: v1.h = vadd(v1.h,v2.h)
+; CHECK-NEXT: q2 = vcmp.eq(v0.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q3 = vsetq(r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.h = vasl(v0.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.h = vadd(v6.h,v4.h)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uh = vlsr(v6.uh,r5)
+; CHECK-NEXT: q1 = vcmp.eq(v5.h,v3.h)
+; CHECK-NEXT: q0 = vcmp.gt(v6.uh,v7.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uh = vlsr(v7.uh,r5)
+; CHECK-NEXT: v5 = vmux(q1,v3,v2)
+; CHECK-NEXT: v2 = vmux(q0,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.h = vadd(v2.h,v4.h)
+; CHECK-NEXT: v28.h = vadd(v7.h,v5.h)
+; CHECK-NEXT: q1 = vcmp.eq(v6.h,v7.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.uh = vlsr(v7.uh,r2)
+; CHECK-NEXT: v1.h = vsub(v2.h,v1.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uh = vlsr(v28.uh,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.h = vasl(v1.h,r4)
+; CHECK-NEXT: v2 = vmux(q1,v30,v29)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v3,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x i16>, ptr %a0, align 128
+ %v1 = uitofp <32 x i16> %v0 to <32 x half>
+ store <32 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; u16 -> f32
+; No widening
+define void @u16f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u16f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = ##.LCPI20_0
+; CHECK-NEXT: v1:0.uw = vunpack(v2.uh)
+; CHECK-NEXT: v2.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = #1
+; CHECK-NEXT: v1 = vmem(r2+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r0)
+; CHECK-NEXT: r7:6 = combine(##255,#8)
+; CHECK-NEXT: r4 = #512
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v6 = vsplat(r7)
+; CHECK-NEXT: v1 = vdelta(v2,v1)
+; CHECK-NEXT: v4.uw = vcl0(v0.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r5 = #159
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v17.w = vadd(v4.w,v3.w)
+; CHECK-NEXT: v8 = vxor(v8,v8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v14 = vsplat(r5)
+; CHECK-NEXT: v19:18.uw = vunpack(v1.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v19.w = vasl(v0.w,v17.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vcl0(v18.uw)
+; CHECK-NEXT: v9.w = vadd(v19.w,v6.w)
+; CHECK-NEXT: v10 = vand(v19,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v11.uw = vlsr(v19.uw,r6)
+; CHECK-NEXT: v5.w = vadd(v5.w,v3.w)
+; CHECK-NEXT: q0 = vcmp.eq(v10.w,v8.w)
+; CHECK-NEXT: q1 = vcmp.gt(v19.uw,v9.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v21.uw = vlsr(v9.uw,r6)
+; CHECK-NEXT: v22 = vmux(q0,v8,v3)
+; CHECK-NEXT: v12 = vmux(q1,v3,v8)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v20.w = vasl(v18.w,v5.w)
+; CHECK-NEXT: v2.w = vsub(v12.w,v17.w)
+; CHECK-NEXT: v9.w = vadd(v21.w,v22.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v25.uw = vlsr(v21.uw,r0)
+; CHECK-NEXT: v6.w = vadd(v20.w,v6.w)
+; CHECK-NEXT: v7 = vand(v20,v7)
+; CHECK-NEXT: v2.w = vadd(v2.w,v14.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v13.uw = vlsr(v20.uw,r6)
+; CHECK-NEXT: q3 = vcmp.eq(v7.w,v8.w)
+; CHECK-NEXT: q2 = vcmp.gt(v20.uw,v6.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v23.uw = vlsr(v6.uw,r6)
+; CHECK-NEXT: v7 = vmux(q3,v8,v3)
+; CHECK-NEXT: v3 = vmux(q2,v3,v8)
+; CHECK-NEXT: q3 = vcmp.eq(v11.w,v21.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v24.uw = vlsr(v9.uw,r0)
+; CHECK-NEXT: v3.w = vsub(v3.w,v5.w)
+; CHECK-NEXT: v26.w = vadd(v23.w,v7.w)
+; CHECK-NEXT: q2 = vcmp.eq(v13.w,v23.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uw = vlsr(v23.uw,r0)
+; CHECK-NEXT: v3.w = vadd(v3.w,v14.w)
+; CHECK-NEXT: v29 = vmux(q3,v24,v25)
+; CHECK-NEXT: q3 = vcmp.eq(v18.w,v8.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v28.uw = vlsr(v26.uw,r0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vasl(v2.w,r4)
+; CHECK-NEXT: v1 = vmux(q2,v28,v27)
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v8.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v3.w,r4)
+; CHECK-NEXT: v2 = vor(v29,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30 = vor(v1,v3)
+; CHECK-NEXT: v31 = vmux(q2,v8,v2)
+; CHECK-NEXT: vmem(r1+#0) = v31.new
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v8,v30)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#1) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i16>, ptr %a0, align 128
+ %v1 = uitofp <64 x i16> %v0 to <64 x float>
+ store <64 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input
+define void @u16f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u16f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r6 = #1
+; CHECK-NEXT: r2 = #255
+; CHECK-NEXT: v1:0.uw = vunpack(v0.uh)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1 = vsplat(r6)
+; CHECK-NEXT: v4 = vsplat(r2)
+; CHECK-NEXT: r3 = #512
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5 = vsplat(r3)
+; CHECK-NEXT: r5:4 = combine(##159,#8)
+; CHECK-NEXT: v3.uw = vcl0(v0.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r5)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v2.w)
+; CHECK-NEXT: v3.w = vadd(v3.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r4)
+; CHECK-NEXT: q0 = vcmp.gt(v6.uw,v4.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v5.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #23
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r4)
+; CHECK-NEXT: v5 = vmux(q1,v2,v1)
+; CHECK-NEXT: v1 = vmux(q0,v1,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v1.w,v3.w)
+; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
+; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v4.uw,r6)
+; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.uw = vlsr(v29.uw,r6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r4)
+; CHECK-NEXT: v3 = vmux(q2,v3,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v2,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i16>, ptr %a0, align 128
+ %v1 = uitofp <32 x i16> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+; u32 -> f16
+; No widening
+define void @u32f16_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u32f16_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(#8,#1)
+; CHECK-NEXT: r6 = #255
+; CHECK-NEXT: v1.uw = vcl0(v0.uw)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r2)
+; CHECK-NEXT: r4 = #512
+; CHECK-NEXT: v3.uw = vcl0(v2.uw)
+; CHECK-NEXT: v2.cur = vmem(r0+#1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: v6 = vsplat(r6)
+; CHECK-NEXT: v1.w = vadd(v1.w,v4.w)
+; CHECK-NEXT: v3.w = vadd(v3.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v9 = vxor(v9,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v10 = vsplat(r4)
+; CHECK-NEXT: v5.w = vasl(v0.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v8.w = vasl(v2.w,v3.w)
+; CHECK-NEXT: v11.w = vadd(v5.w,v6.w)
+; CHECK-NEXT: v13 = vand(v5,v7)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vadd(v8.w,v6.w)
+; CHECK-NEXT: v7 = vand(v8,v7)
+; CHECK-NEXT: q0 = vcmp.gt(v5.uw,v11.uw)
+; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3)
+; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw)
+; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w)
+; CHECK-NEXT: v29 = vmux(q0,v4,v9)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3)
+; CHECK-NEXT: v28 = vmux(q1,v9,v4)
+; CHECK-NEXT: v30 = vmux(q3,v4,v9)
+; CHECK-NEXT: v4 = vmux(q2,v9,v4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v29.w,v1.w)
+; CHECK-NEXT: v7.w = vadd(v27.w,v28.w)
+; CHECK-NEXT: v3.w = vsub(v30.w,v3.w)
+; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3)
+; CHECK-NEXT: v1.w = vadd(v1.w,v10.w)
+; CHECK-NEXT: v3.w = vadd(v3.w,v10.w)
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3)
+; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2)
+; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7.uw = vlsr(v7.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v5 = vmux(q3,v7,v5)
+; CHECK-NEXT: q3 = vcmp.eq(v2.w,v9.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: v31 = vmux(q1,v4,v6)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.w = vasl(v3.w,r3)
+; CHECK-NEXT: v1 = vor(v5,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vor(v31,v3)
+; CHECK-NEXT: v1 = vmux(q2,v9,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v9,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.qf32 = vadd(v1.sf,v9.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.hf = v3:2.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vdeal(v0.h)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <64 x i32>, ptr %a0, align 128
+ %v1 = uitofp <64 x i32> %v0 to <64 x half>
+ store <64 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen result
+define void @u32f16_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u32f16_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##512,#1)
+; CHECK-NEXT: v1.uw = vcl0(v0.uw)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3 = vsplat(r2)
+; CHECK-NEXT: v5 = vsplat(r3)
+; CHECK-NEXT: r6 = #255
+; CHECK-NEXT: v2 = vxor(v2,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v1.w = vadd(v1.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
+; CHECK-NEXT: q0 = vcmp.eq(v5.w,v2.w)
+; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
+; CHECK-NEXT: v5 = vmux(q0,v2,v3)
+; CHECK-NEXT: v3 = vmux(q1,v3,v2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v3.w,v1.w)
+; CHECK-NEXT: v30.w = vadd(v4.w,v5.w)
+; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v3.uw = vlsr(v30.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v3 = vmux(q1,v3,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.qf32 = vadd(v2.sf,v2.sf)
+; CHECK-NEXT: v0 = vor(v3,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v2,v0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.qf32 = vadd(v0.sf,v2.sf)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.hf = v1:0.qf32
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.h = vdeal(v0.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <32 x i32>, ptr %a0, align 128
+ %v1 = uitofp <32 x i32> %v0 to <32 x half>
+ store <32 x half> %v1, ptr %a1, align 128
+ ret void
+}
+
+; u32 -> f32
+; No widening
+define void @u32f32_0(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u32f32_0:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##512,#1)
+; CHECK-NEXT: v1.uw = vcl0(v0.uw)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r2)
+; CHECK-NEXT: v5 = vsplat(r3)
+; CHECK-NEXT: r6 = #255
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
+; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
+; CHECK-NEXT: v5 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
+; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
+; CHECK-NEXT: q2 = vcmp.eq(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: v2 = vmux(q2,v2,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q3,v3,v31)
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: vmem(r1+#0) = v0.new
+; CHECK-NEXT: }
+ %v0 = load <32 x i32>, ptr %a0, align 128
+ %v1 = uitofp <32 x i32> %v0 to <32 x float>
+ store <32 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+; Widen input and result
+define void @u32f32_1(ptr %a0, ptr %a1) #0 {
+; CHECK-LABEL: u32f32_1:
+; CHECK: .cfi_startproc
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: {
+; CHECK-NEXT: r3:2 = combine(##512,#1)
+; CHECK-NEXT: v1.uw = vcl0(v0.uw)
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2 = vsplat(r2)
+; CHECK-NEXT: v5 = vsplat(r3)
+; CHECK-NEXT: r6 = #255
+; CHECK-NEXT: v3 = vxor(v3,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4 = vsplat(r6)
+; CHECK-NEXT: r5 = #8
+; CHECK-NEXT: r4 = #159
+; CHECK-NEXT: v1.w = vadd(v1.w,v2.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v7 = vsplat(r4)
+; CHECK-NEXT: r3 = #23
+; CHECK-NEXT: q2 = vcmp.eq(v0.w,v3.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.w = vasl(v0.w,v1.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: v5 = vand(v6,v5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v6.uw = vlsr(v6.uw,r5)
+; CHECK-NEXT: q0 = vcmp.eq(v5.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v6.uw,v4.uw)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v4.uw = vlsr(v4.uw,r5)
+; CHECK-NEXT: v5 = vmux(q0,v3,v2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vsub(v2.w,v1.w)
+; CHECK-NEXT: v29.w = vadd(v4.w,v5.w)
+; CHECK-NEXT: q1 = vcmp.eq(v6.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.uw = vlsr(v4.uw,r2)
+; CHECK-NEXT: v1.w = vadd(v1.w,v7.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r2 = #64
+; CHECK-NEXT: v2.uw = vlsr(v29.uw,r2)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: q3 = vsetq(r2)
+; CHECK-NEXT: v2 = vmux(q1,v2,v30)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31 = vor(v2,v1)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0 = vmux(q2,v3,v31)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jumpr r31
+; CHECK-NEXT: if (q3) vmem(r1+#0) = v0
+; CHECK-NEXT: }
+ %v0 = load <16 x i32>, ptr %a0, align 128
+ %v1 = uitofp <16 x i32> %v0 to <16 x float>
+ store <16 x float> %v1, ptr %a1, align 128
+ ret void
+}
+
+
+attributes #0 = { "target-features"="+v68,+hvxv68,+hvx-length128b,+hvx-qfloat" }
+
diff --git a/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll b/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll
index 726ee8f06171..699d6219ebe7 100644
--- a/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll
+++ b/llvm/test/CodeGen/Hexagon/vector-sint-to-fp.ll
@@ -1,12 +1,10 @@
; RUN: llc -march=hexagon < %s | FileCheck %s
-; Test that code is generated for the vector sint_to_fp node. The compiler
-; asserts with a cannot select message if the node is not expanded. When
-; expanded, the generated code is very inefficient, so iwe need to find a more
-; efficient code sequence to generate.
+; Test that code is generated for the vector sint_to_fp node.
-; CHECK: convert_w2sf
+; The floor builtin is still scalarized.
; CHECK: call floorf
+; CHECK: vmem
target triple = "hexagon"
More information about the llvm-commits
mailing list