[llvm] d9ed93d - [Hexagon] Don't lower legal EXTRACT_SUBVECTOR to EXTRACT_SUBREG
Krzysztof Parzyszek via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 24 18:11:12 PST 2022
Author: Krzysztof Parzyszek
Date: 2022-11-24T18:10:42-08:00
New Revision: d9ed93da86a3ffd663f6b19481551855d5cc86be
URL: https://github.com/llvm/llvm-project/commit/d9ed93da86a3ffd663f6b19481551855d5cc86be
DIFF: https://github.com/llvm/llvm-project/commit/d9ed93da86a3ffd663f6b19481551855d5cc86be.diff
LOG: [Hexagon] Don't lower legal EXTRACT_SUBVECTOR to EXTRACT_SUBREG
EXTRACT_SUBREG is a machine opcode and cannot be a part of an input
selection pattern.
Added:
Modified:
llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
llvm/lib/Target/Hexagon/HexagonISelLowering.h
llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 49fd21f7629eb..d4ad9e52eb6e5 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -692,6 +692,24 @@ void HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
SelectCode(N);
}
+void HexagonDAGToDAGISel::SelectExtractSubvector(SDNode *N) {
+ SDValue Inp = N->getOperand(0);
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+ MVT InpTy = Inp.getValueType().getSimpleVT();
+ assert(InpTy.getVectorElementType() == ResTy.getVectorElementType());
+ unsigned ResLen = ResTy.getVectorNumElements();
+ assert(2 * ResLen == InpTy.getVectorNumElements());
+ assert(ResTy.getSizeInBits() == 32);
+
+ auto IdxN = cast<ConstantSDNode>(N->getOperand(1));
+ unsigned Idx = IdxN->getZExtValue();
+ assert(Idx == 0 || Idx == ResLen);
+ unsigned SubReg = Idx == 0 ? Hexagon::isub_lo : Hexagon::isub_hi;
+ SDValue Ext = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(N), ResTy, Inp);
+
+ ReplaceNode(N, Ext.getNode());
+}
+
//
// Map floating point constant values.
//
@@ -884,6 +902,28 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
if (N->isMachineOpcode())
return N->setNodeId(-1); // Already selected.
+ auto isHvxOp = [this](SDNode *N) {
+ auto &HST = MF->getSubtarget<HexagonSubtarget>();
+ for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+ if (HST.isHVXVectorType(N->getValueType(i), true))
+ return true;
+ }
+ for (SDValue I : N->ops()) {
+ if (HST.isHVXVectorType(I.getValueType(), true))
+ return true;
+ }
+ return false;
+ };
+
+ if (HST->useHVXOps() && isHvxOp(N)) {
+ switch (N->getOpcode()) {
+ case ISD::EXTRACT_SUBVECTOR: return SelectHvxExtractSubvector(N);
+ case ISD::VECTOR_SHUFFLE: return SelectHvxShuffle(N);
+
+ case HexagonISD::VROR: return SelectHvxRor(N);
+ }
+ }
+
switch (N->getOpcode()) {
case ISD::Constant: return SelectConstant(N);
case ISD::ConstantFP: return SelectConstantFP(N);
@@ -893,6 +933,7 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
case ISD::STORE: return SelectStore(N);
case ISD::INTRINSIC_W_CHAIN: return SelectIntrinsicWChain(N);
case ISD::INTRINSIC_WO_CHAIN: return SelectIntrinsicWOChain(N);
+ case ISD::EXTRACT_SUBVECTOR: return SelectExtractSubvector(N);
case HexagonISD::ADDC:
case HexagonISD::SUBC: return SelectAddSubCarry(N);
@@ -905,13 +946,6 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
case HexagonISD::V2Q: return SelectV2Q(N);
}
- if (HST->useHVXOps()) {
- switch (N->getOpcode()) {
- case ISD::VECTOR_SHUFFLE: return SelectHvxShuffle(N);
- case HexagonISD::VROR: return SelectHvxRor(N);
- }
- }
-
SelectCode(N);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index 505c90f66f43b..50605377e7111 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -97,6 +97,7 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
void SelectSHL(SDNode *N);
void SelectIntrinsicWChain(SDNode *N);
void SelectIntrinsicWOChain(SDNode *N);
+ void SelectExtractSubvector(SDNode *N);
void SelectConstant(SDNode *N);
void SelectConstantFP(SDNode *N);
void SelectV65Gather(SDNode *N);
@@ -126,6 +127,7 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
return SDValue(U, 0);
}
+ void SelectHvxExtractSubvector(SDNode *N);
void SelectHvxShuffle(SDNode *N);
void SelectHvxRor(SDNode *N);
void SelectHvxVAlign(SDNode *N);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 95bbdb61a2e55..0c7419fef9549 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -833,6 +833,7 @@ namespace llvm {
return MVT::getVectorVT(MVT::i1, HwLen);
}
+ void selectExtractSubvector(SDNode *N);
void selectShuffle(SDNode *N);
void selectRor(SDNode *N);
void selectVAlign(SDNode *N);
@@ -2281,6 +2282,22 @@ SDValue HvxSelector::getVectorConstant(ArrayRef<uint8_t> Data,
return DAG.getNode(HexagonISD::ISEL, dl, VecTy, LV);
}
+void HvxSelector::selectExtractSubvector(SDNode *N) {
+ SDValue Inp = N->getOperand(0);
+ MVT ResTy = N->getValueType(0).getSimpleVT();
+ MVT InpTy = Inp.getValueType().getSimpleVT();
+ assert(InpTy.getVectorElementType() == ResTy.getVectorElementType());
+ unsigned ResLen = ResTy.getVectorNumElements();
+ assert(2 * ResLen == InpTy.getVectorNumElements());
+ auto IdxN = cast<ConstantSDNode>(N->getOperand(1));
+ unsigned Idx = IdxN->getZExtValue();
+ assert(Idx == 0 || Idx == ResLen);
+ unsigned SubReg = Idx == 0 ? Hexagon::vsub_lo : Hexagon::vsub_hi;
+ SDValue Ext = DAG.getTargetExtractSubreg(SubReg, SDLoc(N), ResTy, Inp);
+
+ ISel.ReplaceNode(N, Ext.getNode());
+}
+
void HvxSelector::selectShuffle(SDNode *N) {
DEBUG_WITH_TYPE("isel", {
dbgs() << "Starting " << __func__ << " on node:\n";
@@ -2390,6 +2407,10 @@ void HvxSelector::selectVAlign(SDNode *N) {
DAG.RemoveDeadNode(N);
}
+void HexagonDAGToDAGISel::SelectHvxExtractSubvector(SDNode *N) {
+ HvxSelector(*this, *CurDAG).selectExtractSubvector(N);
+}
+
void HexagonDAGToDAGISel::SelectHvxShuffle(SDNode *N) {
HvxSelector(*this, *CurDAG).selectShuffle(N);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 5e6e0238438df..1ce7199efe447 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2661,8 +2661,7 @@ HexagonTargetLowering::extractVector(SDValue VecV, SDValue IdxV,
unsigned Off = IdxN->getZExtValue() * ElemWidth;
if (VecWidth == 64 && ValWidth == 32) {
assert(Off == 0 || Off == 32);
- unsigned SubIdx = Off == 0 ? Hexagon::isub_lo : Hexagon::isub_hi;
- ExtV = DAG.getTargetExtractSubreg(SubIdx, dl, MVT::i32, VecV);
+ ExtV = Off == 0 ? LoHalf(VecV, DAG) : HiHalf(VecV, DAG);
} else if (Off == 0 && (ValWidth % 8) == 0) {
ExtV = DAG.getZeroExtendInReg(VecV, dl, tyScalar(ValTy));
} else {
@@ -2734,7 +2733,7 @@ HexagonTargetLowering::extractVectorPred(SDValue VecV, SDValue IdxV,
while (Scale > 1) {
// The longest possible subvector is at most 32 bits, so it is always
// contained in the low subregister.
- T1 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, T1);
+ T1 = LoHalf(T1, DAG);
T1 = expandPredicate(T1, dl, DAG);
Scale /= 2;
}
@@ -2994,7 +2993,7 @@ HexagonTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
W = contractPredicate(W, dl, DAG);
W = getCombine(DAG.getUNDEF(MVT::i32), W, dl, MVT::i64, DAG);
}
- W = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, W);
+ W = LoHalf(W, DAG);
Words[IdxW].push_back(W);
}
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 1387f0c1b355c..afa4f6647844a 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -447,6 +447,29 @@ class HexagonTargetLowering : public TargetLowering {
VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
+ SDValue LoHalf(SDValue V, SelectionDAG &DAG) const {
+ MVT Ty = ty(V);
+ const SDLoc &dl(V);
+ if (!Ty.isVector()) {
+ assert(Ty.getSizeInBits() == 64);
+ return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, V);
+ }
+ MVT HalfTy = typeSplit(Ty).first;
+ SDValue Idx = getZero(dl, MVT::i32, DAG);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfTy, V, Idx);
+ }
+ SDValue HiHalf(SDValue V, SelectionDAG &DAG) const {
+ MVT Ty = ty(V);
+ const SDLoc &dl(V);
+ if (!Ty.isVector()) {
+ assert(Ty.getSizeInBits() == 64);
+ return DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, V);
+ }
+ MVT HalfTy = typeSplit(Ty).first;
+ SDValue Idx = DAG.getConstant(HalfTy.getVectorNumElements(), dl, MVT::i32);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfTy, V, Idx);
+ }
+
bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
unsigned *Fast) const;
bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
@@ -478,8 +501,9 @@ class HexagonTargetLowering : public TargetLowering {
const SDLoc &dl, SelectionDAG &DAG) const;
SDValue insertHvxElementPred(SDValue VecV, SDValue IdxV, SDValue ValV,
const SDLoc &dl, SelectionDAG &DAG) const;
- SDValue extractHvxSubvectorReg(SDValue VecV, SDValue IdxV, const SDLoc &dl,
- MVT ResTy, SelectionDAG &DAG) const;
+ SDValue extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV, SDValue IdxV,
+ const SDLoc &dl, MVT ResTy, SelectionDAG &DAG)
+ const;
SDValue extractHvxSubvectorPred(SDValue VecV, SDValue IdxV, const SDLoc &dl,
MVT ResTy, SelectionDAG &DAG) const;
SDValue insertHvxSubvectorReg(SDValue VecV, SDValue SubV, SDValue IdxV,
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 2d50b621cdd10..8fd9ab4fb1f28 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -909,9 +909,7 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
SDValue S = DAG.getVectorShuffle(ExtTy, dl, ExtVec,
DAG.getUNDEF(ExtTy), Mask);
- if (ExtLen == VecLen)
- return S;
- return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, S);
+ return ExtLen == VecLen ? S : LoHalf(S, DAG);
}
}
@@ -1033,18 +1031,11 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
SmallVector<SDValue,4> Words[2];
unsigned IdxW = 0;
- auto Lo32 = [&DAG, &dl] (SDValue P) {
- return DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, P);
- };
- auto Hi32 = [&DAG, &dl] (SDValue P) {
- return DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, P);
- };
-
SDValue W0 = isUndef(PredV)
? DAG.getUNDEF(MVT::i64)
: DAG.getNode(HexagonISD::P2D, dl, MVT::i64, PredV);
- Words[IdxW].push_back(Hi32(W0));
- Words[IdxW].push_back(Lo32(W0));
+ Words[IdxW].push_back(HiHalf(W0, DAG));
+ Words[IdxW].push_back(LoHalf(W0, DAG));
while (Bytes < BitBytes) {
IdxW ^= 1;
@@ -1053,8 +1044,8 @@ HexagonTargetLowering::createHvxPrefixPred(SDValue PredV, const SDLoc &dl,
if (Bytes < 4) {
for (const SDValue &W : Words[IdxW ^ 1]) {
SDValue T = expandPredicate(W, dl, DAG);
- Words[IdxW].push_back(Hi32(T));
- Words[IdxW].push_back(Lo32(T));
+ Words[IdxW].push_back(HiHalf(T, DAG));
+ Words[IdxW].push_back(LoHalf(T, DAG));
}
} else {
for (const SDValue &W : Words[IdxW ^ 1]) {
@@ -1255,8 +1246,8 @@ HexagonTargetLowering::insertHvxElementPred(SDValue VecV, SDValue IdxV,
}
SDValue
-HexagonTargetLowering::extractHvxSubvectorReg(SDValue VecV, SDValue IdxV,
- const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
+HexagonTargetLowering::extractHvxSubvectorReg(SDValue OrigOp, SDValue VecV,
+ SDValue IdxV, const SDLoc &dl, MVT ResTy, SelectionDAG &DAG) const {
MVT VecTy = ty(VecV);
unsigned HwLen = Subtarget.getVectorLength();
unsigned Idx = cast<ConstantSDNode>(IdxV.getNode())->getZExtValue();
@@ -1267,16 +1258,11 @@ HexagonTargetLowering::extractHvxSubvectorReg(SDValue VecV, SDValue IdxV,
// the subvector of interest. The subvector will never overlap two single
// vectors.
if (isHvxPairTy(VecTy)) {
- unsigned SubIdx;
- if (Idx * ElemWidth >= 8*HwLen) {
- SubIdx = Hexagon::vsub_hi;
+ if (Idx * ElemWidth >= 8*HwLen)
Idx -= VecTy.getVectorNumElements() / 2;
- } else {
- SubIdx = Hexagon::vsub_lo;
- }
- VecTy = typeSplit(VecTy).first;
- VecV = DAG.getTargetExtractSubreg(SubIdx, dl, VecTy, VecV);
- if (VecTy == ResTy)
+
+ VecV = OrigOp;
+ if (typeSplit(VecTy).first == ResTy)
return VecV;
}
@@ -1380,8 +1366,8 @@ HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
SDValue PickHi;
if (IsPair) {
- V0 = DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, SingleTy, VecV);
- V1 = DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, SingleTy, VecV);
+ V0 = LoHalf(VecV, DAG);
+ V1 = HiHalf(VecV, DAG);
SDValue HalfV = DAG.getConstant(SingleTy.getVectorNumElements(),
dl, MVT::i32);
@@ -1427,8 +1413,8 @@ HexagonTargetLowering::insertHvxSubvectorReg(SDValue VecV, SDValue SubV,
SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, V);
} else {
SDValue V = DAG.getBitcast(MVT::i64, SubV);
- SDValue R0 = DAG.getTargetExtractSubreg(Hexagon::isub_lo, dl, MVT::i32, V);
- SDValue R1 = DAG.getTargetExtractSubreg(Hexagon::isub_hi, dl, MVT::i32, V);
+ SDValue R0 = LoHalf(V, DAG);
+ SDValue R1 = HiHalf(V, DAG);
SingleV = DAG.getNode(HexagonISD::VINSERTW0, dl, SingleTy, SingleV, R0);
SingleV = DAG.getNode(HexagonISD::VROR, dl, SingleTy, SingleV,
DAG.getConstant(4, dl, MVT::i32));
@@ -1818,7 +1804,7 @@ HexagonTargetLowering::LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG)
if (ElemTy == MVT::i1)
return extractHvxSubvectorPred(SrcV, IdxV, dl, DstTy, DAG);
- return extractHvxSubvectorReg(SrcV, IdxV, dl, DstTy, DAG);
+ return extractHvxSubvectorReg(Op, SrcV, IdxV, dl, DstTy, DAG);
}
SDValue
@@ -2490,13 +2476,6 @@ HexagonTargetLowering::emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl,
SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
- auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
- };
- auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
- };
-
// mulhs(A,B) =
// = [(Hi(A)*2^16 + Lo(A)) *s (Hi(B)*2^16 + Lo(B))] >> 32
// = [Hi(A)*2^16 *s Hi(B)*2^16 + Hi(A) *su Lo(B)*2^16
@@ -2524,7 +2503,7 @@ HexagonTargetLowering::emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl,
// P0 = interleaved T1.h*B.uh (full precision product)
SDValue P0 = getInstr(Hexagon::V6_vmpyhus, dl, PairTy, {T1, B}, DAG);
// T2 = T1.even(h) * B.even(uh), i.e. Hi(A)*Lo(B)
- SDValue T2 = LoVec(P0);
+ SDValue T2 = LoHalf(P0, DAG);
// We need to add T0+T2, recording the carry-out, which will be 1<<16
// added to the final sum.
// P1 = interleaved even/odd 32-bit (unsigned) sums of 16-bit halves
@@ -2534,12 +2513,12 @@ HexagonTargetLowering::emitHvxMulHsV60(SDValue A, SDValue B, const SDLoc &dl,
// T3 = full-precision(T0+T2) >> 16
// The low halves are added-unsigned, the high ones are added-signed.
SDValue T3 = getInstr(Hexagon::V6_vasrw_acc, dl, VecTy,
- {HiVec(P2), LoVec(P1), S16}, DAG);
+ {HiHalf(P2, DAG), LoHalf(P1, DAG), S16}, DAG);
SDValue T4 = getInstr(Hexagon::V6_vasrw, dl, VecTy, {B, S16}, DAG);
// P3 = interleaved Hi(B)*Hi(A) (full precision),
// which is now Lo(T1)*Lo(T4), so we want to keep the even product.
SDValue P3 = getInstr(Hexagon::V6_vmpyhv, dl, PairTy, {T1, T4}, DAG);
- SDValue T5 = LoVec(P3);
+ SDValue T5 = LoHalf(P3, DAG);
// Add:
SDValue T6 = DAG.getNode(ISD::ADD, dl, VecTy, {T3, T5});
return T6;
@@ -2555,13 +2534,6 @@ HexagonTargetLowering::emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B,
SDValue S16 = DAG.getConstant(16, dl, MVT::i32);
- auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
- };
- auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
- };
-
if (SignedA && !SignedB) {
// Make A:unsigned, B:signed.
std::swap(A, B);
@@ -2588,20 +2560,21 @@ HexagonTargetLowering::emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B,
// P2:lo = low halves of P1:lo + P1:hi,
// P2:hi = high halves of P1:lo + P1:hi.
- SDValue P2 =
- getInstr(Hexagon::V6_vadduhw, dl, PairTy, {HiVec(P1), LoVec(P1)}, DAG);
+ SDValue P2 = getInstr(Hexagon::V6_vadduhw, dl, PairTy,
+ {HiHalf(P1, DAG), LoHalf(P1, DAG)}, DAG);
// Still need to add the high halves of P0:lo to P2:lo
- SDValue T2 = getInstr(Hexagon::V6_vlsrw, dl, VecTy, {LoVec(P0), S16}, DAG);
- SDValue T3 = DAG.getNode(ISD::ADD, dl, VecTy, {LoVec(P2), T2});
+ SDValue T2 =
+ getInstr(Hexagon::V6_vlsrw, dl, VecTy, {LoHalf(P0, DAG), S16}, DAG);
+ SDValue T3 = DAG.getNode(ISD::ADD, dl, VecTy, {LoHalf(P2, DAG), T2});
// The high halves of T3 will contribute to the HI part of LOHI.
- SDValue T4 =
- getInstr(Hexagon::V6_vasrw_acc, dl, VecTy, {HiVec(P2), T3, S16}, DAG);
+ SDValue T4 = getInstr(Hexagon::V6_vasrw_acc, dl, VecTy,
+ {HiHalf(P2, DAG), T3, S16}, DAG);
// The low halves of P2 need to be added to high halves of the LO part.
- Lo = getInstr(Hexagon::V6_vaslw_acc, dl, VecTy, {LoVec(P0), LoVec(P2), S16},
- DAG);
- Hi = DAG.getNode(ISD::ADD, dl, VecTy, {HiVec(P0), T4});
+ Lo = getInstr(Hexagon::V6_vaslw_acc, dl, VecTy,
+ {LoHalf(P0, DAG), LoHalf(P2, DAG), S16}, DAG);
+ Hi = DAG.getNode(ISD::ADD, dl, VecTy, {HiHalf(P0, DAG), T4});
if (SignedA) {
assert(SignedB && "Signed A and unsigned B should have been inverted");
@@ -2628,20 +2601,14 @@ HexagonTargetLowering::emitHvxMulLoHiV60(SDValue A, bool SignedA, SDValue B,
}
SDValue
-HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B,
- bool SignedB, const SDLoc &dl,
+HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA,
+ SDValue B, bool SignedB,
+ const SDLoc &dl,
SelectionDAG &DAG) const {
MVT VecTy = ty(A);
MVT PairTy = typeJoin({VecTy, VecTy});
assert(VecTy.getVectorElementType() == MVT::i32);
- auto LoVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_lo, dl, VecTy, Pair);
- };
- auto HiVec = [&DAG, VecTy, dl](SDValue Pair) {
- return DAG.getTargetExtractSubreg(Hexagon::vsub_hi, dl, VecTy, Pair);
- };
-
if (SignedA && !SignedB) {
// Make A:unsigned, B:signed.
std::swap(A, B);
@@ -2652,8 +2619,8 @@ HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B,
SDValue P0 = getInstr(Hexagon::V6_vmpyewuh_64, dl, PairTy, {A, B}, DAG);
SDValue P1 =
getInstr(Hexagon::V6_vmpyowh_64_acc, dl, PairTy, {P0, A, B}, DAG);
- SDValue Lo = LoVec(P1);
- SDValue Hi = HiVec(P1);
+ SDValue Lo = LoHalf(P1, DAG);
+ SDValue Hi = HiHalf(P1, DAG);
if (!SignedB) {
assert(!SignedA && "Signed A and unsigned B should have been inverted");
@@ -2662,7 +2629,7 @@ HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B,
// Mulhu(X, Y) = Mulhs(X, Y) + (X, if Y < 0) + (Y, if X < 0).
// def: Pat<(VecI32 (mulhu HVI32:$A, HVI32:$B)),
- // (V6_vaddw (HiVec (Muls64O $A, $B)),
+ // (V6_vaddw (HiHalf (Muls64O $A, $B)),
// (V6_vaddwq (V6_vgtw (V6_vd0), $B),
// (V6_vandvqv (V6_vgtw (V6_vd0), $A), $B),
// $A))>;
@@ -2678,7 +2645,7 @@ HexagonTargetLowering::emitHvxMulLoHiV62(SDValue A, bool SignedA, SDValue B,
// Mulhus(unsigned X, signed Y) = Mulhs(X, Y) + (Y, if X < 0).
// def: Pat<(VecI32 (HexagonMULHUS HVI32:$A, HVI32:$B)),
// (V6_vaddwq (V6_vgtw (V6_vd0), $A),
- // (HiVec (Muls64O $A, $B)),
+ // (HiHalf (Muls64O $A, $B)),
// $B)>;
SDValue Q0 = DAG.getSetCC(dl, PredTy, A, Zero, ISD::SETLT);
Hi = getInstr(Hexagon::V6_vaddwq, dl, VecTy, {Q0, Hi, B}, DAG);
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
index 9ea5d11f89c20..595568bd9e055 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/fp-to-int.ll
@@ -13,13 +13,13 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##32768,#1)
; CHECK-NEXT: r4 = #14
-; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r3)
; CHECK-NEXT: r6 = #5
; CHECK-NEXT: v3.h = vasl(v0.h,r2)
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vsplat(r4)
@@ -33,55 +33,55 @@ define void @f16s8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #16
-; CHECK-NEXT: v5.h = vasl(v0.h,r6)
-; CHECK-NEXT: q1 = vcmp.gt(v7.h,v1.h)
+; CHECK-NEXT: v5.h = vasl(v1.h,r6)
+; CHECK-NEXT: q1 = vcmp.gt(v7.h,v0.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r3)
-; CHECK-NEXT: v28.h = vasr(v3.h,r5)
+; CHECK-NEXT: v27.h = vasr(v3.h,r5)
; CHECK-NEXT: v5 = vor(v5,v2)
-; CHECK-NEXT: q0 = vcmp.gt(v7.h,v0.h)
+; CHECK-NEXT: q0 = vcmp.gt(v7.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.h = vsplat(r4)
; CHECK-NEXT: v8.h = vasr(v8.h,r5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v27.h = vasl(v1.h,r6)
-; CHECK-NEXT: v1.h = vsub(v4.h,v28.h)
+; CHECK-NEXT: v26.h = vasl(v0.h,r6)
+; CHECK-NEXT: v0.h = vsub(v4.h,v27.h)
; CHECK-NEXT: v4.h = vsub(v4.h,v8.h)
-; CHECK-NEXT: v29 = vmux(q0,v2,v9)
+; CHECK-NEXT: v28 = vmux(q0,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.h = vmin(v1.h,v6.h)
-; CHECK-NEXT: v0 = vor(v27,v2)
; CHECK-NEXT: v4.h = vmin(v4.h,v6.h)
+; CHECK-NEXT: v1 = vor(v26,v2)
+; CHECK-NEXT: v0.h = vmin(v0.h,v6.h)
; CHECK-NEXT: v2 = vmux(q1,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: q2 = vcmp.gt(v1.h,v7.h)
-; CHECK-NEXT: q3 = vcmp.gt(v4.h,v7.h)
+; CHECK-NEXT: q2 = vcmp.gt(v4.h,v7.h)
+; CHECK-NEXT: q3 = vcmp.gt(v0.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.h = vlsr(v5.h,v1.h)
+; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.h = vlsr(v0.h,v4.h)
-; CHECK-NEXT: v30.h = vsub(v7.h,v5.h)
+; CHECK-NEXT: v1.h = vlsr(v1.h,v0.h)
+; CHECK-NEXT: v29.h = vsub(v7.h,v5.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v31.h = vsub(v7.h,v0.h)
-; CHECK-NEXT: v5 = vmux(q0,v30,v5)
+; CHECK-NEXT: v30.h = vsub(v7.h,v1.h)
+; CHECK-NEXT: v5 = vmux(q0,v29,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q1,v31,v0)
-; CHECK-NEXT: v1 = vmux(q2,v5,v29)
+; CHECK-NEXT: v1 = vmux(q1,v30,v1)
+; CHECK-NEXT: v31 = vmux(q2,v5,v28)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: v1 = vmux(q3,v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.b = vpack(v0.h,v1.h):sat
+; CHECK-NEXT: v0.b = vpack(v1.h,v31.h):sat
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
@@ -491,127 +491,127 @@ define void @f32s8_0(ptr %a0, ptr %a1) #0 {
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: {
-; CHECK-NEXT: r3:2 = combine(#1,#8)
; CHECK-NEXT: r4 = ##-2147483648
-; CHECK-NEXT: v6 = vmem(r0+#1)
+; CHECK-NEXT: r3:2 = combine(#1,#8)
+; CHECK-NEXT: v4 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vsplat(r4)
+; CHECK-NEXT: v1 = vsplat(r4)
; CHECK-NEXT: r7 = #30
; CHECK-NEXT: r6 = #24
-; CHECK-NEXT: v4 = vmem(r0+#0)
+; CHECK-NEXT: v2 = vmem(r0+#2)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r7)
; CHECK-NEXT: r5 = #32
-; CHECK-NEXT: v9.w = vasl(v6.w,r3)
-; CHECK-NEXT: v1 = vmem(r0+#3)
+; CHECK-NEXT: v8.w = vasl(v6.w,r3)
+; CHECK-NEXT: v6.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v8.w = vasl(v4.w,r3)
-; CHECK-NEXT: v14 = vxor(v14,v14)
-; CHECK-NEXT: v9.w = vsub(v9.w,v0.w)
-; CHECK-NEXT: v2 = vmem(r0+#2)
+; CHECK-NEXT: v7.w = vasl(v4.w,r3)
+; CHECK-NEXT: v12 = vxor(v12,v12)
+; CHECK-NEXT: v8.w = vsub(v8.w,v1.w)
+; CHECK-NEXT: v0 = vmem(r0+#3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
-; CHECK-NEXT: v11.w = vasl(v2.w,r3)
-; CHECK-NEXT: v8.w = vsub(v8.w,v0.w)
-; CHECK-NEXT: q1 = vcmp.gt(v14.w,v6.w)
+; CHECK-NEXT: v11.w = vasl(v0.w,r3)
+; CHECK-NEXT: v7.w = vsub(v7.w,v1.w)
+; CHECK-NEXT: q0 = vcmp.gt(v12.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v12.w = vasl(v1.w,r3)
-; CHECK-NEXT: q0 = vcmp.gt(v14.w,v4.w)
-; CHECK-NEXT: v11.w = vsub(v11.w,v0.w)
+; CHECK-NEXT: v9.w = vasl(v2.w,r3)
+; CHECK-NEXT: q1 = vcmp.gt(v12.w,v6.w)
+; CHECK-NEXT: v11.w = vsub(v11.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = ##2147483647
; CHECK-NEXT: r7 = #64
-; CHECK-NEXT: v9.w = vasr(v9.w,r6)
+; CHECK-NEXT: v8.w = vasr(v8.w,r6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v20 = vsplat(r3)
-; CHECK-NEXT: v7.w = vasl(v6.w,r2)
-; CHECK-NEXT: v21.w = vsub(v12.w,v0.w)
-; CHECK-NEXT: v9.w = vsub(v10.w,v9.w)
+; CHECK-NEXT: v23 = vsplat(r3)
+; CHECK-NEXT: v7.w = vasr(v7.w,r6)
+; CHECK-NEXT: v20.w = vsub(v9.w,v1.w)
+; CHECK-NEXT: v8.w = vsub(v10.w,v8.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v8.w = vasr(v8.w,r6)
-; CHECK-NEXT: v27 = vmux(q1,v0,v20)
-; CHECK-NEXT: v25 = vmux(q0,v0,v20)
-; CHECK-NEXT: v9.w = vmin(v9.w,v13.w)
+; CHECK-NEXT: v21.w = vasl(v6.w,r2)
+; CHECK-NEXT: v28 = vmux(q1,v1,v23)
+; CHECK-NEXT: v26 = vmux(q0,v1,v23)
+; CHECK-NEXT: v7.w = vsub(v10.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v4.w,r2)
-; CHECK-NEXT: v7 = vor(v7,v0)
-; CHECK-NEXT: v8.w = vsub(v10.w,v8.w)
-; CHECK-NEXT: q3 = vcmp.gt(v9.w,v14.w)
+; CHECK-NEXT: v8.w = vmin(v8.w,v13.w)
+; CHECK-NEXT: v9 = vor(v21,v1)
+; CHECK-NEXT: v22.w = vmin(v7.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v11.w = vasr(v11.w,r6)
-; CHECK-NEXT: v8.w = vmin(v8.w,v13.w)
-; CHECK-NEXT: v5 = vor(v5,v0)
+; CHECK-NEXT: v4.w = vasr(v20.w,r6)
+; CHECK-NEXT: q3 = vcmp.gt(v8.w,v12.w)
+; CHECK-NEXT: v5 = vor(v5,v1)
+; CHECK-NEXT: q2 = vcmp.gt(v22.w,v12.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v6.w = vasr(v21.w,r6)
-; CHECK-NEXT: v11.w = vsub(v10.w,v11.w)
-; CHECK-NEXT: q2 = vcmp.gt(v8.w,v14.w)
+; CHECK-NEXT: v11.w = vasr(v11.w,r6)
+; CHECK-NEXT: v4.w = vsub(v10.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v1.w,r2)
-; CHECK-NEXT: v6.w = vsub(v10.w,v6.w)
-; CHECK-NEXT: v23.w = vmin(v11.w,v13.w)
+; CHECK-NEXT: v3.w = vasl(v2.w,r2)
+; CHECK-NEXT: v10.w = vsub(v10.w,v11.w)
+; CHECK-NEXT: v4.w = vmin(v4.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v22.w = vasl(v2.w,r2)
-; CHECK-NEXT: v3 = vor(v3,v0)
-; CHECK-NEXT: v6.w = vmin(v6.w,v13.w)
+; CHECK-NEXT: v24.w = vasl(v0.w,r2)
+; CHECK-NEXT: v3 = vor(v3,v1)
+; CHECK-NEXT: v10.w = vmin(v10.w,v13.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w)
-; CHECK-NEXT: v12 = vor(v22,v0)
+; CHECK-NEXT: v8.w = vlsr(v9.w,v8.w)
+; CHECK-NEXT: v6 = vor(v24,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vlsr(v5.w,v8.w)
-; CHECK-NEXT: v26.w = vsub(v14.w,v7.w)
+; CHECK-NEXT: v5.w = vlsr(v5.w,v22.w)
+; CHECK-NEXT: v27.w = vsub(v12.w,v8.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v28.w = vlsr(v12.w,v23.w)
-; CHECK-NEXT: v24.w = vsub(v14.w,v5.w)
-; CHECK-NEXT: v7 = vmux(q1,v26,v7)
+; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w)
+; CHECK-NEXT: v25.w = vsub(v12.w,v5.w)
+; CHECK-NEXT: v8 = vmux(q1,v27,v8)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vlsr(v3.w,v6.w)
-; CHECK-NEXT: v5 = vmux(q0,v24,v5)
-; CHECK-NEXT: q0 = vcmp.gt(v14.w,v2.w)
-; CHECK-NEXT: v29.w = vsub(v14.w,v28.w)
+; CHECK-NEXT: v6.w = vlsr(v6.w,v10.w)
+; CHECK-NEXT: v5 = vmux(q0,v25,v5)
+; CHECK-NEXT: q0 = vcmp.gt(v12.w,v2.w)
+; CHECK-NEXT: v29.w = vsub(v12.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2 = vmux(q3,v7,v27)
-; CHECK-NEXT: q3 = vcmp.gt(v14.w,v1.w)
-; CHECK-NEXT: v31.w = vsub(v14.w,v3.w)
-; CHECK-NEXT: v5 = vmux(q2,v5,v25)
+; CHECK-NEXT: v2 = vmux(q3,v8,v28)
+; CHECK-NEXT: q3 = vcmp.gt(v12.w,v0.w)
+; CHECK-NEXT: v30.w = vsub(v12.w,v6.w)
+; CHECK-NEXT: v5 = vmux(q2,v5,v26)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1 = vmux(q0,v0,v20)
-; CHECK-NEXT: v30 = vmux(q0,v29,v28)
-; CHECK-NEXT: q2 = vcmp.gt(v23.w,v14.w)
-; CHECK-NEXT: v3 = vmux(q3,v31,v3)
+; CHECK-NEXT: v0 = vmux(q0,v1,v23)
+; CHECK-NEXT: v3 = vmux(q0,v29,v3)
+; CHECK-NEXT: q2 = vcmp.gt(v4.w,v12.w)
+; CHECK-NEXT: v31 = vmux(q3,v30,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vpack(v2.w,v5.w):sat
-; CHECK-NEXT: v0 = vmux(q3,v0,v20)
-; CHECK-NEXT: q3 = vcmp.gt(v6.w,v14.w)
-; CHECK-NEXT: v1 = vmux(q2,v30,v1)
+; CHECK-NEXT: v1 = vmux(q3,v1,v23)
+; CHECK-NEXT: q3 = vcmp.gt(v10.w,v12.w)
+; CHECK-NEXT: v0 = vmux(q2,v3,v0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q3,v3,v0)
+; CHECK-NEXT: v1 = vmux(q3,v31,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.h = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.b = vpack(v3.h,v2.h):sat
@@ -638,13 +638,13 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##-2147483648,#8)
; CHECK-NEXT: r4 = #1
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3 = vsplat(r3)
; CHECK-NEXT: r5 = #30
; CHECK-NEXT: v4.w = vasl(v0.w,r4)
-; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v5.w = vasl(v1.w,r4)
@@ -653,64 +653,64 @@ define void @f32s8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: r4 = #32
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v7 = vsplat(r5)
-; CHECK-NEXT: v8 = vsplat(r4)
+; CHECK-NEXT: v6 = vsplat(r5)
+; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v2.w = vasl(v1.w,r2)
; CHECK-NEXT: v5.w = vsub(v5.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vasr(v4.w,r6)
-; CHECK-NEXT: v27 = vxor(v27,v27)
+; CHECK-NEXT: v26 = vxor(v26,v26)
; CHECK-NEXT: v2 = vor(v2,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = ##2147483647
; CHECK-NEXT: v5.w = vasr(v5.w,r6)
-; CHECK-NEXT: q0 = vcmp.gt(v27.w,v0.w)
+; CHECK-NEXT: q0 = vcmp.gt(v26.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v28 = vsplat(r3)
-; CHECK-NEXT: v6.w = vasl(v0.w,r2)
-; CHECK-NEXT: v4.w = vsub(v7.w,v4.w)
-; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w)
+; CHECK-NEXT: v27 = vsplat(r3)
+; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
+; CHECK-NEXT: q2 = vcmp.gt(v26.w,v0.w)
+; CHECK-NEXT: v5.w = vsub(v6.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vsub(v7.w,v5.w)
-; CHECK-NEXT: v4.w = vmin(v4.w,v8.w)
-; CHECK-NEXT: v31 = vmux(q0,v3,v28)
-; CHECK-NEXT: v6 = vor(v6,v3)
+; CHECK-NEXT: v8.w = vasl(v0.w,r2)
+; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
+; CHECK-NEXT: v30 = vmux(q0,v3,v27)
+; CHECK-NEXT: v5.w = vmin(v5.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vmin(v5.w,v8.w)
-; CHECK-NEXT: q1 = vcmp.gt(v4.w,v27.w)
-; CHECK-NEXT: v0 = vmux(q2,v3,v28)
+; CHECK-NEXT: v25 = vor(v8,v3)
+; CHECK-NEXT: v1 = vmux(q2,v3,v27)
+; CHECK-NEXT: q3 = vcmp.gt(v4.w,v26.w)
+; CHECK-NEXT: q1 = vcmp.gt(v5.w,v26.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
-; CHECK-NEXT: v6.w = vlsr(v6.w,v4.w)
-; CHECK-NEXT: q3 = vcmp.gt(v5.w,v27.w)
+; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.w = vlsr(v2.w,v5.w)
-; CHECK-NEXT: v29.w = vsub(v27.w,v6.w)
+; CHECK-NEXT: v28.w = vlsr(v25.w,v4.w)
+; CHECK-NEXT: v29.w = vsub(v26.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v30.w = vsub(v27.w,v2.w)
-; CHECK-NEXT: v1 = vmux(q0,v29,v6)
+; CHECK-NEXT: v6.w = vsub(v26.w,v28.w)
+; CHECK-NEXT: v0 = vmux(q0,v29,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2 = vmux(q2,v30,v2)
-; CHECK-NEXT: v1 = vmux(q1,v1,v31)
+; CHECK-NEXT: v31 = vmux(q2,v6,v28)
+; CHECK-NEXT: v0 = vmux(q1,v0,v30)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: q3 = vsetq(r2)
-; CHECK-NEXT: v0 = vmux(q3,v2,v0)
+; CHECK-NEXT: v1 = vmux(q3,v31,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: v0.h = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0.b = vpack(v2.h,v0.h):sat
@@ -808,13 +808,13 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
; CHECK-NEXT: r4 = #30
-; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r3)
; CHECK-NEXT: r6 = #8
; CHECK-NEXT: v3.w = vasl(v0.w,r2)
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r4)
@@ -828,55 +828,55 @@ define void @f32s16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #32
-; CHECK-NEXT: v5.w = vasl(v0.w,r6)
-; CHECK-NEXT: q1 = vcmp.gt(v7.w,v1.w)
+; CHECK-NEXT: v5.w = vasl(v1.w,r6)
+; CHECK-NEXT: q1 = vcmp.gt(v7.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r3)
-; CHECK-NEXT: v28.w = vasr(v3.w,r5)
+; CHECK-NEXT: v27.w = vasr(v3.w,r5)
; CHECK-NEXT: v5 = vor(v5,v2)
-; CHECK-NEXT: q0 = vcmp.gt(v7.w,v0.w)
+; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vsplat(r4)
; CHECK-NEXT: v8.w = vasr(v8.w,r5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v27.w = vasl(v1.w,r6)
-; CHECK-NEXT: v1.w = vsub(v4.w,v28.w)
+; CHECK-NEXT: v26.w = vasl(v0.w,r6)
+; CHECK-NEXT: v0.w = vsub(v4.w,v27.w)
; CHECK-NEXT: v4.w = vsub(v4.w,v8.w)
-; CHECK-NEXT: v29 = vmux(q0,v2,v9)
+; CHECK-NEXT: v28 = vmux(q0,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.w = vmin(v1.w,v6.w)
-; CHECK-NEXT: v0 = vor(v27,v2)
; CHECK-NEXT: v4.w = vmin(v4.w,v6.w)
+; CHECK-NEXT: v1 = vor(v26,v2)
+; CHECK-NEXT: v0.w = vmin(v0.w,v6.w)
; CHECK-NEXT: v2 = vmux(q1,v2,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: q2 = vcmp.gt(v1.w,v7.w)
-; CHECK-NEXT: q3 = vcmp.gt(v4.w,v7.w)
+; CHECK-NEXT: q2 = vcmp.gt(v4.w,v7.w)
+; CHECK-NEXT: q3 = vcmp.gt(v0.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vlsr(v5.w,v1.w)
+; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.w = vlsr(v0.w,v4.w)
-; CHECK-NEXT: v30.w = vsub(v7.w,v5.w)
+; CHECK-NEXT: v1.w = vlsr(v1.w,v0.w)
+; CHECK-NEXT: v29.w = vsub(v7.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v31.w = vsub(v7.w,v0.w)
-; CHECK-NEXT: v5 = vmux(q0,v30,v5)
+; CHECK-NEXT: v30.w = vsub(v7.w,v1.w)
+; CHECK-NEXT: v5 = vmux(q0,v29,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q1,v31,v0)
-; CHECK-NEXT: v1 = vmux(q2,v5,v29)
+; CHECK-NEXT: v1 = vmux(q1,v30,v1)
+; CHECK-NEXT: v31 = vmux(q2,v5,v28)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q3,v0,v2)
+; CHECK-NEXT: v1 = vmux(q3,v1,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.h = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: v0.h = vpack(v1.w,v31.w):sat
; CHECK-NEXT: jumpr r31
; CHECK-NEXT: vmem(r1+#0) = v0.new
; CHECK-NEXT: }
@@ -1097,13 +1097,13 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##32768,#1)
; CHECK-NEXT: r4 = #14
-; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: v0 = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2.h = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(#11,#16)
; CHECK-NEXT: v3.h = vasl(v0.h,r2)
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.h = vsplat(r4)
@@ -1113,7 +1113,7 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.h = vsplat(r6)
-; CHECK-NEXT: v5.h = vasl(v0.h,r5)
+; CHECK-NEXT: v5.h = vasl(v1.h,r5)
; CHECK-NEXT: v4.h = vsub(v4.h,v2.h)
; CHECK-NEXT: v28 = vxor(v28,v28)
; CHECK-NEXT: }
@@ -1125,26 +1125,28 @@ define void @f16u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: v29.h = vsplat(r2)
; CHECK-NEXT: v4.h = vasr(v4.h,r7)
-; CHECK-NEXT: q2 = vcmp.gt(v28.h,v0.h)
+; CHECK-NEXT: q2 = vcmp.gt(v28.h,v1.h)
; CHECK-NEXT: v3.h = vsub(v6.h,v3.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v8.h = vasl(v1.h,r5)
-; CHECK-NEXT: q3 = vcmp.gt(v28.h,v1.h)
+; CHECK-NEXT: v8.h = vasl(v0.h,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v28.h,v0.h)
; CHECK-NEXT: v4.h = vsub(v6.h,v4.h)
; CHECK-NEXT: v3.h = vmin(v3.h,v7.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.h = vmin(v4.h,v7.h)
; CHECK-NEXT: v2 = vor(v8,v2)
-; CHECK-NEXT: q0 = vcmp.gt(v28.h,v3.h)
+; CHECK-NEXT: q1 = vcmp.gt(v28.h,v3.h)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: q0 = vcmp.gt(v28.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.h = vlsr(v5.h,v3.h)
-; CHECK-NEXT: q1 = vcmp.gt(v28.h,v4.h)
+; CHECK-NEXT: v5.h = vlsr(v5.h,v4.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.h = vlsr(v2.h,v4.h)
+; CHECK-NEXT: v2.h = vlsr(v2.h,v3.h)
; CHECK-NEXT: v30 = vmux(q0,v29,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
@@ -1550,7 +1552,7 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: v5 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v4 = vsplat(r3)
+; CHECK-NEXT: v3 = vsplat(r3)
; CHECK-NEXT: r5 = #30
; CHECK-NEXT: r6 = #24
; CHECK-NEXT: v2 = vmem(r0+#1)
@@ -1559,29 +1561,29 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: v14 = vsplat(r5)
; CHECK-NEXT: v8.w = vasl(v5.w,r4)
; CHECK-NEXT: v13 = vxor(v13,v13)
-; CHECK-NEXT: v0 = vmem(r0+#2)
+; CHECK-NEXT: v0 = vmem(r0+#3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r7 = #64
; CHECK-NEXT: v9.w = vasl(v2.w,r4)
-; CHECK-NEXT: v8.w = vsub(v8.w,v4.w)
-; CHECK-NEXT: v1 = vmem(r0+#3)
+; CHECK-NEXT: v8.w = vsub(v8.w,v3.w)
+; CHECK-NEXT: v1 = vmem(r0+#2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v11.w = vasl(v0.w,r4)
+; CHECK-NEXT: v12.w = vasl(v0.w,r4)
; CHECK-NEXT: q0 = vcmp.gt(v13.w,v5.w)
-; CHECK-NEXT: v9.w = vsub(v9.w,v4.w)
+; CHECK-NEXT: v9.w = vsub(v9.w,v3.w)
; CHECK-NEXT: q3 = vcmp.gt(v13.w,v2.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #32
-; CHECK-NEXT: v12.w = vasl(v1.w,r4)
-; CHECK-NEXT: v11.w = vsub(v11.w,v4.w)
+; CHECK-NEXT: v11.w = vasl(v1.w,r4)
+; CHECK-NEXT: v12.w = vsub(v12.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v24 = vsplat(r4)
+; CHECK-NEXT: v23 = vsplat(r4)
; CHECK-NEXT: v8.w = vasr(v8.w,r6)
-; CHECK-NEXT: v12.w = vsub(v12.w,v4.w)
+; CHECK-NEXT: v11.w = vsub(v11.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.w = vasr(v9.w,r6)
@@ -1590,70 +1592,70 @@ define void @f32u8_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: v6.w = vasl(v5.w,r2)
; CHECK-NEXT: v9.w = vsub(v14.w,v9.w)
-; CHECK-NEXT: v8.w = vmin(v8.w,v24.w)
+; CHECK-NEXT: v8.w = vmin(v8.w,v23.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vasl(v2.w,r2)
-; CHECK-NEXT: v6 = vor(v6,v4)
-; CHECK-NEXT: v9.w = vmin(v9.w,v24.w)
+; CHECK-NEXT: v6 = vor(v6,v3)
+; CHECK-NEXT: v9.w = vmin(v9.w,v23.w)
; CHECK-NEXT: q1 = vcmp.gt(v13.w,v8.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v23.w = vasr(v11.w,r6)
-; CHECK-NEXT: v7 = vor(v7,v4)
+; CHECK-NEXT: v22.w = vasr(v11.w,r6)
+; CHECK-NEXT: v7 = vor(v7,v3)
; CHECK-NEXT: q2 = vcmp.gt(v13.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.w = vasr(v12.w,r6)
-; CHECK-NEXT: v5.w = vsub(v14.w,v23.w)
+; CHECK-NEXT: v5.w = vsub(v14.w,v22.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v1.w,r2)
-; CHECK-NEXT: v25.w = vsub(v14.w,v12.w)
-; CHECK-NEXT: v5.w = vmin(v5.w,v24.w)
+; CHECK-NEXT: v4.w = vasl(v1.w,r2)
+; CHECK-NEXT: v24.w = vsub(v14.w,v12.w)
+; CHECK-NEXT: v5.w = vmin(v5.w,v23.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = ##2147483647
; CHECK-NEXT: v10.w = vasl(v0.w,r2)
-; CHECK-NEXT: v3 = vor(v3,v4)
+; CHECK-NEXT: v4 = vor(v4,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v26 = vsplat(r2)
+; CHECK-NEXT: v25 = vsplat(r2)
; CHECK-NEXT: v6.w = vlsr(v6.w,v8.w)
-; CHECK-NEXT: v10 = vor(v10,v4)
-; CHECK-NEXT: v4.w = vmin(v25.w,v24.w)
+; CHECK-NEXT: v3 = vor(v10,v3)
+; CHECK-NEXT: v10.w = vmin(v24.w,v23.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7.w = vlsr(v7.w,v9.w)
-; CHECK-NEXT: v6 = vmux(q1,v26,v6)
+; CHECK-NEXT: v27 = vmux(q1,v25,v6)
; CHECK-NEXT: q1 = vcmp.gt(v13.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v27.w = vlsr(v10.w,v5.w)
-; CHECK-NEXT: v7 = vmux(q2,v26,v7)
-; CHECK-NEXT: q2 = vcmp.gt(v13.w,v4.w)
-; CHECK-NEXT: v28 = vmux(q0,v13,v6)
+; CHECK-NEXT: v26.w = vlsr(v4.w,v5.w)
+; CHECK-NEXT: v28 = vmux(q2,v25,v7)
+; CHECK-NEXT: q2 = vcmp.gt(v13.w,v10.w)
+; CHECK-NEXT: v4 = vmux(q0,v13,v27)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vlsr(v3.w,v4.w)
-; CHECK-NEXT: v29 = vmux(q3,v13,v7)
-; CHECK-NEXT: v2 = vmux(q1,v26,v27)
-; CHECK-NEXT: q1 = vcmp.gt(v13.w,v0.w)
+; CHECK-NEXT: v3.w = vlsr(v3.w,v10.w)
+; CHECK-NEXT: v29 = vmux(q3,v13,v28)
+; CHECK-NEXT: v2 = vmux(q1,v25,v26)
+; CHECK-NEXT: q1 = vcmp.gt(v13.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: q3 = vcmp.gt(v13.w,v1.w)
-; CHECK-NEXT: v0 = vmux(q2,v26,v3)
-; CHECK-NEXT: v1 = vmux(q1,v13,v2)
+; CHECK-NEXT: q3 = vcmp.gt(v13.w,v0.w)
+; CHECK-NEXT: v1 = vmux(q2,v25,v3)
+; CHECK-NEXT: v0 = vmux(q1,v13,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v30.uh = vpack(v29.w,v28.w):sat
-; CHECK-NEXT: v0 = vmux(q3,v13,v0)
+; CHECK-NEXT: v30.uh = vpack(v29.w,v4.w):sat
+; CHECK-NEXT: v1 = vmux(q3,v13,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v31.uh = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.uh = vpack(v0.w,v1.w):sat
+; CHECK-NEXT: v0.uh = vpack(v1.w,v0.w):sat
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v1.ub = vpack(v31.h,v30.h):sat
@@ -1680,13 +1682,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
; CHECK-NEXT: r4 = #30
-; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: v0 = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(#24,#32)
; CHECK-NEXT: v3.w = vasl(v0.w,r2)
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r4)
@@ -1696,7 +1698,7 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r6)
-; CHECK-NEXT: v5.w = vasl(v0.w,r5)
+; CHECK-NEXT: v5.w = vasl(v1.w,r5)
; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
; CHECK-NEXT: v27 = vxor(v27,v27)
; CHECK-NEXT: }
@@ -1708,13 +1710,13 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: v28 = vsplat(r3)
; CHECK-NEXT: v4.w = vasr(v4.w,r7)
-; CHECK-NEXT: q2 = vcmp.gt(v27.w,v0.w)
+; CHECK-NEXT: q2 = vcmp.gt(v27.w,v1.w)
; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = #64
-; CHECK-NEXT: v8.w = vasl(v1.w,r5)
-; CHECK-NEXT: q3 = vcmp.gt(v27.w,v1.w)
+; CHECK-NEXT: v8.w = vasl(v0.w,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v27.w,v0.w)
; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
@@ -1723,14 +1725,14 @@ define void @f32u8_1(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: v2 = vor(v8,v2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: q0 = vcmp.gt(v27.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v27.w,v3.w)
+; CHECK-NEXT: q0 = vcmp.gt(v27.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w)
-; CHECK-NEXT: q1 = vcmp.gt(v27.w,v4.w)
+; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w)
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
; CHECK-NEXT: v29 = vmux(q0,v28,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
@@ -1839,13 +1841,13 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(##-2147483648,#1)
; CHECK-NEXT: r4 = #30
-; CHECK-NEXT: v0 = vmem(r0+#0)
+; CHECK-NEXT: v0 = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v2 = vsplat(r3)
; CHECK-NEXT: r7:6 = combine(#24,#32)
; CHECK-NEXT: v3.w = vasl(v0.w,r2)
-; CHECK-NEXT: v1 = vmem(r0+#1)
+; CHECK-NEXT: v1 = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6 = vsplat(r4)
@@ -1855,7 +1857,7 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r6)
-; CHECK-NEXT: v5.w = vasl(v0.w,r5)
+; CHECK-NEXT: v5.w = vasl(v1.w,r5)
; CHECK-NEXT: v4.w = vsub(v4.w,v2.w)
; CHECK-NEXT: v28 = vxor(v28,v28)
; CHECK-NEXT: }
@@ -1867,26 +1869,28 @@ define void @f32u16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: v29 = vsplat(r2)
; CHECK-NEXT: v4.w = vasr(v4.w,r7)
-; CHECK-NEXT: q2 = vcmp.gt(v28.w,v0.w)
+; CHECK-NEXT: q2 = vcmp.gt(v28.w,v1.w)
; CHECK-NEXT: v3.w = vsub(v6.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v8.w = vasl(v1.w,r5)
-; CHECK-NEXT: q3 = vcmp.gt(v28.w,v1.w)
+; CHECK-NEXT: v8.w = vasl(v0.w,r5)
+; CHECK-NEXT: q3 = vcmp.gt(v28.w,v0.w)
; CHECK-NEXT: v4.w = vsub(v6.w,v4.w)
; CHECK-NEXT: v3.w = vmin(v3.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4.w = vmin(v4.w,v7.w)
; CHECK-NEXT: v2 = vor(v8,v2)
-; CHECK-NEXT: q0 = vcmp.gt(v28.w,v3.w)
+; CHECK-NEXT: q1 = vcmp.gt(v28.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vlsr(v5.w,v3.w)
-; CHECK-NEXT: q1 = vcmp.gt(v28.w,v4.w)
+; CHECK-NEXT: q0 = vcmp.gt(v28.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.w = vlsr(v2.w,v4.w)
+; CHECK-NEXT: v5.w = vlsr(v5.w,v4.w)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v2.w = vlsr(v2.w,v3.w)
; CHECK-NEXT: v30 = vmux(q0,v29,v5)
; CHECK-NEXT: }
; CHECK-NEXT: {
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
index 4dcb8cd55b5b7..3f754aa55a14c 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/int-to-fp.ll
@@ -1041,111 +1041,111 @@ define void @s32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: r3:2 = combine(#8,#1)
; CHECK-NEXT: r6 = #255
-; CHECK-NEXT: v2.w = vabs(v1.w)
+; CHECK-NEXT: v3.w = vabs(v1.w)
; CHECK-NEXT: v1.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r2)
; CHECK-NEXT: r4 = #512
-; CHECK-NEXT: v3.w = vabs(v0.w)
+; CHECK-NEXT: v2.w = vabs(v0.w)
; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9 = vsplat(r4)
; CHECK-NEXT: v8 = vsplat(r6)
-; CHECK-NEXT: v5.uw = vcl0(v2.uw)
+; CHECK-NEXT: v6.uw = vcl0(v3.uw)
; CHECK-NEXT: v7 = vxor(v7,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
-; CHECK-NEXT: v6.uw = vcl0(v3.uw)
-; CHECK-NEXT: v5.w = vadd(v5.w,v4.w)
+; CHECK-NEXT: v5.uw = vcl0(v2.uw)
+; CHECK-NEXT: v6.w = vadd(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v28 = vsplat(r4)
+; CHECK-NEXT: v27 = vsplat(r4)
; CHECK-NEXT: r5 = ##-2147483648
-; CHECK-NEXT: v6.w = vadd(v6.w,v4.w)
+; CHECK-NEXT: v5.w = vadd(v5.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v13 = vsplat(r5)
-; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
+; CHECK-NEXT: v3.w = vasl(v3.w,v6.w)
; CHECK-NEXT: q0 = vcmp.gt(v7.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v3.w,v6.w)
-; CHECK-NEXT: v27 = vmux(q0,v13,v7)
-; CHECK-NEXT: v10.w = vadd(v2.w,v8.w)
-; CHECK-NEXT: v11 = vand(v2,v9)
+; CHECK-NEXT: v2.w = vasl(v2.w,v5.w)
+; CHECK-NEXT: v26 = vmux(q0,v13,v7)
+; CHECK-NEXT: v10.w = vadd(v3.w,v8.w)
+; CHECK-NEXT: v11 = vand(v3,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v9 = vand(v3,v9)
+; CHECK-NEXT: v9 = vand(v2,v9)
; CHECK-NEXT: q1 = vcmp.eq(v11.w,v7.w)
-; CHECK-NEXT: v8.w = vadd(v3.w,v8.w)
-; CHECK-NEXT: q2 = vcmp.gt(v2.uw,v10.uw)
+; CHECK-NEXT: v8.w = vadd(v2.w,v8.w)
+; CHECK-NEXT: q2 = vcmp.gt(v3.uw,v10.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v12.uw = vlsr(v2.uw,r3)
+; CHECK-NEXT: v12.uw = vlsr(v3.uw,r3)
; CHECK-NEXT: q3 = vcmp.eq(v9.w,v7.w)
-; CHECK-NEXT: v23 = vmux(q1,v7,v4)
-; CHECK-NEXT: q1 = vcmp.gt(v3.uw,v8.uw)
+; CHECK-NEXT: v22 = vmux(q1,v7,v4)
+; CHECK-NEXT: q1 = vcmp.gt(v2.uw,v8.uw)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.uw = vlsr(v10.uw,r3)
-; CHECK-NEXT: v25 = vmux(q3,v7,v4)
-; CHECK-NEXT: v24 = vmux(q2,v4,v7)
+; CHECK-NEXT: v3.uw = vlsr(v10.uw,r3)
+; CHECK-NEXT: v24 = vmux(q3,v7,v4)
+; CHECK-NEXT: v23 = vmux(q2,v4,v7)
; CHECK-NEXT: v4 = vmux(q1,v4,v7)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v8.uw = vlsr(v8.uw,r3)
-; CHECK-NEXT: v9.w = vadd(v2.w,v23.w)
-; CHECK-NEXT: v5.w = vsub(v24.w,v5.w)
-; CHECK-NEXT: v4.w = vsub(v4.w,v6.w)
+; CHECK-NEXT: v9.w = vadd(v3.w,v22.w)
+; CHECK-NEXT: v6.w = vsub(v23.w,v6.w)
+; CHECK-NEXT: v4.w = vsub(v4.w,v5.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.uw = vlsr(v3.uw,r3)
-; CHECK-NEXT: v26.w = vadd(v8.w,v25.w)
-; CHECK-NEXT: q3 = vcmp.eq(v12.w,v2.w)
-; CHECK-NEXT: v5.w = vadd(v5.w,v28.w)
+; CHECK-NEXT: v2.uw = vlsr(v2.uw,r3)
+; CHECK-NEXT: v25.w = vadd(v8.w,v24.w)
+; CHECK-NEXT: q3 = vcmp.eq(v12.w,v3.w)
+; CHECK-NEXT: v6.w = vadd(v6.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #23
-; CHECK-NEXT: v2.uw = vlsr(v2.uw,r2)
-; CHECK-NEXT: q2 = vcmp.eq(v3.w,v8.w)
-; CHECK-NEXT: v4.w = vadd(v4.w,v28.w)
+; CHECK-NEXT: v3.uw = vlsr(v3.uw,r2)
+; CHECK-NEXT: q2 = vcmp.eq(v2.w,v8.w)
+; CHECK-NEXT: v4.w = vadd(v4.w,v27.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v9.uw = vlsr(v9.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v29.uw = vlsr(v26.uw,r2)
-; CHECK-NEXT: v2 = vmux(q3,v9,v2)
+; CHECK-NEXT: v28.uw = vlsr(v25.uw,r2)
+; CHECK-NEXT: v3 = vmux(q3,v9,v3)
; CHECK-NEXT: q3 = vcmp.gt(v7.w,v0.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.uw = vlsr(v8.uw,r2)
+; CHECK-NEXT: v2.uw = vlsr(v8.uw,r2)
; CHECK-NEXT: v30 = vmux(q3,v13,v7)
-; CHECK-NEXT: v2 = vor(v27,v2)
+; CHECK-NEXT: v3 = vor(v26,v3)
; CHECK-NEXT: q3 = vcmp.eq(v0.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.w = vasl(v5.w,r3)
-; CHECK-NEXT: v3 = vmux(q2,v29,v3)
+; CHECK-NEXT: v29.w = vasl(v6.w,r3)
+; CHECK-NEXT: v2 = vmux(q2,v28,v2)
; CHECK-NEXT: q2 = vcmp.eq(v1.w,v7.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v4.w,r3)
-; CHECK-NEXT: v31 = vor(v30,v3)
-; CHECK-NEXT: v2 = vor(v2,v5)
+; CHECK-NEXT: v2.w = vasl(v4.w,r3)
+; CHECK-NEXT: v31 = vor(v30,v2)
+; CHECK-NEXT: v3 = vor(v3,v29)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1 = vor(v31,v3)
-; CHECK-NEXT: v2 = vmux(q2,v7,v2)
+; CHECK-NEXT: v1 = vor(v31,v2)
+; CHECK-NEXT: v3 = vmux(q2,v7,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v0 = vmux(q3,v7,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.qf32 = vadd(v2.sf,v7.sf)
+; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v7.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v7.sf)
@@ -2372,19 +2372,19 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: r3:2 = combine(#8,#1)
; CHECK-NEXT: r6 = #255
; CHECK-NEXT: v1.uw = vcl0(v0.uw)
-; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: v0.cur = vmem(r0+#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v4 = vsplat(r2)
; CHECK-NEXT: r4 = #512
; CHECK-NEXT: v3.uw = vcl0(v2.uw)
-; CHECK-NEXT: v2.cur = vmem(r0+#1)
+; CHECK-NEXT: v2.cur = vmem(r0+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v7 = vsplat(r4)
; CHECK-NEXT: v6 = vsplat(r6)
-; CHECK-NEXT: v1.w = vadd(v1.w,v4.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v4.w)
+; CHECK-NEXT: v1.w = vadd(v1.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r4 = #159
@@ -2392,10 +2392,10 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v10 = vsplat(r4)
-; CHECK-NEXT: v5.w = vasl(v0.w,v1.w)
+; CHECK-NEXT: v5.w = vasl(v2.w,v3.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v8.w = vasl(v2.w,v3.w)
+; CHECK-NEXT: v8.w = vasl(v0.w,v1.w)
; CHECK-NEXT: v11.w = vadd(v5.w,v6.w)
; CHECK-NEXT: v13 = vand(v5,v7)
; CHECK-NEXT: }
@@ -2406,36 +2406,36 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: q1 = vcmp.eq(v13.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v27.uw = vlsr(v11.uw,r3)
+; CHECK-NEXT: v28.uw = vlsr(v11.uw,r3)
; CHECK-NEXT: q3 = vcmp.gt(v8.uw,v6.uw)
; CHECK-NEXT: q2 = vcmp.eq(v7.w,v9.w)
-; CHECK-NEXT: v29 = vmux(q0,v4,v9)
+; CHECK-NEXT: v30 = vmux(q0,v4,v9)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r3)
-; CHECK-NEXT: v28 = vmux(q1,v9,v4)
-; CHECK-NEXT: v30 = vmux(q3,v4,v9)
+; CHECK-NEXT: v29 = vmux(q1,v9,v4)
+; CHECK-NEXT: v31 = vmux(q3,v4,v9)
; CHECK-NEXT: v4 = vmux(q2,v9,v4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.w = vsub(v29.w,v1.w)
-; CHECK-NEXT: v7.w = vadd(v27.w,v28.w)
; CHECK-NEXT: v3.w = vsub(v30.w,v3.w)
+; CHECK-NEXT: v7.w = vadd(v28.w,v29.w)
+; CHECK-NEXT: v1.w = vsub(v31.w,v1.w)
; CHECK-NEXT: v4.w = vadd(v6.w,v4.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v12.uw = vlsr(v5.uw,r3)
-; CHECK-NEXT: v1.w = vadd(v1.w,v10.w)
; CHECK-NEXT: v3.w = vadd(v3.w,v10.w)
-; CHECK-NEXT: q2 = vcmp.eq(v0.w,v9.w)
+; CHECK-NEXT: v1.w = vadd(v1.w,v10.w)
+; CHECK-NEXT: q2 = vcmp.eq(v2.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #23
; CHECK-NEXT: v14.uw = vlsr(v8.uw,r3)
-; CHECK-NEXT: q3 = vcmp.eq(v12.w,v27.w)
+; CHECK-NEXT: q3 = vcmp.eq(v12.w,v28.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v5.uw = vlsr(v27.uw,r2)
+; CHECK-NEXT: v5.uw = vlsr(v28.uw,r2)
; CHECK-NEXT: q1 = vcmp.eq(v14.w,v6.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
@@ -2444,28 +2444,28 @@ define void @u32f16_0(ptr %a0, ptr %a1) #0 {
; CHECK-NEXT: {
; CHECK-NEXT: v4.uw = vlsr(v4.uw,r2)
; CHECK-NEXT: v5 = vmux(q3,v7,v5)
-; CHECK-NEXT: q3 = vcmp.eq(v2.w,v9.w)
+; CHECK-NEXT: q3 = vcmp.eq(v0.w,v9.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v6.uw = vlsr(v6.uw,r2)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.w = vasl(v1.w,r3)
-; CHECK-NEXT: v31 = vmux(q1,v4,v6)
+; CHECK-NEXT: v3.w = vasl(v3.w,r3)
+; CHECK-NEXT: v2 = vmux(q1,v4,v6)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v3.w = vasl(v3.w,r3)
-; CHECK-NEXT: v1 = vor(v5,v1)
+; CHECK-NEXT: v1.w = vasl(v1.w,r3)
+; CHECK-NEXT: v3 = vor(v5,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vor(v31,v3)
-; CHECK-NEXT: v1 = vmux(q2,v9,v1)
+; CHECK-NEXT: v1 = vor(v2,v1)
+; CHECK-NEXT: v3 = vmux(q2,v9,v3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0 = vmux(q3,v9,v0)
+; CHECK-NEXT: v0 = vmux(q3,v9,v1)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v2.qf32 = vadd(v1.sf,v9.sf)
+; CHECK-NEXT: v2.qf32 = vadd(v3.sf,v9.sf)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: v3.qf32 = vadd(v0.sf,v9.sf)
More information about the llvm-commits
mailing list