[llvm] 5767497 - Add Extend shuffle pattern to vNf32 shuffles.
Noah Goldstein via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 24 13:22:25 PST 2023
Author: Noah Goldstein
Date: 2023-02-24T15:22:08-06:00
New Revision: 5767497943c719e7b9fd319d73f9a33b4e0f8595
URL: https://github.com/llvm/llvm-project/commit/5767497943c719e7b9fd319d73f9a33b4e0f8595
DIFF: https://github.com/llvm/llvm-project/commit/5767497943c719e7b9fd319d73f9a33b4e0f8595.diff
LOG: Add Extend shuffle pattern to vNf32 shuffles.
There are some cases where its useful for float types, not quite as
hot as in the integer case, but still better than alternatives.
Differential Revision: https://reviews.llvm.org/D143785
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pr43866.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6c93e6aea3722..49b79e1dc67a0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -14322,6 +14322,7 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
return SDValue();
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
NumElements / Scale);
+ InputV = DAG.getBitcast(VT, InputV);
InputV = ShuffleOffset(InputV);
InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
DL, ExtVT, InputV, DAG);
@@ -14329,6 +14330,7 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
}
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
+ InputV = DAG.getBitcast(VT, InputV);
// For any extends we can cheat for larger element sizes and use shuffle
// instructions that can fold with a load and/or copy.
@@ -15488,6 +15490,13 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ if (Subtarget.hasSSE2())
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
+ ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
+ return ZExt;
+ }
+
if (Subtarget.hasAVX2())
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
return Extract;
@@ -16872,7 +16881,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// AVX vector shuffle types.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG, bool SimpleOnly) {
assert(VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!");
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -16900,11 +16909,10 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
std::tie(LoV2, HiV2) = SplitVector(V2);
// Now create two 4-way blends of these half-width vectors.
- auto HalfBlend = [&](ArrayRef<int> HalfMask) {
- bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
- SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
- SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
- SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
+ bool &UseHiV1, bool &UseLoV2,
+ bool &UseHiV2) {
+ UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
for (int i = 0; i < SplitNumElements; ++i) {
int M = HalfMask[i];
if (M >= NumElements) {
@@ -16912,22 +16920,49 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
UseHiV2 = true;
else
UseLoV2 = true;
- V2BlendMask[i] = M - NumElements;
- BlendMask[i] = SplitNumElements + i;
} else if (M >= 0) {
if (M >= SplitNumElements)
UseHiV1 = true;
else
UseLoV1 = true;
+ }
+ }
+ };
+
+ auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
+ if (!SimpleOnly)
+ return true;
+
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
+
+ return !(UseHiV1 || UseHiV2);
+ };
+
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
+ for (int i = 0; i < SplitNumElements; ++i) {
+ int M = HalfMask[i];
+ if (M >= NumElements) {
+ V2BlendMask[i] = M - NumElements;
+ BlendMask[i] = SplitNumElements + i;
+ } else if (M >= 0) {
V1BlendMask[i] = M;
BlendMask[i] = i;
}
}
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
+
// Because the lowering happens after all combining takes place, we need to
// manually combine these blend masks as much as possible so that we create
// a minimal number of high-level vector shuffle nodes.
+ assert(!SimpleOnly || (!UseHiV1 && !UseHiV2) && "Shuffle won't be simple");
+
// First try just blending the halves of V1 or V2.
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
return DAG.getUNDEF(SplitVT);
@@ -16938,8 +16973,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
SDValue V1Blend, V2Blend;
if (UseLoV1 && UseHiV1) {
- V1Blend =
- DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
+ V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
} else {
// We only use half of V1 so map the usage down into the final blend mask.
V1Blend = UseLoV1 ? LoV1 : HiV1;
@@ -16948,8 +16982,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
}
if (UseLoV2 && UseHiV2) {
- V2Blend =
- DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
+ V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
} else {
// We only use half of V2 so map the usage down into the final blend mask.
V2Blend = UseLoV2 ? LoV2 : HiV2;
@@ -16959,6 +16992,10 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
}
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
};
+
+ if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
+ return SDValue();
+
SDValue Lo = HalfBlend(LoMask);
SDValue Hi = HalfBlend(HiMask);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
@@ -17015,7 +17052,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
if (Mask[i] >= 0)
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
// requires that the decomposed single-input shuffles don't end up here.
@@ -17163,6 +17201,20 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
}
+/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
+static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
+ SmallVector<int> &InLaneMask) {
+ int Size = Mask.size();
+ InLaneMask.assign(Mask.begin(), Mask.end());
+ for (int i = 0; i < Size; ++i) {
+ int &M = InLaneMask[i];
+ if (M < 0)
+ continue;
+ if (((M % Size) / LaneSize) != (i / LaneSize))
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
+ }
+}
+
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
/// source with a lane permutation.
///
@@ -17207,21 +17259,17 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
- SmallVector<int, 32> InLaneMask(Mask);
- for (int i = 0; i < Size; ++i) {
- int &M = InLaneMask[i];
- if (M < 0)
- continue;
- if (((M % Size) / LaneSize) != (i / LaneSize))
- M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
- }
+ SmallVector<int> InLaneMask;
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
+
assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
"In-lane shuffle mask expected");
// If we're not using both lanes in each lane and the inlane mask is not
// repeating, then we're better off splitting.
if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ false);
// Flip the lanes, and shuffle the results which should now be in-lane.
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
@@ -18356,6 +18404,19 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget, DAG))
return Broadcast;
+ if (!Subtarget.hasAVX2()) {
+ SmallVector<int> InLaneMask;
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
+
+ if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
+ if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
+ /*SimpleOnly*/ true))
+ return R;
+ }
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
+ return DAG.getBitcast(MVT::v8f32, ZExt);
+
// If the shuffle mask is repeated in each 128-bit lane, we have many more
// options to efficiently lower the shuffle.
SmallVector<int, 4> RepeatedMask;
@@ -18848,7 +18909,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
return V;
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -19087,6 +19148,10 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Blend;
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ return DAG.getBitcast(MVT::v16f32, ZExt);
+
// Try to create an in-lane repeating shuffle mask and then shuffle the
// results into the target lanes.
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -19404,7 +19469,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (Subtarget.hasVBMI())
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
- return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -19449,7 +19514,7 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
return V;
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
}
if (VT == MVT::v32f16) {
diff --git a/llvm/test/CodeGen/X86/pr43866.ll b/llvm/test/CodeGen/X86/pr43866.ll
index f00c7ec96bc76..20eedbc942277 100644
--- a/llvm/test/CodeGen/X86/pr43866.ll
+++ b/llvm/test/CodeGen/X86/pr43866.ll
@@ -15,12 +15,9 @@ define dso_local void @test() {
; CHECK-NEXT: subq $64, %rsp
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,0],xmm0[1,0]
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[0,0]
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4]
; CHECK-NEXT: vmovaps %ymm0, (%rsp)
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 1546a58bc8401..ac7cc726aee6b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -491,16 +491,27 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) {
;
; AVX2-LABEL: shuffle_v8f32_091b2d3f:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
-; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
; AVX2-NEXT: retq
;
-; AVX512VL-LABEL: shuffle_v8f32_091b2d3f:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
-; AVX512VL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
-; AVX512VL-NEXT: retq
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,9,1,11,2,13,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_091b2d3f:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512VL-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 9, i32 1, i32 11, i32 2, i32 13, i32 3, i32 15>
ret <8 x float> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 91f2a6715ff7b..56b48ea40b637 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2820,17 +2820,17 @@ define <4 x float> @PR30264(<4 x float> %x) {
; SSE2-LABEL: PR30264:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
-; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
+; SSE2-NEXT: movapd %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: PR30264:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3]
-; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSSE3-NEXT: shufpd {{.*#+}} xmm1 = xmm1[0],mem[1]
+; SSSE3-NEXT: movapd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: PR30264:
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 3d9d30e107db5..86737f28e28cc 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -4677,20 +4677,16 @@ define void @vec384_v12i32_to_v6i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 16(%rcx)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: vec384_v12i32_to_v6i64_factor2:
@@ -7005,26 +7001,19 @@ define void @vec512_v16i32_to_v8i64_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpaddb 48(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 48(%rcx)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm4, %xmm3
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
; AVX-NEXT: retq
;
; AVX2-LABEL: vec512_v16i32_to_v8i64_factor2:
More information about the llvm-commits
mailing list