[llvm] a5d0122 - [DAG] Canonicalize non-inlane shuffle -> AND if all non-inlane referenced elements are known zero
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sat Jul 16 03:38:43 PDT 2022
Author: Simon Pilgrim
Date: 2022-07-16T11:38:24+01:00
New Revision: a5d0122f7591d249b14ab473c85c79defa146d38
URL: https://github.com/llvm/llvm-project/commit/a5d0122f7591d249b14ab473c85c79defa146d38
DIFF: https://github.com/llvm/llvm-project/commit/a5d0122f7591d249b14ab473c85c79defa146d38.diff
LOG: [DAG] Canonicalize non-inlane shuffle -> AND if all non-inlane referenced elements are known zero
As mentioned on D127115, this patch that attempts to recognise shuffle masks that could be simplified to a AND mask - we already have a similar transform that will fold AND -> 'clear mask' shuffle, but this patch handles cases where the referenced elements are not from the same lane indices but are known to be zero.
Differential Revision: https://reviews.llvm.org/D129150
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/test/CodeGen/AArch64/build-vector-extract.ll
llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
llvm/test/CodeGen/ARM/vector-DAGCombine.ll
llvm/test/CodeGen/X86/sad.ll
llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 73d0f91fba841..ed1dc3e8b1db9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22641,6 +22641,56 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
}
}
+ // If we're not performing a select/blend shuffle, see if we can convert the
+ // shuffle into a AND node, with all the out-of-lane elements are known zero.
+ if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+ bool IsInLaneMask = true;
+ ArrayRef<int> Mask = SVN->getMask();
+ SmallVector<int, 16> ClearMask(NumElts, -1);
+ APInt DemandedLHS = APInt::getNullValue(NumElts);
+ APInt DemandedRHS = APInt::getNullValue(NumElts);
+ for (int I = 0; I != (int)NumElts; ++I) {
+ int M = Mask[I];
+ if (M < 0)
+ continue;
+ ClearMask[I] = M == I ? I : (I + NumElts);
+ IsInLaneMask &= (M == I) || (M == (I + NumElts));
+ if (M != I) {
+ APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS;
+ Demanded.setBit(M % NumElts);
+ }
+ }
+ // TODO: Should we try to mask with N1 as well?
+ if (!IsInLaneMask &&
+ (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) &&
+ (DemandedLHS.isNullValue() ||
+ DAG.MaskedVectorIsZero(N0, DemandedLHS)) &&
+ (DemandedRHS.isNullValue() ||
+ DAG.MaskedVectorIsZero(N1, DemandedRHS))) {
+ SDLoc DL(N);
+ EVT IntVT = VT.changeVectorElementTypeToInteger();
+ EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
+ SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
+ SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
+ SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
+ for (int I = 0; I != (int)NumElts; ++I)
+ if (0 <= Mask[I])
+ AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
+
+ // See if a clear mask is legal instead of going via
+ // XformToShuffleWithZero which loses UNDEF mask elements.
+ if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
+ return DAG.getBitcast(
+ VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
+ DAG.getConstant(0, DL, IntVT), ClearMask));
+
+ if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
+ return DAG.getBitcast(
+ VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
+ DAG.getBuildVector(IntVT, DL, AndMask)));
+ }
+ }
+
// Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
// BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 447ad10ddf228..831254340062d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -11811,6 +11811,12 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
isConcatMask(M, VT, VT.getSizeInBits() == 128));
}
+bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
+ EVT VT) const {
+ // Just delegate to the generic legality, clear masks aren't special.
+ return isShuffleMaskLegal(M, VT);
+}
+
/// getVShiftImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e02b5e56fd2e9..fcff5e04df948 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -549,6 +549,10 @@ class AArch64TargetLowering : public TargetLowering {
/// should be stack expanded.
bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
+ /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero'
+ /// shuffle mask can be codegen'd directly.
+ bool isVectorClearMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
/// Return the ISD::SETCC ValueType.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;
diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
index 998a5304a19a8..7b60a398fa7b6 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
@@ -4,8 +4,7 @@
define <2 x i64> @extract0_i32_zext_insert0_i64_undef(<4 x i32> %x) {
; CHECK-LABEL: extract0_i32_zext_insert0_i64_undef:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: mov v0.s[1], wzr
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 0
%z = zext i32 %e to i64
diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
index 069767da7b495..13143041101c7 100644
--- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -907,23 +907,11 @@ define <8 x i8> @vselect_equivalent_shuffle_v8i8(<8 x i8> %a, <8 x i8> %b) {
ret <8 x i8> %c
}
-; CHECK-LABEL: .LCPI90_0:
-; CHECK-NEXT: .byte 0
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 2
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 4
-; CHECK-NEXT: .byte 5
-; CHECK-NEXT: .byte 6
-; CHECK-NEXT: .byte 7
define <8 x i8> @vselect_equivalent_shuffle_v8i8_zero(<8 x i8> %a) {
; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zero:
; CHECK: // %bb.0:
-; CHECK-NEXT: adrp x8, .LCPI90_0
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov v0.d[1], v0.d[0]
-; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI90_0]
-; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b
+; CHECK-NEXT: movi d1, #0xffffffff00ff00ff
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: ret
%c = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7>
ret <8 x i8> %c
@@ -982,28 +970,20 @@ define <8 x i16> @vselect_equivalent_shuffle_v8i16(<8 x i16> %a, <8 x i16> %b) {
}
; CHECK-LABEL: .LCPI93_0:
-; CHECK-NEXT: .byte 0
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 4
-; CHECK-NEXT: .byte 5
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 255
-; CHECK-NEXT: .byte 8
-; CHECK-NEXT: .byte 9
-; CHECK-NEXT: .byte 10
-; CHECK-NEXT: .byte 11
-; CHECK-NEXT: .byte 12
-; CHECK-NEXT: .byte 13
-; CHECK-NEXT: .byte 14
-; CHECK-NEXT: .byte 15
+; CHECK-NEXT: .hword 65535 // 0xffff
+; CHECK-NEXT: .hword 0 // 0x0
+; CHECK-NEXT: .hword 65535 // 0xffff
+; CHECK-NEXT: .hword 0 // 0x0
+; CHECK-NEXT: .hword 65535 // 0xffff
+; CHECK-NEXT: .hword 65535 // 0xffff
+; CHECK-NEXT: .hword 65535 // 0xffff
+; CHECK-NEXT: .hword 65535 // 0xffff
define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) {
; CHECK-LABEL: vselect_equivalent_shuffle_v8i16_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI93_0
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI93_0]
-; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%c = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7>
ret <8 x i16> %c
diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
index f274f331a5073..1a9f44bd52e99 100644
--- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
+++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll
@@ -56,7 +56,6 @@ define <4 x i32> @test_vmovrrd_combine() nounwind {
; CHECK-NEXT: bne .LBB3_2
; CHECK-NEXT: @ %bb.1: @ %bb1.preheader
; CHECK-NEXT: vmov.i32 q8, #0x0
-; CHECK-NEXT: vext.8 q8, q8, q8, #4
; CHECK-NEXT: .LBB3_2: @ %bb2
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 45dc6395b56d1..36d23e7c96d25 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -1058,47 +1058,19 @@ define dso_local i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX1-LABEL: sad_double_reduction:
-; AVX1: # %bb.0: # %bb
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu (%rdx), %xmm1
-; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sad_double_reduction:
-; AVX2: # %bb.0: # %bb
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdx), %xmm1
-; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sad_double_reduction:
-; AVX512: # %bb.0: # %bb
-; AVX512-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu (%rdx), %xmm1
-; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: sad_double_reduction:
+; AVX: # %bb.0: # %bb
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%rdx), %xmm1
+; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
bb:
%tmp = load <16 x i8>, <16 x i8>* %arg, align 1
%tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
@@ -1148,47 +1120,19 @@ define dso_local i32 @sad_double_reduction_abs(<16 x i8>* %arg, <16 x i8>* %arg1
; SSE2-NEXT: movd %xmm1, %eax
; SSE2-NEXT: retq
;
-; AVX1-LABEL: sad_double_reduction_abs:
-; AVX1: # %bb.0: # %bb
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu (%rdx), %xmm1
-; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sad_double_reduction_abs:
-; AVX2: # %bb.0: # %bb
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu (%rdx), %xmm1
-; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sad_double_reduction_abs:
-; AVX512: # %bb.0: # %bb
-; AVX512-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu (%rdx), %xmm1
-; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
-; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
+; AVX-LABEL: sad_double_reduction_abs:
+; AVX: # %bb.0: # %bb
+; AVX-NEXT: vmovdqu (%rdi), %xmm0
+; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vmovdqu (%rdx), %xmm1
+; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vmovd %xmm0, %eax
+; AVX-NEXT: retq
bb:
%tmp = load <16 x i8>, <16 x i8>* %arg, align 1
%tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
index f3df1baaa9624..4730f5ea724b5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@@ -1013,21 +1013,21 @@ define <4 x float> @shuffle_v4f32_0z2z(<4 x float> %v) {
; SSE2-LABEL: shuffle_v4f32_0z2z:
; SSE2: # %bb.0:
; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4f32_0z2z:
; SSE3: # %bb.0:
; SSE3-NEXT: xorps %xmm1, %xmm1
-; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4f32_0z2z:
; SSSE3: # %bb.0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0]
+; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 7e7c456a28db9..3f2a8098a6564 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2249,9 +2249,7 @@ define <2 x i64> @test_v8i64_2_5 (<8 x i64> %v) {
define <8 x i64> @test_v8i64_insert_zero_128(<8 x i64> %a) {
; ALL-LABEL: test_v8i64_insert_zero_128:
; ALL: # %bb.0:
-; ALL-NEXT: movb $3, %al
-; ALL-NEXT: kmovw %eax, %k1
-; ALL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; ALL-NEXT: vmovaps %xmm0, %xmm0
; ALL-NEXT: ret{{[l|q]}}
%res = shufflevector <8 x i64> %a, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9>
ret <8 x i64> %res
More information about the llvm-commits
mailing list