[llvm] 51a4c61 - [X86] Add test cases for failures to form vbroadcastw due to isTypeDesirableForOp preventing load shrinking to i16.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 00:12:20 PDT 2020
Author: Craig Topper
Date: 2020-03-12T23:20:05-07:00
New Revision: 51a4c6125ca6f25cff39c82a62878556b430d7f1
URL: https://github.com/llvm/llvm-project/commit/51a4c6125ca6f25cff39c82a62878556b430d7f1
DIFF: https://github.com/llvm/llvm-project/commit/51a4c6125ca6f25cff39c82a62878556b430d7f1.diff
LOG: [X86] Add test cases for failures to form vbroadcastw due to isTypeDesirableForOp preventing load shrinking to i16.
These are based on existing test cases but use i64 instead of i32.
Some of these end up with i64 zextload/extloads from i16 that we
don't have isel patterns for.
Some of the other cases fail because isTypeDesirableForOp prevents
shrinking the (trunc (i64 (srl (load)))) directly. So we try
to shrink based on the (i64 (srl (load))) but we need 64 - shift_amount
to be a power of 2 to do that shrink.
Added:
Modified:
llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 2688bde878ee..a09deb9a527a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -3226,3 +3226,253 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) {
%tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
ret <8 x i16> %tmp4
}
+
+define <8 x i16> @insert_dup_mem_v8i16_i64(i64* %ptr) {
+; SSE-LABEL: insert_dup_mem_v8i16_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: insert_dup_mem_v8i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i64:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0
+; AVX2OR512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_mem_v8i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_mem_v8i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw (%rdi), %xmm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) {
+; SSE-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: shrq $16, %rax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: shrq $16, %rax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movq (%rdi), %rax
+; XOPAVX2-NEXT: shrq $16, %rax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) {
+; SSE-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl 6(%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) {
+; SSE2-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
+; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl 6(%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) {
+; SSE-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; SSE: # %bb.0:
+; SSE-NEXT: movzwl (%rdi), %eax
+; SSE-NEXT: movq %rax, %xmm0
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl (%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: movzwl (%rdi), %eax
+; XOPAVX1-NEXT: vmovq %rax, %xmm0
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl (%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i16, i16* %ptr, align 2
+ %tmp1 = sext i16 %tmp to i64
+ %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0
+ %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
+ %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %tmp4
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index d92db2e15c58..103de84f0c38 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -7458,6 +7458,215 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 {
ret <16 x i16> %tmp3
}
+define <16 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v16i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i64:
+; AVX2OR512VL: # %bb.0:
+; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0
+; AVX2OR512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_mem_v16i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_mem_v16i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
+; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: shrq $16, %rax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movq (%rdi), %rax
+; AVX512VL-NEXT: shrq $16, %rax
+; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movq (%rdi), %rax
+; XOPAVX2-NEXT: shrq $16, %rax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
+; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl 6(%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
+; AVX1-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl 6(%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl 6(%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl 6(%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
+; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: movzwl (%rdi), %eax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movzwl (%rdi), %eax
+; AVX2-NEXT: vmovd %eax, %xmm0
+; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: movzwl (%rdi), %eax
+; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
+; AVX512VL-NEXT: retq
+;
+; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; XOPAVX1: # %bb.0:
+; XOPAVX1-NEXT: movzwl (%rdi), %eax
+; XOPAVX1-NEXT: vmovq %rax, %xmm0
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; XOPAVX2: # %bb.0:
+; XOPAVX2-NEXT: movzwl (%rdi), %eax
+; XOPAVX2-NEXT: vmovd %eax, %xmm0
+; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0
+; XOPAVX2-NEXT: retq
+ %tmp = load i16, i16* %ptr, align 2
+ %tmp1 = sext i16 %tmp to i64
+ %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0
+ %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
+ %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %tmp4
+}
+
define <16 x i16> @unpckh_v16i16(<16 x i16> %x, <16 x i16> %y) {
; AVX1-LABEL: unpckh_v16i16:
; AVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 54266b12864f..11085c945914 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -289,6 +289,111 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 {
ret <32 x i16> %tmp3
}
+define <32 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) {
+; KNL-LABEL: insert_dup_mem_v16i16_i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: vpbroadcastw (%rdi), %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_mem_v16i16_i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: vpbroadcastw (%rdi), %zmm0
+; SKX-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> zeroinitializer
+ ret <32 x i16> %tmp3
+}
+
+define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) {
+; KNL-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: movq (%rdi), %rax
+; KNL-NEXT: shrq $16, %rax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_elt1_mem_v16i16_i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: movq (%rdi), %rax
+; SKX-NEXT: shrq $16, %rax
+; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <32 x i16> %tmp3
+}
+
+define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) {
+; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: movzwl 6(%rdi), %eax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: movzwl 6(%rdi), %eax
+; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ ret <32 x i16> %tmp3
+}
+
+define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) {
+; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: movzwl 6(%rdi), %eax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: movzwl 6(%rdi), %eax
+; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: retq
+ %tmp = load i64, i64* %ptr, align 4
+ %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1
+ %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16>
+ %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+ ret <32 x i16> %tmp3
+}
+
+define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) {
+; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; KNL: ## %bb.0:
+; KNL-NEXT: movzwl (%rdi), %eax
+; KNL-NEXT: vmovd %eax, %xmm0
+; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT: retq
+;
+; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64:
+; SKX: ## %bb.0:
+; SKX-NEXT: movzwl (%rdi), %eax
+; SKX-NEXT: vpbroadcastw %eax, %zmm0
+; SKX-NEXT: retq
+ %tmp = load i16, i16* %ptr, align 2
+ %tmp1 = sext i16 %tmp to i64
+ %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0
+ %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16>
+ %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <32 x i32> zeroinitializer
+ ret <32 x i16> %tmp4
+}
+
define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz:
; KNL: ## %bb.0:
More information about the llvm-commits
mailing list