[llvm] b58a58c - [X86] vector-bo-select.ll - add SSE test coverage
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Apr 11 03:26:19 PDT 2023
Author: Simon Pilgrim
Date: 2023-04-11T11:26:00+01:00
New Revision: b58a58c3b1c0dbaf75b00899a0db42a62c8f58e1
URL: https://github.com/llvm/llvm-project/commit/b58a58c3b1c0dbaf75b00899a0db42a62c8f58e1
DIFF: https://github.com/llvm/llvm-project/commit/b58a58c3b1c0dbaf75b00899a0db42a62c8f58e1.diff
LOG: [X86] vector-bo-select.ll - add SSE test coverage
Added:
Modified:
llvm/test/CodeGen/X86/vector-bo-select.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll
index 8653835ca917..e1d10688a422 100644
--- a/llvm/test/CodeGen/X86/vector-bo-select.ll
+++ b/llvm/test/CodeGen/X86/vector-bo-select.ll
@@ -1,9 +1,30 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
define <4 x float> @fadd_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) {
+; SSE2-LABEL: fadd_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v4f32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE42-NEXT: addps %xmm1, %xmm3
+; SSE42-NEXT: movaps %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -36,6 +57,45 @@ define <4 x float> @fadd_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float>
}
define <8 x float> @fadd_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fadd_v8f32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: addps %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v8f32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE42-NEXT: addps %xmm1, %xmm7
+; SSE42-NEXT: addps %xmm2, %xmm6
+; SSE42-NEXT: movaps %xmm7, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v8f32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -70,6 +130,76 @@ define <8 x float> @fadd_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x
}
define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fadd_v16f32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: por %xmm8, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm10, %xmm8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm10, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: addps %xmm2, %xmm7
+; SSE2-NEXT: addps %xmm3, %xmm8
+; SSE2-NEXT: addps %xmm4, %xmm9
+; SSE2-NEXT: movaps %xmm7, %xmm1
+; SSE2-NEXT: movaps %xmm8, %xmm2
+; SSE2-NEXT: movaps %xmm9, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v16f32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm3, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: addps %xmm1, %xmm5
+; SSE42-NEXT: addps %xmm2, %xmm6
+; SSE42-NEXT: addps %xmm8, %xmm7
+; SSE42-NEXT: addps %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm5, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: movaps %xmm7, %xmm2
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v16f32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -99,6 +229,76 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16
}
define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fadd_v16f32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: movdqa %xmm8, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: por %xmm8, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm10, %xmm8
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: movdqa %xmm10, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm10, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm6, %xmm0
+; SSE2-NEXT: addps %xmm1, %xmm0
+; SSE2-NEXT: addps %xmm2, %xmm7
+; SSE2-NEXT: addps %xmm3, %xmm8
+; SSE2-NEXT: addps %xmm4, %xmm9
+; SSE2-NEXT: movaps %xmm7, %xmm1
+; SSE2-NEXT: movaps %xmm8, %xmm2
+; SSE2-NEXT: movaps %xmm9, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v16f32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm3, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: addps %xmm1, %xmm5
+; SSE42-NEXT: addps %xmm2, %xmm6
+; SSE42-NEXT: addps %xmm8, %xmm7
+; SSE42-NEXT: addps %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm5, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: movaps %xmm7, %xmm2
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v16f32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -128,6 +328,15 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef
}
define <4 x float> @fsub_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) {
+; SSE-LABEL: fsub_v4f32:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: subps %xmm0, %xmm1
+; SSE-NEXT: movaps %xmm1, %xmm0
+; SSE-NEXT: retq
+;
; AVX2-LABEL: fsub_v4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -161,6 +370,38 @@ define <4 x float> @fsub_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float>
; negative test - fsub is not commutative; there is no identity constant for operand 0
define <8 x float> @fsub_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fsub_v8f32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: subps %xmm1, %xmm0
+; SSE2-NEXT: subps %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsub_v8f32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm3, %xmm0
+; SSE42-NEXT: subps %xmm1, %xmm0
+; SSE42-NEXT: subps %xmm2, %xmm5
+; SSE42-NEXT: movaps %xmm5, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fsub_v8f32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -194,6 +435,70 @@ define <8 x float> @fsub_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x
}
define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fsub_v16f32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: subps %xmm0, %xmm1
+; SSE2-NEXT: subps %xmm7, %xmm2
+; SSE2-NEXT: subps %xmm9, %xmm3
+; SSE2-NEXT: subps %xmm8, %xmm4
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsub_v16f32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: subps %xmm6, %xmm1
+; SSE42-NEXT: subps %xmm7, %xmm2
+; SSE42-NEXT: subps %xmm8, %xmm3
+; SSE42-NEXT: subps %xmm0, %xmm4
+; SSE42-NEXT: movaps %xmm1, %xmm0
+; SSE42-NEXT: movaps %xmm2, %xmm1
+; SSE42-NEXT: movaps %xmm3, %xmm2
+; SSE42-NEXT: movaps %xmm4, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fsub_v16f32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -226,6 +531,69 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16
; negative test - fsub is not commutative; there is no identity constant for operand 0
define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fsub_v16f32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: subps %xmm1, %xmm0
+; SSE2-NEXT: subps %xmm8, %xmm7
+; SSE2-NEXT: subps %xmm3, %xmm2
+; SSE2-NEXT: subps %xmm4, %xmm9
+; SSE2-NEXT: movaps %xmm7, %xmm1
+; SSE2-NEXT: movaps %xmm9, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsub_v16f32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm2, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm2
+; SSE42-NEXT: psrad $31, %xmm2
+; SSE42-NEXT: pandn %xmm7, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: subps %xmm1, %xmm6
+; SSE42-NEXT: subps %xmm8, %xmm7
+; SSE42-NEXT: subps %xmm3, %xmm2
+; SSE42-NEXT: subps %xmm4, %xmm5
+; SSE42-NEXT: movaps %xmm6, %xmm0
+; SSE42-NEXT: movaps %xmm7, %xmm1
+; SSE42-NEXT: movaps %xmm5, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fsub_v16f32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -256,6 +624,25 @@ define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef
}
define <4 x float> @fmul_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) {
+; SSE2-LABEL: fmul_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v4f32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE42-NEXT: mulps %xmm1, %xmm3
+; SSE42-NEXT: movaps %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -288,6 +675,45 @@ define <4 x float> @fmul_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float>
}
define <8 x float> @fmul_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fmul_v8f32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: mulps %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v8f32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE42-NEXT: mulps %xmm1, %xmm7
+; SSE42-NEXT: mulps %xmm2, %xmm6
+; SSE42-NEXT: movaps %xmm7, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v8f32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -322,6 +748,80 @@ define <8 x float> @fmul_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x
}
define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fmul_v16f32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: psrad $31, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: por %xmm9, %xmm10
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm2, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: movdqa %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm9
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm11
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm11, %xmm0
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: mulps %xmm8, %xmm9
+; SSE2-NEXT: mulps %xmm3, %xmm2
+; SSE2-NEXT: mulps %xmm4, %xmm10
+; SSE2-NEXT: movaps %xmm9, %xmm1
+; SSE2-NEXT: movaps %xmm10, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v16f32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm3, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: mulps %xmm1, %xmm5
+; SSE42-NEXT: mulps %xmm2, %xmm6
+; SSE42-NEXT: mulps %xmm8, %xmm7
+; SSE42-NEXT: mulps %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm5, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: movaps %xmm7, %xmm2
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v16f32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -351,6 +851,80 @@ define <16 x float> @fmul_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16
}
define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fmul_v16f32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: psrad $31, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: por %xmm9, %xmm10
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm2, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: movdqa %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm9
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm11
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm11, %xmm0
+; SSE2-NEXT: mulps %xmm1, %xmm0
+; SSE2-NEXT: mulps %xmm8, %xmm9
+; SSE2-NEXT: mulps %xmm3, %xmm2
+; SSE2-NEXT: mulps %xmm4, %xmm10
+; SSE2-NEXT: movaps %xmm9, %xmm1
+; SSE2-NEXT: movaps %xmm10, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v16f32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm3, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: mulps %xmm1, %xmm5
+; SSE42-NEXT: mulps %xmm2, %xmm6
+; SSE42-NEXT: mulps %xmm8, %xmm7
+; SSE42-NEXT: mulps %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm5, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: movaps %xmm7, %xmm2
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v16f32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -380,6 +954,26 @@ define <16 x float> @fmul_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef
}
define <4 x float> @fdiv_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float> noundef %y) {
+; SSE2-LABEL: fdiv_v4f32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: divps %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v4f32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE42-NEXT: divps %xmm3, %xmm1
+; SSE42-NEXT: movaps %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v4f32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -412,6 +1006,45 @@ define <4 x float> @fdiv_v4f32(<4 x i1> %b, <4 x float> noundef %x, <4 x float>
}
define <8 x float> @fdiv_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fdiv_v8f32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pandn %xmm6, %xmm0
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: divps %xmm1, %xmm0
+; SSE2-NEXT: divps %xmm2, %xmm5
+; SSE2-NEXT: movaps %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v8f32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE42-NEXT: divps %xmm1, %xmm7
+; SSE42-NEXT: divps %xmm2, %xmm6
+; SSE42-NEXT: movaps %xmm7, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v8f32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -447,6 +1080,81 @@ define <8 x float> @fdiv_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x
}
define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fdiv_v16f32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm8, %xmm11
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: por %xmm11, %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm9, %xmm11
+; SSE2-NEXT: pandn %xmm7, %xmm9
+; SSE2-NEXT: por %xmm11, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pand %xmm7, %xmm11
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: por %xmm11, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm10
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm10, %xmm0
+; SSE2-NEXT: divps %xmm0, %xmm1
+; SSE2-NEXT: divps %xmm7, %xmm2
+; SSE2-NEXT: divps %xmm9, %xmm3
+; SSE2-NEXT: divps %xmm8, %xmm4
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: movaps %xmm2, %xmm1
+; SSE2-NEXT: movaps %xmm3, %xmm2
+; SSE2-NEXT: movaps %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v16f32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm8
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: divps %xmm5, %xmm1
+; SSE42-NEXT: divps %xmm6, %xmm2
+; SSE42-NEXT: divps %xmm7, %xmm3
+; SSE42-NEXT: divps %xmm9, %xmm4
+; SSE42-NEXT: movaps %xmm1, %xmm0
+; SSE42-NEXT: movaps %xmm2, %xmm1
+; SSE42-NEXT: movaps %xmm3, %xmm2
+; SSE42-NEXT: movaps %xmm4, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v16f32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -476,6 +1184,80 @@ define <16 x float> @fdiv_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16
}
define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) {
+; SSE2-LABEL: fdiv_v16f32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movaps %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: psrad $31, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm10, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm10
+; SSE2-NEXT: por %xmm9, %xmm10
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: movdqa %xmm11, %xmm9
+; SSE2-NEXT: pand %xmm2, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: movdqa %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm9
+; SSE2-NEXT: por %xmm7, %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm11
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: por %xmm11, %xmm0
+; SSE2-NEXT: divps %xmm1, %xmm0
+; SSE2-NEXT: divps %xmm8, %xmm9
+; SSE2-NEXT: divps %xmm3, %xmm2
+; SSE2-NEXT: divps %xmm4, %xmm10
+; SSE2-NEXT: movaps %xmm9, %xmm1
+; SSE2-NEXT: movaps %xmm10, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v16f32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm3, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm9
+; SSE42-NEXT: movaps {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm10 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: divps %xmm1, %xmm5
+; SSE42-NEXT: divps %xmm2, %xmm6
+; SSE42-NEXT: divps %xmm8, %xmm7
+; SSE42-NEXT: divps %xmm4, %xmm3
+; SSE42-NEXT: movaps %xmm5, %xmm0
+; SSE42-NEXT: movaps %xmm6, %xmm1
+; SSE42-NEXT: movaps %xmm7, %xmm2
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v16f32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -505,6 +1287,50 @@ define <16 x float> @fdiv_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef
}
define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fadd_v8f32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: addps %xmm4, %xmm0
+; SSE2-NEXT: addps %xmm6, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v8f32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm0, %xmm4
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm0, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6
+; SSE42-NEXT: addps %xmm4, %xmm6
+; SSE42-NEXT: addps %xmm7, %xmm1
+; SSE42-NEXT: movaps %xmm6, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v8f32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -538,6 +1364,88 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou
}
define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) {
+; SSE2-LABEL: fadd_v8f64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [-0.0E+0,-0.0E+0]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm10, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm7, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm6, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: addpd %xmm5, %xmm0
+; SSE2-NEXT: addpd %xmm6, %xmm1
+; SSE2-NEXT: addpd %xmm7, %xmm2
+; SSE2-NEXT: addpd %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fadd_v8f64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movapd %xmm0, %xmm8
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm9, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm9, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm9 = [-0.0E+0,-0.0E+0]
+; SSE42-NEXT: movapd %xmm9, %xmm11
+; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm7
+; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm6
+; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE42-NEXT: pand %xmm0, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm0, %xmm10
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9
+; SSE42-NEXT: addpd %xmm8, %xmm9
+; SSE42-NEXT: addpd %xmm6, %xmm1
+; SSE42-NEXT: addpd %xmm7, %xmm2
+; SSE42-NEXT: addpd %xmm11, %xmm3
+; SSE42-NEXT: movapd %xmm9, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fadd_v8f64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -567,6 +1475,23 @@ define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n
}
define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE-LABEL: fsub_v8f32_cast_cond:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: subps %xmm4, %xmm0
+; SSE-NEXT: subps %xmm6, %xmm1
+; SSE-NEXT: retq
+;
; AVX2-LABEL: fsub_v8f32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -598,6 +1523,72 @@ define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou
}
define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) {
+; SSE2-LABEL: fsub_v8f64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: subpd %xmm9, %xmm0
+; SSE2-NEXT: subpd %xmm10, %xmm1
+; SSE2-NEXT: subpd %xmm7, %xmm2
+; SSE2-NEXT: subpd %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fsub_v8f64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm10
+; SSE42-NEXT: pand %xmm7, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm10
+; SSE42-NEXT: pand %xmm6, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm7
+; SSE42-NEXT: pand %xmm5, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: subpd %xmm9, %xmm0
+; SSE42-NEXT: subpd %xmm7, %xmm1
+; SSE42-NEXT: subpd %xmm10, %xmm2
+; SSE42-NEXT: subpd %xmm8, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fsub_v8f64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -626,6 +1617,50 @@ define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n
}
define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fmul_v8f32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: mulps %xmm4, %xmm0
+; SSE2-NEXT: mulps %xmm6, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v8f32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm0, %xmm4
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm0, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6
+; SSE42-NEXT: mulps %xmm4, %xmm6
+; SSE42-NEXT: mulps %xmm7, %xmm1
+; SSE42-NEXT: movaps %xmm6, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v8f32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -659,6 +1694,88 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou
}
define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) {
+; SSE2-LABEL: fmul_v8f64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm10, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm7, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm6, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: mulpd %xmm5, %xmm0
+; SSE2-NEXT: mulpd %xmm6, %xmm1
+; SSE2-NEXT: mulpd %xmm7, %xmm2
+; SSE2-NEXT: mulpd %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fmul_v8f64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movapd %xmm0, %xmm8
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm9, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm9, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1.0E+0,1.0E+0]
+; SSE42-NEXT: movapd %xmm9, %xmm11
+; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm7
+; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm6
+; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE42-NEXT: pand %xmm0, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm0, %xmm10
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9
+; SSE42-NEXT: mulpd %xmm8, %xmm9
+; SSE42-NEXT: mulpd %xmm6, %xmm1
+; SSE42-NEXT: mulpd %xmm7, %xmm2
+; SSE42-NEXT: mulpd %xmm11, %xmm3
+; SSE42-NEXT: movapd %xmm9, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fmul_v8f64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -688,6 +1805,50 @@ define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n
}
define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> noundef %x, <8 x float> noundef %y) {
+; SSE2-LABEL: fdiv_v8f32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pandn %xmm5, %xmm6
+; SSE2-NEXT: por %xmm3, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm2, %xmm4
+; SSE2-NEXT: divps %xmm4, %xmm0
+; SSE2-NEXT: divps %xmm6, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v8f32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movaps %xmm0, %xmm4
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm0, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6
+; SSE42-NEXT: divps %xmm6, %xmm4
+; SSE42-NEXT: divps %xmm7, %xmm1
+; SSE42-NEXT: movaps %xmm4, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v8f32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -721,6 +1882,88 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou
}
define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) {
+; SSE2-LABEL: fdiv_v8f64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1.0E+0,1.0E+0]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm10, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm7, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm6, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: divpd %xmm5, %xmm0
+; SSE2-NEXT: divpd %xmm6, %xmm1
+; SSE2-NEXT: divpd %xmm7, %xmm2
+; SSE2-NEXT: divpd %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: fdiv_v8f64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movapd %xmm0, %xmm8
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm0
+; SSE42-NEXT: pand %xmm10, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm11 = [1.0E+0,1.0E+0]
+; SSE42-NEXT: movapd %xmm11, %xmm10
+; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm0
+; SSE42-NEXT: pand %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: movapd %xmm11, %xmm7
+; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm0
+; SSE42-NEXT: movapd %xmm11, %xmm6
+; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE42-NEXT: pand %xmm0, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm0, %xmm9
+; SSE42-NEXT: movdqa %xmm9, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm11
+; SSE42-NEXT: divpd %xmm11, %xmm8
+; SSE42-NEXT: divpd %xmm6, %xmm1
+; SSE42-NEXT: divpd %xmm7, %xmm2
+; SSE42-NEXT: divpd %xmm10, %xmm3
+; SSE42-NEXT: movapd %xmm8, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: fdiv_v8f64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -750,6 +1993,14 @@ define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> n
}
define <4 x i32> @add_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE-LABEL: add_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: paddd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
; AVX2-LABEL: add_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -781,6 +2032,38 @@ define <4 x i32> @add_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef
}
define <8 x i32> @add_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: add_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: paddd %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: add_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm3, %xmm0
+; SSE42-NEXT: paddd %xmm1, %xmm0
+; SSE42-NEXT: paddd %xmm2, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: add_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -814,6 +2097,23 @@ define <8 x i32> @add_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32>
}
define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE-LABEL: add_v8i32_cast_cond:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: paddd %xmm4, %xmm0
+; SSE-NEXT: paddd %xmm6, %xmm1
+; SSE-NEXT: retq
+;
; AVX2-LABEL: add_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -845,6 +2145,72 @@ define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: add_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: paddq %xmm9, %xmm0
+; SSE2-NEXT: paddq %xmm10, %xmm1
+; SSE2-NEXT: paddq %xmm7, %xmm2
+; SSE2-NEXT: paddq %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: add_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm10
+; SSE42-NEXT: pand %xmm7, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm10
+; SSE42-NEXT: pand %xmm6, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm7
+; SSE42-NEXT: pand %xmm5, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: paddq %xmm9, %xmm0
+; SSE42-NEXT: paddq %xmm7, %xmm1
+; SSE42-NEXT: paddq %xmm10, %xmm2
+; SSE42-NEXT: paddq %xmm8, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: add_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -873,6 +2239,15 @@ define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <4 x i32> @sub_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE-LABEL: sub_v4i32:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $31, %xmm0
+; SSE-NEXT: psrad $31, %xmm0
+; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: psubd %xmm0, %xmm1
+; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: retq
+;
; AVX2-LABEL: sub_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -906,6 +2281,38 @@ define <4 x i32> @sub_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef
; negative test - sub is not commutative; there is no identity constant for operand 0
define <8 x i32> @sub_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: sub_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm2, %xmm5
+; SSE2-NEXT: movdqa %xmm5, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: sub_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm3, %xmm0
+; SSE42-NEXT: psubd %xmm1, %xmm0
+; SSE42-NEXT: psubd %xmm2, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: sub_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -939,6 +2346,70 @@ define <8 x i32> @sub_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32>
}
define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: sub_v16i32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm1
+; SSE2-NEXT: psubd %xmm7, %xmm2
+; SSE2-NEXT: psubd %xmm9, %xmm3
+; SSE2-NEXT: psubd %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: sub_v16i32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT: psubd %xmm6, %xmm1
+; SSE42-NEXT: psubd %xmm7, %xmm2
+; SSE42-NEXT: psubd %xmm8, %xmm3
+; SSE42-NEXT: psubd %xmm0, %xmm4
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: movdqa %xmm2, %xmm1
+; SSE42-NEXT: movdqa %xmm3, %xmm2
+; SSE42-NEXT: movdqa %xmm4, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: sub_v16i32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -971,6 +2442,69 @@ define <16 x i32> @sub_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3
; negative test - sub is not commutative; there is no identity constant for operand 0
define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: sub_v16i32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm2, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: psubd %xmm1, %xmm0
+; SSE2-NEXT: psubd %xmm8, %xmm7
+; SSE2-NEXT: psubd %xmm3, %xmm2
+; SSE2-NEXT: psubd %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm1
+; SSE2-NEXT: movdqa %xmm9, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: sub_v16i32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm2, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm2
+; SSE42-NEXT: psrad $31, %xmm2
+; SSE42-NEXT: pandn %xmm7, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: psubd %xmm1, %xmm6
+; SSE42-NEXT: psubd %xmm8, %xmm7
+; SSE42-NEXT: psubd %xmm3, %xmm2
+; SSE42-NEXT: psubd %xmm4, %xmm5
+; SSE42-NEXT: movdqa %xmm6, %xmm0
+; SSE42-NEXT: movdqa %xmm7, %xmm1
+; SSE42-NEXT: movdqa %xmm5, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: sub_v16i32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1001,6 +2535,23 @@ define <16 x i32> @sub_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x,
}
define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE-LABEL: sub_v8i32_cast_cond:
+; SSE: # %bb.0:
+; SSE-NEXT: movd %edi, %xmm4
+; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE-NEXT: movdqa %xmm4, %xmm6
+; SSE-NEXT: pand %xmm5, %xmm6
+; SSE-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE-NEXT: pand %xmm3, %xmm6
+; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE-NEXT: pand %xmm3, %xmm4
+; SSE-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE-NEXT: pand %xmm2, %xmm4
+; SSE-NEXT: psubd %xmm4, %xmm0
+; SSE-NEXT: psubd %xmm6, %xmm1
+; SSE-NEXT: retq
+;
; AVX2-LABEL: sub_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -1032,6 +2583,72 @@ define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: sub_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: psubq %xmm9, %xmm0
+; SSE2-NEXT: psubq %xmm10, %xmm1
+; SSE2-NEXT: psubq %xmm7, %xmm2
+; SSE2-NEXT: psubq %xmm8, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: sub_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm10
+; SSE42-NEXT: pand %xmm7, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm10
+; SSE42-NEXT: pand %xmm6, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm7
+; SSE42-NEXT: pand %xmm5, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: psubq %xmm9, %xmm0
+; SSE42-NEXT: psubq %xmm7, %xmm1
+; SSE42-NEXT: psubq %xmm10, %xmm2
+; SSE42-NEXT: psubq %xmm8, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: sub_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -1060,6 +2677,32 @@ define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <4 x i32> @mul_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE2-LABEL: mul_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: paddd %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: psubd %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mul_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm3 = [1,1,1,1]
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm3
+; SSE42-NEXT: pmulld %xmm1, %xmm3
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: mul_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1092,6 +2735,56 @@ define <4 x i32> @mul_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef
}
define <8 x i32> @mul_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: mul_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: psubd %xmm5, %xmm4
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: paddd %xmm0, %xmm3
+; SSE2-NEXT: psubd %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mul_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm4, %xmm6
+; SSE42-NEXT: pmulld %xmm1, %xmm7
+; SSE42-NEXT: pmulld %xmm2, %xmm6
+; SSE42-NEXT: movdqa %xmm7, %xmm0
+; SSE42-NEXT: movdqa %xmm6, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: mul_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1126,6 +2819,62 @@ define <8 x i32> @mul_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32>
}
define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: mul_v8i32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm4, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE2-NEXT: psubd %xmm5, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm2
+; SSE2-NEXT: paddd %xmm4, %xmm2
+; SSE2-NEXT: psubd %xmm5, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mul_v8i32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm0
+; SSE42-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1]
+; SSE42-NEXT: movaps %xmm6, %xmm7
+; SSE42-NEXT: blendvps %xmm0, %xmm3, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm0, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm0, %xmm5
+; SSE42-NEXT: movdqa %xmm5, %xmm0
+; SSE42-NEXT: blendvps %xmm0, %xmm2, %xmm6
+; SSE42-NEXT: pmulld %xmm4, %xmm6
+; SSE42-NEXT: pmulld %xmm7, %xmm1
+; SSE42-NEXT: movdqa %xmm6, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: mul_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -1159,6 +2908,160 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: mul_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm8, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm8, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [1,1]
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pandn %xmm10, %xmm8
+; SSE2-NEXT: por %xmm7, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm7, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm7
+; SSE2-NEXT: pand %xmm7, %xmm6
+; SSE2-NEXT: pandn %xmm10, %xmm7
+; SSE2-NEXT: por %xmm6, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm11
+; SSE2-NEXT: pand %xmm6, %xmm11
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm11[1,0,3,2]
+; SSE2-NEXT: pand %xmm11, %xmm6
+; SSE2-NEXT: pand %xmm6, %xmm5
+; SSE2-NEXT: pandn %xmm10, %xmm6
+; SSE2-NEXT: por %xmm5, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm9, %xmm5
+; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm10, %xmm5
+; SSE2-NEXT: por %xmm4, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm5, %xmm4
+; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: psrlq $32, %xmm9
+; SSE2-NEXT: pmuludq %xmm0, %xmm9
+; SSE2-NEXT: paddq %xmm4, %xmm9
+; SSE2-NEXT: psllq $32, %xmm9
+; SSE2-NEXT: pmuludq %xmm5, %xmm0
+; SSE2-NEXT: paddq %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm6, %xmm4
+; SSE2-NEXT: movdqa %xmm6, %xmm5
+; SSE2-NEXT: psrlq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm1, %xmm5
+; SSE2-NEXT: paddq %xmm4, %xmm5
+; SSE2-NEXT: psllq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm6, %xmm1
+; SSE2-NEXT: paddq %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm7, %xmm4
+; SSE2-NEXT: movdqa %xmm7, %xmm5
+; SSE2-NEXT: psrlq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm2, %xmm5
+; SSE2-NEXT: paddq %xmm4, %xmm5
+; SSE2-NEXT: psllq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm7, %xmm2
+; SSE2-NEXT: paddq %xmm5, %xmm2
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlq $32, %xmm4
+; SSE2-NEXT: pmuludq %xmm8, %xmm4
+; SSE2-NEXT: movdqa %xmm8, %xmm5
+; SSE2-NEXT: psrlq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm3, %xmm5
+; SSE2-NEXT: paddq %xmm4, %xmm5
+; SSE2-NEXT: psllq $32, %xmm5
+; SSE2-NEXT: pmuludq %xmm8, %xmm3
+; SSE2-NEXT: paddq %xmm5, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: mul_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm8
+; SSE42-NEXT: movd %edi, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [64,128]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm9, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm9, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm9 = [1,1]
+; SSE42-NEXT: movapd %xmm9, %xmm11
+; SSE42-NEXT: blendvpd %xmm0, %xmm7, %xmm11
+; SSE42-NEXT: movdqa {{.*#+}} xmm7 = [16,32]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm7, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm7, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm7
+; SSE42-NEXT: blendvpd %xmm0, %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: pand %xmm6, %xmm0
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm0
+; SSE42-NEXT: movapd %xmm9, %xmm6
+; SSE42-NEXT: blendvpd %xmm0, %xmm5, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [1,2]
+; SSE42-NEXT: pand %xmm0, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm0, %xmm10
+; SSE42-NEXT: movdqa %xmm10, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm4, %xmm9
+; SSE42-NEXT: movdqa %xmm8, %xmm0
+; SSE42-NEXT: psrlq $32, %xmm0
+; SSE42-NEXT: pmuludq %xmm9, %xmm0
+; SSE42-NEXT: movdqa %xmm9, %xmm4
+; SSE42-NEXT: psrlq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm8, %xmm4
+; SSE42-NEXT: paddq %xmm0, %xmm4
+; SSE42-NEXT: psllq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm8, %xmm9
+; SSE42-NEXT: paddq %xmm4, %xmm9
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: psrlq $32, %xmm0
+; SSE42-NEXT: pmuludq %xmm6, %xmm0
+; SSE42-NEXT: movdqa %xmm6, %xmm4
+; SSE42-NEXT: psrlq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm1, %xmm4
+; SSE42-NEXT: paddq %xmm0, %xmm4
+; SSE42-NEXT: psllq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm6, %xmm1
+; SSE42-NEXT: paddq %xmm4, %xmm1
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: psrlq $32, %xmm0
+; SSE42-NEXT: pmuludq %xmm7, %xmm0
+; SSE42-NEXT: movdqa %xmm7, %xmm4
+; SSE42-NEXT: psrlq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm2, %xmm4
+; SSE42-NEXT: paddq %xmm0, %xmm4
+; SSE42-NEXT: psllq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm7, %xmm2
+; SSE42-NEXT: paddq %xmm4, %xmm2
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: psrlq $32, %xmm0
+; SSE42-NEXT: pmuludq %xmm11, %xmm0
+; SSE42-NEXT: movdqa %xmm11, %xmm4
+; SSE42-NEXT: psrlq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm3, %xmm4
+; SSE42-NEXT: paddq %xmm0, %xmm4
+; SSE42-NEXT: psllq $32, %xmm4
+; SSE42-NEXT: pmuludq %xmm11, %xmm3
+; SSE42-NEXT: paddq %xmm4, %xmm3
+; SSE42-NEXT: movdqa %xmm9, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: mul_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -1209,6 +3112,34 @@ define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <4 x i32> @shl_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE2-LABEL: shl_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pslld $23, %xmm0
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: cvttps2dq %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm2, %xmm0
+; SSE42-NEXT: pslld $23, %xmm0
+; SSE42-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE42-NEXT: pmulld %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1242,6 +3173,62 @@ define <4 x i32> @shl_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef
; negative test - shl is not commutative; there is no identity constant for operand 0
define <8 x i32> @shl_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: shl_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm5
+; SSE2-NEXT: psrad $31, %xmm5
+; SSE2-NEXT: pand %xmm4, %xmm5
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd %xmm3, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm4, %xmm0
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm3, %xmm5
+; SSE42-NEXT: pslld $23, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
+; SSE42-NEXT: paddd %xmm4, %xmm1
+; SSE42-NEXT: cvttps2dq %xmm1, %xmm3
+; SSE42-NEXT: pmulld %xmm5, %xmm3
+; SSE42-NEXT: pslld $23, %xmm2
+; SSE42-NEXT: paddd %xmm4, %xmm2
+; SSE42-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE42-NEXT: pmulld %xmm0, %xmm1
+; SSE42-NEXT: movdqa %xmm3, %xmm0
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1275,6 +3262,112 @@ define <8 x i32> @shl_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32>
}
define <16 x i32> @shl_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: shl_v16i32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: pslld $23, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm5, %xmm0
+; SSE2-NEXT: cvttps2dq %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm10, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pslld $23, %xmm7
+; SSE2-NEXT: paddd %xmm5, %xmm7
+; SSE2-NEXT: cvttps2dq %xmm7, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm7, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: pslld $23, %xmm9
+; SSE2-NEXT: paddd %xmm5, %xmm9
+; SSE2-NEXT: cvttps2dq %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm7, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: pslld $23, %xmm8
+; SSE2-NEXT: paddd %xmm5, %xmm8
+; SSE2-NEXT: cvttps2dq %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v16i32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pslld $23, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216]
+; SSE42-NEXT: paddd %xmm9, %xmm6
+; SSE42-NEXT: cvttps2dq %xmm6, %xmm0
+; SSE42-NEXT: pmulld %xmm1, %xmm0
+; SSE42-NEXT: pslld $23, %xmm7
+; SSE42-NEXT: paddd %xmm9, %xmm7
+; SSE42-NEXT: cvttps2dq %xmm7, %xmm1
+; SSE42-NEXT: pmulld %xmm2, %xmm1
+; SSE42-NEXT: pslld $23, %xmm8
+; SSE42-NEXT: paddd %xmm9, %xmm8
+; SSE42-NEXT: cvttps2dq %xmm8, %xmm2
+; SSE42-NEXT: pmulld %xmm3, %xmm2
+; SSE42-NEXT: pslld $23, %xmm5
+; SSE42-NEXT: paddd %xmm9, %xmm5
+; SSE42-NEXT: cvttps2dq %xmm5, %xmm3
+; SSE42-NEXT: pmulld %xmm4, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v16i32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1307,6 +3400,112 @@ define <16 x i32> @shl_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i3
; negative test - shl is not commutative; there is no identity constant for operand 0
define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: shl_v16i32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm9
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn %xmm7, %xmm9
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: pslld $23, %xmm1
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pslld $23, %xmm2
+; SSE2-NEXT: paddd %xmm5, %xmm2
+; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: pslld $23, %xmm3
+; SSE2-NEXT: paddd %xmm5, %xmm3
+; SSE2-NEXT: cvttps2dq %xmm3, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm9[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm6, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: pslld $23, %xmm4
+; SSE2-NEXT: paddd %xmm5, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm4, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v16i32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pslld $23, %xmm1
+; SSE42-NEXT: movdqa {{.*#+}} xmm9 = [1065353216,1065353216,1065353216,1065353216]
+; SSE42-NEXT: paddd %xmm9, %xmm1
+; SSE42-NEXT: cvttps2dq %xmm1, %xmm0
+; SSE42-NEXT: pmulld %xmm6, %xmm0
+; SSE42-NEXT: pslld $23, %xmm2
+; SSE42-NEXT: paddd %xmm9, %xmm2
+; SSE42-NEXT: cvttps2dq %xmm2, %xmm1
+; SSE42-NEXT: pmulld %xmm7, %xmm1
+; SSE42-NEXT: pslld $23, %xmm3
+; SSE42-NEXT: paddd %xmm9, %xmm3
+; SSE42-NEXT: cvttps2dq %xmm3, %xmm2
+; SSE42-NEXT: pmulld %xmm8, %xmm2
+; SSE42-NEXT: pslld $23, %xmm4
+; SSE42-NEXT: paddd %xmm9, %xmm4
+; SSE42-NEXT: cvttps2dq %xmm4, %xmm3
+; SSE42-NEXT: pmulld %xmm5, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v16i32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1337,6 +3536,66 @@ define <16 x i32> @shl_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x,
}
define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: shl_v8i32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pslld $23, %xmm5
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; SSE2-NEXT: paddd %xmm2, %xmm5
+; SSE2-NEXT: cvttps2dq %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT: pslld $23, %xmm4
+; SSE2-NEXT: paddd %xmm2, %xmm4
+; SSE2-NEXT: cvttps2dq %xmm4, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v8i32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm4, %xmm6
+; SSE42-NEXT: pand %xmm5, %xmm6
+; SSE42-NEXT: pcmpeqd %xmm5, %xmm6
+; SSE42-NEXT: pand %xmm3, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm3, %xmm4
+; SSE42-NEXT: pcmpeqd %xmm3, %xmm4
+; SSE42-NEXT: pand %xmm2, %xmm4
+; SSE42-NEXT: pslld $23, %xmm4
+; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
+; SSE42-NEXT: paddd %xmm2, %xmm4
+; SSE42-NEXT: cvttps2dq %xmm4, %xmm3
+; SSE42-NEXT: pmulld %xmm3, %xmm0
+; SSE42-NEXT: pslld $23, %xmm6
+; SSE42-NEXT: paddd %xmm2, %xmm6
+; SSE42-NEXT: cvttps2dq %xmm6, %xmm2
+; SSE42-NEXT: pmulld %xmm2, %xmm1
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -1368,6 +3627,104 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: shl_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psllq %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: psllq %xmm5, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psllq %xmm10, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: psllq %xmm5, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psllq %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE2-NEXT: psllq %xmm5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psllq %xmm8, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: psllq %xmm5, %xmm3
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: shl_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm10, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm10
+; SSE42-NEXT: pand %xmm6, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm10
+; SSE42-NEXT: pand %xmm5, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psllq %xmm9, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE42-NEXT: psllq %xmm5, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm4
+; SSE42-NEXT: psllq %xmm10, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE42-NEXT: psllq %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm4
+; SSE42-NEXT: psllq %xmm7, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE42-NEXT: psllq %xmm5, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm4
+; SSE42-NEXT: psllq %xmm8, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE42-NEXT: psllq %xmm5, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: shl_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -1396,6 +3753,51 @@ define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <4 x i32> @lshr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE2-LABEL: lshr_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrld %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrld %xmm4, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm0, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm2, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
+; SSE42-NEXT: psrld %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm5
+; SSE42-NEXT: psrld %xmm4, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: psrld %xmm3, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm2, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1429,6 +3831,93 @@ define <4 x i32> @lshr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde
; negative test - lshr is not commutative; there is no identity constant for operand 0
define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: lshr_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrld %xmm4, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrld %xmm3, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm3, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrld %xmm3, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm3, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm2, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm3, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm6
+; SSE42-NEXT: psrld %xmm4, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrld %xmm4, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psrld %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm1, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm4
+; SSE42-NEXT: psrld %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm6
+; SSE42-NEXT: psrld %xmm3, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm1
+; SSE42-NEXT: psrld %xmm3, %xmm1
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm2, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1462,6 +3951,175 @@ define <8 x i32> @lshr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32
}
define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: lshr_v16i32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: psrad $31, %xmm10
+; SSE2-NEXT: pandn %xmm7, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn %xmm5, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm0, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrld %xmm6, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrld %xmm6, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrld %xmm1, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrld %xmm6, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm2
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrld %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: psrld %xmm6, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm3
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrld %xmm3, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: psrld %xmm6, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v16i32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm9
+; SSE42-NEXT: psrld %xmm0, %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm11
+; SSE42-NEXT: psrld %xmm0, %xmm11
+; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: psrld %xmm6, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm6, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm6
+; SSE42-NEXT: psrld %xmm1, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm10
+; SSE42-NEXT: psrld %xmm1, %xmm10
+; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm1
+; SSE42-NEXT: psrld %xmm6, %xmm1
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm6, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm6
+; SSE42-NEXT: psrld %xmm2, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm9
+; SSE42-NEXT: psrld %xmm2, %xmm9
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm2
+; SSE42-NEXT: psrld %xmm6, %xmm2
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm6, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm6
+; SSE42-NEXT: psrld %xmm3, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm8
+; SSE42-NEXT: psrld %xmm3, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm3
+; SSE42-NEXT: psrld %xmm5, %xmm3
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm5, %xmm4
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v16i32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1494,6 +4152,181 @@ define <16 x i32> @lshr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i
; negative test - lshr is not commutative; there is no identity constant for operand 0
define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: lshr_v16i32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrld %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrld %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrld %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrld %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrld %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrld %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm5, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrld %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrld %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrld %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm4, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v16i32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm3, %xmm10
+; SSE42-NEXT: movdqa %xmm2, %xmm9
+; SSE42-NEXT: movdqa %xmm1, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm2
+; SSE42-NEXT: psrad $31, %xmm2
+; SSE42-NEXT: pandn %xmm7, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm1
+; SSE42-NEXT: psrad $31, %xmm1
+; SSE42-NEXT: pandn %xmm6, %xmm1
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pandn %xmm5, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm3
+; SSE42-NEXT: psrad $31, %xmm3
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrld %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm11
+; SSE42-NEXT: psrld %xmm6, %xmm11
+; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm6
+; SSE42-NEXT: psrld %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm5, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm7
+; SSE42-NEXT: psrld %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm8
+; SSE42-NEXT: psrld %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm6
+; SSE42-NEXT: psrld %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm7
+; SSE42-NEXT: psrld %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm8
+; SSE42-NEXT: psrld %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm6
+; SSE42-NEXT: psrld %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm5, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm7
+; SSE42-NEXT: psrld %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm8
+; SSE42-NEXT: psrld %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm6
+; SSE42-NEXT: psrld %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm4, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v16i32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1524,6 +4357,98 @@ define <16 x i32> @lshr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x,
}
define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: lshr_v8i32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrld %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrld %xmm6, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrld %xmm5, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm3, %xmm0
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm0, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrld %xmm0, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrld %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrld %xmm0, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v8i32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm4
+; SSE42-NEXT: pand %xmm6, %xmm4
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE42-NEXT: pand %xmm3, %xmm4
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm3, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE42-NEXT: pand %xmm2, %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: psrld %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrld %xmm6, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: psrld %xmm3, %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm2, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
+; SSE42-NEXT: psrld %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm6
+; SSE42-NEXT: psrld %xmm5, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm4
+; SSE42-NEXT: psrld %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrld %xmm2, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -1555,6 +4480,104 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: lshr_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm10[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm6, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrlq %xmm9, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm5, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrlq %xmm10, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm5, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm4
+; SSE2-NEXT: psrlq %xmm7, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm5, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: psrlq %xmm8, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm5, %xmm3
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: lshr_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm10, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm10
+; SSE42-NEXT: pand %xmm6, %xmm10
+; SSE42-NEXT: pcmpeqq %xmm6, %xmm10
+; SSE42-NEXT: pand %xmm5, %xmm10
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psrlq %xmm9, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE42-NEXT: psrlq %xmm5, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm4
+; SSE42-NEXT: psrlq %xmm10, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE42-NEXT: psrlq %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm4
+; SSE42-NEXT: psrlq %xmm7, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE42-NEXT: psrlq %xmm5, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm4
+; SSE42-NEXT: psrlq %xmm8, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE42-NEXT: psrlq %xmm5, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: lshr_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -1583,6 +4606,51 @@ define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <4 x i32> @ashr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> noundef %y) {
+; SSE2-LABEL: ashr_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrad %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: psrad %xmm4, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad %xmm3, %xmm4
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm0, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm2, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
+; SSE42-NEXT: psrad %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm5
+; SSE42-NEXT: psrad %xmm4, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: psrad %xmm3, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm2, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v4i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
@@ -1616,6 +4684,93 @@ define <4 x i32> @ashr_v4i32(<4 x i1> %b, <4 x i32> noundef %x, <4 x i32> nounde
; negative test - ashr is not commutative; there is no identity constant for operand 0
define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: ashr_v8i32_commute:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pand %xmm4, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad %xmm4, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: psrad %xmm3, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm6[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad %xmm3, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm3, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrad %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm4
+; SSE2-NEXT: psrad %xmm3, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm5[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrad %xmm3, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm2, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm4[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v8i32_commute:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE42-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pand %xmm4, %xmm5
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pand %xmm3, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm6
+; SSE42-NEXT: psrad %xmm4, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrad %xmm4, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm4
+; SSE42-NEXT: psrad %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm1, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm4
+; SSE42-NEXT: psrad %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm6
+; SSE42-NEXT: psrad %xmm3, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm5, %xmm1
+; SSE42-NEXT: psrad %xmm3, %xmm1
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm2, %xmm5
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v8i32_commute:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
@@ -1649,6 +4804,175 @@ define <8 x i32> @ashr_v8i32_commute(<8 x i1> %b, <8 x i32> noundef %x, <8 x i32
}
define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: ashr_v16i32_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm0, %xmm8
+; SSE2-NEXT: movdqa %xmm0, %xmm10
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm10, %xmm9
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm9
+; SSE2-NEXT: psrad $31, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm9
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm10
+; SSE2-NEXT: psrad $31, %xmm10
+; SSE2-NEXT: pandn %xmm7, %xmm10
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm8, %xmm7
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pandn %xmm6, %xmm7
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pandn %xmm5, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrad %xmm0, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: psrad %xmm6, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm8
+; SSE2-NEXT: psrad %xmm6, %xmm8
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm8[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrad %xmm1, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm1
+; SSE2-NEXT: psrad %xmm6, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm2
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrad %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm2
+; SSE2-NEXT: psrad %xmm6, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm3
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm9[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrad %xmm3, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm3
+; SSE2-NEXT: psrad %xmm6, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm4
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm7[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v16i32_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm8
+; SSE42-NEXT: psrad $31, %xmm8
+; SSE42-NEXT: pandn %xmm7, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm7
+; SSE42-NEXT: psrad $31, %xmm7
+; SSE42-NEXT: pandn %xmm6, %xmm7
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm6
+; SSE42-NEXT: psrad $31, %xmm6
+; SSE42-NEXT: pandn %xmm5, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm5
+; SSE42-NEXT: psrad $31, %xmm5
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm9
+; SSE42-NEXT: psrad %xmm0, %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm11
+; SSE42-NEXT: psrad %xmm0, %xmm11
+; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm9[0,1,2,3],xmm11[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: psrad %xmm6, %xmm0
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm6, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm6
+; SSE42-NEXT: psrad %xmm1, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm7[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm10
+; SSE42-NEXT: psrad %xmm1, %xmm10
+; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm6[0,1,2,3],xmm10[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm1
+; SSE42-NEXT: psrad %xmm6, %xmm1
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm6, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3],xmm1[4,5],xmm10[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm8[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm6
+; SSE42-NEXT: psrad %xmm2, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm9
+; SSE42-NEXT: psrad %xmm2, %xmm9
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm6[0,1,2,3],xmm9[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm2
+; SSE42-NEXT: psrad %xmm6, %xmm2
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm6, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5],xmm9[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm6
+; SSE42-NEXT: psrad %xmm3, %xmm6
+; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm7[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm8
+; SSE42-NEXT: psrad %xmm3, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm4, %xmm3
+; SSE42-NEXT: psrad %xmm5, %xmm3
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm7[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm5, %xmm4
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v16i32_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1681,6 +5005,181 @@ define <16 x i32> @ashr_v16i32_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i
; negative test - ashr is not commutative; there is no identity constant for operand 0
define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x, <16 x i32> noundef %y) {
+; SSE2-LABEL: ashr_v16i32_commute_swap:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: movdqa %xmm2, %xmm9
+; SSE2-NEXT: movdqa %xmm1, %xmm10
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; SSE2-NEXT: movdqa %xmm2, %xmm3
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm3
+; SSE2-NEXT: psrad $31, %xmm3
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm2
+; SSE2-NEXT: psrad $31, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm2
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm1
+; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; SSE2-NEXT: pslld $31, %xmm1
+; SSE2-NEXT: psrad $31, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm1
+; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT: pslld $31, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm7
+; SSE2-NEXT: psrad %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm0
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: psrad %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm7
+; SSE2-NEXT: psrad %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm6
+; SSE2-NEXT: psrad %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm2, %xmm7
+; SSE2-NEXT: psrad %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm5, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm6[0,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrad %xmm6, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm6
+; SSE2-NEXT: psrad %xmm5, %xmm6
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm7[1]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm3, %xmm7
+; SSE2-NEXT: psrad %xmm5, %xmm7
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm4, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm6[0,3]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v16i32_commute_swap:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm3, %xmm10
+; SSE42-NEXT: movdqa %xmm2, %xmm9
+; SSE42-NEXT: movdqa %xmm1, %xmm8
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm2
+; SSE42-NEXT: psrad $31, %xmm2
+; SSE42-NEXT: pandn %xmm7, %xmm2
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm1
+; SSE42-NEXT: psrad $31, %xmm1
+; SSE42-NEXT: pandn %xmm6, %xmm1
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm0
+; SSE42-NEXT: psrad $31, %xmm0
+; SSE42-NEXT: pandn %xmm5, %xmm0
+; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; SSE42-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE42-NEXT: pslld $31, %xmm3
+; SSE42-NEXT: psrad $31, %xmm3
+; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrad %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm8[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm11
+; SSE42-NEXT: psrad %xmm6, %xmm11
+; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm6
+; SSE42-NEXT: psrad %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm5, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5],xmm11[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm9[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm7
+; SSE42-NEXT: psrad %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm9[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm8
+; SSE42-NEXT: psrad %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm6
+; SSE42-NEXT: psrad %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm5, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm10[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm7
+; SSE42-NEXT: psrad %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm8
+; SSE42-NEXT: psrad %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm6
+; SSE42-NEXT: psrad %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm10[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm5, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm7
+; SSE42-NEXT: psrad %xmm6, %xmm7
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm4[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm8
+; SSE42-NEXT: psrad %xmm6, %xmm8
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm6
+; SSE42-NEXT: psrad %xmm5, %xmm6
+; SSE42-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm4, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3],xmm3[4,5],xmm8[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v16i32_commute_swap:
; AVX2: # %bb.0:
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
@@ -1711,6 +5210,98 @@ define <16 x i32> @ashr_v16i32_commute_swap(<16 x i1> %b, <16 x i32> noundef %x,
}
define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef %x, <8 x i32> noundef %y) {
+; SSE2-LABEL: ashr_v8i32_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE2-NEXT: movdqa %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm6, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE2-NEXT: pand %xmm3, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE2-NEXT: pand %xmm2, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: psrad %xmm2, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm5[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: psrad %xmm6, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm0, %xmm6
+; SSE2-NEXT: psrad %xmm5, %xmm6
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm3, %xmm0
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm6[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrad %xmm0, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: psrad %xmm0, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,3,3,3,4,5,6,7]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrad %xmm4, %xmm5
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; SSE2-NEXT: psrad %xmm0, %xmm1
+; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm5[1]
+; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
+; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movaps %xmm3, %xmm1
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v8i32_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm4
+; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
+; SSE42-NEXT: movdqa {{.*#+}} xmm6 = [16,32,64,128]
+; SSE42-NEXT: movdqa %xmm5, %xmm4
+; SSE42-NEXT: pand %xmm6, %xmm4
+; SSE42-NEXT: pcmpeqd %xmm6, %xmm4
+; SSE42-NEXT: pand %xmm3, %xmm4
+; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8]
+; SSE42-NEXT: pand %xmm3, %xmm5
+; SSE42-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE42-NEXT: pand %xmm2, %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm3
+; SSE42-NEXT: psrad %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm7
+; SSE42-NEXT: psrad %xmm6, %xmm7
+; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm3[0,1,2,3],xmm7[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: psrad %xmm3, %xmm5
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm2, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm4[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm3
+; SSE42-NEXT: psrad %xmm2, %xmm3
+; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,3,3,3,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm6
+; SSE42-NEXT: psrad %xmm5, %xmm6
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm3[0,1,2,3],xmm6[4,5,6,7]
+; SSE42-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm4
+; SSE42-NEXT: psrad %xmm3, %xmm4
+; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
+; SSE42-NEXT: psrad %xmm2, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v8i32_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm2
@@ -1742,6 +5333,160 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef
}
define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) {
+; SSE2-LABEL: ashr_v8i64_cast_cond:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movd %edi, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE2-NEXT: movdqa %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm8[1,0,3,2]
+; SSE2-NEXT: pand %xmm7, %xmm8
+; SSE2-NEXT: pand %xmm10, %xmm8
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE2-NEXT: movdqa %xmm9, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[1,0,3,2]
+; SSE2-NEXT: pand %xmm6, %xmm7
+; SSE2-NEXT: pand %xmm10, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [4,8]
+; SSE2-NEXT: movdqa %xmm9, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,0,3,2]
+; SSE2-NEXT: pand %xmm5, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,0,3,2]
+; SSE2-NEXT: pand %xmm4, %xmm9
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrlq %xmm9, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; SSE2-NEXT: movdqa %xmm4, %xmm11
+; SSE2-NEXT: psrlq %xmm10, %xmm11
+; SSE2-NEXT: movsd {{.*#+}} xmm11 = xmm5[0],xmm11[1]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: psrlq %xmm9, %xmm5
+; SSE2-NEXT: psrlq %xmm10, %xmm0
+; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
+; SSE2-NEXT: xorpd %xmm11, %xmm0
+; SSE2-NEXT: psubq %xmm11, %xmm0
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrlq %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3]
+; SSE2-NEXT: movdqa %xmm4, %xmm10
+; SSE2-NEXT: psrlq %xmm9, %xmm10
+; SSE2-NEXT: movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: psrlq %xmm6, %xmm5
+; SSE2-NEXT: psrlq %xmm9, %xmm1
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
+; SSE2-NEXT: xorpd %xmm10, %xmm1
+; SSE2-NEXT: psubq %xmm10, %xmm1
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrlq %xmm7, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: psrlq %xmm6, %xmm9
+; SSE2-NEXT: movsd {{.*#+}} xmm9 = xmm5[0],xmm9[1]
+; SSE2-NEXT: movdqa %xmm2, %xmm5
+; SSE2-NEXT: psrlq %xmm7, %xmm5
+; SSE2-NEXT: psrlq %xmm6, %xmm2
+; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
+; SSE2-NEXT: xorpd %xmm9, %xmm2
+; SSE2-NEXT: psubq %xmm9, %xmm2
+; SSE2-NEXT: movdqa %xmm4, %xmm5
+; SSE2-NEXT: psrlq %xmm8, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3]
+; SSE2-NEXT: psrlq %xmm6, %xmm4
+; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
+; SSE2-NEXT: movdqa %xmm3, %xmm5
+; SSE2-NEXT: psrlq %xmm8, %xmm5
+; SSE2-NEXT: psrlq %xmm6, %xmm3
+; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm5[0],xmm3[1]
+; SSE2-NEXT: xorpd %xmm4, %xmm3
+; SSE2-NEXT: psubq %xmm4, %xmm3
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: ashr_v8i64_cast_cond:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movd %edi, %xmm8
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,1,0,1]
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [64,128]
+; SSE42-NEXT: movdqa %xmm9, %xmm8
+; SSE42-NEXT: pand %xmm10, %xmm8
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm8
+; SSE42-NEXT: pand %xmm7, %xmm8
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [16,32]
+; SSE42-NEXT: movdqa %xmm9, %xmm7
+; SSE42-NEXT: pand %xmm10, %xmm7
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm7
+; SSE42-NEXT: pand %xmm6, %xmm7
+; SSE42-NEXT: movdqa {{.*#+}} xmm10 = [4,8]
+; SSE42-NEXT: movdqa %xmm9, %xmm6
+; SSE42-NEXT: pand %xmm10, %xmm6
+; SSE42-NEXT: pcmpeqq %xmm10, %xmm6
+; SSE42-NEXT: pand %xmm5, %xmm6
+; SSE42-NEXT: movdqa {{.*#+}} xmm5 = [1,2]
+; SSE42-NEXT: pand %xmm5, %xmm9
+; SSE42-NEXT: pcmpeqq %xmm5, %xmm9
+; SSE42-NEXT: pand %xmm4, %xmm9
+; SSE42-NEXT: movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: movdqa %xmm4, %xmm5
+; SSE42-NEXT: psrlq %xmm9, %xmm5
+; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; SSE42-NEXT: movdqa %xmm4, %xmm11
+; SSE42-NEXT: psrlq %xmm10, %xmm11
+; SSE42-NEXT: pblendw {{.*#+}} xmm11 = xmm5[0,1,2,3],xmm11[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm0, %xmm5
+; SSE42-NEXT: psrlq %xmm9, %xmm5
+; SSE42-NEXT: psrlq %xmm10, %xmm0
+; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
+; SSE42-NEXT: pxor %xmm11, %xmm0
+; SSE42-NEXT: psubq %xmm11, %xmm0
+; SSE42-NEXT: movdqa %xmm4, %xmm5
+; SSE42-NEXT: psrlq %xmm6, %xmm5
+; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm6[2,3,2,3]
+; SSE42-NEXT: movdqa %xmm4, %xmm10
+; SSE42-NEXT: psrlq %xmm9, %xmm10
+; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm5[0,1,2,3],xmm10[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm1, %xmm5
+; SSE42-NEXT: psrlq %xmm6, %xmm5
+; SSE42-NEXT: psrlq %xmm9, %xmm1
+; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
+; SSE42-NEXT: pxor %xmm10, %xmm1
+; SSE42-NEXT: psubq %xmm10, %xmm1
+; SSE42-NEXT: movdqa %xmm4, %xmm5
+; SSE42-NEXT: psrlq %xmm7, %xmm5
+; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; SSE42-NEXT: movdqa %xmm4, %xmm9
+; SSE42-NEXT: psrlq %xmm6, %xmm9
+; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm5[0,1,2,3],xmm9[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm2, %xmm5
+; SSE42-NEXT: psrlq %xmm7, %xmm5
+; SSE42-NEXT: psrlq %xmm6, %xmm2
+; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
+; SSE42-NEXT: pxor %xmm9, %xmm2
+; SSE42-NEXT: psubq %xmm9, %xmm2
+; SSE42-NEXT: movdqa %xmm4, %xmm5
+; SSE42-NEXT: psrlq %xmm8, %xmm5
+; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3]
+; SSE42-NEXT: psrlq %xmm6, %xmm4
+; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
+; SSE42-NEXT: movdqa %xmm3, %xmm5
+; SSE42-NEXT: psrlq %xmm8, %xmm5
+; SSE42-NEXT: psrlq %xmm6, %xmm3
+; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; SSE42-NEXT: pxor %xmm4, %xmm3
+; SSE42-NEXT: psubq %xmm4, %xmm3
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: ashr_v8i64_cast_cond:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd %edi, %xmm4
@@ -1777,6 +5522,164 @@ define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef
}
define <8 x i64> @select_sdiv_neutral_constant_v8i64(<8 x i1> %b, <8 x i64> %x, <8 x i64> %y) {
+; SSE2-LABEL: select_sdiv_neutral_constant_v8i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: psllq $63, %xmm8
+; SSE2-NEXT: psrad $31, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm8, %xmm10
+; SSE2-NEXT: pandn %xmm7, %xmm10
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [1,1]
+; SSE2-NEXT: pand %xmm9, %xmm8
+; SSE2-NEXT: por %xmm10, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,1,1]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: psllq $63, %xmm7
+; SSE2-NEXT: psrad $31, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm7, %xmm10
+; SSE2-NEXT: pandn %xmm6, %xmm10
+; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: por %xmm10, %xmm7
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,0,0]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: psllq $63, %xmm6
+; SSE2-NEXT: psrad $31, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; SSE2-NEXT: movdqa %xmm6, %xmm10
+; SSE2-NEXT: pandn %xmm5, %xmm10
+; SSE2-NEXT: pand %xmm9, %xmm6
+; SSE2-NEXT: por %xmm10, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; SSE2-NEXT: psllq $63, %xmm0
+; SSE2-NEXT: psrad $31, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE2-NEXT: pand %xmm5, %xmm9
+; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: por %xmm9, %xmm5
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT: movq %xmm1, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movq %xmm7, %rcx
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[2,3,2,3]
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm2
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT: movq %xmm8, %rcx
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm8[2,3,2,3]
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm3
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE2-NEXT: movq %xmm5, %rcx
+; SSE2-NEXT: movq %xmm4, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
+; SSE2-NEXT: movq %xmm5, %rcx
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; SSE2-NEXT: movq %xmm4, %rax
+; SSE2-NEXT: cqto
+; SSE2-NEXT: idivq %rcx
+; SSE2-NEXT: movq %rax, %xmm4
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: select_sdiv_neutral_constant_v8i64:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movdqa %xmm0, %xmm8
+; SSE42-NEXT: movapd {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: movapd {{.*#+}} xmm10 = [1,1]
+; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm9
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm7
+; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm6
+; SSE42-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; SSE42-NEXT: psllq $63, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm10, %xmm5
+; SSE42-NEXT: pextrq $1, %xmm5, %rcx
+; SSE42-NEXT: pextrq $1, %xmm1, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm8
+; SSE42-NEXT: movq %xmm5, %rcx
+; SSE42-NEXT: movq %xmm1, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm0
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm8[0]
+; SSE42-NEXT: pextrq $1, %xmm6, %rcx
+; SSE42-NEXT: pextrq $1, %xmm2, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm5
+; SSE42-NEXT: movq %xmm6, %rcx
+; SSE42-NEXT: movq %xmm2, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm1
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE42-NEXT: pextrq $1, %xmm7, %rcx
+; SSE42-NEXT: pextrq $1, %xmm3, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm5
+; SSE42-NEXT: movq %xmm7, %rcx
+; SSE42-NEXT: movq %xmm3, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm2
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; SSE42-NEXT: pextrq $1, %xmm9, %rcx
+; SSE42-NEXT: pextrq $1, %xmm4, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm5
+; SSE42-NEXT: movq %xmm9, %rcx
+; SSE42-NEXT: movq %xmm4, %rax
+; SSE42-NEXT: cqto
+; SSE42-NEXT: idivq %rcx
+; SSE42-NEXT: movq %rax, %xmm3
+; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE42-NEXT: retq
+;
; AVX2-LABEL: select_sdiv_neutral_constant_v8i64:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
More information about the llvm-commits
mailing list