[llvm] 52855ed - [X86] Add back support for matching VPTERNLOG from back to back logic ops.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 2 22:12:41 PDT 2020
Author: Craig Topper
Date: 2020-07-02T22:11:52-07:00
New Revision: 52855ed099fa8caed908584675c892c27445c1dd
URL: https://github.com/llvm/llvm-project/commit/52855ed099fa8caed908584675c892c27445c1dd
DIFF: https://github.com/llvm/llvm-project/commit/52855ed099fa8caed908584675c892c27445c1dd.diff
LOG: [X86] Add back support for matching VPTERNLOG from back to back logic ops.
I think this mostly looks ok. The only weird thing I noticed was
a couple rotate vXi8 tests picked up an extra logic op where we have
(and (or (and), (andn)), X). Previously we matched the (or (and), (andn))
to vpternlog, but now we match the (and (or), X) and leave the and/andn
unmatched.
Added:
Modified:
llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/test/CodeGen/X86/avx512-cvt.ll
llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
llvm/test/CodeGen/X86/avx512-mask-op.ll
llvm/test/CodeGen/X86/fp-round.ll
llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
llvm/test/CodeGen/X86/min-legal-vector-width.ll
llvm/test/CodeGen/X86/sadd_sat_vec.ll
llvm/test/CodeGen/X86/ssub_sat_vec.ll
llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
llvm/test/CodeGen/X86/vector-fshl-256.ll
llvm/test/CodeGen/X86/vector-fshl-512.ll
llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
llvm/test/CodeGen/X86/vector-fshr-256.ll
llvm/test/CodeGen/X86/vector-fshr-512.ll
llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
llvm/test/CodeGen/X86/vector-rotate-128.ll
llvm/test/CodeGen/X86/vector-rotate-256.ll
llvm/test/CodeGen/X86/vector-rotate-512.ll
llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 410547390c91..409da74bb74f 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -503,6 +503,7 @@ namespace {
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
bool tryShrinkShlLogicImm(SDNode *N);
+ bool tryVPTERNLOG(SDNode *N);
bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
bool tryMatchBitSelect(SDNode *N);
@@ -3929,6 +3930,82 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
return true;
}
+// Try to match two logic ops to a VPTERNLOG.
+// FIXME: Handle inverted inputs?
+// FIXME: Handle more complex patterns that use an operand more than once?
+// FIXME: Support X86ISD::ANDNP
+bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
+ MVT NVT = N->getSimpleValueType(0);
+
+ // Make sure we support VPTERNLOG.
+ if (!NVT.isVector() || !Subtarget->hasAVX512() ||
+ NVT.getVectorElementType() == MVT::i1)
+ return false;
+
+ // We need VLX for 128/256-bit.
+ if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+ return false;
+
+ unsigned Opc1 = N->getOpcode();
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ auto isLogicOp = [](unsigned Opc) {
+ return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR;
+ };
+
+ SDValue A, B, C;
+ unsigned Opc2;
+ if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
+ Opc2 = N1.getOpcode();
+ A = N0;
+ B = N1.getOperand(0);
+ C = N1.getOperand(1);
+ } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
+ Opc2 = N0.getOpcode();
+ A = N1;
+ B = N0.getOperand(0);
+ C = N0.getOperand(1);
+ } else
+ return false;
+
+ uint64_t Imm;
+ switch (Opc1) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0x80; break;
+ case ISD::OR: Imm = 0xe0; break;
+ case ISD::XOR: Imm = 0x60; break;
+ }
+ break;
+ case ISD::OR:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0xf8; break;
+ case ISD::OR: Imm = 0xfe; break;
+ case ISD::XOR: Imm = 0xf6; break;
+ }
+ break;
+ case ISD::XOR:
+ switch (Opc2) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case ISD::AND: Imm = 0x78; break;
+ case ISD::OR: Imm = 0x1e; break;
+ case ISD::XOR: Imm = 0x96; break;
+ }
+ break;
+ }
+
+ SDLoc DL(N);
+ SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
+ CurDAG->getTargetConstant(Imm, DL, MVT::i8));
+ ReplaceNode(N, New.getNode());
+ SelectCode(New.getNode());
+ return true;
+}
+
/// If the high bits of an 'and' operand are known zero, try setting the
/// high bits of an 'and' constant operand to produce a smaller encoding by
/// creating a small, sign-extended negative immediate rather than a large
@@ -4432,9 +4509,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
case ISD::XOR:
if (tryShrinkShlLogicImm(Node))
return;
-
if (Opcode == ISD::OR && tryMatchBitSelect(Node))
return;
+ if (tryVPTERNLOG(Node))
+ return;
LLVM_FALLTHROUGH;
case ISD::ADD:
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 5362e2b262a4..e92528b9bbcc 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -321,8 +321,8 @@ define <4 x float> @ulto4f32(<4 x i64> %a) {
define <8 x double> @ulto8f64(<8 x i64> %a) {
; NODQ-LABEL: ulto8f64:
; NODQ: # %bb.0:
-; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1
; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
@@ -346,21 +346,20 @@ define <16 x double> @ulto16f64(<16 x i64> %a) {
; NODQ-LABEL: ulto16f64:
; NODQ: # %bb.0:
; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3
+; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-NEXT: vmovdqa64 %zmm3, %zmm4
+; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm4
; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0
; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0
-; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0
-; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2
-; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2
+; NODQ-NEXT: vaddpd %zmm0, %zmm4, %zmm0
+; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm3
; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1
; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1
; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1
-; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; NODQ-NEXT: vaddpd %zmm1, %zmm3, %zmm1
; NODQ-NEXT: retq
;
; VLDQ-LABEL: ulto16f64:
diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
index 241a6ea9995e..65fcc7091a7a 100644
--- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll
@@ -7,21 +7,21 @@ define <16 x i8> @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <
; X86-LABEL: test_vgf2p8affineinvqb_128:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X86-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
+; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03]
+; X86-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x04]
; X86-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
-; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
-; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineinvqb_128:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03]
-; X64-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x04]
+; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03]
+; X64-NEXT: vgf2p8affineinvqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x04]
; X64-NEXT: vgf2p8affineinvqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x05]
-; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
-; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i16 %mask to <16 x i1>
%2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
@@ -39,21 +39,21 @@ define <32 x i8> @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <
; X86-LABEL: test_vgf2p8affineinvqb_256:
; X86: # %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X86-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
+; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03]
+; X86-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x04]
; X86-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
-; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
-; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineinvqb_256:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03]
-; X64-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x04]
+; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03]
+; X64-NEXT: vgf2p8affineinvqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x04]
; X64-NEXT: vgf2p8affineinvqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x05]
-; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
-; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i32 %mask to <32 x i1>
%2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
@@ -71,21 +71,21 @@ define <64 x i8> @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <
; X86-LABEL: test_vgf2p8affineinvqb_512:
; X86: # %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X86-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
+; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03]
+; X86-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x04]
; X86-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
-; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
-; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineinvqb_512:
; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03]
-; X64-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x04]
+; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03]
+; X64-NEXT: vgf2p8affineinvqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x04]
; X64-NEXT: vgf2p8affineinvqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x05]
-; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
-; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i64 %mask to <64 x i1>
%2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
@@ -103,21 +103,21 @@ define <16 x i8> @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16
; X86-LABEL: test_vgf2p8affineqb_128:
; X86: # %bb.0:
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X86-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
+; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03]
+; X86-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x04]
; X86-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
-; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
-; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineqb_128:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03]
-; X64-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x04]
+; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03]
+; X64-NEXT: vgf2p8affineqb $4, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x04]
; X64-NEXT: vgf2p8affineqb $5, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x05]
-; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3]
-; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i16 %mask to <16 x i1>
%2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3)
@@ -135,21 +135,21 @@ define <32 x i8> @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32
; X86-LABEL: test_vgf2p8affineqb_256:
; X86: # %bb.0:
; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X86-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
+; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03]
+; X86-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x04]
; X86-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
-; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
-; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineqb_256:
; X64: # %bb.0:
; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03]
-; X64-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x04]
+; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03]
+; X64-NEXT: vgf2p8affineqb $4, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x04]
; X64-NEXT: vgf2p8affineqb $5, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x05]
-; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3]
-; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i32 %mask to <32 x i1>
%2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3)
@@ -167,21 +167,21 @@ define <64 x i8> @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64
; X86-LABEL: test_vgf2p8affineqb_512:
; X86: # %bb.0:
; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
-; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X86-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
+; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03]
+; X86-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x04]
; X86-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
-; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
-; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96]
+; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
; X86-NEXT: retl # encoding: [0xc3]
;
; X64-LABEL: test_vgf2p8affineqb_512:
; X64: # %bb.0:
; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf]
-; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03]
-; X64-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x04]
+; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03]
+; X64-NEXT: vgf2p8affineqb $4, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x04]
; X64-NEXT: vgf2p8affineqb $5, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x05]
-; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3]
-; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0]
+; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96]
+; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3]
; X64-NEXT: retq # encoding: [0xc3]
%1 = bitcast i64 %mask to <64 x i1>
%2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3)
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 879ea9146be6..e67b81581396 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -4840,12 +4840,11 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; KNL-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1
-; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0
+; KNL-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2
+; KNL-NEXT: vpmovsxwd %ymm2, %zmm0
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %ecx
@@ -4928,12 +4927,11 @@ define void @ktest_6(<32 x i16> %w, <32 x i16> %x, <32 x i16> %y, <32 x i16> %z)
; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqw %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k0
+; AVX512DQ-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpmovsxwd %ymm2, %zmm0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %eax
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; AVX512DQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512DQ-NEXT: kmovw %k0, %ecx
@@ -5013,10 +5011,9 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
; KNL-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3
; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; KNL-NEXT: vporq %zmm2, %zmm1, %zmm1
-; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; KNL-NEXT: vpmovmskb %ymm0, %eax
-; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; KNL-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2
+; KNL-NEXT: vpmovmskb %ymm2, %eax
+; KNL-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; KNL-NEXT: vpmovmskb %ymm0, %ecx
; KNL-NEXT: shlq $32, %rcx
; KNL-NEXT: orq %rax, %rcx
@@ -5097,10 +5094,9 @@ define void @ktest_7(<64 x i8> %w, <64 x i8> %x, <64 x i8> %y, <64 x i8> %z) {
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm3, %ymm3
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT: vporq %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovmskb %ymm0, %eax
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpternlogq $200, %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vpmovmskb %ymm2, %eax
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm0
; AVX512DQ-NEXT: vpmovmskb %ymm0, %ecx
; AVX512DQ-NEXT: shlq $32, %rcx
; AVX512DQ-NEXT: orq %rax, %rcx
diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll
index 4defa872fc0a..f67ddb65be57 100644
--- a/llvm/test/CodeGen/X86/fp-round.ll
+++ b/llvm/test/CodeGen/X86/fp-round.ll
@@ -455,8 +455,8 @@ define <16 x float> @round_v16f32(<16 x float> %x) {
;
; AVX512-LABEL: round_v16f32:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1
-; AVX512-NEXT: vpord {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm1 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1]
+; AVX512-NEXT: vpternlogd $248, {{.*}}(%rip){1to16}, %zmm0, %zmm1
; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vrndscaleps $11, %zmm0, %zmm0
; AVX512-NEXT: retq
@@ -554,8 +554,8 @@ define <8 x double> @round_v8f64(<8 x double> %x) {
;
; AVX512-LABEL: round_v8f64:
; AVX512: ## %bb.0:
-; AVX512-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; AVX512-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1]
+; AVX512-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1
; AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vrndscalepd $11, %zmm0, %zmm0
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
index e4420be56b46..4e07b4abde4b 100644
--- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -13,12 +13,18 @@ define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>
; SSE-NEXT: pand %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_and_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_and_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_and_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogd $128, %xmm2, %xmm3, %xmm0
+; AVX512-NEXT: retq
%t0 = add <4 x i32> %x0, %x1
%t1 = and <4 x i32> %x2, %t0
@@ -34,12 +40,18 @@ define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %
; SSE-NEXT: por %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_or_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_or_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1
+; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_or_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogd $254, %xmm2, %xmm3, %xmm0
+; AVX512-NEXT: retq
%t0 = add <4 x i32> %x0, %x1
%t1 = or <4 x i32> %x2, %t0
@@ -55,12 +67,18 @@ define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32>
; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_xor_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_xor_v4i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_xor_v4i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogd $150, %xmm2, %xmm3, %xmm0
+; AVX512-NEXT: retq
%t0 = add <4 x i32> %x0, %x1
%t1 = xor <4 x i32> %x2, %t0
@@ -81,12 +99,18 @@ define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>
; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_and_v8i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpand %ymm3, %ymm2, %ymm1
-; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_and_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_and_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogd $128, %ymm2, %ymm3, %ymm0
+; AVX512-NEXT: retq
%t0 = add <8 x i32> %x0, %x1
%t1 = and <8 x i32> %x2, %t0
@@ -105,12 +129,18 @@ define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %
; SSE-NEXT: por %xmm5, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_or_v8i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpor %ymm3, %ymm2, %ymm1
-; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_or_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_or_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogd $254, %ymm2, %ymm3, %ymm0
+; AVX512-NEXT: retq
%t0 = add <8 x i32> %x0, %x1
%t1 = or <8 x i32> %x2, %t0
@@ -129,12 +159,18 @@ define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32>
; SSE-NEXT: pxor %xmm5, %xmm1
; SSE-NEXT: retq
;
-; AVX-LABEL: reassociate_xor_v8i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpxor %ymm3, %ymm2, %ymm1
-; AVX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX-NEXT: retq
+; AVX2-LABEL: reassociate_xor_v8i32:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reassociate_xor_v8i32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogd $150, %ymm2, %ymm3, %ymm0
+; AVX512-NEXT: retq
%t0 = add <8 x i32> %x0, %x1
%t1 = xor <8 x i32> %x2, %t0
@@ -175,8 +211,7 @@ define <16 x i32> @reassociate_and_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x
; AVX512-LABEL: reassociate_and_v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm1
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogd $128, %zmm2, %zmm3, %zmm0
; AVX512-NEXT: retq
%t0 = add <16 x i32> %x0, %x1
@@ -215,8 +250,7 @@ define <16 x i32> @reassociate_or_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i
; AVX512-LABEL: reassociate_or_v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm1
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogd $254, %zmm2, %zmm3, %zmm0
; AVX512-NEXT: retq
%t0 = add <16 x i32> %x0, %x1
@@ -255,8 +289,7 @@ define <16 x i32> @reassociate_xor_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x
; AVX512-LABEL: reassociate_xor_v16i32:
; AVX512: # %bb.0:
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpxord %zmm3, %zmm2, %zmm1
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogd $150, %zmm2, %zmm3, %zmm0
; AVX512-NEXT: retq
%t0 = add <16 x i32> %x0, %x1
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index 9c2768792e00..dcb5806c51fb 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -2261,12 +2261,12 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
; AVX512VL-FALLBACK: # %bb.0:
; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3
-; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4
; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
-; AVX512VL-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -3121,14 +3121,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
; AVX512VL-FALLBACK: # %bb.0:
; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3
-; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4
; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
+; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 0710e953d6d4..f7f1c8bac87b 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1855,12 +1855,12 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512VL-FALLBACK: # %bb.0:
; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
;
@@ -2789,21 +2789,21 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512VL-FALLBACK: # %bb.0:
; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index cb9951ccb494..3cd81a0c9707 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -418,10 +418,9 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm6, %ymm1, %ymm1
@@ -429,10 +428,10 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm7, %ymm6
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpor %ymm4, %ymm5, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm4, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm4, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -877,45 +876,44 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm4
; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpminub %ymm4, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3
; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm6
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm6, %ymm0, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm7, %ymm7, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm8, %ymm8, %ymm8
+; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm4, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm4, %ymm4
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm7, %ymm7
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm8, %ymm4
-; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm6, %ymm7
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm9, %ymm5
+; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm9, %ymm5
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm8, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm8, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpand %ymm1, %ymm9, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm8, %ymm6, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm9, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm8, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm4, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm9, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 849d4ac9770c..18076468f352 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1674,12 +1674,11 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "min-leg
; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; CHECK-NEXT: vpsrlw $7, %ymm0, %ymm3
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
-; CHECK-NEXT: vpor %ymm3, %ymm2, %ymm2
+; CHECK-NEXT: vpsrlw $7, %ymm0, %ymm2
+; CHECK-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; CHECK-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
; CHECK-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; CHECK-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; CHECK-NEXT: retq
%b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
%shl = shl <32 x i8> %a, %b
@@ -1695,19 +1694,18 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind "mi
; CHECK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vpsllw %xmm2, %ymm0, %ymm3
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; CHECK-NEXT: vpsubb %xmm1, %xmm4, %xmm1
; CHECK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; CHECK-NEXT: vpbroadcastb %xmm2, %ymm2
-; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm2
-; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; CHECK-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; CHECK-NEXT: vpsrlw $8, %xmm1, %xmm1
-; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1
-; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0
-; CHECK-NEXT: vpor %ymm0, %ymm2, %ymm0
+; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm5
+; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm2
+; CHECK-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; CHECK-NEXT: vpsrlw $8, %xmm0, %xmm0
+; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0
+; CHECK-NEXT: vpternlogq $236, %ymm5, %ymm2, %ymm0
; CHECK-NEXT: retq
%splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
%splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
@@ -1788,10 +1786,12 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v
define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: splatconstant_rotate_mask_v32i8:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsllw $4, %ymm0, %ymm1
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; CHECK-NEXT: vpsllw $4, %ymm0, %ymm2
+; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm2
; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0
-; CHECK-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
-; CHECK-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT: vpandn %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm2, %ymm0
; CHECK-NEXT: retq
%shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 3096d4fb9472..f5fbd3915c12 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -514,20 +514,64 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v16i4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v16i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddsb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
%z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
}
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 59a18bfa85f6..4126656fd805 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -510,20 +510,64 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind {
; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: v16i4:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
-; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: v16i4:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: v16i4:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: v16i4:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: v16i4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $4, %xmm1, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsubsb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: retq
%z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y)
ret <16 x i4> %z
}
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index be1d15ba68b3..328f3c15fc48 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -362,8 +362,8 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
; NODQ-32-LABEL: uitofp_v8i64_v8f64:
; NODQ-32: # %bb.0:
-; NODQ-32-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm1
-; NODQ-32-NEXT: vporq {{\.LCPI.*}}, %zmm1, %zmm1
+; NODQ-32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200,0,1127219200]
+; NODQ-32-NEXT: vpternlogq $248, {{\.LCPI.*}}, %zmm0, %zmm1
; NODQ-32-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-32-NEXT: vporq {{\.LCPI.*}}, %zmm0, %zmm0
; NODQ-32-NEXT: vsubpd {{\.LCPI.*}}{1to8}, %zmm0, %zmm0
@@ -372,8 +372,8 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
;
; NODQ-64-LABEL: uitofp_v8i64_v8f64:
; NODQ-64: # %bb.0:
-; NODQ-64-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; NODQ-64-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; NODQ-64-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
+; NODQ-64-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1
; NODQ-64-NEXT: vpsrlq $32, %zmm0, %zmm0
; NODQ-64-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
; NODQ-64-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 860c2d576c72..678ae18b5f33 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1513,12 +1513,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4
; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm3, %ymm4
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm1
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 83a74c657dca..09a29fdbaad4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -893,12 +893,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
+; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
@@ -917,12 +916,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %zmm1
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
@@ -941,12 +939,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
@@ -965,12 +962,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %zmm1
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index e8fb824076f2..655b6e4c2504 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -446,12 +446,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i8:
@@ -833,12 +832,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
index 8d11e104bf8d..49f229bf1d67 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll
@@ -550,12 +550,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
@@ -573,12 +572,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0
; AVX512VLBW-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index e40e3cdfbd65..ad29afec0958 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1502,25 +1502,24 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VL-LABEL: splatvar_funnnel_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vpsllw %xmm3, %xmm4, %xmm3
+; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
+; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4
-; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
+; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm5
+; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3
-; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4
-; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm0
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index aa7a0e63f1e7..3337ebe22fed 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -872,21 +872,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512BW-NEXT: vpsllw %xmm3, %xmm4, %xmm3
; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5
+; AVX512BW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
@@ -895,21 +894,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3
; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
@@ -918,21 +916,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm4, %xmm3
; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5
+; AVX512VLBW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
@@ -941,21 +938,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
-; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 930795283a24..61c45a118e47 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -487,12 +487,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind {
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v32i8:
@@ -908,12 +907,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
index 79ffaaf53859..d642e513c49b 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll
@@ -534,47 +534,45 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm4
; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
-; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsllw %xmm2, %xmm5, %xmm2
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512BW-NEXT: vpandq %zmm2, %zmm4, %zmm2
+; AVX512BW-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3
-; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
-; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1
-; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm5, %xmm2
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
+; AVX512VLBW-NEXT: vpandq %zmm2, %zmm4, %zmm2
+; AVX512VLBW-NEXT: vpand %xmm3, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat)
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
index b48a67c129f0..c3fa89445c2f 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll
@@ -185,9 +185,8 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm2, %zmm1
; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0
@@ -539,9 +538,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm2, %zmm1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1
; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm3, %zmm2
; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 48b5777ecd2d..67dd15ee87ab 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1968,13 +1968,35 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1
-; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1
+; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0
+; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1
+; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLBW-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v8i16:
; XOP: # %bb.0:
@@ -2020,14 +2042,39 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
;
-; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1
-; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1
+; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm2
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0
+; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0
+; AVX512VLBW-NEXT: retq
;
; XOP-LABEL: splatconstant_rotate_mask_v16i8:
; XOP: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index ad92c1c67111..d27d39849299 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -442,12 +442,11 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2
-; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3
-; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2
+; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3
; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1
-; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_rotate_v32i8:
@@ -823,12 +822,11 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1
-; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3
+; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_rotate_v32i8:
@@ -1707,13 +1705,35 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: retq
;
-; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1
-; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: retq
+; AVX512F-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1
+; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1
+; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1
+; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0
+; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT: retq
+;
+; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1
+; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLBW-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
; XOPAVX1: # %bb.0:
@@ -1782,9 +1802,11 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1
+; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT: vpandn %ymm0, %ymm2, %ymm0
+; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8:
@@ -1799,10 +1821,9 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1
-; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm2
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm0
+; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0
; AVX512VLBW-NEXT: retq
;
; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index 2acabc352481..05d989ebaa30 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -488,12 +488,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
@@ -507,12 +506,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1
-; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1
-; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0
+; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
@@ -919,10 +917,9 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $11, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
@@ -933,28 +930,25 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
; AVX512VL-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0
; AVX512VL-NEXT: vpsrlw $11, %ymm2, %ymm2
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
%shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
%lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
@@ -979,10 +973,9 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1
-; AVX512F-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm0
+; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
@@ -999,28 +992,25 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1
; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0
; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm1
-; AVX512VL-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm0
+; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0
+; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0
; AVX512VLBW-NEXT: retq
%shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
%lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index dc0d0a1168b7..5764d19f4c7f 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1443,9 +1443,8 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_shift_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 708ab48479b0..358f9b8cc4de 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -944,15 +944,14 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
; AVX512DQVL: # %bb.0:
; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512DQVL-NEXT: vpand %ymm2, %ymm0, %ymm0
; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm1
-; AVX512DQVL-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
+; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
+; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
+; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1
+; AVX512DQVL-NEXT: vpternlogq $108, %ymm0, %ymm2, %ymm1
+; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm1, %ymm0
; AVX512DQVL-NEXT: retq
;
; AVX512BWVL-LABEL: splatvar_shift_v32i8:
@@ -1630,9 +1629,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_shift_v32i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0
; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0
; AVX512VL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index a71d257bfe55..0acdbac821da 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -212,15 +212,14 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2
+; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1
+; AVX512BW-NEXT: vpternlogq $108, %zmm0, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm1, %zmm0
; AVX512BW-NEXT: retq
%splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
%shift = ashr <64 x i8> %a, %splat
@@ -375,9 +374,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
; AVX512BW-LABEL: splatconstant_shift_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm1, %zmm0
; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index 0e4016e83594..11d118bf31c3 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -2354,9 +2354,8 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_shift_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
@@ -2408,9 +2407,8 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_shift_v4i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
@@ -2462,9 +2460,8 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
; AVX512VL-LABEL: splatconstant_shift_v2i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0
; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: retq
;
More information about the llvm-commits
mailing list