[llvm] fdc9027 - [DAG][AMDGPU][X86] Add SimplifyMultipleUseDemandedBits handling for SIGN/ZERO_EXTEND + SIGN/ZERO_EXTEND_VECTOR_INREG
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 29 10:11:13 PDT 2020
Author: Simon Pilgrim
Date: 2020-07-29T18:10:59+01:00
New Revision: fdc902774e7ae0dae0833216500da047c7c4bec6
URL: https://github.com/llvm/llvm-project/commit/fdc902774e7ae0dae0833216500da047c7c4bec6
DIFF: https://github.com/llvm/llvm-project/commit/fdc902774e7ae0dae0833216500da047c7c4bec6.diff
LOG: [DAG][AMDGPU][X86] Add SimplifyMultipleUseDemandedBits handling for SIGN/ZERO_EXTEND + SIGN/ZERO_EXTEND_VECTOR_INREG
Peek through multiple use ops like we already do for ANY_EXTEND/ANY_EXTEND_VECTOR_INREG
Differential Revision: https://reviews.llvm.org/D84863
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/test/CodeGen/AMDGPU/bswap.ll
llvm/test/CodeGen/AMDGPU/fshr.ll
llvm/test/CodeGen/AMDGPU/idot8s.ll
llvm/test/CodeGen/AMDGPU/idot8u.ll
llvm/test/CodeGen/AMDGPU/saddsat.ll
llvm/test/CodeGen/AMDGPU/ssubsat.ll
llvm/test/CodeGen/AMDGPU/uaddsat.ll
llvm/test/CodeGen/AMDGPU/usubsat.ll
llvm/test/CodeGen/X86/vector-fshl-128.ll
llvm/test/CodeGen/X86/vector-fshr-128.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index d140b15067a6..4c7c46218c78 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1858,6 +1858,11 @@ bool TargetLowering::SimplifyDemandedBits(
assert(!Known.hasConflict() && "Bits known to be one AND zero?");
assert(Known.getBitWidth() == InBits && "Src width has changed?");
Known = Known.zext(BitWidth);
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
break;
}
case ISD::SIGN_EXTEND:
@@ -1906,6 +1911,11 @@ bool TargetLowering::SimplifyDemandedBits(
if (!TLO.LegalOperations() || isOperationLegal(Opc, VT))
return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT, Src));
}
+
+ // Attempt to avoid multi-use ops if we don't need anything from them.
+ if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+ Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+ return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
break;
}
case ISD::ANY_EXTEND:
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index 1cdd6f4e3710..884457b319f4 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -493,11 +493,11 @@ define <3 x i16> @v_bswap_v3i16(<3 x i16> %src) {
; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
; SI-NEXT: v_bfi_b32 v0, s4, v0, v4
; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
-; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT: v_or_b32_e32 v0, v0, v3
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_bswap_v3i16:
@@ -515,27 +515,27 @@ define <4 x i16> @v_bswap_v4i16(<4 x i16> %src) {
; SI-LABEL: v_bswap_v4i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_alignbit_b32 v4, v3, v3, 8
-; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
+; SI-NEXT: v_alignbit_b32 v4, v1, v1, 8
+; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: s_mov_b32 s4, 0xff00ff
; SI-NEXT: s_mov_b32 s5, 0xffff0000
-; SI-NEXT: v_alignbit_b32 v5, v2, v2, 8
-; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
-; SI-NEXT: v_alignbit_b32 v6, v1, v1, 8
-; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
-; SI-NEXT: v_alignbit_b32 v7, v0, v0, 8
+; SI-NEXT: v_alignbit_b32 v5, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
-; SI-NEXT: v_bfi_b32 v3, s4, v3, v4
-; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
-; SI-NEXT: v_bfi_b32 v1, s4, v1, v6
-; SI-NEXT: v_bfi_b32 v0, s4, v0, v7
+; SI-NEXT: v_alignbit_b32 v6, v3, v3, 8
+; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
+; SI-NEXT: v_alignbit_b32 v7, v2, v2, 8
+; SI-NEXT: v_alignbit_b32 v2, v2, v2, 24
+; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
+; SI-NEXT: v_bfi_b32 v0, s4, v0, v5
+; SI-NEXT: v_bfi_b32 v3, s4, v3, v6
+; SI-NEXT: v_bfi_b32 v2, s4, v2, v7
+; SI-NEXT: v_and_b32_e32 v4, s5, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_and_b32_e32 v3, s5, v3
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_and_b32_e32 v1, s5, v1
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 2dadb5e80dfd..83b3b56172c9 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -767,21 +767,21 @@ define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2
; SI-NEXT: v_lshl_b32_e32 v0, v0, v7
; SI-NEXT: v_or_b32_e32 v0, v0, v6
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
-; SI-NEXT: v_mov_b32_e32 v9, 0xffff
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-NEXT: v_and_b32_e32 v3, 15, v8
+; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3
+; SI-NEXT: v_and_b32_e32 v10, s4, v5
+; SI-NEXT: v_lshr_b32_e32 v4, v10, v3
+; SI-NEXT: v_lshl_b32_e32 v2, v2, v6
+; SI-NEXT: v_mov_b32_e32 v9, 0xffff
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; SI-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, v9, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_and_b32_e32 v1, 15, v8
-; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v1
-; SI-NEXT: v_and_b32_e32 v10, s4, v5
-; SI-NEXT: v_lshr_b32_e32 v3, v10, v1
-; SI-NEXT: v_lshl_b32_e32 v2, v2, v4
-; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
-; SI-NEXT: v_and_b32_e32 v2, v9, v1
-; SI-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; SI-NEXT: v_and_b32_e32 v2, v9, v3
+; SI-NEXT: v_alignbit_b32 v1, v3, v1, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fshr_v3i16:
@@ -865,46 +865,46 @@ define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0xffff
-; SI-NEXT: v_and_b32_e32 v11, 15, v11
-; SI-NEXT: v_and_b32_e32 v16, s4, v7
-; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11
-; SI-NEXT: v_lshr_b32_e32 v16, v16, v11
-; SI-NEXT: v_lshl_b32_e32 v3, v3, v17
-; SI-NEXT: v_or_b32_e32 v3, v3, v16
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11
+; SI-NEXT: v_and_b32_e32 v9, 15, v9
+; SI-NEXT: v_and_b32_e32 v16, s4, v5
+; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v9
+; SI-NEXT: v_lshr_b32_e32 v16, v16, v9
+; SI-NEXT: v_lshl_b32_e32 v1, v1, v17
+; SI-NEXT: v_or_b32_e32 v1, v1, v16
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
+; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; SI-NEXT: v_and_b32_e32 v5, 15, v8
+; SI-NEXT: v_sub_i32_e32 v9, vcc, 16, v5
+; SI-NEXT: v_and_b32_e32 v15, s4, v4
+; SI-NEXT: v_lshr_b32_e32 v8, v15, v5
+; SI-NEXT: v_lshl_b32_e32 v0, v0, v9
+; SI-NEXT: v_or_b32_e32 v0, v0, v8
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT: v_and_b32_e32 v4, 15, v11
+; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v4
+; SI-NEXT: v_and_b32_e32 v14, s4, v7
+; SI-NEXT: v_lshr_b32_e32 v5, v14, v4
+; SI-NEXT: v_lshl_b32_e32 v3, v3, v8
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; SI-NEXT: v_or_b32_e32 v3, v3, v5
+; SI-NEXT: v_and_b32_e32 v4, 15, v10
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; SI-NEXT: v_and_b32_e32 v7, 15, v10
-; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7
-; SI-NEXT: v_and_b32_e32 v15, s4, v6
-; SI-NEXT: v_lshr_b32_e32 v10, v15, v7
-; SI-NEXT: v_lshl_b32_e32 v2, v2, v11
-; SI-NEXT: v_or_b32_e32 v2, v2, v10
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4
+; SI-NEXT: v_and_b32_e32 v13, s4, v6
+; SI-NEXT: v_lshr_b32_e32 v5, v13, v4
+; SI-NEXT: v_lshl_b32_e32 v2, v2, v7
+; SI-NEXT: v_or_b32_e32 v2, v2, v5
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
; SI-NEXT: v_mov_b32_e32 v12, 0xffff
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_and_b32_e32 v2, v12, v2
; SI-NEXT: v_or_b32_e32 v2, v2, v3
-; SI-NEXT: v_and_b32_e32 v3, 15, v9
-; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3
-; SI-NEXT: v_and_b32_e32 v14, s4, v5
-; SI-NEXT: v_lshr_b32_e32 v6, v14, v3
-; SI-NEXT: v_lshl_b32_e32 v1, v1, v7
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT: v_or_b32_e32 v1, v1, v6
-; SI-NEXT: v_and_b32_e32 v3, 15, v8
-; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3
-; SI-NEXT: v_and_b32_e32 v13, s4, v4
-; SI-NEXT: v_lshr_b32_e32 v5, v13, v3
-; SI-NEXT: v_lshl_b32_e32 v0, v0, v6
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; SI-NEXT: v_or_b32_e32 v0, v0, v5
-; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_and_b32_e32 v0, v12, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16
+; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 9a88c82b5a85..e7d48de6ed60 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -1470,21 +1470,21 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018
; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014
; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010
-; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000
-; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004
-; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008
+; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40008
+; GFX7-NEXT: s_bfe_i32 s19, s5, 0x4000c
+; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40000
; GFX7-NEXT: s_ashr_i32 s14, s5, 28
-; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c
+; GFX7-NEXT: s_bfe_i32 s5, s5, 0x40004
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014
; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010
-; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000
+; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v4, s18
-; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004
+; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c
; GFX7-NEXT: v_mov_b32_e32 v3, s19
-; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008
+; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40000
; GFX7-NEXT: v_mov_b32_e32 v2, s20
-; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c
+; GFX7-NEXT: s_bfe_i32 s4, s4, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2
@@ -1494,17 +1494,17 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_and_b32_e32 v4, s8, v4
-; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v1
+; GFX7-NEXT: v_alignbit_b32 v1, v3, v1, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s17
; GFX7-NEXT: v_mov_b32_e32 v6, s16
; GFX7-NEXT: v_mov_b32_e32 v7, s15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 171cb6ac6ea7..ef0825120f20 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -1988,17 +1988,17 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004
-; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004
-; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c
+; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c
+; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c
+; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v4, s17
; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018
; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014
; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010
-; GFX7-NEXT: s_and_b32 s18, s5, 15
+; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008
; GFX7-NEXT: s_lshr_b32 s13, s5, 28
-; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008
-; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c
+; GFX7-NEXT: s_and_b32 s5, s5, 15
+; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004
; GFX7-NEXT: v_mov_b32_e32 v2, s19
; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2
; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4
@@ -2006,25 +2006,25 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010
-; GFX7-NEXT: s_and_b32 s11, s4, 15
+; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v3, s18
-; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008
+; GFX7-NEXT: s_and_b32 s4, s4, 15
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v4
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v3, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; GFX7-NEXT: v_mov_b32_e32 v5, s16
; GFX7-NEXT: v_mov_b32_e32 v6, s15
; GFX7-NEXT: v_mov_b32_e32 v7, s14
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v2, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 73ceea294186..e78889e349dc 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -170,24 +170,24 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: s_movk_i32 s4, 0x7fff
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: s_movk_i32 s5, 0x8000
; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
+; GFX6-NEXT: v_max_i32_e32 v3, s5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5
-; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
+; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_saddsat_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index f6048f6a357c..138488ece997 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -170,25 +170,25 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_bfe_i32 v4, v4, 0, 16
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
+; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; GFX6-NEXT: s_movk_i32 s4, 0x7fff
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
; GFX6-NEXT: s_movk_i32 s5, 0x8000
; GFX6-NEXT: v_min_i32_e32 v0, s4, v0
; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
+; GFX6-NEXT: v_min_i32_e32 v2, s4, v2
; GFX6-NEXT: v_max_i32_e32 v0, s5, v0
; GFX6-NEXT: s_mov_b32 s6, 0xffff
-; GFX6-NEXT: v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX6-NEXT: v_max_i32_e32 v3, s5, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, s6, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v2, v5
-; GFX6-NEXT: v_min_i32_e32 v1, s4, v1
-; GFX6-NEXT: v_max_i32_e32 v1, s5, v1
-; GFX6-NEXT: v_and_b32_e32 v2, s6, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_and_b32_e32 v2, s6, v3
+; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ssubsat_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
index 48349cbed903..56e4123e182a 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddsat.ll
@@ -128,19 +128,19 @@ define <3 x i16> @v_uaddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_and_b32_e32 v4, s4, v4
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT: v_and_b32_e32 v5, s4, v5
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
; GFX6-NEXT: v_and_b32_e32 v3, s4, v3
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_min_u32_e32 v1, s4, v1
-; GFX6-NEXT: v_and_b32_e32 v5, s4, v5
-; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GFX6-NEXT: v_min_u32_e32 v3, s4, v2
; GFX6-NEXT: v_min_u32_e32 v0, s4, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v5
-; GFX6-NEXT: v_min_u32_e32 v1, s4, v1
-; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_or_b32_e32 v2, 0xffff0000, v3
+; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uaddsat_v3i16:
diff --git a/llvm/test/CodeGen/AMDGPU/usubsat.ll b/llvm/test/CodeGen/AMDGPU/usubsat.ll
index 55275450b1b4..97bfdc479e09 100644
--- a/llvm/test/CodeGen/AMDGPU/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubsat.ll
@@ -131,17 +131,17 @@ define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
; GFX6-NEXT: v_and_b32_e32 v1, s4, v1
; GFX6-NEXT: v_max_u32_e32 v1, v1, v8
; GFX6-NEXT: v_max_u32_e32 v0, v0, v7
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
-; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
; GFX6-NEXT: v_and_b32_e32 v6, s4, v5
; GFX6-NEXT: v_and_b32_e32 v2, s4, v2
+; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
+; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; GFX6-NEXT: v_max_u32_e32 v2, v2, v6
+; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v2, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-NEXT: v_and_b32_e32 v0, s4, v0
; GFX6-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT: v_max_u32_e32 v1, v2, v6
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
-; GFX6-NEXT: v_and_b32_e32 v2, s4, v1
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_and_b32_e32 v2, s4, v3
+; GFX6-NEXT: v_alignbit_b32 v1, v3, v1, 16
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_usubsat_v3i16:
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 00c06564d225..b8755a23e976 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1947,18 +1947,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512F-LABEL: splatvar_funnnel_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpslld %xmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1967,18 +1967,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VL-LABEL: splatvar_funnnel_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm1
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpslld %xmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1988,18 +1988,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-LABEL: splatvar_funnnel_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
@@ -2009,18 +2009,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
@@ -2029,18 +2029,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
@@ -2049,18 +2049,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index bad82a424797..43b8a49d1776 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1967,18 +1967,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpsrld %xmm5, %zmm3, %zmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512F-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT: vpsrld %xmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512F-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512F-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2
+; AVX512F-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -1987,18 +1987,18 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT: vpsrld %xmm5, %zmm3, %zmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VL-NEXT: vpsrld %xmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512VL-NEXT: vpslld %xmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpcmpeqb %xmm2, %xmm4, %xmm2
+; AVX512VL-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
@@ -2008,17 +2008,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm4, %k1
+; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -2029,17 +2029,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm4, %k1
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -2049,17 +2049,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm4, %xmm4, %k1
+; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@@ -2068,17 +2068,17 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %ymm3, %ymm3
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm5, %xmm2
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm2, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm4, %xmm4, %k1
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
More information about the llvm-commits
mailing list