[llvm] [DAG] visitFREEZE - enable SRA/SRL handling (PR #148252)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Jul 11 08:41:43 PDT 2025


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/148252

None

>From 8387d028c64c30f1a5796faa1acd2a19dea43aaa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 1 Jul 2025 08:19:53 +0100
Subject: [PATCH] [DAG] visitFREEZE - enable SRA/SRL handling

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   6 -
 .../AMDGPU/divergence-driven-trunc-to-i1.ll   |  14 +-
 llvm/test/CodeGen/AMDGPU/freeze.ll            |  75 +--
 llvm/test/CodeGen/AMDGPU/srem64.ll            |  40 +-
 llvm/test/CodeGen/RISCV/rv64xtheadbb.ll       |  15 +-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |  15 +-
 .../CodeGen/RISCV/rvv/vec3-setcc-crash.ll     |  20 +-
 llvm/test/CodeGen/X86/freeze-binary.ll        |  18 +-
 llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 474 +++++++++---------
 9 files changed, 306 insertions(+), 371 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..f0df94a69ff9f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16715,12 +16715,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
     return N0;
 
-  // We currently avoid folding freeze over SRA/SRL, due to the problems seen
-  // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
-  // example https://reviews.llvm.org/D136529#4120959.
-  if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
-    return SDValue();
-
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 3303cb86c874e..e703caf4724d8 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -14,15 +14,13 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
   ; GCN-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
   ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
-  ; GCN-NEXT:   [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
-  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; GCN-NEXT:   [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
-  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+  ; GCN-NEXT:   [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]]
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[COPY3]], implicit-def dead $scc
+  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
   ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
-  ; GCN-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
+  ; GCN-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; GCN-NEXT:   S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
   ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
   ; GCN-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
   ; GCN-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index ac438062ae208..da12f7d83509f 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -5692,10 +5692,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX6-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
@@ -5725,10 +5721,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -6351,10 +6343,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX6-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX6-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX6-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
@@ -6384,10 +6372,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX7-SDAG-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT:    v_or_b32_e32 v0, v4, v0
 ; GFX7-SDAG-NEXT:    buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
 ; GFX7-SDAG-NEXT:    buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -12355,14 +12339,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX6-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX6-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX6-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX6-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX6-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX6-SDAG-NEXT:    buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
 ; GFX6-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT:    buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12400,14 +12379,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX7-SDAG-NEXT:    s_mov_b32 s5, s6
 ; GFX7-SDAG-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX7-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-SDAG-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX7-SDAG-NEXT:    buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
 ; GFX7-SDAG-NEXT:    buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT:    buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
 ; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12482,11 +12456,7 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-SDAG-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    v_lshrrev_b16 v1, 8, v0
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SDAG-NEXT:    global_store_byte v[2:3], v4, off offset:2
+; GFX10-SDAG-NEXT:    global_store_byte_d16_hi v[2:3], v0, off offset:2
 ; GFX10-SDAG-NEXT:    global_store_short v[2:3], v0, off
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -12507,36 +12477,15 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
 ; GFX10-GISEL-NEXT:    global_store_byte_d16_hi v[2:3], v0, off offset:2
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-TRUE16:       ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    global_load_b32 v1, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.h, 0
-; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b16 v0.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT:    v_lshlrev_b16 v0.l, 8, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_or_b16 v0.l, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_clause 0x1
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b8 v[2:3], v4, off offset:2
-; GFX11-SDAG-TRUE16-NEXT:    global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-FAKE16:       ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b16 v1, 8, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-FAKE16-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX11-SDAG-FAKE16-NEXT:    s_clause 0x1
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b8 v[2:3], v0, off offset:2
-; GFX11-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v1, off
-; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: freeze_v3i8:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT:    s_clause 0x1
+; GFX11-SDAG-NEXT:    global_store_d16_hi_b8 v[2:3], v0, off offset:2
+; GFX11-SDAG-NEXT:    global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: freeze_v3i8:
 ; GFX11-GISEL:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 47dfa9f4fc2d3..8791070d5b229 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -653,11 +653,11 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem25_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s0, s0, 7
+; GCN-NEXT:    s_ashr_i32 s0, s1, 7
 ; GCN-NEXT:    s_abs_i32 s8, s0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -694,11 +694,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ;
 ; GCN-IR-LABEL: s_test_srem25_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 7
+; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 7
 ; GCN-IR-NEXT:    s_abs_i32 s8, s0
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -742,11 +742,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem31_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s0, s0, 1
+; GCN-NEXT:    s_ashr_i32 s0, s1, 1
 ; GCN-NEXT:    s_abs_i32 s8, s0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -783,11 +783,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ;
 ; GCN-IR-LABEL: s_test_srem31_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 1
+; GCN-IR-NEXT:    s_ashr_i32 s0, s1, 1
 ; GCN-IR-NEXT:    s_abs_i32 s8, s0
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -832,11 +832,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem32_64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_abs_i32 s8, s0
+; GCN-NEXT:    s_abs_i32 s8, s1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_sub_i32 s0, 0, s8
 ; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -871,11 +871,11 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ;
 ; GCN-IR-LABEL: s_test_srem32_64:
 ; GCN-IR:       ; %bb.0:
-; GCN-IR-NEXT:    s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_abs_i32 s8, s0
+; GCN-IR-NEXT:    s_abs_i32 s8, s1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, 0, s8
 ; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
@@ -1053,25 +1053,25 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-LABEL: s_test_srem33_64:
 ; GCN-IR:       ; %bb.0: ; %_udiv-special-cases
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
 ; GCN-IR-NEXT:    s_mov_b32 s13, 0
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT:    s_ashr_i64 s[2:3], s[2:3], 31
-; GCN-IR-NEXT:    s_ashr_i64 s[8:9], s[4:5], 31
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s3, 31
+; GCN-IR-NEXT:    s_ashr_i64 s[6:7], s[2:3], 31
 ; GCN-IR-NEXT:    s_mov_b32 s5, s4
-; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GCN-IR-NEXT:    s_ashr_i64 s[10:11], s[8:9], 31
+; GCN-IR-NEXT:    s_xor_b64 s[2:3], s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_sub_u32 s6, s2, s4
 ; GCN-IR-NEXT:    s_subb_u32 s7, s3, s4
 ; GCN-IR-NEXT:    s_ashr_i32 s2, s9, 31
 ; GCN-IR-NEXT:    s_mov_b32 s3, s2
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], s[10:11], s[2:3]
 ; GCN-IR-NEXT:    s_sub_u32 s8, s8, s2
 ; GCN-IR-NEXT:    s_subb_u32 s9, s9, s2
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[8:9], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[6:7], 0
+; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
 ; GCN-IR-NEXT:    s_flbit_i32_b64 s12, s[8:9]
-; GCN-IR-NEXT:    s_or_b64 s[10:11], s[2:3], s[10:11]
+; GCN-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[2:3]
 ; GCN-IR-NEXT:    s_flbit_i32_b64 s20, s[6:7]
 ; GCN-IR-NEXT:    s_sub_u32 s14, s12, s20
 ; GCN-IR-NEXT:    s_subb_u32 s15, 0, 0
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 00f7b462f68db..3acd025897a28 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -210,7 +210,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    srliw a1, a1, 24
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    srli a1, a1, 58
 ; RV64I-NEXT:    xori a1, a1, 31
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
@@ -239,17 +240,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a0, a0, 1
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    addi a1, a2, 1365
-; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    srli a2, a0, 4
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    slli a2, a0, 33
+; RV64I-NEXT:    srli a2, a2, 41
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    slli a2, a0, 33
+; RV64I-NEXT:    srli a2, a2, 49
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    srli a2, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e6407279870db..72b1c155596b2 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -204,7 +204,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64I-NEXT:    add a1, a1, a2
 ; RV64I-NEXT:    slli a2, a1, 16
 ; RV64I-NEXT:    add a1, a1, a2
-; RV64I-NEXT:    srliw a1, a1, 24
+; RV64I-NEXT:    slli a1, a1, 34
+; RV64I-NEXT:    srli a1, a1, 58
 ; RV64I-NEXT:    xori a1, a1, 31
 ; RV64I-NEXT:    addi a0, a0, -1
 ; RV64I-NEXT:    or a0, a0, a1
@@ -231,17 +232,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
 ; RV64I-NEXT:    srliw a0, a0, 1
 ; RV64I-NEXT:    beqz a0, .LBB4_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    srli a1, a0, 1
 ; RV64I-NEXT:    lui a2, 349525
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    addi a1, a2, 1365
-; RV64I-NEXT:    srliw a2, a0, 2
+; RV64I-NEXT:    srli a2, a0, 2
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 4
+; RV64I-NEXT:    srli a2, a0, 4
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    slli a2, a0, 33
+; RV64I-NEXT:    srli a2, a2, 41
 ; RV64I-NEXT:    or a0, a0, a2
-; RV64I-NEXT:    srliw a2, a0, 16
+; RV64I-NEXT:    slli a2, a0, 33
+; RV64I-NEXT:    srli a2, a2, 49
 ; RV64I-NEXT:    or a0, a0, a2
 ; RV64I-NEXT:    not a0, a0
 ; RV64I-NEXT:    srli a2, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 3740737ba2989..d8ba5df9414cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -13,22 +13,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    lw a0, 0(a0)
 ; RV32-NEXT:    srli a2, a0, 16
-; RV32-NEXT:    slli a3, a0, 16
-; RV32-NEXT:    slli a4, a0, 24
-; RV32-NEXT:    slli a5, a0, 8
-; RV32-NEXT:    srli a6, a3, 24
-; RV32-NEXT:    srai a3, a3, 24
+; RV32-NEXT:    srli a3, a0, 8
+; RV32-NEXT:    slli a4, a0, 16
+; RV32-NEXT:    slli a5, a0, 24
+; RV32-NEXT:    slli a6, a0, 8
 ; RV32-NEXT:    srai a4, a4, 24
 ; RV32-NEXT:    srai a5, a5, 24
+; RV32-NEXT:    srai a6, a6, 24
+; RV32-NEXT:    sgtz a6, a6
 ; RV32-NEXT:    sgtz a5, a5
 ; RV32-NEXT:    sgtz a4, a4
-; RV32-NEXT:    sgtz a3, a3
-; RV32-NEXT:    neg a3, a3
 ; RV32-NEXT:    neg a4, a4
 ; RV32-NEXT:    neg a5, a5
-; RV32-NEXT:    and a3, a3, a6
-; RV32-NEXT:    and a0, a4, a0
-; RV32-NEXT:    and a2, a5, a2
+; RV32-NEXT:    neg a6, a6
+; RV32-NEXT:    and a3, a4, a3
+; RV32-NEXT:    and a0, a5, a0
+; RV32-NEXT:    and a2, a6, a2
 ; RV32-NEXT:    slli a3, a3, 8
 ; RV32-NEXT:    zext.b a0, a0
 ; RV32-NEXT:    or a0, a0, a3
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 189de051011d2..716c9dd849216 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -452,8 +452,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
 ; X86-LABEL: freeze_ashr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $3, %eax
-; X86-NEXT:    sarl $3, %eax
+; X86-NEXT:    sarl $6, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr:
@@ -471,15 +470,13 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
 ; X86-LABEL: freeze_ashr_exact:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sarl $3, %eax
-; X86-NEXT:    sarl $6, %eax
+; X86-NEXT:    sarl $9, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_ashr_exact:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    sarl $3, %eax
-; X64-NEXT:    sarl $6, %eax
+; X64-NEXT:    sarl $9, %eax
 ; X64-NEXT:    retq
   %x = ashr exact i32 %a0, 3
   %y = freeze i32 %x
@@ -565,8 +562,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
 ; X86-LABEL: freeze_lshr:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    shrl %eax
+; X86-NEXT:    shrl $3, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr:
@@ -584,15 +580,13 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
 ; X86-LABEL: freeze_lshr_exact:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    shrl $5, %eax
+; X86-NEXT:    shrl $8, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_lshr_exact:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shrl $3, %eax
-; X64-NEXT:    shrl $5, %eax
+; X64-NEXT:    shrl $8, %eax
 ; X64-NEXT:    retq
   %x = lshr exact i32 %a0, 3
   %y = freeze i32 %x
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 5f6337e29d685..582cc9c3e8055 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -426,58 +426,58 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
 define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounwind {
 ; AVX512F-LABEL: vec512_i16_signed_mem_reg:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpminsw %ymm0, %ymm1, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
 ; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm1, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
 ; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
@@ -507,56 +507,56 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
 define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind {
 ; AVX512F-LABEL: vec512_i16_signed_reg_mem:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpminsw %ymm3, %ymm2, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm5
 ; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubw %ymm3, %ymm6, %ymm3
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm3, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm3, %ymm3
 ; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm5
 ; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm3, %ymm6, %ymm3
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -588,60 +588,60 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
 define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; AVX512F-LABEL: vec512_i16_signed_mem_mem:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
 ; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rsi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
 ; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_i16_signed_mem_mem:
@@ -849,66 +849,64 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind {
 ; AVX512F-LABEL: vec512_i8_signed_mem_reg:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT:    vpminsb %ymm0, %ymm1, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm6, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
@@ -939,64 +937,62 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind {
 ; AVX512F-LABEL: vec512_i8_signed_reg_mem:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vpminsb %ymm3, %ymm2, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm5
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm4
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm3, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm3, %ymm3
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm3, %ymm6, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
 ; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -1029,68 +1025,66 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
 ; AVX512F-LABEL: vec512_i8_signed_mem_mem:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm1
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT:    vpand %ymm5, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
 ; AVX512VL-FALLBACK:       # %bb.0:
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rsi), %ymm0
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rsi), %ymm1
-; AVX512VL-FALLBACK-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT:    vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rdi), %zmm0
+; AVX512VL-FALLBACK-NEXT:    vmovdqa64 (%rsi), %zmm1
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm5
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT:    vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT:    vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
 ;
 ; AVX512BW-LABEL: vec512_i8_signed_mem_mem:



More information about the llvm-commits mailing list