[llvm] [DAG] visitFREEZE - enable SRA/SRL handling (PR #148252)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jul 11 08:41:43 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/148252
None
>From 8387d028c64c30f1a5796faa1acd2a19dea43aaa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 1 Jul 2025 08:19:53 +0100
Subject: [PATCH] [DAG] visitFREEZE - enable SRA/SRL handling
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 -
.../AMDGPU/divergence-driven-trunc-to-i1.ll | 14 +-
llvm/test/CodeGen/AMDGPU/freeze.ll | 75 +--
llvm/test/CodeGen/AMDGPU/srem64.ll | 40 +-
llvm/test/CodeGen/RISCV/rv64xtheadbb.ll | 15 +-
llvm/test/CodeGen/RISCV/rv64zbb.ll | 15 +-
.../CodeGen/RISCV/rvv/vec3-setcc-crash.ll | 20 +-
llvm/test/CodeGen/X86/freeze-binary.ll | 18 +-
llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 474 +++++++++---------
9 files changed, 306 insertions(+), 371 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 231184587d682..f0df94a69ff9f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16715,12 +16715,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
return N0;
- // We currently avoid folding freeze over SRA/SRL, due to the problems seen
- // with (freeze (assert ext)) blocking simplifications of SRA/SRL. See for
- // example https://reviews.llvm.org/D136529#4120959.
- if (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)
- return SDValue();
-
// Fold freeze(op(x, ...)) -> op(freeze(x), ...).
// Try to push freeze through instructions that propagate but don't produce
// poison as far as possible. If an operand of freeze follows three
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
index 3303cb86c874e..e703caf4724d8 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll
@@ -14,15 +14,13 @@ define amdgpu_kernel void @uniform_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %x
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY2]], %subreg.sub0, killed [[COPY1]], %subreg.sub1, killed [[S_MOV_B32_1]], %subreg.sub2, killed [[S_MOV_B32_]], %subreg.sub3
- ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[S_LOAD_DWORD_IMM]]
- ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 16
- ; GCN-NEXT: [[S_LSHR_B32_:%[0-9]+]]:sreg_32 = S_LSHR_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_2]], implicit-def dead $scc
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LSHR_B32_]]
- ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY3]], implicit-def dead $scc
- ; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
+ ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]]
+ ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 65536, [[COPY3]], implicit-def dead $scc
+ ; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_64 = COPY $scc
- ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
- ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_3]], implicit-def $scc
+ ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+ ; GCN-NEXT: S_CMP_LT_I32 killed [[S_SEXT_I32_I16_]], killed [[S_MOV_B32_2]], implicit-def $scc
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_64 = COPY $scc
; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[COPY5]], killed [[COPY4]], implicit-def dead $scc
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll
index ac438062ae208..da12f7d83509f 100644
--- a/llvm/test/CodeGen/AMDGPU/freeze.ll
+++ b/llvm/test/CodeGen/AMDGPU/freeze.ll
@@ -5692,10 +5692,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -5725,10 +5721,6 @@ define void @freeze_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6351,10 +6343,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
; GFX6-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX6-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -6384,10 +6372,6 @@ define void @freeze_v3f16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v4, v0
; GFX7-SDAG-NEXT: buffer_store_short v1, v[2:3], s[4:7], 0 addr64 offset:4
; GFX7-SDAG-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -12355,14 +12339,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-NEXT: s_mov_b32 s5, s6
; GFX6-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX6-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX6-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX6-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX6-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX6-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12400,14 +12379,9 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-SDAG-NEXT: s_mov_b32 s5, s6
; GFX7-SDAG-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 8, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v0
-; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v4, 8, v4
-; GFX7-SDAG-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX7-SDAG-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
+; GFX7-SDAG-NEXT: buffer_store_byte v1, v[2:3], s[4:7], 0 addr64 offset:2
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12482,11 +12456,7 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: global_load_dword v0, v[0:1], off
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: v_lshrrev_b16 v1, 8, v0
-; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SDAG-NEXT: global_store_byte v[2:3], v4, off offset:2
+; GFX10-SDAG-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2
; GFX10-SDAG-NEXT: global_store_short v[2:3], v0, off
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -12507,36 +12477,15 @@ define void @freeze_v3i8(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX10-GISEL-NEXT: global_store_byte_d16_hi v[2:3], v0, off offset:2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-SDAG-TRUE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-TRUE16: ; %bb.0:
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: global_load_b32 v1, v[0:1], off
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
-; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b16 v0.l, 8, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
-; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
-; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_clause 0x1
-; GFX11-SDAG-TRUE16-NEXT: global_store_b8 v[2:3], v4, off offset:2
-; GFX11-SDAG-TRUE16-NEXT: global_store_b16 v[2:3], v0, off
-; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-FAKE16-LABEL: freeze_v3i8:
-; GFX11-SDAG-FAKE16: ; %bb.0:
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: global_load_b32 v0, v[0:1], off
-; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b16 v1, 8, v0
-; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v0
-; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
-; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX11-SDAG-FAKE16-NEXT: s_clause 0x1
-; GFX11-SDAG-FAKE16-NEXT: global_store_b8 v[2:3], v0, off offset:2
-; GFX11-SDAG-FAKE16-NEXT: global_store_b16 v[2:3], v1, off
-; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX11-SDAG-LABEL: freeze_v3i8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: global_store_d16_hi_b8 v[2:3], v0, off offset:2
+; GFX11-SDAG-NEXT: global_store_b16 v[2:3], v0, off
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: freeze_v3i8:
; GFX11-GISEL: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 47dfa9f4fc2d3..8791070d5b229 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -653,11 +653,11 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_srem25_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_ashr_i32 s0, s0, 7
+; GCN-NEXT: s_ashr_i32 s0, s1, 7
; GCN-NEXT: s_abs_i32 s8, s0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -694,11 +694,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
;
; GCN-IR-LABEL: s_test_srem25_64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_ashr_i32 s0, s0, 7
+; GCN-IR-NEXT: s_ashr_i32 s0, s1, 7
; GCN-IR-NEXT: s_abs_i32 s8, s0
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -742,11 +742,11 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_srem31_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_ashr_i32 s0, s0, 1
+; GCN-NEXT: s_ashr_i32 s0, s1, 1
; GCN-NEXT: s_abs_i32 s8, s0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -783,11 +783,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
;
; GCN-IR-LABEL: s_test_srem31_64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_ashr_i32 s0, s0, 1
+; GCN-IR-NEXT: s_ashr_i32 s0, s1, 1
; GCN-IR-NEXT: s_abs_i32 s8, s0
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -832,11 +832,11 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-LABEL: s_test_srem32_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_abs_i32 s8, s0
+; GCN-NEXT: s_abs_i32 s8, s1
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: s_sub_i32 s0, 0, s8
; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -871,11 +871,11 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
;
; GCN-IR-LABEL: s_test_srem32_64:
; GCN-IR: ; %bb.0:
-; GCN-IR-NEXT: s_load_dword s0, s[4:5], 0xe
+; GCN-IR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_abs_i32 s8, s0
+; GCN-IR-NEXT: s_abs_i32 s8, s1
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-IR-NEXT: s_sub_i32 s0, 0, s8
; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1053,25 +1053,25 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-LABEL: s_test_srem33_64:
; GCN-IR: ; %bb.0: ; %_udiv-special-cases
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd
+; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-IR-NEXT: s_mov_b32 s13, 0
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 31
-; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 31
; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31
+; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[2:3], 31
; GCN-IR-NEXT: s_mov_b32 s5, s4
-; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GCN-IR-NEXT: s_ashr_i64 s[10:11], s[8:9], 31
+; GCN-IR-NEXT: s_xor_b64 s[2:3], s[6:7], s[4:5]
; GCN-IR-NEXT: s_sub_u32 s6, s2, s4
; GCN-IR-NEXT: s_subb_u32 s7, s3, s4
; GCN-IR-NEXT: s_ashr_i32 s2, s9, 31
; GCN-IR-NEXT: s_mov_b32 s3, s2
-; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GCN-IR-NEXT: s_xor_b64 s[8:9], s[10:11], s[2:3]
; GCN-IR-NEXT: s_sub_u32 s8, s8, s2
; GCN-IR-NEXT: s_subb_u32 s9, s9, s2
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0
-; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0
+; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0
; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9]
-; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11]
+; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[2:3]
; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[6:7]
; GCN-IR-NEXT: s_sub_u32 s14, s12, s20
; GCN-IR-NEXT: s_subb_u32 s15, 0, 0
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
index 00f7b462f68db..3acd025897a28 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll
@@ -210,7 +210,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a1, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a1, a1, 58
; RV64I-NEXT: xori a1, a1, 31
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: or a0, a0, a1
@@ -239,17 +240,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: srliw a0, a0, 1
; RV64I-NEXT: beqz a0, .LBB4_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: srliw a1, a0, 1
+; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: addi a1, a2, 1365
-; RV64I-NEXT: srliw a2, a0, 2
+; RV64I-NEXT: srli a2, a0, 2
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 4
+; RV64I-NEXT: srli a2, a0, 4
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 8
+; RV64I-NEXT: slli a2, a0, 33
+; RV64I-NEXT: srli a2, a2, 41
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 16
+; RV64I-NEXT: slli a2, a0, 33
+; RV64I-NEXT: srli a2, a2, 49
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: not a0, a0
; RV64I-NEXT: srli a2, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index e6407279870db..72b1c155596b2 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -204,7 +204,8 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
; RV64I-NEXT: add a1, a1, a2
; RV64I-NEXT: slli a2, a1, 16
; RV64I-NEXT: add a1, a1, a2
-; RV64I-NEXT: srliw a1, a1, 24
+; RV64I-NEXT: slli a1, a1, 34
+; RV64I-NEXT: srli a1, a1, 58
; RV64I-NEXT: xori a1, a1, 31
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: or a0, a0, a1
@@ -231,17 +232,19 @@ define i32 @ctlz_lshr_i32(i32 signext %a) {
; RV64I-NEXT: srliw a0, a0, 1
; RV64I-NEXT: beqz a0, .LBB4_2
; RV64I-NEXT: # %bb.1: # %cond.false
-; RV64I-NEXT: srliw a1, a0, 1
+; RV64I-NEXT: srli a1, a0, 1
; RV64I-NEXT: lui a2, 349525
; RV64I-NEXT: or a0, a0, a1
; RV64I-NEXT: addi a1, a2, 1365
-; RV64I-NEXT: srliw a2, a0, 2
+; RV64I-NEXT: srli a2, a0, 2
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 4
+; RV64I-NEXT: srli a2, a0, 4
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 8
+; RV64I-NEXT: slli a2, a0, 33
+; RV64I-NEXT: srli a2, a2, 41
; RV64I-NEXT: or a0, a0, a2
-; RV64I-NEXT: srliw a2, a0, 16
+; RV64I-NEXT: slli a2, a0, 33
+; RV64I-NEXT: srli a2, a2, 49
; RV64I-NEXT: or a0, a0, a2
; RV64I-NEXT: not a0, a0
; RV64I-NEXT: srli a2, a0, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
index 3740737ba2989..d8ba5df9414cf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vec3-setcc-crash.ll
@@ -13,22 +13,22 @@ define void @vec3_setcc_crash(ptr %in, ptr %out) {
; RV32: # %bb.0:
; RV32-NEXT: lw a0, 0(a0)
; RV32-NEXT: srli a2, a0, 16
-; RV32-NEXT: slli a3, a0, 16
-; RV32-NEXT: slli a4, a0, 24
-; RV32-NEXT: slli a5, a0, 8
-; RV32-NEXT: srli a6, a3, 24
-; RV32-NEXT: srai a3, a3, 24
+; RV32-NEXT: srli a3, a0, 8
+; RV32-NEXT: slli a4, a0, 16
+; RV32-NEXT: slli a5, a0, 24
+; RV32-NEXT: slli a6, a0, 8
; RV32-NEXT: srai a4, a4, 24
; RV32-NEXT: srai a5, a5, 24
+; RV32-NEXT: srai a6, a6, 24
+; RV32-NEXT: sgtz a6, a6
; RV32-NEXT: sgtz a5, a5
; RV32-NEXT: sgtz a4, a4
-; RV32-NEXT: sgtz a3, a3
-; RV32-NEXT: neg a3, a3
; RV32-NEXT: neg a4, a4
; RV32-NEXT: neg a5, a5
-; RV32-NEXT: and a3, a3, a6
-; RV32-NEXT: and a0, a4, a0
-; RV32-NEXT: and a2, a5, a2
+; RV32-NEXT: neg a6, a6
+; RV32-NEXT: and a3, a4, a3
+; RV32-NEXT: and a0, a5, a0
+; RV32-NEXT: and a2, a6, a2
; RV32-NEXT: slli a3, a3, 8
; RV32-NEXT: zext.b a0, a0
; RV32-NEXT: or a0, a0, a3
diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll
index 189de051011d2..716c9dd849216 100644
--- a/llvm/test/CodeGen/X86/freeze-binary.ll
+++ b/llvm/test/CodeGen/X86/freeze-binary.ll
@@ -452,8 +452,7 @@ define i32 @freeze_ashr(i32 %a0) nounwind {
; X86-LABEL: freeze_ashr:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $3, %eax
-; X86-NEXT: sarl $3, %eax
+; X86-NEXT: sarl $6, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr:
@@ -471,15 +470,13 @@ define i32 @freeze_ashr_exact(i32 %a0) nounwind {
; X86-LABEL: freeze_ashr_exact:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: sarl $3, %eax
-; X86-NEXT: sarl $6, %eax
+; X86-NEXT: sarl $9, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_ashr_exact:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: sarl $3, %eax
-; X64-NEXT: sarl $6, %eax
+; X64-NEXT: sarl $9, %eax
; X64-NEXT: retq
%x = ashr exact i32 %a0, 3
%y = freeze i32 %x
@@ -565,8 +562,7 @@ define i32 @freeze_lshr(i32 %a0) nounwind {
; X86-LABEL: freeze_lshr:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $2, %eax
-; X86-NEXT: shrl %eax
+; X86-NEXT: shrl $3, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr:
@@ -584,15 +580,13 @@ define i32 @freeze_lshr_exact(i32 %a0) nounwind {
; X86-LABEL: freeze_lshr_exact:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: shrl $5, %eax
+; X86-NEXT: shrl $8, %eax
; X86-NEXT: retl
;
; X64-LABEL: freeze_lshr_exact:
; X64: # %bb.0:
; X64-NEXT: movl %edi, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: shrl $5, %eax
+; X64-NEXT: shrl $8, %eax
; X64-NEXT: retq
%x = lshr exact i32 %a0, 3
%y = freeze i32 %x
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 5f6337e29d685..582cc9c3e8055 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -426,58 +426,58 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounwind {
; AVX512F-LABEL: vec512_i16_signed_mem_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsw %ymm0, %ymm1, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5
; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_reg:
@@ -507,56 +507,56 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsw %ymm3, %ymm2, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubw %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpaddw %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm3, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5
; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm3, %ymm6, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -588,60 +588,60 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i16_signed_mem_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsw %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpminsw %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpminsw %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpminsw %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsubw %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512F-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512F-NEXT: vpsubw %ymm0, %ymm6, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i16_signed_mem_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rsi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpminsw %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm6, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm1, %ymm6, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubw %ymm0, %ymm6, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddw %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i16_signed_mem_mem:
@@ -849,66 +849,64 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind {
; AVX512F-LABEL: vec512_i8_signed_mem_reg:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
+; AVX512F-NEXT: vpminsb %ymm0, %ymm1, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm0, %ymm6, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_reg:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm6, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_mem_reg:
@@ -939,64 +937,62 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_reg_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vpminsb %ymm3, %ymm2, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3
+; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm3, %ymm6, %ymm3
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512F-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_reg_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm4
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm3, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm3, %ymm2, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm3, %ymm3
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm7, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm3, %ymm6, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -1029,68 +1025,66 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind {
; AVX512F-LABEL: vec512_i8_signed_mem_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovdqa64 (%rsi), %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512F-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vpminsb %ymm1, %ymm0, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vpminsb %ymm0, %ymm2, %ymm5
-; AVX512F-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512F-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512F-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpsubb %ymm5, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512F-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512F-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512F-NEXT: vpsubb %ymm0, %ymm7, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512F-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512F-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-FALLBACK-LABEL: vec512_i8_signed_mem_mem:
; AVX512VL-FALLBACK: # %bb.0:
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rsi), %ymm1
-; AVX512VL-FALLBACK-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512VL-FALLBACK-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm3, %ymm4
-; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm5
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512VL-FALLBACK-NEXT: vmovdqa64 (%rsi), %zmm1
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm2, %ymm3, %ymm4
+; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm5
; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm3, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm3, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpminsb %ymm0, %ymm2, %ymm5
-; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm2, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm0, %ymm0
+; AVX512VL-FALLBACK-NEXT: vpminsb %ymm2, %ymm3, %ymm5
+; AVX512VL-FALLBACK-NEXT: vpmaxsb %ymm2, %ymm3, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpbroadcastd {{.*#+}} zmm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-FALLBACK-NEXT: vpandq %zmm6, %zmm5, %zmm5
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm7, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm7, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
-; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm0, %ymm0
-; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
+; AVX512VL-FALLBACK-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm6, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm6, %ymm1
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
+; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm3, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512VL-FALLBACK-NEXT: retq
;
; AVX512BW-LABEL: vec512_i8_signed_mem_mem:
More information about the llvm-commits
mailing list